diff --git a/.gitignore b/.gitignore index 9797bc4..ec5eb97 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ *.a *.cmxa .DS_Store -pre *.aux *.log *.tex.backup diff --git a/config b/config index aa3cc69..ebc19e5 100644 --- a/config +++ b/config @@ -45,4 +45,4 @@ SWIGRA_ENABLED=false SWIGRA_PATH=../swigra/parser/ # Is sentence selection enabled -SENTENCE_SELECTION_ENABLED=true +SENTENCE_SELECTION_ENABLED=false diff --git a/corpora/CONLL.ml b/corpora/CONLL.ml index f95971b..d0f5a35 100644 --- a/corpora/CONLL.ml +++ b/corpora/CONLL.ml @@ -1,7 +1,7 @@ open Xstd open PreTypes -let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts +let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts then f (snd @@ List.find (fun (m,_) -> m = mode) alts) else f (snd @@ List.find (fun (m,_) -> m = PreTypes.Struct) alts) @@ -54,7 +54,7 @@ let rec string_of_text mode tokens = function (******************) - +(*** let establish_next tokens paths = let n = ExtArray.size tokens in Int.iter 1 (n - 2) (fun i -> @@ -128,7 +128,7 @@ let match_sentence (p_record,tokens) = with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] let match_corpus corpus = - Xlist.map corpus match_sentence + Xlist.map corpus match_sentence***) (******************) diff --git a/diagnostics/LCGfields.ml b/diagnostics/LCGfields.ml index 1630bbe..92b1d59 100644 --- a/diagnostics/LCGfields.ml +++ b/diagnostics/LCGfields.ml @@ -90,14 +90,14 @@ let field_of_conll_sentence fields tokens (result : conll_parse_result) = let rec field_of_sentence fields tokens = function RawSentence s -> s - | StructSentence(_,paths,last) -> "StructSentence" - | DepSentence(_,paths) -> "DepSentence" + | StructSentence _ -> "StructSentence" + | DepSentence _ -> "DepSentence" | ENIAMSentence result -> field_of_eniam_sentence fields tokens result | CONLLSentence result -> field_of_conll_sentence fields tokens result | QuotedSentences sentences -> "QuotedSentences" | AltSentence l -> String.concat "\n\t" (Xlist.map l (fun (m, s) -> Visualization.string_of_mode m ^ "\t" ^ (field_of_sentence fields tokens s))) - (* | _ -> failwith "field_of_sentence: ni" *) + | _ -> failwith "field_of_sentence: ni" let rec field_of_paragraph fields tokens = function RawParagraph s -> print_endline "no fields detected: only raw paragraph"; s diff --git a/diagnostics/treeChange.ml b/diagnostics/treeChange.ml index a1e6a19..77b7153 100644 --- a/diagnostics/treeChange.ml +++ b/diagnostics/treeChange.ml @@ -1,23 +1,125 @@ open Xstd open PreTypes -let remove_interps paths tokens = +let if_lemma lemmas = function + Lemma(l,_,_) -> List.exists (fun x -> x = l) lemmas + | _ -> false + +let if_cat cats = function + Lemma(_,cat,_) -> List.exists (fun x -> x = cat) cats + | _ -> false + +let if_interps interps token = + let interp = match token with + Lemma(_,_,i) -> i + | _ -> [[[]]] in + let if_interp nr value = + List.exists (fun x -> + List.exists (fun y -> + y = value) (List.nth x nr)) interp in + Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value)) + +let correct_coordination paths tokens = + let paths_ls = List.mapi (fun i (id,super,label) -> + (i,id,super,label)) (Array.to_list paths) in + + let ps a sons = + print_endline a; + List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons; + print_endline "" in + + let rec correct_rec (i,id,super,label) sons = + let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in + (* ps "left:" (List.rev left_s); + ps "right:" right_s; *) + find_father i (List.rev left_s); + find_father i right_s + + and find_father i0 = function + [(i,id,super,label)] -> paths.(i) <- (id,i0,label) + | (a,b,c,d) :: (i,id,super,label) :: t -> + paths.(i) <- (id,i0,label); + if not (if_cat ["conj"; "interp"] (ExtArray.get tokens i).token) + then (prerr_endline "find_father"; failwith "find_father"); + correct_rec (i,id,super,label) (if a < i + then (a,b,c,d) :: t + else List.rev @@ (a,b,c,d) :: t) + | _ -> prerr_endline "find_father"; failwith "find_father" in + + Array.iteri (fun i (id,super,label) -> + if if_cat ["conj"; "interp"] (ExtArray.get tokens i).token + then (let sons = List.filter (fun (_,_,super,_) -> super = i) paths_ls in + if (List.length sons > 2) + then correct_rec (i,id,super,label) sons)) paths; + paths + +let replace_tokens paths tokens = +(* for i = 0 to ExtArray.size tokens - 1 do + print_endline (string_of_int i ^ ": "^ (ExtArray.get tokens i).orth) +done; *) + let find_token orth = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> + if (ExtArray.get tokens i).orth = orth then i else acc) in + + let multidot i id0 super0 label0 = + let id1, super1, label1 = paths.(super0) in + let id2, super2, label2 = paths.(super1) in + if (ExtArray.get tokens id1).orth = "." && + (ExtArray.get tokens id2).orth = "." + then + (paths.(super1) <- (find_token "..." ,super2, label2); + paths.(super0) <- (0,-1,""); + paths.(i) <- (0,-1,"")) in + + let brev i id super label = + let if_the_last_dot () = + let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) -> + s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in + Array.fold_left (fun acc (i2,s,l) -> + acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in + + let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot () + then "" + else "." in + + let n_orth = (ExtArray.get tokens id).orth ^ dot in + paths.(i) <- (find_token n_orth,super,label) in + + Array.iteri (fun i (id,super,label) -> + if (ExtArray.get tokens id).orth = "." + then multidot i id super label; + if if_cat ["brev"] (ExtArray.get tokens id).token + then brev i id super label) + paths; + paths + +let remove_interps interp paths tokens = let paths_ls = Array.to_list paths in - Array.iter (fun (id,super,label) -> - if ((ExtArray.get tokens id).orth = "," || - (ExtArray.get tokens id).orth = "." || - (ExtArray.get tokens id).orth = "-") && - not (List.exists (fun (_,super,_) -> super = id) paths_ls) - then paths.(id) <- (0,-1,"")) paths; paths - -let move_comp paths tokens = - let correct_dep (id,super,label) = - let is_comp = function - Lemma(_,"comp",_) -> true - | _ -> false in - if ((ExtArray.get tokens id).orth = "by" || (ExtArray.get tokens id).orth = "że") - && is_comp (ExtArray.get tokens id).token - then (let id_S, super_S, label_S = paths.(super) in - paths.(id) <- (id,super_S,label); - paths.(super) <- (id_S, id, label_S)) in - Array.iter correct_dep paths; paths + Array.iteri (fun i (id,super,label) -> + if (ExtArray.get tokens id).orth = interp && + not (List.exists (fun (_,super,_) -> super = i) paths_ls) + then paths.(i) <- (0,-1,"")) paths; + paths + +let swap_dep paths tokens = + let change_dep i (id,super,label) = + let id_S, super_S, label_S = paths.(super) in + paths.(i) <- (id,super_S,label); + paths.(super) <- (id_S, id, label_S) in + let rec correct_dep i (id,super,label) = + let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który"; + "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in + if (if_cat ["comp"] (ExtArray.get tokens id).token && + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"] (ExtArray.get tokens super).token) || + (if_cat ["conj"] (ExtArray.get tokens id).token && + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"] (ExtArray.get tokens super).token && + not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths))) || + (if_cat ["ppron3"] (ExtArray.get tokens id).token && + if_interps [5,"praep"] (ExtArray.get tokens id).token) || + (if_lemma adv_relators (ExtArray.get tokens id).token && + if_cat ["fin"; "praet"; "winien"; "imps"; "subst"; "pred"] (ExtArray.get tokens super).token) + then + change_dep i (id,super,label); + if (if_lemma adv_relators (ExtArray.get tokens id).token && + if_cat ["subst"; "pred"] (ExtArray.get tokens super).token) + then correct_dep i paths.(i) in + Array.iteri correct_dep paths; paths diff --git a/parser/LCGlexicon.ml b/parser/LCGlexicon.ml index 58292a7..96097c7 100644 --- a/parser/LCGlexicon.ml +++ b/parser/LCGlexicon.ml @@ -596,8 +596,8 @@ let make_adjp numbers cases genders grads d lemma cat = (* FIXME: usunąć niekt with Not_found -> l) | fnum,frame -> failwith ("make_adjp: " ^ lemma ^ ": " ^ WalStringOf.frame lemma frame)) in -let adv_relators = Xlist.fold [ - "jak","Attr",[Int;Rel]; +let adv_relators = Xlist.fold [ (* przy zmianie kluczy, trzeba też zmienić analogicznie zawartość *) + "jak","Attr",[Int;Rel]; (* listy adv_relators w procedurze move_comp w pliku ../diagnostics/treeChange.ml *) "skąd","abl",[Int;Rel]; "dokąd","adl",[Int;Rel]; "gdzie","locat",[Int;Rel]; @@ -1169,6 +1169,7 @@ let rec process_interp (d:PreTypes.token_record) = function (* FIXME: rozpoznawa [LCGrenderer.make_frame_simple quant t d ( batrs)] | _,"xxx",[] -> [] (* FIXME *) | ".","interp",[] -> [LCGrenderer.make_frame_simple [] ["dot"] d (make_node "." "interp" d.weight 0 [])] (* FIXME: to jest potrzebne przy CONLL *) + | "…","interp",[] -> [LCGrenderer.make_frame_simple [] ["multidot"] d (make_node "…" "interp" d.weight 0 [])] (* FIXME: to jest potrzebne przy CONLL *) | lemma,"brev",_ -> [LCGrenderer.make_frame_simple [] ["brev"] d (make_node lemma "brev" d.weight 0 [])] (* FIXME: to jest potrzebne przy CONLL *) | "<conll_root>","interp",[] -> let batrs = (make_node "<conll_root>" "interp" d.weight 0 []) in diff --git a/parser/LCGvalence.ml b/parser/LCGvalence.ml index 1a1740c..5fda46a 100644 --- a/parser/LCGvalence.ml +++ b/parser/LCGvalence.ml @@ -276,7 +276,7 @@ let get_nodes = function Node t -> let attrs,b = extract_nosem [] t.attrs in (* let t = if t.pred = "<query1>" || t.pred = "<query2>" || t.pred = "<query3>" || t.pred = "<query4>" || t.pred = "<query5>" || t.pred = "<query6>" then {t with agf=CORE} else t in *) - let t = if t.pred = "<sentence>" || t.pred = "pro-komunikować" then {t with agf=CORE} else t in (* FIXME: przetestować na mowie niezależnej *) + let t = if t.pred = "<conll_root>" || t.pred = "<sentence>" || t.pred = "pro-komunikować" then {t with agf=CORE} else t in (* FIXME: przetestować na mowie niezależnej *) if t.agf = NOGF then failwith ("get_nodes agf=NOGF: " ^ t.pred) else if b then {t with amorf=mark_nosem_morf t.amorf; attrs=attrs} else t | _ -> failwith "get_nodes" diff --git a/parser/exec.ml b/parser/exec.ml index ed1b891..160a438 100644 --- a/parser/exec.ml +++ b/parser/exec.ml @@ -200,8 +200,12 @@ let conll_parse_sentence timeout test_only_flag paths tokens = let result = empty_conll_parse_result in let time2 = time_fun () in try - let paths = TreeChange.remove_interps paths tokens in - let paths = TreeChange.move_comp paths tokens in + let paths = TreeChange.replace_tokens paths tokens in + let paths = TreeChange.remove_interps "." paths tokens in + let paths = TreeChange.correct_coordination paths tokens in + let paths = TreeChange.remove_interps "," paths tokens in + let paths = TreeChange.remove_interps "-" paths tokens in + let paths = TreeChange.swap_dep paths tokens in let dep_chart = LCGlexicon.dep_create paths tokens in let dep_chart,references = LCGchart.dep_lazify dep_chart in let result = if test_only_flag then result else {result with dep_chart=dep_chart} in diff --git a/parser/makefile b/parser/makefile index 3bd901f..661052a 100755 --- a/parser/makefile +++ b/parser/makefile @@ -13,7 +13,7 @@ LCG= LCGtypes.ml LCGstringOf.ml LCGrules.ml LCGrenderer.ml LCGchart.ml LCGlatexO DISAMB= disambSelPref.ml disambLemma.ml SEM= semGraph.ml semTypes.ml semStringOf.ml semLatexOf.ml semMmlOf.ml semMrl.ml #SEM= semGraph.ml semTypes.ml semStringOf.ml semMmlOf.ml semMrl.ml -EXEC= execTypes.ml visualization.ml ../diagnostics/treeChange.ml exec.ml ../diagnostics/LCGfields.ml ../diagnostics/compTrees.ml +EXEC= execTypes.ml visualization.ml ../diagnostics/treeChange.ml exec.ml ../diagnostics/LCGfields.ml #../diagnostics/compTrees.ml all: $(OCAMLOPT) -o pipe $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) pipe.ml diff --git a/parser/pipe.ml b/parser/pipe.ml index dc6a813..f386b5f 100644 --- a/parser/pipe.ml +++ b/parser/pipe.ml @@ -129,7 +129,7 @@ let process_id s = let process_conll_corpus filename = let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in print_endline "process_conll_corpus"; - (* let corpus = [List.hd corpus] in *) + let corpus = [List.hd corpus] in let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in Xlist.iter corpus (fun query -> let id = process_id (get_query_id query) in diff --git a/parser/visualization.ml b/parser/visualization.ml index 4a3e886..06574c5 100644 --- a/parser/visualization.ml +++ b/parser/visualization.ml @@ -799,12 +799,13 @@ let html_of_eniam_sentence path tokens (result : eniam_parse_result) = (* | NotTranslated -> "not_translated: \n" *) | Parsed -> (* print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree; *) - (* print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; *) + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; (* LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree; *) - sprintf "parsed: paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.paths_size result.chart_size result.dependency_tree_size (*^ *) + sprintf "parsed: paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.paths_size result.chart_size result.dependency_tree_size ^ (* sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^ *) - (* sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ *) + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ (* sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix *) + "" | _ -> failwith "html_of_eniam_sentence" let html_of_conll_sentence path tokens (result : conll_parse_result) = @@ -841,12 +842,13 @@ let html_of_conll_sentence path tokens (result : conll_parse_result) = (* | NotTranslated -> "not_translated: \n" *) | Parsed -> (* print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree; *) - (* print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; *) + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; (* LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree; *) - sprintf "parsed: paths_size=%d dependency_tree_size=%d\n" result.paths_size result.dependency_tree_size (* ^ *) + sprintf "parsed: paths_size=%d dependency_tree_size=%d\n" result.paths_size result.dependency_tree_size ^ (* sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^ *) - (* sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ *) + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ (* sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix *) + "" | _ -> failwith "html_of_conll_sentence" let html_of_sem_sentence path tokens (result : semantic_processing_result) =