Commit ec7e858820fc252a3c7ecc9ffe687dbb4ae0509a
rozwiązanie konfliktów przy ponownym merge dep_trees
Showing
11 changed files
with
148 additions
and
40 deletions
.gitignore
config
corpora/CONLL.ml
1 | 1 | open Xstd |
2 | 2 | open PreTypes |
3 | 3 | |
4 | -let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts | |
4 | +let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts | |
5 | 5 | then f (snd @@ List.find (fun (m,_) -> m = mode) alts) |
6 | 6 | else f (snd @@ List.find (fun (m,_) -> m = PreTypes.Struct) alts) |
7 | 7 | |
... | ... | @@ -54,7 +54,7 @@ let rec string_of_text mode tokens = function |
54 | 54 | |
55 | 55 | |
56 | 56 | (******************) |
57 | - | |
57 | +(*** | |
58 | 58 | let establish_next tokens paths = |
59 | 59 | let n = ExtArray.size tokens in |
60 | 60 | Int.iter 1 (n - 2) (fun i -> |
... | ... | @@ -128,7 +128,7 @@ let match_sentence (p_record,tokens) = |
128 | 128 | with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] |
129 | 129 | |
130 | 130 | let match_corpus corpus = |
131 | - Xlist.map corpus match_sentence | |
131 | + Xlist.map corpus match_sentence***) | |
132 | 132 | |
133 | 133 | (******************) |
134 | 134 | |
... | ... |
diagnostics/LCGfields.ml
... | ... | @@ -90,14 +90,14 @@ let field_of_conll_sentence fields tokens (result : conll_parse_result) = |
90 | 90 | |
91 | 91 | let rec field_of_sentence fields tokens = function |
92 | 92 | RawSentence s -> s |
93 | - | StructSentence(_,paths,last) -> "StructSentence" | |
94 | - | DepSentence(_,paths) -> "DepSentence" | |
93 | + | StructSentence _ -> "StructSentence" | |
94 | + | DepSentence _ -> "DepSentence" | |
95 | 95 | | ENIAMSentence result -> field_of_eniam_sentence fields tokens result |
96 | 96 | | CONLLSentence result -> field_of_conll_sentence fields tokens result |
97 | 97 | | QuotedSentences sentences -> "QuotedSentences" |
98 | 98 | | AltSentence l -> String.concat "\n\t" (Xlist.map l (fun (m, s) -> |
99 | 99 | Visualization.string_of_mode m ^ "\t" ^ (field_of_sentence fields tokens s))) |
100 | - (* | _ -> failwith "field_of_sentence: ni" *) | |
100 | + | _ -> failwith "field_of_sentence: ni" | |
101 | 101 | |
102 | 102 | let rec field_of_paragraph fields tokens = function |
103 | 103 | RawParagraph s -> print_endline "no fields detected: only raw paragraph"; s |
... | ... |
diagnostics/treeChange.ml
1 | 1 | open Xstd |
2 | 2 | open PreTypes |
3 | 3 | |
4 | -let remove_interps paths tokens = | |
4 | +let if_lemma lemmas = function | |
5 | + Lemma(l,_,_) -> List.exists (fun x -> x = l) lemmas | |
6 | + | _ -> false | |
7 | + | |
8 | +let if_cat cats = function | |
9 | + Lemma(_,cat,_) -> List.exists (fun x -> x = cat) cats | |
10 | + | _ -> false | |
11 | + | |
12 | +let if_interps interps token = | |
13 | + let interp = match token with | |
14 | + Lemma(_,_,i) -> i | |
15 | + | _ -> [[[]]] in | |
16 | + let if_interp nr value = | |
17 | + List.exists (fun x -> | |
18 | + List.exists (fun y -> | |
19 | + y = value) (List.nth x nr)) interp in | |
20 | + Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value)) | |
21 | + | |
22 | +let correct_coordination paths tokens = | |
23 | + let paths_ls = List.mapi (fun i (id,super,label) -> | |
24 | + (i,id,super,label)) (Array.to_list paths) in | |
25 | + | |
26 | + let ps a sons = | |
27 | + print_endline a; | |
28 | + List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons; | |
29 | + print_endline "" in | |
30 | + | |
31 | + let rec correct_rec (i,id,super,label) sons = | |
32 | + let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in | |
33 | + (* ps "left:" (List.rev left_s); | |
34 | + ps "right:" right_s; *) | |
35 | + find_father i (List.rev left_s); | |
36 | + find_father i right_s | |
37 | + | |
38 | + and find_father i0 = function | |
39 | + [(i,id,super,label)] -> paths.(i) <- (id,i0,label) | |
40 | + | (a,b,c,d) :: (i,id,super,label) :: t -> | |
41 | + paths.(i) <- (id,i0,label); | |
42 | + if not (if_cat ["conj"; "interp"] (ExtArray.get tokens i).token) | |
43 | + then (prerr_endline "find_father"; failwith "find_father"); | |
44 | + correct_rec (i,id,super,label) (if a < i | |
45 | + then (a,b,c,d) :: t | |
46 | + else List.rev @@ (a,b,c,d) :: t) | |
47 | + | _ -> prerr_endline "find_father"; failwith "find_father" in | |
48 | + | |
49 | + Array.iteri (fun i (id,super,label) -> | |
50 | + if if_cat ["conj"; "interp"] (ExtArray.get tokens i).token | |
51 | + then (let sons = List.filter (fun (_,_,super,_) -> super = i) paths_ls in | |
52 | + if (List.length sons > 2) | |
53 | + then correct_rec (i,id,super,label) sons)) paths; | |
54 | + paths | |
55 | + | |
56 | +let replace_tokens paths tokens = | |
57 | +(* for i = 0 to ExtArray.size tokens - 1 do | |
58 | + print_endline (string_of_int i ^ ": "^ (ExtArray.get tokens i).orth) | |
59 | +done; *) | |
60 | + let find_token orth = Int.fold 0 (ExtArray.size tokens - 1) 0 (fun acc i -> | |
61 | + if (ExtArray.get tokens i).orth = orth then i else acc) in | |
62 | + | |
63 | + let multidot i id0 super0 label0 = | |
64 | + let id1, super1, label1 = paths.(super0) in | |
65 | + let id2, super2, label2 = paths.(super1) in | |
66 | + if (ExtArray.get tokens id1).orth = "." && | |
67 | + (ExtArray.get tokens id2).orth = "." | |
68 | + then | |
69 | + (paths.(super1) <- (find_token "..." ,super2, label2); | |
70 | + paths.(super0) <- (0,-1,""); | |
71 | + paths.(i) <- (0,-1,"")) in | |
72 | + | |
73 | + let brev i id super label = | |
74 | + let if_the_last_dot () = | |
75 | + let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) -> | |
76 | + s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in | |
77 | + Array.fold_left (fun acc (i2,s,l) -> | |
78 | + acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in | |
79 | + | |
80 | + let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot () | |
81 | + then "" | |
82 | + else "." in | |
83 | + | |
84 | + let n_orth = (ExtArray.get tokens id).orth ^ dot in | |
85 | + paths.(i) <- (find_token n_orth,super,label) in | |
86 | + | |
87 | + Array.iteri (fun i (id,super,label) -> | |
88 | + if (ExtArray.get tokens id).orth = "." | |
89 | + then multidot i id super label; | |
90 | + if if_cat ["brev"] (ExtArray.get tokens id).token | |
91 | + then brev i id super label) | |
92 | + paths; | |
93 | + paths | |
94 | + | |
95 | +let remove_interps interp paths tokens = | |
5 | 96 | let paths_ls = Array.to_list paths in |
6 | - Array.iter (fun (id,super,label) -> | |
7 | - if ((ExtArray.get tokens id).orth = "," || | |
8 | - (ExtArray.get tokens id).orth = "." || | |
9 | - (ExtArray.get tokens id).orth = "-") && | |
10 | - not (List.exists (fun (_,super,_) -> super = id) paths_ls) | |
11 | - then paths.(id) <- (0,-1,"")) paths; paths | |
12 | - | |
13 | -let move_comp paths tokens = | |
14 | - let correct_dep (id,super,label) = | |
15 | - let is_comp = function | |
16 | - Lemma(_,"comp",_) -> true | |
17 | - | _ -> false in | |
18 | - if ((ExtArray.get tokens id).orth = "by" || (ExtArray.get tokens id).orth = "że") | |
19 | - && is_comp (ExtArray.get tokens id).token | |
20 | - then (let id_S, super_S, label_S = paths.(super) in | |
21 | - paths.(id) <- (id,super_S,label); | |
22 | - paths.(super) <- (id_S, id, label_S)) in | |
23 | - Array.iter correct_dep paths; paths | |
97 | + Array.iteri (fun i (id,super,label) -> | |
98 | + if (ExtArray.get tokens id).orth = interp && | |
99 | + not (List.exists (fun (_,super,_) -> super = i) paths_ls) | |
100 | + then paths.(i) <- (0,-1,"")) paths; | |
101 | + paths | |
102 | + | |
103 | +let swap_dep paths tokens = | |
104 | + let change_dep i (id,super,label) = | |
105 | + let id_S, super_S, label_S = paths.(super) in | |
106 | + paths.(i) <- (id,super_S,label); | |
107 | + paths.(super) <- (id_S, id, label_S) in | |
108 | + let rec correct_dep i (id,super,label) = | |
109 | + let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który"; | |
110 | + "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in | |
111 | + if (if_cat ["comp"] (ExtArray.get tokens id).token && | |
112 | + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"] (ExtArray.get tokens super).token) || | |
113 | + (if_cat ["conj"] (ExtArray.get tokens id).token && | |
114 | + if_cat ["fin"; "praet"; "winien"; "pred"; "imps"] (ExtArray.get tokens super).token && | |
115 | + not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths))) || | |
116 | + (if_cat ["ppron3"] (ExtArray.get tokens id).token && | |
117 | + if_interps [5,"praep"] (ExtArray.get tokens id).token) || | |
118 | + (if_lemma adv_relators (ExtArray.get tokens id).token && | |
119 | + if_cat ["fin"; "praet"; "winien"; "imps"; "subst"; "pred"] (ExtArray.get tokens super).token) | |
120 | + then | |
121 | + change_dep i (id,super,label); | |
122 | + if (if_lemma adv_relators (ExtArray.get tokens id).token && | |
123 | + if_cat ["subst"; "pred"] (ExtArray.get tokens super).token) | |
124 | + then correct_dep i paths.(i) in | |
125 | + Array.iteri correct_dep paths; paths | |
... | ... |
parser/LCGlexicon.ml
... | ... | @@ -596,8 +596,8 @@ let make_adjp numbers cases genders grads d lemma cat = (* FIXME: usunąć niekt |
596 | 596 | with Not_found -> l) |
597 | 597 | | fnum,frame -> failwith ("make_adjp: " ^ lemma ^ ": " ^ WalStringOf.frame lemma frame)) in |
598 | 598 | |
599 | -let adv_relators = Xlist.fold [ | |
600 | - "jak","Attr",[Int;Rel]; | |
599 | +let adv_relators = Xlist.fold [ (* przy zmianie kluczy, trzeba też zmienić analogicznie zawartość *) | |
600 | + "jak","Attr",[Int;Rel]; (* listy adv_relators w procedurze move_comp w pliku ../diagnostics/treeChange.ml *) | |
601 | 601 | "skąd","abl",[Int;Rel]; |
602 | 602 | "dokąd","adl",[Int;Rel]; |
603 | 603 | "gdzie","locat",[Int;Rel]; |
... | ... | @@ -1169,6 +1169,7 @@ let rec process_interp (d:PreTypes.token_record) = function (* FIXME: rozpoznawa |
1169 | 1169 | [LCGrenderer.make_frame_simple quant t d ( batrs)] |
1170 | 1170 | | _,"xxx",[] -> [] (* FIXME *) |
1171 | 1171 | | ".","interp",[] -> [LCGrenderer.make_frame_simple [] ["dot"] d (make_node "." "interp" d.weight 0 [])] (* FIXME: to jest potrzebne przy CONLL *) |
1172 | + | "…","interp",[] -> [LCGrenderer.make_frame_simple [] ["multidot"] d (make_node "…" "interp" d.weight 0 [])] (* FIXME: to jest potrzebne przy CONLL *) | |
1172 | 1173 | | lemma,"brev",_ -> [LCGrenderer.make_frame_simple [] ["brev"] d (make_node lemma "brev" d.weight 0 [])] (* FIXME: to jest potrzebne przy CONLL *) |
1173 | 1174 | | "<conll_root>","interp",[] -> |
1174 | 1175 | let batrs = (make_node "<conll_root>" "interp" d.weight 0 []) in |
... | ... |
parser/LCGvalence.ml
... | ... | @@ -276,7 +276,7 @@ let get_nodes = function |
276 | 276 | Node t -> |
277 | 277 | let attrs,b = extract_nosem [] t.attrs in |
278 | 278 | (* let t = if t.pred = "<query1>" || t.pred = "<query2>" || t.pred = "<query3>" || t.pred = "<query4>" || t.pred = "<query5>" || t.pred = "<query6>" then {t with agf=CORE} else t in *) |
279 | - let t = if t.pred = "<sentence>" || t.pred = "pro-komunikować" then {t with agf=CORE} else t in (* FIXME: przetestować na mowie niezależnej *) | |
279 | + let t = if t.pred = "<conll_root>" || t.pred = "<sentence>" || t.pred = "pro-komunikować" then {t with agf=CORE} else t in (* FIXME: przetestować na mowie niezależnej *) | |
280 | 280 | if t.agf = NOGF then failwith ("get_nodes agf=NOGF: " ^ t.pred) else |
281 | 281 | if b then {t with amorf=mark_nosem_morf t.amorf; attrs=attrs} else t |
282 | 282 | | _ -> failwith "get_nodes" |
... | ... |
parser/exec.ml
... | ... | @@ -200,8 +200,12 @@ let conll_parse_sentence timeout test_only_flag paths tokens = |
200 | 200 | let result = empty_conll_parse_result in |
201 | 201 | let time2 = time_fun () in |
202 | 202 | try |
203 | - let paths = TreeChange.remove_interps paths tokens in | |
204 | - let paths = TreeChange.move_comp paths tokens in | |
203 | + let paths = TreeChange.replace_tokens paths tokens in | |
204 | + let paths = TreeChange.remove_interps "." paths tokens in | |
205 | + let paths = TreeChange.correct_coordination paths tokens in | |
206 | + let paths = TreeChange.remove_interps "," paths tokens in | |
207 | + let paths = TreeChange.remove_interps "-" paths tokens in | |
208 | + let paths = TreeChange.swap_dep paths tokens in | |
205 | 209 | let dep_chart = LCGlexicon.dep_create paths tokens in |
206 | 210 | let dep_chart,references = LCGchart.dep_lazify dep_chart in |
207 | 211 | let result = if test_only_flag then result else {result with dep_chart=dep_chart} in |
... | ... |
parser/makefile
... | ... | @@ -13,7 +13,7 @@ LCG= LCGtypes.ml LCGstringOf.ml LCGrules.ml LCGrenderer.ml LCGchart.ml LCGlatexO |
13 | 13 | DISAMB= disambSelPref.ml disambLemma.ml |
14 | 14 | SEM= semGraph.ml semTypes.ml semStringOf.ml semLatexOf.ml semMmlOf.ml semMrl.ml |
15 | 15 | #SEM= semGraph.ml semTypes.ml semStringOf.ml semMmlOf.ml semMrl.ml |
16 | -EXEC= execTypes.ml visualization.ml ../diagnostics/treeChange.ml exec.ml ../diagnostics/LCGfields.ml ../diagnostics/compTrees.ml | |
16 | +EXEC= execTypes.ml visualization.ml ../diagnostics/treeChange.ml exec.ml ../diagnostics/LCGfields.ml #../diagnostics/compTrees.ml | |
17 | 17 | |
18 | 18 | all: |
19 | 19 | $(OCAMLOPT) -o pipe $(OCAMLOPTFLAGS) $(PRE) $(LCG) $(DISAMB) $(SEM) $(EXEC) pipe.ml |
... | ... |
parser/pipe.ml
... | ... | @@ -129,7 +129,7 @@ let process_id s = |
129 | 129 | let process_conll_corpus filename = |
130 | 130 | let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in |
131 | 131 | print_endline "process_conll_corpus"; |
132 | - (* let corpus = [List.hd corpus] in *) | |
132 | + let corpus = [List.hd corpus] in | |
133 | 133 | let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in |
134 | 134 | Xlist.iter corpus (fun query -> |
135 | 135 | let id = process_id (get_query_id query) in |
... | ... |
parser/visualization.ml
... | ... | @@ -799,12 +799,13 @@ let html_of_eniam_sentence path tokens (result : eniam_parse_result) = |
799 | 799 | (* | NotTranslated -> "not_translated: \n" *) |
800 | 800 | | Parsed -> |
801 | 801 | (* print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree; *) |
802 | - (* print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; *) | |
802 | + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; | |
803 | 803 | (* LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree; *) |
804 | - sprintf "parsed: paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.paths_size result.chart_size result.dependency_tree_size (*^ *) | |
804 | + sprintf "parsed: paths_size=%d chart_size=%d dependency_tree_size=%d\n" result.paths_size result.chart_size result.dependency_tree_size ^ | |
805 | 805 | (* sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^ *) |
806 | - (* sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ *) | |
806 | + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ | |
807 | 807 | (* sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix *) |
808 | + "" | |
808 | 809 | | _ -> failwith "html_of_eniam_sentence" |
809 | 810 | |
810 | 811 | let html_of_conll_sentence path tokens (result : conll_parse_result) = |
... | ... | @@ -841,12 +842,13 @@ let html_of_conll_sentence path tokens (result : conll_parse_result) = |
841 | 842 | (* | NotTranslated -> "not_translated: \n" *) |
842 | 843 | | Parsed -> |
843 | 844 | (* print_simplified_dependency_tree path (result.file_prefix ^ "_simplified_dependency_tree") tokens result.dependency_tree; *) |
844 | - (* print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; *) | |
845 | + print_dependency_tree path (result.file_prefix ^ "_dependency_tree") result.dependency_tree; | |
845 | 846 | (* LCGlatexOf.print_dependency_tree path (result.file_prefix ^ "_dependency_tree_references") result.dependency_tree; *) |
846 | - sprintf "parsed: paths_size=%d dependency_tree_size=%d\n" result.paths_size result.dependency_tree_size (* ^ *) | |
847 | + sprintf "parsed: paths_size=%d dependency_tree_size=%d\n" result.paths_size result.dependency_tree_size ^ | |
847 | 848 | (* sprintf "<BR><A HREF=\"%s_simplified_dependency_tree.png\">Simplified Dependency Tree</A>\n" result.file_prefix ^ *) |
848 | - (* sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ *) | |
849 | + sprintf "<BR><A HREF=\"%s_dependency_tree.png\">Dependency Tree</A>\n" result.file_prefix ^ | |
849 | 850 | (* sprintf "<BR><A HREF=\"%s_dependency_tree_references.pdf\">Dependency Tree References</A>\n" result.file_prefix *) |
851 | + "" | |
850 | 852 | | _ -> failwith "html_of_conll_sentence" |
851 | 853 | |
852 | 854 | let html_of_sem_sentence path tokens (result : semantic_processing_result) = |
... | ... |