Commit 0bce3b2e8d3baa9d0907f6421e81ca3525540b0e
Integracja parsera zależnościowego z ENIAMlexSemantics
Showing
18 changed files
with
551 additions
and
273 deletions
.gitignore
corpora/CONLL.ml
... | ... | @@ -220,13 +220,14 @@ let match_corpus corpus = |
220 | 220 | |
221 | 221 | (******************) |
222 | 222 | |
223 | +exception Comment_line | |
223 | 224 | exception Empty_line |
224 | 225 | exception Empty_sentence |
225 | 226 | exception Id_line of string |
226 | 227 | |
227 | 228 | let load_token in_channel = |
228 | 229 | let fail line = |
229 | - (* failwith ("load_token: " ^ line) *) | |
230 | + print_endline ("load_token: " ^ line); | |
230 | 231 | () in |
231 | 232 | let int_of_super = function |
232 | 233 | "_" -> -1 |
... | ... | @@ -247,7 +248,8 @@ let load_token in_channel = |
247 | 248 | else if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.tree" line |
248 | 249 | then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.tree" line in |
249 | 250 | raise (Id_line id) |
250 | - else failwith ("load_token: " ^ line) | |
251 | + else raise Comment_line | |
252 | + (* failwith ("load_token: " ^ line) *) | |
251 | 253 | else |
252 | 254 | match Xstring.split "\t" line with |
253 | 255 | [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] -> |
... | ... | @@ -272,6 +274,7 @@ let load_sentence in_channel = |
272 | 274 | if id_a <> conll_id then failwith "load_sentence: different ids" else |
273 | 275 | pom ((id_a,super,label) :: rev_paths) id |
274 | 276 | with Id_line new_id -> pom rev_paths new_id |
277 | + | Comment_line -> pom rev_paths id | |
275 | 278 | | Empty_line -> rev_paths, id |
276 | 279 | | End_of_file -> if rev_paths = [] |
277 | 280 | then raise End_of_file |
... | ... |
corpora/CONLL_adapter.ml
... | ... | @@ -42,6 +42,34 @@ let if_interps interps token = |
42 | 42 | ) interp in |
43 | 43 | Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value)) |
44 | 44 | |
45 | +let change_dep paths i (id,super,label) = | |
46 | + let id_S, super_S, label_S = paths.(super) in | |
47 | + paths.(i) <- (id,super_S,label); | |
48 | + paths.(super) <- (id_S, id, label_S) | |
49 | + | |
50 | +let correct_injection paths tokens = Array.iteri (fun i (id,super,label) -> | |
51 | + if label = "punct" then (*musi być pierwszym tokenem o tym ojcu*) | |
52 | + let j = Int.fold (i+1) (Array.length paths - 1) 0 (fun acc n -> | |
53 | + let i2,s2,l2 = paths.(n) in | |
54 | + if super = s2 | |
55 | + then if l2 = "punct" | |
56 | + then n | |
57 | + else 0 | |
58 | + else acc | |
59 | + ) in | |
60 | + let k = Int.fold_down (i-1) 1 i (fun acc n -> | |
61 | + let i2,s2,l2 = paths.(n) in | |
62 | + if super = s2 | |
63 | + then 0 | |
64 | + else acc | |
65 | + ) in | |
66 | + if k == i && j <> 0 && i < super && super < j | |
67 | + then | |
68 | + (paths.(i) <- (0,-1,""); | |
69 | + paths.(j) <- (0,-1,"")) | |
70 | + ) paths; | |
71 | + paths | |
72 | + | |
45 | 73 | let correct_coordination1 paths tokens = |
46 | 74 | let paths_ls = List.mapi (fun i (id,super,label) -> |
47 | 75 | (i,id,super,label)) (Array.to_list paths) in |
... | ... | @@ -136,15 +164,15 @@ let correct_coordination2 paths tokens = |
136 | 164 | let paths_ls () = List.mapi (fun i (id,super,label) -> |
137 | 165 | (i,id,super,label)) (Array.to_list paths_c) in |
138 | 166 | |
139 | - (* let ps a sons = | |
167 | + let ps a sons = | |
140 | 168 | print_endline a; |
141 | 169 | List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons; |
142 | - print_endline "" in *) | |
170 | + print_endline "" in | |
143 | 171 | |
144 | 172 | let rec correct_rec (i,id,super,label) sons = |
145 | 173 | let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in |
146 | - (* ps "left:" (List.rev left_s); | |
147 | - ps "right:" right_s; *) | |
174 | + ps "left:" (List.rev left_s); | |
175 | + ps "right:" right_s; | |
148 | 176 | find_father i (List.rev left_s); |
149 | 177 | find_father i right_s |
150 | 178 | |
... | ... | @@ -154,23 +182,35 @@ let correct_coordination2 paths tokens = |
154 | 182 | paths_c.(i) <- (id,i0,label); |
155 | 183 | if not (if_cat ["conj"] (ExtArray.get tokens i).token || |
156 | 184 | (ExtArray.get tokens i).orth = ",") |
157 | - then failwith "find_father"; | |
185 | + then failwith "find_father1"; | |
158 | 186 | correct_rec (i,id,super,label) (if a < i |
159 | 187 | then (a,b,c,d) :: t |
160 | 188 | else List.rev @@ (a,b,c,d) :: t) |
161 | - | _ -> failwith "find_father" in | |
189 | + | [] -> failwith "find_father2" in | |
162 | 190 | |
163 | 191 | let check_previous_for_interp i = |
164 | 192 | if i >= 0 && (ExtArray.get tokens i).orth = "," && |
165 | 193 | not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths_c)) |
166 | 194 | then paths_c.(i) <- (0,-1,"") in |
167 | 195 | |
196 | + let filter_comp_construction sons = | |
197 | + let rec pom acc = function | |
198 | + (i1,id1,super1,label1) :: (i2,id2,super2,label2) :: t -> | |
199 | + if if_cat ["interp"] (ExtArray.get tokens i1).token && | |
200 | + if_cat ["comp"] (ExtArray.get tokens i2).token | |
201 | + then pom acc t | |
202 | + else pom ((i1,id1,super1,label1) :: acc) ((i2,id2,super2,label2) :: t) | |
203 | + | h :: t -> pom (h :: acc) t | |
204 | + | [] -> List.rev acc in | |
205 | + pom [] sons in | |
206 | + | |
168 | 207 | Array.iteri (fun i (id,super,label) -> |
169 | 208 | if if_cat ["conj"] (ExtArray.get tokens i).token || |
170 | 209 | (ExtArray.get tokens i).orth = "," |
171 | 210 | then |
172 | 211 | (check_previous_for_interp (i-1); |
173 | 212 | let sons = List.filter (fun (_,_,super,_) -> super = i) (paths_ls ()) in |
213 | + (* let sons = filter_comp_construction sons in *) | |
174 | 214 | if (List.length sons > 2) |
175 | 215 | then correct_rec (i,id,super,label) sons)) paths_c; |
176 | 216 | paths_c |
... | ... | @@ -206,15 +246,16 @@ done; *) |
206 | 246 | |
207 | 247 | let brev i id super label = |
208 | 248 | let if_the_last_dot () = |
209 | - let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) -> | |
210 | - s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in | |
211 | - Array.fold_left (fun acc (i2,s,l) -> | |
212 | - acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in | |
249 | + try | |
250 | + let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) -> | |
251 | + s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in | |
252 | + Array.fold_left (fun acc (i2,s,l) -> | |
253 | + acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths | |
254 | + with Not_found -> true in | |
213 | 255 | |
214 | 256 | let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot () |
215 | 257 | then "" |
216 | 258 | else "." in |
217 | - | |
218 | 259 | let n_orth = (ExtArray.get tokens id).orth ^ dot in |
219 | 260 | paths.(i) <- (find_token n_orth,super,label) in |
220 | 261 | |
... | ... | @@ -317,6 +358,16 @@ let correct_interp_with_father_0 paths tokens = |
317 | 358 | then paths.(i1) <- (id1,0,label1)) paths) paths; |
318 | 359 | paths |
319 | 360 | |
361 | +let corect_complm paths tokens = | |
362 | + Array.iteri (fun i (id,super,label) -> | |
363 | + if label = "complm" && super > 0 | |
364 | + then | |
365 | + let i2,s2,l2 = paths.(super) in | |
366 | + if if_cat ["conj"] (ExtArray.get tokens i2).token | |
367 | + then change_dep paths i (id,super,label) | |
368 | + ) paths; | |
369 | + paths | |
370 | + | |
320 | 371 | let remove_interps interp paths tokens = |
321 | 372 | let paths_ls = Array.to_list paths in |
322 | 373 | Array.iteri (fun i (id,super,label) -> |
... | ... | @@ -339,10 +390,6 @@ let correct_passive_voice paths tokens = |
339 | 390 | paths |
340 | 391 | |
341 | 392 | let swap_dep paths tokens = |
342 | - let change_dep i (id,super,label) = | |
343 | - let id_S, super_S, label_S = paths.(super) in | |
344 | - paths.(i) <- (id,super_S,label); | |
345 | - paths.(super) <- (id_S, id, label_S) in | |
346 | 393 | let rec correct_dep i (id,super,label) = |
347 | 394 | let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który"; |
348 | 395 | "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in |
... | ... | @@ -356,7 +403,7 @@ let swap_dep paths tokens = |
356 | 403 | (if_lemma adv_relators (ExtArray.get tokens id).token && |
357 | 404 | if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"; "subst"] (ExtArray.get tokens super).token) |
358 | 405 | then |
359 | - change_dep i (id,super,label); | |
406 | + change_dep paths i (id,super,label); | |
360 | 407 | if (if_lemma adv_relators (ExtArray.get tokens id).token && |
361 | 408 | if_cat ["subst"; "pred"] (ExtArray.get tokens super).token) |
362 | 409 | then correct_dep i paths.(i) in |
... | ... | @@ -367,7 +414,11 @@ let swap_dep paths tokens = |
367 | 414 | nieobsługiwana na razie koordynacja strony biernej - zarówno czasowniki posiłkowe, jak i imiesłowy |
368 | 415 | nieobsługiwana na razie koordynacja podrzędników spójników podrzędnych *) |
369 | 416 | |
370 | -let convert_dep_tree id first_try paths tokens = | |
417 | +let convert_dep_tree path first_try paths tokens = | |
418 | + File.file_out (path ^ "/pre_text_unmodified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> | |
419 | + Printf.fprintf file "%s\n" ENIAMvisualization.html_header; | |
420 | + Printf.fprintf file "%s\n" (ENIAMvisualization.html_of_dep_sentence tokens paths); | |
421 | + Printf.fprintf file "%s\n" ENIAMvisualization.html_trailer); | |
371 | 422 | let paths = Array.copy paths in |
372 | 423 | let paths = |
373 | 424 | if first_try |
... | ... | @@ -375,16 +426,27 @@ let convert_dep_tree id first_try paths tokens = |
375 | 426 | let pom = replace_tokens paths tokens in |
376 | 427 | let pom = (remove_interps ".") pom tokens in |
377 | 428 | let pom = replace_hyphens pom tokens in |
429 | + let pom = correct_injection pom tokens in | |
378 | 430 | let pom = correct_coordination1 pom tokens in |
379 | 431 | let pom = correct_interp_with_father_0 pom tokens in |
380 | - let pom = correct_coordination2 pom tokens in | |
381 | - let pom = remove_interps "," pom tokens in | |
432 | + (* File.file_out (path ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> | |
433 | + Printf.fprintf file "%s\n" ENIAMvisualization.html_header; | |
434 | + Printf.fprintf file "%s\n" (ENIAMvisualization.html_of_dep_sentence tokens paths); | |
435 | + Printf.fprintf file "%s\n" ENIAMvisualization.html_trailer); *) | |
436 | + let pom = try corect_complm pom tokens with | e -> print_endline (Printexc.to_string e); pom in | |
437 | + let pom = try | |
438 | + let pom2 = correct_coordination2 pom tokens in | |
439 | + remove_interps "," pom2 tokens | |
440 | + with | |
441 | + | _ -> (let pom2 = remove_interps "," pom tokens in | |
442 | + correct_coordination2 pom2 tokens) in | |
382 | 443 | let pom = correct_passive_voice pom tokens in |
383 | 444 | praet_qub_aglt pom tokens |
384 | 445 | else |
385 | - swap_dep paths tokens in | |
386 | - (* File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> | |
387 | - Printf.fprintf file "%s\n" Visualization.html_header; | |
388 | - Printf.fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths); | |
389 | - Printf.fprintf file "%s\n" Visualization.html_trailer); *) | |
446 | + paths in | |
447 | + (* swap_dep paths tokens in *) | |
448 | + File.file_out (path ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> | |
449 | + Printf.fprintf file "%s\n" ENIAMvisualization.html_header; | |
450 | + Printf.fprintf file "%s\n" (ENIAMvisualization.html_of_dep_sentence tokens paths); | |
451 | + Printf.fprintf file "%s\n" ENIAMvisualization.html_trailer); | |
390 | 452 | paths |
... | ... |
corpora/makefile
... | ... | @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt |
3 | 3 | OCAMLDEP=ocamldep |
4 | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa | |
6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa eniam-exec.cmxa | |
7 | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | |
9 | 9 | SOURCES= types.ml CONLL.ml CONLL_adapter.ml resources.ml conllParser.ml interpsInCorpus.ml generate.ml |
... | ... |
corpora/test_conll.ml
... | ... | @@ -48,7 +48,7 @@ let clarify_categories senses token = |
48 | 48 | | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[]) |
49 | 49 | | _ -> [] |
50 | 50 | |
51 | -let create_chart tokens lex_sems paths last = | |
51 | +(* let create_chart tokens lex_sems paths last = | |
52 | 52 | ENIAM_LCGrenderer.reset_variable_numbers (); |
53 | 53 | let chart = ENIAM_LCGchart.make last in |
54 | 54 | let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> |
... | ... | @@ -59,7 +59,7 @@ let create_chart tokens lex_sems paths last = |
59 | 59 | let cats = clarify_categories ["X"] t in |
60 | 60 | let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in |
61 | 61 | ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in |
62 | - chart | |
62 | + chart *) | |
63 | 63 | |
64 | 64 | let rec split_sons left id right = function |
65 | 65 | [] -> List.rev (List.sort compare left), List.sort compare right |
... | ... | @@ -85,7 +85,7 @@ let create_dep_chart tokens lex_sems paths = |
85 | 85 | ENIAM_LCGrenderer.reset_variable_names (); |
86 | 86 | ENIAM_LCGrenderer.add_variable_numbers (); |
87 | 87 | let cats = clarify_categories ["X"] t in |
88 | - let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in | |
88 | + let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata s.ENIAMlexSemanticsTypes.lex_entries in | |
89 | 89 | IntMap.add nodes i l) in |
90 | 90 | (* print_endline "create_dep_chart 3"; *) |
91 | 91 | let x = dep_create_rec nodes sons 0 in |
... | ... | @@ -93,7 +93,7 @@ let create_dep_chart tokens lex_sems paths = |
93 | 93 | x |
94 | 94 | |
95 | 95 | |
96 | -let test_example path id tokens lex_sems paths last = | |
96 | +(* let test_example path id tokens lex_sems paths last = | |
97 | 97 | ENIAM_LCGreductions.reset_variant_label (); |
98 | 98 | let chart = create_chart tokens lex_sems paths last in |
99 | 99 | ENIAM_LCGlatexOf.print_chart path (id^"1_chart") "a1" chart; |
... | ... | @@ -119,43 +119,45 @@ let test_example path id tokens lex_sems paths last = |
119 | 119 | ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; |
120 | 120 | ()) |
121 | 121 | else print_endline "not reduced") |
122 | - else print_endline "not parsed" | |
122 | + else print_endline "not parsed" *) | |
123 | 123 | |
124 | -let test_dep_example path id tokens lex_sems paths = | |
124 | +let rec test_dep_example path id tokens lex_sems first_try paths = | |
125 | + (* print_endline "test_dep_example 1"; *) | |
126 | + let paths = CONLL_adapter.convert_dep_tree path first_try paths tokens in | |
125 | 127 | try |
126 | - ENIAM_LCGreductions.reset_variant_label (); | |
127 | - print_endline "test_dep_example 1"; | |
128 | - let paths = CONLL_adapter.convert_dep_tree id (*first_try*) true paths tokens in | |
129 | - print_endline "test_dep_example 2"; | |
130 | - (* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *) | |
131 | - let chart = create_dep_chart tokens lex_sems paths in | |
132 | - (* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *) | |
133 | - let chart,references = ENIAM_LCGchart.dep_lazify chart in | |
134 | - (* ENIAM_LCGlatexOf.print_dep_chart path (id^"2_chart") "a4" chart; *) | |
135 | - (* ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references; *) | |
136 | - let chart = ENIAM_LCGchart.dep_parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | |
137 | - (* ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart; *) | |
138 | - (* ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references; *) | |
139 | - if ENIAM_LCGchart.is_dep_parsed chart then ( | |
140 | - let term = ENIAM_LCGchart.get_dep_parsed_term chart in | |
141 | - (* Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file -> | |
142 | - Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | |
143 | - Xlatex.latex_compile_and_clean path (id^"4_term"); *) | |
144 | - let dependency_tree = ENIAM_LCGreductions.reduce term references in | |
145 | - (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree; *) | |
146 | - if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | |
147 | - ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
148 | - (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree; *) | |
149 | - ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
150 | - (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree; *) | |
151 | - (* ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree; *) | |
152 | - (* ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; *) | |
153 | - ()) | |
154 | - else print_endline "not reduced") | |
155 | - else print_endline "not parsed" | |
128 | + ENIAM_LCGreductions.reset_variant_label (); | |
129 | + (* print_endline "test_dep_example 2"; *) | |
130 | + (* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *) | |
131 | + let chart = create_dep_chart tokens lex_sems paths in | |
132 | + (* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *) | |
133 | + let chart,references = ENIAM_LCGchart.dep_lazify chart in | |
134 | + (* ENIAM_LCGlatexOf.print_dep_chart path (id^"2_chart") "a4" chart; *) | |
135 | + (* ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references; *) | |
136 | + let chart = ENIAM_LCGchart.dep_parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | |
137 | + (* ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart; *) | |
138 | + (* ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references; *) | |
139 | + if ENIAM_LCGchart.is_dep_parsed chart then ( | |
140 | + let term = ENIAM_LCGchart.get_dep_parsed_term chart in | |
141 | + (* Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file -> | |
142 | + Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | |
143 | + Xlatex.latex_compile_and_clean path (id^"4_term"); *) | |
144 | + let dependency_tree = ENIAM_LCGreductions.reduce term references in | |
145 | + (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree; *) | |
146 | + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | |
147 | + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
148 | + (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree; *) | |
149 | + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | |
150 | + (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree; *) | |
151 | + (* ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree; *) | |
152 | + (* ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; *) | |
153 | + ()) | |
154 | + else print_endline "not reduced") | |
155 | + else print_endline "not parsed" | |
156 | 156 | with NotDepParsed(id_ndp,left,l,right) -> ( |
157 | - print_endline "not parsed 2"; | |
158 | - ENIAM_LCGlatexOf.print_not_parsed_dep_chart path (id^"3_not_parsed_chart") "a2" (id_ndp,left,l,right)) | |
157 | + if (first_try) | |
158 | + then test_dep_example path id tokens lex_sems false paths | |
159 | + else (print_endline "not parsed 2"; | |
160 | + ENIAM_LCGlatexOf.print_not_parsed_dep_chart path (id^"3_not_parsed_chart") "a2" (id_ndp,left,l,right))) | |
159 | 161 | |
160 | 162 | let rec parse_sentence name id tokens lex_sems = function |
161 | 163 | RawSentence s -> id |
... | ... | @@ -163,7 +165,7 @@ let rec parse_sentence name id tokens lex_sems = function |
163 | 165 | (* test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; *) |
164 | 166 | id + 1 |
165 | 167 | | DepSentence(paths) -> |
166 | - test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems paths; | |
168 | + test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems true paths; | |
167 | 169 | id + 1 |
168 | 170 | | QuotedSentences sentences -> |
169 | 171 | Xlist.fold sentences id (fun id p -> |
... | ... | @@ -212,8 +214,8 @@ let process_id s = |
212 | 214 | else failwith ("process_id: " ^ s) |
213 | 215 | |
214 | 216 | let process_conll_corpus filename = |
215 | - let corpus = File.file_in filename (fun file -> CONLL.match_corpus (ENIAM_CONLL.load_corpus file)) in | |
216 | - print_endline "process_conll_corpus"; | |
217 | + let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in | |
218 | + (* print_endline "process_conll_corpus 1"; *) | |
217 | 219 | (* let corpus = [List.hd corpus] in *) |
218 | 220 | Xlist.iter corpus (fun query -> try |
219 | 221 | let id = process_id (get_query_id query) in |
... | ... | @@ -226,13 +228,17 @@ let process_conll_corpus filename = |
226 | 228 | (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *) |
227 | 229 | let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] |
228 | 230 | (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in |
231 | + (* print_endline "process_conll_corpus 2"; *) | |
229 | 232 | let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in |
233 | + (* print_endline "process_conll_corpus 3"; *) | |
230 | 234 | let sentences = match text with |
231 | 235 | AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences |
232 | 236 | | _ -> failwith "process_conll_corpus 1" in |
233 | 237 | let text = AltText[Raw,RawText query; Struct, StructText([ |
234 | 238 | AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in |
239 | + (* print_endline "process_conll_corpus 4"; *) | |
235 | 240 | let lex_sems = ENIAMlexSemantics.assign tokens text in |
241 | + (* print_endline "process_conll_corpus 5"; *) | |
236 | 242 | ignore(parse_text id 1 tokens lex_sems text) |
237 | 243 | | _ -> failwith "process_conll_corpus 2" |
238 | 244 | with |
... | ... | @@ -241,6 +247,7 @@ let process_conll_corpus filename = |
241 | 247 | |
242 | 248 | let _ = |
243 | 249 | Printexc.record_backtrace true; |
250 | + ENIAMlexSemantics.initialize (); | |
244 | 251 | (* LCGfields.reset (); *) |
245 | 252 | (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) |
246 | 253 | (* process_conll_corpus "../testy/skladnica-test1.conll"; *) |
... | ... |
exec/ENIAMexec.ml
... | ... | @@ -85,6 +85,37 @@ let create_chart rules tokens lex_sems paths last = |
85 | 85 | ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in |
86 | 86 | chart |
87 | 87 | |
88 | +let rec split_sons left id right = function | |
89 | + [] -> List.rev (List.sort compare left), List.sort compare right | |
90 | + | x :: l -> if x < id then split_sons (x :: left) id right l else split_sons left id (x :: right) l | |
91 | + | |
92 | +let rec dep_create_rec nodes sons conll_id = | |
93 | + let node = IntMap.find nodes conll_id in | |
94 | + let l = try IntMap.find sons conll_id with Not_found -> [] in | |
95 | + let left,right = split_sons [] conll_id [] l in | |
96 | + (* Printf.printf "dep_create_rec [%s] %d [%s]\n" (String.concat ";" (Xlist.map left string_of_int)) conll_id (String.concat ";" (Xlist.map right string_of_int)); *) | |
97 | + DepNode(conll_id, Xlist.map left (dep_create_rec nodes sons), node, Xlist.map right (dep_create_rec nodes sons)) | |
98 | + | |
99 | +let create_dep_chart dep_rules tokens lex_sems paths = | |
100 | + (* print_endline "create_dep_chart 1"; *) | |
101 | + let sons = Int.fold 1 (Array.length paths - 1) IntMap.empty (fun sons i -> | |
102 | + let _,super,_ = paths.(i) in | |
103 | + IntMap.add_inc sons super [i] (fun l -> i :: l)) in | |
104 | + (* print_endline "create_dep_chart 2"; *) | |
105 | + let nodes = Int.fold 0 (Array.length paths - 1) IntMap.empty (fun nodes i -> | |
106 | + let id,_,_ = paths.(i) in | |
107 | + let t = ExtArray.get tokens id in | |
108 | + let s = ExtArray.get lex_sems id in | |
109 | + ENIAM_LCGrenderer.reset_variable_names (); | |
110 | + ENIAM_LCGrenderer.add_variable_numbers (); | |
111 | + let cats = clarify_categories ["X"] t in | |
112 | + let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata s.ENIAMlexSemanticsTypes.lex_entries in | |
113 | + IntMap.add nodes i l) in | |
114 | + (* print_endline "create_dep_chart 3"; *) | |
115 | + let x = dep_create_rec nodes sons 0 in | |
116 | + (* print_endline "create_dep_chart 4"; *) | |
117 | + x | |
118 | + | |
88 | 119 | let create_text_fragments tokens paths last = |
89 | 120 | let text_fragments = Array.make last IntMap.empty in |
90 | 121 | Xlist.iter paths (fun (id,lnode,rnode) -> |
... | ... | @@ -156,85 +187,75 @@ let eniam_parse_sentence timeout verbosity rules tokens lex_sems paths last = |
156 | 187 | with e -> |
157 | 188 | let time2 = time_fun () in |
158 | 189 | {result with status=LexiconError; msg=string_of_exn e; lex_time=time2 -. time1} |
159 | -(* | |
160 | -let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens lex_sems = | |
161 | - let result = empty_conll_parse_result in | |
162 | - let time2 = time_fun () in | |
163 | - (* let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in *) | |
190 | + | |
191 | +let rec conll_parse_sentence timeout verbosity dep_rules first_try tokens lex_sems paths = | |
192 | + ENIAM_LCGreductions.reset_variant_label (); | |
193 | + let result = {empty_conll_parse_result with paths_size = Xlist.size paths} in | |
194 | + let result = if verbosity = 0 then result else result(*{result with text_fragments=create_dep_text_fragments tokens paths last}*) in (* FIXME *) | |
195 | + let time1 = time_fun () in | |
164 | 196 | try |
165 | - let dep_chart = LCGlexicon.dep_create paths tokens lex_sems in | |
166 | - let dep_chart,references = LCGchart.dep_lazify dep_chart in | |
167 | - let result = if test_only_flag then result else {result with dep_chart=dep_chart} in | |
168 | - let time3 = time_fun () in | |
169 | - let result = {result with lex_time=time3 -. time2} in | |
197 | + let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in | |
198 | + let chart = create_chart dep_rules tokens lex_sems paths in | |
199 | + let result = if verbosity = 0 then result else {result with chart1=chart} in | |
200 | + let chart,references = ENIAM_LCGchart.dep_lazify chart in | |
201 | + let result = if verbosity = 0 then result else {result with chart2=chart; references2=ExtArray.copy references} in | |
202 | + let time2 = time_fun () in | |
203 | + let result = {result with lex_time=time2 -. time1} in | |
170 | 204 | try |
171 | - (* print_endline "conll_parse_sentence 1"; *) | |
172 | - (* LCGlatexOf.print_references "results/" "references1" references; *) | |
173 | - let parsed_dep_chart = LCGchart.dep_parse dep_chart references timeout time_fun in (* uwaga: niejawna zmiana imperatywna w references *) | |
174 | - (* print_endline "conll_parse_sentence 2"; *) | |
175 | - (* LCGlatexOf.print_references "results/" "references2" references; *) | |
176 | - let time4 = time_fun () in | |
177 | - let result = if test_only_flag then result else {result with parsed_dep_chart=parsed_dep_chart} in | |
178 | - let result = {result with parse_time=time4 -. time3} in | |
179 | - if LCGchart.is_dep_parsed parsed_dep_chart then | |
205 | + let chart = ENIAM_LCGchart.dep_parse chart references timeout time_fun in (* uwaga: niejawna zmiana imperatywna w references *) | |
206 | + let time3 = time_fun () in | |
207 | + let result = if verbosity = 0 then result else {result with parsed_dep_chart=chart; references3=references} in | |
208 | + let result = {result with parse_time=time3 -. time2; chart_size=ENIAM_LCGchart.get_no_entries chart} in | |
209 | + if ENIAM_LCGchart.is_dep_parsed chart then | |
180 | 210 | try |
181 | - let term = LCGchart.get_dep_parsed_term tokens lex_sems parsed_dep_chart in | |
182 | - (* LCGlatexOf.print_dependency_tree "dep_dependency_tree1" dependency_tree; *) | |
183 | - let dependency_tree = LCGreductions.reduce term references in | |
184 | - let time5 = time_fun () in | |
185 | - let result = if test_only_flag then result else {result with dependency_tree=dependency_tree} in | |
186 | - let result = {result with reduction_time=time5 -. time4; dependency_tree_size=Array.length dependency_tree} in | |
187 | - if LCGreductions.is_reduced_dependency_tree dependency_tree then | |
211 | + let term = ENIAM_LCGchart.get_dep_parsed_term chart in | |
212 | + let result = if verbosity = 0 then result else {result with term4=term} in | |
213 | + let dependency_tree = ENIAM_LCGreductions.reduce term references in | |
214 | + let time4 = time_fun () in | |
215 | + let result = if verbosity = 0 then result else {result with dependency_tree4=Array.copy dependency_tree} in | |
216 | + let result = {result with reduction_time=time4 -. time3; dependency_tree_size=Array.length dependency_tree} in | |
217 | + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then | |
188 | 218 | try |
189 | - (* print_endline "conll_parse_sentence 3"; *) | |
190 | - LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *) | |
191 | - (* print_endline "conll_parse_sentence 4"; *) | |
192 | - LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *) | |
193 | -(* if Array.length dependency_tree < 10000 then print_xml_dependency_tree "results/trees/" id dependency_tree; *) | |
194 | - (* print_endline "conll_parse_sentence 5"; *) | |
219 | + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *) | |
220 | + let result = if verbosity = 0 then result else {result with dependency_tree5=Array.copy dependency_tree} in | |
221 | + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *) | |
222 | + let result = (*if verbosity = 0 then result else*) {result with dependency_tree6=dependency_tree} in | |
195 | 223 | let time6 = time_fun () in |
196 | - {result with status=Parsed; sem_time=time6 -. time5} | |
224 | + {result with status=Parsed; sem_time=time6 -. time4} | |
197 | 225 | with e -> |
198 | 226 | let time6 = time_fun () in |
199 | - {result with status=SemError; msg=string_of_exn e; sem_time=time6 -. time5} | |
227 | + {result with status=SemError1; msg=string_of_exn e; sem_time=time6 -. time4} | |
200 | 228 | else |
201 | 229 | {result with status=NotReduced} |
202 | 230 | with |
203 | 231 | | SemTooBig -> |
204 | - let time5 = time_fun () in | |
205 | - {result with status=TooManyNodes; reduction_time=time5 -. time4} | |
232 | + let time4 = time_fun () in | |
233 | + {result with status=TooManyNodes; reduction_time=time4 -. time3} | |
206 | 234 | | e -> |
207 | - let time5 = time_fun () in | |
208 | - {result with status=ReductionError; msg=string_of_exn e; reduction_time=time5 -. time4} | |
235 | + let time4 = time_fun () in | |
236 | + {result with status=ReductionError; msg=string_of_exn e; reduction_time=time4 -. time3} | |
209 | 237 | else if first_try |
210 | - then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems | |
238 | + then conll_parse_sentence timeout verbosity dep_rules false tokens lex_sems paths | |
211 | 239 | else {result with status=NotParsed} |
212 | 240 | with |
213 | 241 | Timeout t -> |
214 | - let time4 = time_fun () in | |
215 | - {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time4 -. time3} | |
242 | + let time3 = time_fun () in | |
243 | + {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time3 -. time2} | |
216 | 244 | | NotDepParsed(id_ndp,left,l,right) -> |
217 | 245 | if first_try |
218 | - then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems | |
246 | + then conll_parse_sentence timeout verbosity dep_rules false tokens lex_sems paths | |
219 | 247 | else let time4 = time_fun () in |
220 | 248 | {result with status=NotParsed; not_parsed_dep_chart=(id_ndp,left,l,right); parse_time=time4 -. time3} |
221 | 249 | | e -> |
222 | - let time4 = time_fun () in | |
223 | - {result with status=ParseError; msg=string_of_exn e; parse_time=time4 -. time3} | |
224 | - with e -> (*print_endline (string_of_exn e);*) | |
225 | - let time3 = time_fun () in | |
250 | + let time3 = time_fun () in | |
251 | + {result with status=ParseError; msg=string_of_exn e; parse_time=time3 -. time2} | |
252 | + with e -> | |
253 | + let time2 = time_fun () in | |
226 | 254 | if first_try |
227 | - then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems | |
255 | + then conll_parse_sentence timeout verbosity dep_rules false tokens lex_sems paths | |
228 | 256 | else {result with status=LexiconError; msg=string_of_exn e; lex_time=time3 -. time2} |
229 | 257 | |
230 | - | |
231 | -let mate_in, mate_out = (*Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test"*) | |
232 | - if Paths.config.Paths.mate_parser_enabled then | |
233 | - Unix.open_process ("java -jar " ^ Paths.config.Paths.mate_parser_path ^ "dist/anna-3.5.jar -model " ^ | |
234 | - Paths.config.Paths.mate_parser_path ^ "examples/160622_Polish_MateParser.mdl -test") | |
235 | - else stdin, stdout | |
236 | - | |
237 | -let swigra_in, swigra_out = (*Unix.open_process "../swigra/parser/run.sh"*) | |
258 | +(*let swigra_in, swigra_out = (*Unix.open_process "../swigra/parser/run.sh"*) | |
238 | 259 | if Paths.config.Paths.swigra_enabled then |
239 | 260 | Unix.open_process (Paths.config.Paths.swigra_path ^ "run.sh") |
240 | 261 | else stdin, stdout |
... | ... | @@ -256,38 +277,21 @@ let parse timeout verbosity rules (*name id*) tokens lex_sems = |
256 | 277 | let result = eniam_parse_sentence timeout verbosity rules tokens lex_sems paths last in |
257 | 278 | ENIAMSentence result |
258 | 279 | | _ -> failwith "parse 3") |
259 | - | DepSentence(paths) -> | |
280 | + | DepSentence paths -> | |
260 | 281 | (match mode with |
261 | -(* CONLL -> | |
262 | - let result = conll_parse_sentence timeout verbosity id true paths tokens lex_sems in | |
263 | - let result = {result with | |
282 | + CONLL | Mate -> | |
283 | + let result = conll_parse_sentence timeout verbosity dep_rules true tokens lex_sems paths in | |
284 | + (* let result = {result with | |
264 | 285 | file_prefix = file_prefix_of_mode mode ^ file_prefix; |
265 | - paths = paths} in | |
286 | + paths = paths} in *) | |
266 | 287 | CONLLSentence result |
267 | 288 | (* let xml = DepTree.conll_to_xml paths in |
268 | 289 | let graph = XmlPrinter.graph_of_xml xml in (* FIXME: do poprawy *) |
269 | 290 | Visualization.print_graph "results/" "term_conll" graph; |
270 | 291 | let result = {empty_eniam_parse_result with status=Parsed; term=graph} in |
271 | 292 | ENIAMSentence result, next_id *) |
272 | - | Mate -> | |
273 | - if not Paths.config.Paths.mate_parser_enabled then DepSentence paths else ( | |
274 | - print_endline "parse_sentence 1"; | |
275 | - (* print_endline (Visualization.html_of_dep_sentence tokens paths); *) | |
276 | - let conll = ENIAM_CONLL.string_of_paths ENIAMsubsyntaxTypes.Mate tokens paths in | |
277 | - print_endline "parse_sentence 2"; | |
278 | - (* printf "|%s|\n" conll; *) | |
279 | - Printf.fprintf mate_out "%s%!" conll; | |
280 | - print_endline "parse_sentence 3"; | |
281 | - let new_paths = get_paths paths (ENIAM_CONLL.load_sentence mate_in) in | |
282 | - print_endline "parse_sentence 4"; | |
283 | - (* print_endline (Visualization.html_of_dep_sentence tokens new_paths); *) | |
284 | - let result = conll_parse_sentence timeout verbosity id true new_paths tokens lex_sems in | |
285 | - let result = {result with | |
286 | - file_prefix = file_prefix_of_mode mode ^ file_prefix; | |
287 | - paths=new_paths} in | |
288 | - CONLLSentence result)*) | |
289 | - | _ -> failwith "parse 2") | |
290 | - | _ -> failwith "parse 1") | |
293 | + | _ -> failwith "parse 2") | |
294 | + | _ -> failwith "parse 1") | |
291 | 295 | |
292 | 296 | |
293 | 297 | (* |
... | ... |
exec/ENIAMexecTypes.ml
... | ... | @@ -49,9 +49,9 @@ type eniam_parse_result = { |
49 | 49 | semantic_graph11: ENIAMsemTypes.linear_term; |
50 | 50 | text_fragments: string IntMap.t array; |
51 | 51 | } |
52 | -(* | |
52 | + | |
53 | 53 | type conll_parse_result = { |
54 | - file_prefix: string; | |
54 | +(* file_prefix: string;*) | |
55 | 55 | status: status; |
56 | 56 | msg: string; |
57 | 57 | lex_time: float; |
... | ... | @@ -59,17 +59,29 @@ type conll_parse_result = { |
59 | 59 | reduction_time: float; |
60 | 60 | sem_time: float; |
61 | 61 | paths_size: int; |
62 | + chart_size: int; | |
62 | 63 | dependency_tree_size: int; |
63 | - paths: (int * int * string) array; | |
64 | - dep_chart: LCGtypes.dep_tree; | |
65 | - parsed_dep_chart: (LCGtypes.SymbolMap.key * LCGtypes.linear_term) list; | |
64 | + chart1: dep_tree; | |
65 | + chart2: dep_tree; | |
66 | + references2: linear_term ExtArray.t; | |
67 | + parsed_dep_chart: (SymbolMap.key * linear_term) list; | |
66 | 68 | not_parsed_dep_chart: int * |
67 | - (LCGtypes.grammar_symbol * LCGtypes.linear_term) list list * | |
68 | - (LCGtypes.grammar_symbol * LCGtypes.linear_term) list * | |
69 | - (LCGtypes.grammar_symbol * LCGtypes.linear_term) list list; | |
70 | - dependency_tree: LCGtypes.linear_term array; | |
69 | + (grammar_symbol * linear_term) list list * | |
70 | + (grammar_symbol * linear_term) list * | |
71 | + (grammar_symbol * linear_term) list list; | |
72 | + references3: linear_term ExtArray.t; | |
73 | + term4: linear_term; | |
74 | + dependency_tree4: linear_term array; | |
75 | + dependency_tree5: linear_term array; | |
76 | + dependency_tree6: linear_term array; | |
77 | + dependency_tree7: linear_term array; | |
78 | + dependency_tree8: linear_term ExtArray.t; | |
79 | + dependency_tree9: linear_term array; | |
80 | + semantic_graph10: ENIAMsemTypes.linear_term array; | |
81 | + semantic_graph11: ENIAMsemTypes.linear_term; | |
82 | + text_fragments: string IntMap.t array; | |
71 | 83 | } |
72 | - | |
84 | +(* | |
73 | 85 | type semantic_processing_result = { |
74 | 86 | file_prefix: string; |
75 | 87 | status: status; |
... | ... | @@ -190,6 +202,35 @@ let empty_eniam_parse_result = { |
190 | 202 | text_fragments=[| |]; |
191 | 203 | } |
192 | 204 | |
205 | +let empty_conll_parse_result = { | |
206 | + (* file_prefix=""; *) | |
207 | + status=Idle; | |
208 | + msg=""; | |
209 | + lex_time=0.; | |
210 | + parse_time=0.; | |
211 | + reduction_time=0.; | |
212 | + sem_time=0.; | |
213 | + paths_size=0; | |
214 | + chart_size=0; | |
215 | + dependency_tree_size=0; | |
216 | + chart1=DepNode(-100,[],[],[]); | |
217 | + chart2=DepNode(-100,[],[],[]); | |
218 | + references2=ExtArray.make 0 Dot; | |
219 | + references3=ExtArray.make 0 Dot; | |
220 | + term4=Dot; | |
221 | + dependency_tree4=[| |]; | |
222 | + dependency_tree5=[| |]; | |
223 | + dependency_tree6=[| |]; | |
224 | + dependency_tree7=[| |]; | |
225 | + dependency_tree8=ExtArray.make 0 Dot; | |
226 | + dependency_tree9=[| |]; | |
227 | + semantic_graph10=[| |]; | |
228 | + semantic_graph11=ENIAMsemTypes.Dot; | |
229 | + text_fragments=[| |]; | |
230 | + parsed_dep_chart=[]; | |
231 | + not_parsed_dep_chart=(-100,[],[],[]); | |
232 | + } | |
233 | + | |
193 | 234 | (* |
194 | 235 | let empty_result = { |
195 | 236 | input_text=RawText ""; |
... | ... | @@ -208,23 +249,6 @@ let empty_result = { |
208 | 249 | lex_sems=ExtArray.make 1 ENIAMlexSemanticsTypes.empty_lex_sem; |
209 | 250 | } |
210 | 251 | |
211 | -let empty_conll_parse_result = { | |
212 | - file_prefix=""; | |
213 | - status=Idle; | |
214 | - msg=""; | |
215 | - lex_time=0.; | |
216 | - parse_time=0.; | |
217 | - reduction_time=0.; | |
218 | - sem_time=0.; | |
219 | - paths_size=0; | |
220 | - dependency_tree_size=0; | |
221 | - paths=[| |]; | |
222 | - dep_chart=DepNode(-100,[],[],[]); | |
223 | - parsed_dep_chart=[]; | |
224 | - not_parsed_dep_chart=(-100,[],[],[]); | |
225 | - dependency_tree=[| |]; | |
226 | - } | |
227 | - | |
228 | 252 | let empty_semantic_processing_result = { |
229 | 253 | file_prefix=""; |
230 | 254 | status=Idle; |
... | ... | @@ -321,3 +345,5 @@ let rec fold_text mode s f = function |
321 | 345 | | AltText l -> |
322 | 346 | Xlist.fold l s (fun s (mode,text) -> |
323 | 347 | fold_text mode s f text) |
348 | + | |
349 | +let rules_filename = ENIAM_LCGlexiconTypes.resource_path ^ "/LCGlexicon/lexicon-pl.dic" | |
... | ... |
exec/ENIAMvisualization.ml
... | ... | @@ -702,7 +702,7 @@ let html_of_struct_sentence tokens paths last = |
702 | 702 | t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id lnode rnode)) ^ |
703 | 703 | sprintf "<tr><td></td><td></td><td></td><td>%d</td><td></td></tr>" last ^ |
704 | 704 | "</table>" |
705 | -(* | |
705 | + | |
706 | 706 | let html_of_dep_sentence tokens paths = |
707 | 707 | "<table><tr><td><b>orth</b></td><td><b>token</b></td><td><b>id</b></td><td><b>conll_id</b></td><td><b>super</b></td><td><b>label</b></td></tr>" ^ |
708 | 708 | String.concat "\n" (List.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id -> |
... | ... | @@ -711,7 +711,7 @@ let html_of_dep_sentence tokens paths = |
711 | 711 | (sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td></tr>" |
712 | 712 | t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id conll_id super label) :: l))) ^ |
713 | 713 | "</table>" |
714 | - | |
714 | +(* | |
715 | 715 | let html_of_tokens tokens = |
716 | 716 | "<table><tr><td><b>id</b></td><td><b>orth</b></td><td><b>beg</b></td><td><b>len</b></td><td><b>next</b></td><td><b>token</b></td></td><td><b>attrs</b></td></tr>" ^ |
717 | 717 | String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size tokens - 1) [] (fun l id -> |
... | ... | @@ -1048,7 +1048,7 @@ let file_prefix_of_mode = function |
1048 | 1048 | let rec html_of_sentence path file_prefix mode img verbosity tokens = function |
1049 | 1049 | RawSentence s -> s |
1050 | 1050 | | StructSentence(paths,last) -> html_of_struct_sentence tokens paths last |
1051 | - (* | DepSentence paths -> html_of_dep_sentence img verbosity tokens paths *) | |
1051 | + | DepSentence paths -> html_of_dep_sentence tokens paths | |
1052 | 1052 | | ENIAMSentence result -> |
1053 | 1053 | let file_prefix = file_prefix_of_mode mode ^ file_prefix in |
1054 | 1054 | html_of_eniam_sentence path file_prefix img verbosity tokens result |
... | ... | @@ -1062,7 +1062,7 @@ let rec html_of_sentence path file_prefix mode img verbosity tokens = function |
1062 | 1062 | String.concat "\n" (Xlist.map l (fun (mode,sentence) -> |
1063 | 1063 | sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_sentence path file_prefix mode img verbosity tokens sentence))) ^ |
1064 | 1064 | "</table>" |
1065 | - | _ -> failwith "html_of_sentence: ni" | |
1065 | + (* | _ -> failwith "html_of_sentence: ni" *) | |
1066 | 1066 | |
1067 | 1067 | let rec html_of_paragraph path mode img verbosity tokens = function |
1068 | 1068 | RawParagraph s -> (*print_endline "RawParagraph";*) s |
... | ... |
exec/makefile
... | ... | @@ -19,6 +19,13 @@ install: all |
19 | 19 | cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMvisualization.cmi $(INSTALLDIR) |
20 | 20 | cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMvisualization.cmx $(INSTALLDIR) |
21 | 21 | |
22 | +install-local: all | |
23 | + mkdir -p $(INSTALLDIR) | |
24 | + cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR) | |
25 | + cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMsemLexicon.cmi ENIAMsemValence.cmi ENIAMvisualization.cmi $(INSTALLDIR) | |
26 | + cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMsemLexicon.cmx ENIAMsemValence.cmx ENIAMvisualization.cmx $(INSTALLDIR) | |
27 | + mkdir -p /usr/local/share/eniam/exec | |
28 | + cp resources/* /usr/local/share/eniam/exec | |
22 | 29 | |
23 | 30 | eniam-exec.cma: $(SOURCES) |
24 | 31 | ocamlc -linkall -a -o eniam-exec.cma $(OCAMLFLAGS) $^ |
... | ... |
integration/ENIAMpreIntegration.ml
... | ... | @@ -198,3 +198,9 @@ let rec parse_text mode tokens = function |
198 | 198 | StructText(List.rev paragraphs) |
199 | 199 | | AltText l -> AltText(Xlist.map l (fun (mode,text) -> |
200 | 200 | mode, parse_text mode tokens text)) |
201 | + | |
202 | +let catch_parse_text mode tokens text = | |
203 | + try | |
204 | + parse_text mode tokens text,"" | |
205 | + with e -> | |
206 | + text, Printexc.to_string e | |
... | ... |
lexSemantics/ENIAMwalParser.ml
... | ... | @@ -73,14 +73,6 @@ let split_text schema = |
73 | 73 | | Str.Delim "'" -> Quot |
74 | 74 | | _ -> failwith "parse_text")) |
75 | 75 | |
76 | -let rec split_symbol symb rev = function | |
77 | - [] -> [List.rev rev](*failwith "split_symbol"*) | |
78 | - | s :: l -> | |
79 | - if s = symb then | |
80 | - if l = [] then (*[List.rev rev]*)failwith "split_symbol" | |
81 | - else (List.rev rev) :: (split_symbol symb [] l) | |
82 | - else split_symbol symb (s :: rev) l | |
83 | - | |
84 | 76 | let rec string_of_token = function |
85 | 77 | Text s -> s |
86 | 78 | | Paren l -> "(" ^ String.concat "" (Xlist.map l string_of_token) ^ ")" |
... | ... | @@ -101,6 +93,14 @@ let rec string_of_token = function |
101 | 93 | let string_of_token_list l = |
102 | 94 | String.concat "" (Xlist.map l string_of_token) |
103 | 95 | |
96 | +let rec split_symbol symb rev = function | |
97 | + [] -> [List.rev rev](*failwith "split_symbol"*) | |
98 | + | s :: l -> | |
99 | + if s = symb then | |
100 | + if l = [] then (*[List.rev rev]*)failwith ("split_symbol: " ^ string_of_token symb) | |
101 | + else (List.rev rev) :: (split_symbol symb [] l) | |
102 | + else split_symbol symb (s :: rev) l | |
103 | + | |
104 | 104 | let parse_case = function |
105 | 105 | [Text "nom"] -> Case "nom" |
106 | 106 | | [Text "gen"] -> Case "gen" |
... | ... |
lexSemantics/interface.ml
... | ... | @@ -23,6 +23,7 @@ let output = ref Text |
23 | 23 | let comm_stdio = ref true |
24 | 24 | (* let sentence_split = ref true *) |
25 | 25 | let port = ref 5439 |
26 | +let perform_integration = ref false | |
26 | 27 | |
27 | 28 | let spec_list = [ |
28 | 29 | (* "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; |
... | ... | @@ -33,6 +34,13 @@ let spec_list = [ |
33 | 34 | "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; |
34 | 35 | "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; |
35 | 36 | "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; |
37 | + "--dep_parser", Arg.Unit (fun () -> | |
38 | + ENIAMpreIntegration.concraft_enabled := true; | |
39 | + ENIAMpreIntegration.mate_parser_enabled := true; | |
40 | + perform_integration := true), "Enable dependency parser"; | |
41 | + "--no_dep_parser", Arg.Unit (fun () -> | |
42 | + ENIAMpreIntegration.concraft_enabled := false; | |
43 | + ENIAMpreIntegration.mate_parser_enabled := false), "Disable dependency parser (default)"; | |
36 | 44 | (* "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; *) |
37 | 45 | (* "-r", Arg.String (fun p -> |
38 | 46 | ENIAMtokenizerTypes.set_resource_path p; |
... | ... | @@ -65,6 +73,9 @@ let rec main_loop in_chan out_chan = |
65 | 73 | print_endline text; |
66 | 74 | print_endline "input text end"; *) |
67 | 75 | let text,tokens,msg = ENIAMsubsyntax.catch_parse_text text in |
76 | + let text,msg = | |
77 | + if msg <> "" || not !perform_integration then text,msg else | |
78 | + ENIAMpreIntegration.catch_parse_text ENIAMsubsyntaxTypes.Struct tokens text in | |
68 | 79 | let lex_sems,msg = |
69 | 80 | if msg <> "" then ExtArray.make 0 ENIAMlexSemanticsTypes.empty_lex_sem, msg |
70 | 81 | else ENIAMlexSemantics.catch_assign tokens text in |
... | ... | @@ -84,6 +95,7 @@ let _ = |
84 | 95 | prerr_endline message; |
85 | 96 | Arg.parse spec_list anon_fun usage_msg; |
86 | 97 | ENIAMlexSemantics.initialize (); |
98 | + ENIAMpreIntegration.initialize (); | |
87 | 99 | Gc.compact (); |
88 | 100 | prerr_endline "Ready!"; |
89 | 101 | if !comm_stdio then main_loop stdin stdout |
... | ... |
lexSemantics/makefile
... | ... | @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt |
3 | 3 | OCAMLDEP=ocamldep |
4 | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa | |
6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa | |
7 | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | |
9 | 9 | SOURCES= entries.ml ENIAMwalTypes.ml ENIAMwalStringOf.ml ENIAMwalParser.ml ENIAMwalReduce.ml ENIAMlexSemanticsTypes.ml ENIAMlexSemanticsData.ml ENIAMvalence.ml ENIAMwalRenderer.ml ENIAMadjuncts.ml \ |
... | ... | @@ -40,6 +40,9 @@ eniam-lexSemantics.cmxa: $(SOURCES) |
40 | 40 | test: test.ml |
41 | 41 | $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^ |
42 | 42 | |
43 | +inttest: inttest.ml | |
44 | + $(OCAMLOPT) -o inttest $(OCAMLOPTFLAGS) $^ | |
45 | + | |
43 | 46 | interface: interface.ml |
44 | 47 | $(OCAMLOPT) -o lexSemantics $(OCAMLOPTFLAGS) interface.ml |
45 | 48 | |
... | ... | @@ -65,4 +68,4 @@ interface: interface.ml |
65 | 68 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< |
66 | 69 | |
67 | 70 | clean: |
68 | - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test | |
71 | + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test inttest | |
... | ... |
semantics/ENIAMsemLexicon.ml
... | ... | @@ -47,7 +47,7 @@ let parse_multi p = function |
47 | 47 | let parse_morf p = function |
48 | 48 | [T "1"] -> {p with is_necessary=Opt} |
49 | 49 | | tokens -> |
50 | - let l = Xlist.map (Lexer.split_symbol (T "*") [] tokens) (function | |
50 | + let l = Xlist.map (try Lexer.split_symbol (T "*") [] tokens with _ -> failwith "parse_morf: split_symbol *") (function | |
51 | 51 | [T s] -> Atom s |
52 | 52 | | tokens -> failwith ("parse_morf: " ^ Lexer.string_of_token_list tokens)) in |
53 | 53 | {p with morfs=LCG (Tensor l) :: p.morfs} |
... | ... | @@ -57,7 +57,7 @@ let parse_arg tokens p = |
57 | 57 | let tokens,p = parse_dir p tokens in |
58 | 58 | let tokens,p = parse_multi p tokens in |
59 | 59 | match Lexer.find_brackets ["(",")"] [] tokens with |
60 | - [B("(",")",tokens)] -> Xlist.fold (Lexer.split_symbol (T "+") [] tokens) p parse_morf | |
60 | + [B("(",")",tokens)] -> Xlist.fold (try Lexer.split_symbol (T "+") [] tokens with _ -> failwith "parse_arg: split_symbol +") p parse_morf | |
61 | 61 | | tokens -> parse_morf p tokens |
62 | 62 | |
63 | 63 | |
... | ... | @@ -75,7 +75,7 @@ let parse_entry = function |
75 | 75 | [T symbol; T ":"; T "null"] -> symbol,[] |
76 | 76 | | T symbol :: T ":" :: tokens -> |
77 | 77 | (* Printf.printf "parse_entry: %s\n" (Lexer.string_of_token_list tokens); *) |
78 | - let tokens = Lexer.split_symbol (T ":") [] tokens in | |
78 | + let tokens = try Lexer.split_symbol (T ":") [] tokens with _ -> failwith "parse_entry: split_symbol :" in | |
79 | 79 | let tokens = manage_tokens tokens in |
80 | 80 | let positions = Xlist.map tokens (fun (arg,role) -> |
81 | 81 | parse_arg arg (parse_role {empty_position with is_necessary=Req} role)) in |
... | ... | @@ -91,7 +91,7 @@ let load_lexicon filename = |
91 | 91 | | T "\t" -> tokens |
92 | 92 | | T "\r" -> tokens |
93 | 93 | | t -> t :: tokens)) in |
94 | - let entries = Lexer.split_symbol (T ";") [] tokens in | |
94 | + let entries = try Lexer.split_symbol (T ";") [] tokens with _ -> failwith "load_lexicon: split_symbol ;" in | |
95 | 95 | Xlist.fold entries StringMap.empty (fun map entry -> |
96 | 96 | let symbol,args = parse_entry entry in |
97 | 97 | StringMap.add_inc map symbol args (fun _ -> failwith ("load_lexicon: " ^ symbol))) |
... | ... |
testy/skladnica-test1-Failure.conll
1 | -1 - - interp interp _ 3 punct _ _ | |
2 | -2 Panowie pan subst subst pl|nom|m1 3 subj _ _ | |
3 | -3 przyszli przyjść praet praet pl|m1|perf 0 pred _ _ | |
4 | -4 . . interp interp _ 3 punct _ _ | |
5 | - | |
6 | 1 | 1 O o prep prep loc 12 comp _ _ |
7 | 2 | 2 klasztornym klasztorny adj adj sg|loc|n|pos 3 adjunct _ _ |
8 | 3 | 3 piekle piekło subst subst sg|loc|n 1 comp _ _ |
... | ... | @@ -21,84 +16,118 @@ |
21 | 16 | 16 br bieżący_rok brev brev pun 15 ne _ _ |
22 | 17 | 17 . . interp interp _ 12 punct _ _ |
23 | 18 | |
24 | -1 Następnie następnie adv adv _ 2 adjunct _ _ | |
25 | -2 rozłożyła rozłożyć praet praet sg|f|perf 10 conjunct _ _ | |
26 | -3 wysoki wysoki adj adj sg|acc|m3|pos 4 adjunct _ _ | |
27 | -4 statyw statyw subst subst sg|acc|m3 2 obj _ _ | |
28 | -5 , , interp interp _ 10 coord_punct _ _ | |
29 | -6 zawiesiła zawiesić praet praet sg|f|perf 10 conjunct _ _ | |
30 | -7 na na prep prep loc 6 adjunct _ _ | |
31 | -8 nim on ppron3 ppron3 sg|loc|m3|ter|akc|praep 7 comp _ _ | |
32 | -9 pudełko pudełko subst subst sg|acc|n 6 obj _ _ | |
33 | -10 , , interp interp _ 0 pred _ _ | |
34 | -11 przeprowadziła przeprowadzić praet praet sg|f|perf 10 conjunct _ _ | |
35 | -12 od od prep prep gen|nwok 11 adjunct _ _ | |
36 | -13 niego on ppron3 ppron3 sg|gen|n|ter|akc|praep 12 comp _ _ | |
37 | -14 przezroczysty przezroczysty adj adj sg|acc|m3|pos 15 adjunct _ _ | |
38 | -15 przewód przewód subst subst sg|acc|m3 11 obj _ _ | |
39 | -16 do do prep prep gen 11 adjunct _ _ | |
40 | -17 igły igła subst subst sg|gen|f 16 comp _ _ | |
41 | -18 , , interp interp _ 23 punct _ _ | |
42 | -19 którą który adj adj sg|acc|f|pos 23 obj _ _ | |
43 | -20 wcześniej wcześnie adv adv com 23 adjunct _ _ | |
44 | -21 automatyczny automatyczny adj adj sg|nom|m3|pos 22 adjunct _ _ | |
45 | -22 iniektor iniektor subst subst sg|nom|m3 23 subj _ _ | |
46 | -23 umieścił umieścić praet praet sg|m3|perf 17 adjunct _ _ | |
47 | -24 w w prep prep loc|nwok 23 comp _ _ | |
48 | -25 żyle żyła subst subst sg|loc|f 24 comp _ _ | |
49 | -26 na na prep prep loc 25 adjunct _ _ | |
50 | -27 przedramieniu przedramię subst subst sg|loc|n 26 comp _ _ | |
51 | -28 Irka Irek subst subst sg|gen|m1 27 adjunct _ _ | |
52 | -29 . . interp interp _ 10 punct _ _ | |
19 | +1 W w prep prep loc|nwok 9 adjunct _ _ | |
20 | +2 stanie stan subst subst sg|loc|m3 1 comp _ _ | |
21 | +3 obrzydzenia obrzydzenie subst subst sg|gen|n 2 adjunct _ _ | |
22 | +4 przyprawiającego przyprawiać pact pact sg|gen|n|imperf|aff 3 adjunct _ _ | |
23 | +5 o o prep prep acc 4 comp _ _ | |
24 | +6 nowe nowy adj adj pl|acc|n|pos 7 adjunct _ _ | |
25 | +7 mdłości mdłości subst subst pl|acc|n 5 comp _ _ | |
26 | +8 nie nie qub qub _ 9 neg _ _ | |
27 | +9 zauważył zauważyć praet praet sg|m1|perf 0 pred _ _ | |
28 | +10 nawet nawet qub qub _ 9 adjunct _ _ | |
29 | +11 , , interp interp _ 15 punct _ _ | |
30 | +12 że że comp comp _ 15 complm _ _ | |
31 | +13 wielki wielki adj adj sg|nom|m3|pos 14 adjunct _ _ | |
32 | +14 ból ból subst subst sg|nom|m3 15 subj _ _ | |
33 | +15 zaczyna zaczynać fin fin sg|ter|imperf 9 comp_fin _ _ | |
34 | +16 z z prep prep acc|nwok 18 adjunct _ _ | |
35 | +17 wolna wolny adj adjp _ 16 mwe _ _ | |
36 | +18 zanikać zanikać inf inf imperf 15 comp_inf _ _ | |
37 | +19 . . interp interp _ 9 punct _ _ | |
38 | + | |
39 | +1 - - interp interp _ 7 punct _ _ | |
40 | +2 W w prep prep loc|nwok 4 comp _ _ | |
41 | +3 szkole szkoła subst subst sg|loc|f 2 comp _ _ | |
42 | +4 jest być fin fin sg|ter|imperf 7 conjunct _ _ | |
43 | +5 mniej mało num num pl|nom 4 subj _ _ | |
44 | +6 uczniów uczeń subst subst pl|gen|m1 5 comp _ _ | |
45 | +7 , , interp interp _ 0 coord_punct _ _ | |
46 | +8 dlatego dlatego adv adv _ 9 adjunct _ _ | |
47 | +9 musiał musieć praet praet sg|m1|imperf 7 conjunct _ _ | |
48 | +10 em być aglt aglt sg|pri|imperf|wok 9 aglt _ _ | |
49 | +11 tym ten adj adj pl|dat|f|pos 12 adjunct _ _ | |
50 | +12 paniom pani subst subst pl|dat|f 13 obj_th _ _ | |
51 | +13 podziękować podziękować inf inf perf 9 comp_inf _ _ | |
52 | +14 . . interp interp _ 7 punct _ _ | |
53 | + | |
54 | +1 Od od prep prep gen|nwok 9 adjunct _ _ | |
55 | +2 końca koniec subst subst sg|gen|m3 1 comp _ _ | |
56 | +3 XVIII XVIII adj adj sg|gen|m3|pos 4 ne _ _ | |
57 | +4 w wiek brev brev pun 2 comp _ _ | |
58 | +5 . . interp interp _ 4 abbrev_punct _ _ | |
59 | +6 informacje informacja subst subst pl|nom|f 9 subj _ _ | |
60 | +7 o o prep prep loc 6 adjunct _ _ | |
61 | +8 głodach głód subst subst pl|loc|m3 7 comp _ _ | |
62 | +9 stają stawać fin fin pl|ter|imperf 0 pred _ _ | |
63 | +10 się się qub qub _ 9 refl _ _ | |
64 | +11 coraz coraz adv adv _ 12 adjunct _ _ | |
65 | +12 rzadsze rzadki adj adj pl|nom|f|com 9 pd _ _ | |
66 | +13 . . interp interp _ 9 punct _ _ | |
67 | + | |
68 | +1 Zabrał zabrać praet praet sg|m1|perf 0 pred _ _ | |
69 | +2 ponad ponad qub qub _ 3 adjunct _ _ | |
70 | +3 30 30 num num pl|acc|m3|rec 1 obj _ _ | |
71 | +4 tys tysiąc brev brev pun 3 mwe _ _ | |
72 | +5 . . interp interp _ 4 abbrev_punct _ _ | |
73 | +6 zł złoty brev brev npun 3 comp _ _ | |
74 | +7 . . interp interp _ 1 punct _ _ | |
75 | + | |
76 | +1 ( ( interp interp _ 8 punct _ _ | |
77 | +2 Kiedyś kiedyś adv adv _ 4 adjunct _ _ | |
78 | +3 też też qub qub _ 4 adjunct _ _ | |
79 | +4 miała mieć praet praet sg|f|imperf 8 conjunct _ _ | |
80 | +5 m być aglt aglt sg|pri|imperf|nwok 4 aglt _ _ | |
81 | +6 takie taki adj adj pl|acc|f|pos 7 adjunct _ _ | |
82 | +7 ambicje ambicja subst subst pl|acc|f 4 obj_th _ _ | |
83 | +8 , , interp interp _ 0 pred _ _ | |
84 | +9 zrezygnowała zrezygnować praet praet sg|f|perf 8 conjunct _ _ | |
85 | +10 m być aglt aglt sg|pri|imperf|nwok 9 aglt _ _ | |
86 | +11 . . interp interp _ 8 punct _ _ | |
87 | +12 ) ) interp interp _ 8 punct _ _ | |
53 | 88 | |
54 | -1 - - interp interp _ 4 punct _ _ | |
55 | -2 Co co subst subst sg|nom|n 4 pd _ _ | |
56 | -3 to to subst subst sg|nom|n 4 subj _ _ | |
57 | -4 jest być fin fin sg|ter|imperf 0 pred _ _ | |
58 | -5 ? ? interp interp _ 4 punct _ _ | |
89 | +1 Zawsze zawsze adv adv _ 2 adjunct _ _ | |
90 | +2 mówię mówić fin fin sg|pri|imperf 0 pred _ _ | |
91 | +3 , , interp interp _ 5 punct _ _ | |
92 | +4 że że comp comp _ 5 complm _ _ | |
93 | +5 mogę móc fin fin sg|pri|imperf 2 comp_fin _ _ | |
94 | +6 pracować pracować inf inf imperf 5 comp_inf _ _ | |
95 | +7 , , interp interp _ 5 punct _ _ | |
96 | +8 bo bo comp comp _ 5 adjunct _ _ | |
97 | +9 mam mieć fin fin sg|pri|imperf 13 conjunct _ _ | |
98 | +10 dobre dobry adj adj sg|acc|n|pos 11 adjunct _ _ | |
99 | +11 zdrowie zdrowie subst subst sg|acc|n 9 obj_th _ _ | |
100 | +12 , , interp interp _ 13 punct _ _ | |
101 | +13 a a conj conj _ 8 comp_fin _ _ | |
102 | +14 to to subst subst sg|nom|n 15 subj _ _ | |
103 | +15 jest być fin fin sg|ter|imperf 13 conjunct _ _ | |
104 | +16 darmo darmo adv adv _ 17 adjunct _ _ | |
105 | +17 dane dany adj adj sg|nom|n|perf|aff 15 pd _ _ | |
106 | +18 . . interp interp _ 2 punct _ _ | |
59 | 107 | |
60 | -1 Prosi prosić fin fin sg|ter|imperf 0 pred _ _ | |
61 | -2 się się qub qub _ 1 refl _ _ | |
62 | -3 też też qub qub _ 1 adjunct _ _ | |
63 | -4 zakłady zakład subst subst pl|acc|m3 1 obj _ _ | |
64 | -5 pracy praca subst subst sg|gen|f 4 adjunct _ _ | |
65 | -6 , , interp interp _ 8 punct _ _ | |
66 | -7 które który adj adj pl|nom|m3|pos 8 subj _ _ | |
67 | -8 dysponują dysponować fin fin pl|ter|imperf 4 adjunct _ _ | |
68 | -9 autobusami autobus subst subst pl|inst|m3 8 comp _ _ | |
69 | -10 , , interp interp _ 12 punct _ _ | |
70 | -11 by by comp comp _ 12 complm _ _ | |
71 | -12 wspomogły wspomóc praet praet pl|m3|perf 1 comp_fin _ _ | |
72 | -13 komunikację komunikacja subst subst sg|acc|f 12 obj _ _ | |
73 | -14 zastępczą zastępczy adj adj sg|acc|f|pos 13 adjunct _ _ | |
74 | -15 . . interp interp _ 1 punct _ _ | |
108 | +1 " " interp interp _ 2 punct _ _ | |
109 | +2 Zrobimy zrobić fin fin pl|pri|perf 0 pred _ _ | |
110 | +3 " " interp interp _ 2 punct _ _ | |
111 | +4 ! ! interp interp _ 2 punct _ _ | |
75 | 112 | |
76 | 113 | 1 - - interp interp _ 3 punct _ _ |
77 | -2 Nie nie qub qub _ 3 neg _ _ | |
78 | -3 chcą chcieć fin fin pl|ter|imperf 0 pred _ _ | |
79 | -4 , , interp interp _ 8 punct _ _ | |
80 | -5 by by comp comp _ 8 complm _ _ | |
81 | -6 m być aglt aglt sg|pri|imperf|nwok 8 aglt _ _ | |
82 | -7 ich on ppron3 ppron3 pl|acc|m1|ter|akc|npraep 8 obj _ _ | |
83 | -8 utrzymywał utrzymywać praet praet sg|m1|imperf 3 comp_fin _ _ | |
84 | -9 . . interp interp _ 3 punct _ _ | |
114 | +2 No no qub qub _ 3 adjunct _ _ | |
115 | +3 wie wiedzieć fin fin sg|ter|imperf 0 pred _ _ | |
116 | +4 pan pan subst subst sg|nom|m1 3 subj _ _ | |
117 | +5 ! ! interp interp _ 3 punct _ _ | |
118 | +6 . . interp interp _ 5 punct _ _ | |
119 | +7 . . interp interp _ 6 punct _ _ | |
120 | +8 . . interp interp _ 7 punct _ _ | |
85 | 121 | |
86 | -1 Wzięli wziąć praet praet pl|m1|perf 0 pred _ _ | |
87 | -2 w w prep prep loc|nwok 4 adjunct _ _ | |
88 | -3 niej on ppron3 ppron3 sg|loc|f|ter|akc|praep 2 comp _ _ | |
89 | -4 udział udział subst subst sg|acc|m3 1 obj _ _ | |
90 | -5 przedstawiciele przedstawiciel subst subst pl|nom|m1 1 subj _ _ | |
91 | -6 policji policja subst subst sg|gen|f 5 adjunct _ _ | |
92 | -7 z z prep prep gen|nwok 5 adjunct _ _ | |
93 | -8 Niemiec Niemcy subst subst pl|gen|n 17 conjunct _ _ | |
94 | -9 , , interp interp _ 17 coord_punct _ _ | |
95 | -10 Czech Czechy subst subst pl|gen|n 17 conjunct _ _ | |
96 | -11 , , interp interp _ 17 coord_punct _ _ | |
97 | -12 Słowacji Słowacja subst subst sg|gen|f 17 conjunct _ _ | |
98 | -13 , , interp interp _ 17 coord_punct _ _ | |
99 | -14 Węgier Węgry subst subst pl|gen|n 17 conjunct _ _ | |
100 | -15 , , interp interp _ 17 coord_punct _ _ | |
101 | -16 Ukrainy Ukraina subst subst sg|gen|f 17 conjunct _ _ | |
102 | -17 i i conj conj _ 7 comp _ _ | |
103 | -18 Polski Polska subst subst sg|gen|f 17 conjunct _ _ | |
104 | -19 . . interp interp _ 1 punct _ _ | |
122 | +1 ( ( interp interp _ 6 punct _ _ | |
123 | +2 Myszkinku Myszkinek subst subst sg|voc|m3 6 adjunct _ _ | |
124 | +3 , , interp interp _ 2 punct _ _ | |
125 | +4 jakie jaki adj adj sg|acc|n|pos 7 adjunct _ _ | |
126 | +5 ty ty ppron12 ppron12 sg|nom|m2|sec 6 subj _ _ | |
127 | +6 masz mieć fin fin sg|sec|imperf 0 pred _ _ | |
128 | +7 futerko futerko subst subst sg|acc|n 6 obj_th _ _ | |
129 | +8 , , interp interp _ 7 punct _ _ | |
130 | +9 lazurowe lazurowy adj adj sg|acc|n|pos 7 adjunct _ _ | |
131 | +10 po po prep prep acc 9 adjunct _ _ | |
132 | +11 prostu prosty adjp adjp _ 10 mwe _ _ | |
133 | +12 ! ! interp interp _ 6 punct _ _ | |
... | ... |
testy/skladnica-test1-Not_parsed.conll
0 → 100644
1 | +1 Cmentarz cmentarz subst subst sg|nom|m3 2 subj _ _ | |
2 | +2 jest być fin fin sg|ter|imperf 0 pred _ _ | |
3 | +3 taki taki adj adj sg|nom|m3|pos 4 adjunct _ _ | |
4 | +4 pusty pusty adj adj sg|nom|m3|pos 2 pd _ _ | |
5 | +5 ! ! interp interp _ 2 punct _ _ | |
6 | + | |
7 | +1 Mówi mówić fin fin sg|ter|imperf 0 pred _ _ | |
8 | +2 się się qub qub _ 1 refl _ _ | |
9 | +3 przecież przecież qub qub _ 1 adjunct _ _ | |
10 | +4 , , interp interp _ 7 punct _ _ | |
11 | +5 że że comp comp _ 7 complm _ _ | |
12 | +6 broń broń subst subst sg|nom|f 7 subj _ _ | |
13 | +7 była być praet praet sg|f|imperf 1 comp_fin _ _ | |
14 | +8 w w prep prep loc|nwok 7 adjunct _ _ | |
15 | +9 szkole szkoła subst subst sg|loc|f 8 comp _ _ | |
16 | +10 schowana schować ppas ppas sg|nom|f|perf|aff 7 pd _ _ | |
17 | +11 jeszcze jeszcze qub qub _ 12 adjunct _ _ | |
18 | +12 latem lato subst subst sg|inst|n 7 adjunct _ _ | |
19 | +13 w w prep prep loc|nwok 12 adjunct _ _ | |
20 | +14 czasie czas subst subst sg|loc|m3 13 mwe _ _ | |
21 | +15 remontu remont subst subst sg|gen|m3 14 comp _ _ | |
22 | +16 . . interp interp _ 1 punct _ _ | |
23 | + | |
24 | +1 Bo bo comp comp _ 9 adjunct _ _ | |
25 | +2 jak jak adv adv _ 9 adjunct _ _ | |
26 | +3 ona on ppron3 ppron3 sg|nom|f|ter|akc|npraep 9 subj _ _ | |
27 | +4 , , interp interp _ 3 punct _ _ | |
28 | +5 chora chory adj adj sg|nom|f|pos 3 adjunct _ _ | |
29 | +6 na na prep prep acc 5 adjunct _ _ | |
30 | +7 cukrzycę cukrzyca subst subst sg|acc|f 6 comp _ _ | |
31 | +8 , , interp interp _ 3 punct _ _ | |
32 | +9 przeżyła przeżyć praet praet sg|f|perf 0 pred _ _ | |
33 | +10 trzy trzy num num pl|acc|m3|congr 9 obj _ _ | |
34 | +11 dni dzień subst subst pl|acc|m3 10 comp _ _ | |
35 | +12 bez bez prep prep gen|nwok 9 comp _ _ | |
36 | +13 wody woda subst subst sg|gen|f 14 conjunct _ _ | |
37 | +14 i i conj conj _ 12 comp _ _ | |
38 | +15 jedzenia jedzenie subst subst sg|gen|n 14 conjunct _ _ | |
39 | +16 ? ? interp interp _ 9 punct _ _ | |
40 | + | |
41 | +1 Jednak jednak qub qub _ 9 adjunct _ _ | |
42 | +2 już już qub qub _ 3 adjunct _ _ | |
43 | +3 wkrótce wkrótce adv adv _ 9 adjunct _ _ | |
44 | +4 Nizioł Nizioł subst subst sg|nom|m1 5 conjunct _ _ | |
45 | +5 i i conj conj _ 9 subj _ _ | |
46 | +6 Wapiński Wapiński subst subst sg|nom|m1 5 conjunct _ _ | |
47 | +7 ze z prep prep inst|wok 9 adjunct _ _ | |
48 | +8 zdumieniem zdumienie subst subst sg|inst|n 7 comp _ _ | |
49 | +9 odkryli odkryć praet praet pl|m1|perf 0 pred _ _ | |
50 | +10 , , interp interp _ 14 punct _ _ | |
51 | +11 że że comp comp _ 14 complm _ _ | |
52 | +12 Łapiński Łapiński subst subst sg|nom|m1 14 subj _ _ | |
53 | +13 nie nie qub qub _ 14 neg _ _ | |
54 | +14 dotrzymuje dotrzymywać fin fin sg|ter|imperf 9 comp_fin _ _ | |
55 | +15 wcześniej wcześnie adv adv com 16 adjunct _ _ | |
56 | +16 danego dać ppas ppas sg|gen|n|perf|aff 17 adjunct _ _ | |
57 | +17 słowa słowo subst subst sg|gen|n 14 obj _ _ | |
58 | +18 . . interp interp _ 9 punct _ _ | |
59 | + | |
60 | +1 A a qub qub _ 8 adjunct _ _ | |
61 | +2 pan pan subst subst sg|nom|m1 8 subj _ _ | |
62 | +3 nigdy nigdy adv adv _ 8 adjunct _ _ | |
63 | +4 się się qub qub _ 8 refl _ _ | |
64 | +5 z z prep prep inst|nwok 8 comp _ _ | |
65 | +6 nimi on ppron3 ppron3 pl|inst|m1|ter|akc|praep 5 comp _ _ | |
66 | +7 nie nie qub qub _ 8 neg _ _ | |
67 | +8 zetknął zetknąć praet praet sg|m1|perf 0 pred _ _ | |
68 | +9 ? ? interp interp _ 8 punct _ _ | |
69 | + | |
70 | +1 Załapać załapać inf inf perf 3 comp_inf _ _ | |
71 | +2 się się qub qub _ 1 refl _ _ | |
72 | +3 trzeba trzeba pred pred _ 0 pred _ _ | |
73 | +4 teraz teraz adv adv _ 3 adjunct _ _ | |
74 | +5 , , interp interp _ 3 punct _ _ | |
75 | +6 bo bo comp comp _ 3 adjunct _ _ | |
76 | +7 potem potem adv adv _ 8 adjunct _ _ | |
77 | +8 będzie być bedzie bedzie sg|ter|imperf 6 comp_fin _ _ | |
78 | +9 trudniej trudno adv adv com 8 pd _ _ | |
79 | +10 . . interp interp _ 3 punct _ _ | |
80 | + | |
81 | +1 Medykamenty medykament subst subst pl|nom|m3 4 subj _ _ | |
82 | +2 współczesne współczesny adj adj pl|nom|m3|pos 1 adjunct _ _ | |
83 | +3 dostępne dostępny adj adj pl|nom|m3|pos 4 pd _ _ | |
84 | +4 są być fin fin pl|ter|imperf 0 pred _ _ | |
85 | +5 na na prep prep loc 4 adjunct _ _ | |
86 | +6 czarnym czarny adj adj sg|loc|m3|pos 7 adjunct _ _ | |
87 | +7 rynku rynek subst subst sg|loc|m3 5 comp _ _ | |
88 | +8 . . interp interp _ 4 punct _ _ | |
89 | + | |
90 | +1 To to subst subst sg|nom|n 3 subj _ _ | |
91 | +2 samo sam adj adj sg|nom|n|pos 1 adjunct _ _ | |
92 | +3 dotyczy dotyczyć fin fin sg|ter|imperf 5 conjunct _ _ | |
93 | +4 leczenia leczenie subst subst sg|gen|n 3 obj_th _ _ | |
94 | +5 , , interp interp _ 0 coord_punct _ _ | |
95 | +6 służba służba subst subst sg|nom|f 9 subj _ _ | |
96 | +7 zdrowia zdrowie subst subst sg|gen|n 6 adjunct _ _ | |
97 | +8 praktycznie praktycznie adv adv pos 9 adjunct _ _ | |
98 | +9 przestała przestać praet praet sg|f|perf 5 conjunct _ _ | |
99 | +10 istnieć istnieć inf inf imperf 9 comp_inf _ _ | |
100 | +11 . . interp interp _ 5 punct _ _ | |
101 | + | |
102 | +1 Zwykły zwykły adj adj sg|nom|m1|pos 2 adjunct _ _ | |
103 | +2 mieszkaniec mieszkaniec subst subst sg|nom|m1 4 subj _ _ | |
104 | +3 kraju kraj subst subst sg|gen|m3 2 adjunct _ _ | |
105 | +4 ma mieć fin fin sg|ter|imperf 0 pred _ _ | |
106 | +5 się się qub qub _ 6 refl _ _ | |
107 | +6 leczyć leczyć inf inf imperf 4 comp_inf _ _ | |
108 | +7 ziołami ziele subst subst pl|inst|n 6 obj_th _ _ | |
109 | +8 , , interp interp _ 10 punct _ _ | |
110 | +9 które który adj adj pl|acc|n|pos 10 obj _ _ | |
111 | +10 zaleca zalecać fin fin sg|ter|imperf 7 adjunct _ _ | |
112 | +11 tradycyjna tradycyjny adj adj sg|nom|f|pos 12 adjunct _ _ | |
113 | +12 medycyna medycyna subst subst sg|nom|f 10 subj _ _ | |
114 | +13 koreańska koreański adj adj sg|nom|f|pos 12 adjunct _ _ | |
115 | +14 . . interp interp _ 4 punct _ _ | |
... | ... |
tokenizer/ENIAMtokens.ml
... | ... | @@ -814,6 +814,8 @@ let rec recognize_sign_group poss_s_beg i = function |
814 | 814 | | (Sign "?") :: (Sign "?") :: l -> |
815 | 815 | create_sentence_seq_q i ((Sign "?") :: (Sign "?") :: []) l "??",i+2*factor,l,true |
816 | 816 | (* | (Sign "?") :: (Sign ".") :: l -> *) |
817 | + | (Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: l -> | |
818 | + create_sentence_seq_q i ((Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: []) l "!...",i+4*factor,l,true | |
817 | 819 | | (Sign "!") :: (Sign "?") :: l -> |
818 | 820 | create_sentence_seq_q i ((Sign "!") :: (Sign "?") :: []) l "!?",i+2*factor,l,true |
819 | 821 | | (Sign "?") :: (Sign "…") :: l -> |
... | ... |