Commit 0bce3b2e8d3baa9d0907f6421e81ca3525540b0e
Integracja parsera zależnościowego z ENIAMlexSemantics
Showing
18 changed files
with
551 additions
and
273 deletions
.gitignore
corpora/CONLL.ml
@@ -220,13 +220,14 @@ let match_corpus corpus = | @@ -220,13 +220,14 @@ let match_corpus corpus = | ||
220 | 220 | ||
221 | (******************) | 221 | (******************) |
222 | 222 | ||
223 | +exception Comment_line | ||
223 | exception Empty_line | 224 | exception Empty_line |
224 | exception Empty_sentence | 225 | exception Empty_sentence |
225 | exception Id_line of string | 226 | exception Id_line of string |
226 | 227 | ||
227 | let load_token in_channel = | 228 | let load_token in_channel = |
228 | let fail line = | 229 | let fail line = |
229 | - (* failwith ("load_token: " ^ line) *) | 230 | + print_endline ("load_token: " ^ line); |
230 | () in | 231 | () in |
231 | let int_of_super = function | 232 | let int_of_super = function |
232 | "_" -> -1 | 233 | "_" -> -1 |
@@ -247,7 +248,8 @@ let load_token in_channel = | @@ -247,7 +248,8 @@ let load_token in_channel = | ||
247 | else if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.tree" line | 248 | else if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.tree" line |
248 | then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.tree" line in | 249 | then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.tree" line in |
249 | raise (Id_line id) | 250 | raise (Id_line id) |
250 | - else failwith ("load_token: " ^ line) | 251 | + else raise Comment_line |
252 | + (* failwith ("load_token: " ^ line) *) | ||
251 | else | 253 | else |
252 | match Xstring.split "\t" line with | 254 | match Xstring.split "\t" line with |
253 | [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] -> | 255 | [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] -> |
@@ -272,6 +274,7 @@ let load_sentence in_channel = | @@ -272,6 +274,7 @@ let load_sentence in_channel = | ||
272 | if id_a <> conll_id then failwith "load_sentence: different ids" else | 274 | if id_a <> conll_id then failwith "load_sentence: different ids" else |
273 | pom ((id_a,super,label) :: rev_paths) id | 275 | pom ((id_a,super,label) :: rev_paths) id |
274 | with Id_line new_id -> pom rev_paths new_id | 276 | with Id_line new_id -> pom rev_paths new_id |
277 | + | Comment_line -> pom rev_paths id | ||
275 | | Empty_line -> rev_paths, id | 278 | | Empty_line -> rev_paths, id |
276 | | End_of_file -> if rev_paths = [] | 279 | | End_of_file -> if rev_paths = [] |
277 | then raise End_of_file | 280 | then raise End_of_file |
corpora/CONLL_adapter.ml
@@ -42,6 +42,34 @@ let if_interps interps token = | @@ -42,6 +42,34 @@ let if_interps interps token = | ||
42 | ) interp in | 42 | ) interp in |
43 | Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value)) | 43 | Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value)) |
44 | 44 | ||
45 | +let change_dep paths i (id,super,label) = | ||
46 | + let id_S, super_S, label_S = paths.(super) in | ||
47 | + paths.(i) <- (id,super_S,label); | ||
48 | + paths.(super) <- (id_S, id, label_S) | ||
49 | + | ||
50 | +let correct_injection paths tokens = Array.iteri (fun i (id,super,label) -> | ||
51 | + if label = "punct" then (*musi być pierwszym tokenem o tym ojcu*) | ||
52 | + let j = Int.fold (i+1) (Array.length paths - 1) 0 (fun acc n -> | ||
53 | + let i2,s2,l2 = paths.(n) in | ||
54 | + if super = s2 | ||
55 | + then if l2 = "punct" | ||
56 | + then n | ||
57 | + else 0 | ||
58 | + else acc | ||
59 | + ) in | ||
60 | + let k = Int.fold_down (i-1) 1 i (fun acc n -> | ||
61 | + let i2,s2,l2 = paths.(n) in | ||
62 | + if super = s2 | ||
63 | + then 0 | ||
64 | + else acc | ||
65 | + ) in | ||
66 | + if k == i && j <> 0 && i < super && super < j | ||
67 | + then | ||
68 | + (paths.(i) <- (0,-1,""); | ||
69 | + paths.(j) <- (0,-1,"")) | ||
70 | + ) paths; | ||
71 | + paths | ||
72 | + | ||
45 | let correct_coordination1 paths tokens = | 73 | let correct_coordination1 paths tokens = |
46 | let paths_ls = List.mapi (fun i (id,super,label) -> | 74 | let paths_ls = List.mapi (fun i (id,super,label) -> |
47 | (i,id,super,label)) (Array.to_list paths) in | 75 | (i,id,super,label)) (Array.to_list paths) in |
@@ -136,15 +164,15 @@ let correct_coordination2 paths tokens = | @@ -136,15 +164,15 @@ let correct_coordination2 paths tokens = | ||
136 | let paths_ls () = List.mapi (fun i (id,super,label) -> | 164 | let paths_ls () = List.mapi (fun i (id,super,label) -> |
137 | (i,id,super,label)) (Array.to_list paths_c) in | 165 | (i,id,super,label)) (Array.to_list paths_c) in |
138 | 166 | ||
139 | - (* let ps a sons = | 167 | + let ps a sons = |
140 | print_endline a; | 168 | print_endline a; |
141 | List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons; | 169 | List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons; |
142 | - print_endline "" in *) | 170 | + print_endline "" in |
143 | 171 | ||
144 | let rec correct_rec (i,id,super,label) sons = | 172 | let rec correct_rec (i,id,super,label) sons = |
145 | let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in | 173 | let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in |
146 | - (* ps "left:" (List.rev left_s); | ||
147 | - ps "right:" right_s; *) | 174 | + ps "left:" (List.rev left_s); |
175 | + ps "right:" right_s; | ||
148 | find_father i (List.rev left_s); | 176 | find_father i (List.rev left_s); |
149 | find_father i right_s | 177 | find_father i right_s |
150 | 178 | ||
@@ -154,23 +182,35 @@ let correct_coordination2 paths tokens = | @@ -154,23 +182,35 @@ let correct_coordination2 paths tokens = | ||
154 | paths_c.(i) <- (id,i0,label); | 182 | paths_c.(i) <- (id,i0,label); |
155 | if not (if_cat ["conj"] (ExtArray.get tokens i).token || | 183 | if not (if_cat ["conj"] (ExtArray.get tokens i).token || |
156 | (ExtArray.get tokens i).orth = ",") | 184 | (ExtArray.get tokens i).orth = ",") |
157 | - then failwith "find_father"; | 185 | + then failwith "find_father1"; |
158 | correct_rec (i,id,super,label) (if a < i | 186 | correct_rec (i,id,super,label) (if a < i |
159 | then (a,b,c,d) :: t | 187 | then (a,b,c,d) :: t |
160 | else List.rev @@ (a,b,c,d) :: t) | 188 | else List.rev @@ (a,b,c,d) :: t) |
161 | - | _ -> failwith "find_father" in | 189 | + | [] -> failwith "find_father2" in |
162 | 190 | ||
163 | let check_previous_for_interp i = | 191 | let check_previous_for_interp i = |
164 | if i >= 0 && (ExtArray.get tokens i).orth = "," && | 192 | if i >= 0 && (ExtArray.get tokens i).orth = "," && |
165 | not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths_c)) | 193 | not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths_c)) |
166 | then paths_c.(i) <- (0,-1,"") in | 194 | then paths_c.(i) <- (0,-1,"") in |
167 | 195 | ||
196 | + let filter_comp_construction sons = | ||
197 | + let rec pom acc = function | ||
198 | + (i1,id1,super1,label1) :: (i2,id2,super2,label2) :: t -> | ||
199 | + if if_cat ["interp"] (ExtArray.get tokens i1).token && | ||
200 | + if_cat ["comp"] (ExtArray.get tokens i2).token | ||
201 | + then pom acc t | ||
202 | + else pom ((i1,id1,super1,label1) :: acc) ((i2,id2,super2,label2) :: t) | ||
203 | + | h :: t -> pom (h :: acc) t | ||
204 | + | [] -> List.rev acc in | ||
205 | + pom [] sons in | ||
206 | + | ||
168 | Array.iteri (fun i (id,super,label) -> | 207 | Array.iteri (fun i (id,super,label) -> |
169 | if if_cat ["conj"] (ExtArray.get tokens i).token || | 208 | if if_cat ["conj"] (ExtArray.get tokens i).token || |
170 | (ExtArray.get tokens i).orth = "," | 209 | (ExtArray.get tokens i).orth = "," |
171 | then | 210 | then |
172 | (check_previous_for_interp (i-1); | 211 | (check_previous_for_interp (i-1); |
173 | let sons = List.filter (fun (_,_,super,_) -> super = i) (paths_ls ()) in | 212 | let sons = List.filter (fun (_,_,super,_) -> super = i) (paths_ls ()) in |
213 | + (* let sons = filter_comp_construction sons in *) | ||
174 | if (List.length sons > 2) | 214 | if (List.length sons > 2) |
175 | then correct_rec (i,id,super,label) sons)) paths_c; | 215 | then correct_rec (i,id,super,label) sons)) paths_c; |
176 | paths_c | 216 | paths_c |
@@ -206,15 +246,16 @@ done; *) | @@ -206,15 +246,16 @@ done; *) | ||
206 | 246 | ||
207 | let brev i id super label = | 247 | let brev i id super label = |
208 | let if_the_last_dot () = | 248 | let if_the_last_dot () = |
209 | - let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) -> | ||
210 | - s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in | ||
211 | - Array.fold_left (fun acc (i2,s,l) -> | ||
212 | - acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in | 249 | + try |
250 | + let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) -> | ||
251 | + s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in | ||
252 | + Array.fold_left (fun acc (i2,s,l) -> | ||
253 | + acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths | ||
254 | + with Not_found -> true in | ||
213 | 255 | ||
214 | let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot () | 256 | let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot () |
215 | then "" | 257 | then "" |
216 | else "." in | 258 | else "." in |
217 | - | ||
218 | let n_orth = (ExtArray.get tokens id).orth ^ dot in | 259 | let n_orth = (ExtArray.get tokens id).orth ^ dot in |
219 | paths.(i) <- (find_token n_orth,super,label) in | 260 | paths.(i) <- (find_token n_orth,super,label) in |
220 | 261 | ||
@@ -317,6 +358,16 @@ let correct_interp_with_father_0 paths tokens = | @@ -317,6 +358,16 @@ let correct_interp_with_father_0 paths tokens = | ||
317 | then paths.(i1) <- (id1,0,label1)) paths) paths; | 358 | then paths.(i1) <- (id1,0,label1)) paths) paths; |
318 | paths | 359 | paths |
319 | 360 | ||
361 | +let corect_complm paths tokens = | ||
362 | + Array.iteri (fun i (id,super,label) -> | ||
363 | + if label = "complm" && super > 0 | ||
364 | + then | ||
365 | + let i2,s2,l2 = paths.(super) in | ||
366 | + if if_cat ["conj"] (ExtArray.get tokens i2).token | ||
367 | + then change_dep paths i (id,super,label) | ||
368 | + ) paths; | ||
369 | + paths | ||
370 | + | ||
320 | let remove_interps interp paths tokens = | 371 | let remove_interps interp paths tokens = |
321 | let paths_ls = Array.to_list paths in | 372 | let paths_ls = Array.to_list paths in |
322 | Array.iteri (fun i (id,super,label) -> | 373 | Array.iteri (fun i (id,super,label) -> |
@@ -339,10 +390,6 @@ let correct_passive_voice paths tokens = | @@ -339,10 +390,6 @@ let correct_passive_voice paths tokens = | ||
339 | paths | 390 | paths |
340 | 391 | ||
341 | let swap_dep paths tokens = | 392 | let swap_dep paths tokens = |
342 | - let change_dep i (id,super,label) = | ||
343 | - let id_S, super_S, label_S = paths.(super) in | ||
344 | - paths.(i) <- (id,super_S,label); | ||
345 | - paths.(super) <- (id_S, id, label_S) in | ||
346 | let rec correct_dep i (id,super,label) = | 393 | let rec correct_dep i (id,super,label) = |
347 | let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który"; | 394 | let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który"; |
348 | "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in | 395 | "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in |
@@ -356,7 +403,7 @@ let swap_dep paths tokens = | @@ -356,7 +403,7 @@ let swap_dep paths tokens = | ||
356 | (if_lemma adv_relators (ExtArray.get tokens id).token && | 403 | (if_lemma adv_relators (ExtArray.get tokens id).token && |
357 | if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"; "subst"] (ExtArray.get tokens super).token) | 404 | if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"; "subst"] (ExtArray.get tokens super).token) |
358 | then | 405 | then |
359 | - change_dep i (id,super,label); | 406 | + change_dep paths i (id,super,label); |
360 | if (if_lemma adv_relators (ExtArray.get tokens id).token && | 407 | if (if_lemma adv_relators (ExtArray.get tokens id).token && |
361 | if_cat ["subst"; "pred"] (ExtArray.get tokens super).token) | 408 | if_cat ["subst"; "pred"] (ExtArray.get tokens super).token) |
362 | then correct_dep i paths.(i) in | 409 | then correct_dep i paths.(i) in |
@@ -367,7 +414,11 @@ let swap_dep paths tokens = | @@ -367,7 +414,11 @@ let swap_dep paths tokens = | ||
367 | nieobsługiwana na razie koordynacja strony biernej - zarówno czasowniki posiłkowe, jak i imiesłowy | 414 | nieobsługiwana na razie koordynacja strony biernej - zarówno czasowniki posiłkowe, jak i imiesłowy |
368 | nieobsługiwana na razie koordynacja podrzędników spójników podrzędnych *) | 415 | nieobsługiwana na razie koordynacja podrzędników spójników podrzędnych *) |
369 | 416 | ||
370 | -let convert_dep_tree id first_try paths tokens = | 417 | +let convert_dep_tree path first_try paths tokens = |
418 | + File.file_out (path ^ "/pre_text_unmodified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> | ||
419 | + Printf.fprintf file "%s\n" ENIAMvisualization.html_header; | ||
420 | + Printf.fprintf file "%s\n" (ENIAMvisualization.html_of_dep_sentence tokens paths); | ||
421 | + Printf.fprintf file "%s\n" ENIAMvisualization.html_trailer); | ||
371 | let paths = Array.copy paths in | 422 | let paths = Array.copy paths in |
372 | let paths = | 423 | let paths = |
373 | if first_try | 424 | if first_try |
@@ -375,16 +426,27 @@ let convert_dep_tree id first_try paths tokens = | @@ -375,16 +426,27 @@ let convert_dep_tree id first_try paths tokens = | ||
375 | let pom = replace_tokens paths tokens in | 426 | let pom = replace_tokens paths tokens in |
376 | let pom = (remove_interps ".") pom tokens in | 427 | let pom = (remove_interps ".") pom tokens in |
377 | let pom = replace_hyphens pom tokens in | 428 | let pom = replace_hyphens pom tokens in |
429 | + let pom = correct_injection pom tokens in | ||
378 | let pom = correct_coordination1 pom tokens in | 430 | let pom = correct_coordination1 pom tokens in |
379 | let pom = correct_interp_with_father_0 pom tokens in | 431 | let pom = correct_interp_with_father_0 pom tokens in |
380 | - let pom = correct_coordination2 pom tokens in | ||
381 | - let pom = remove_interps "," pom tokens in | 432 | + (* File.file_out (path ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> |
433 | + Printf.fprintf file "%s\n" ENIAMvisualization.html_header; | ||
434 | + Printf.fprintf file "%s\n" (ENIAMvisualization.html_of_dep_sentence tokens paths); | ||
435 | + Printf.fprintf file "%s\n" ENIAMvisualization.html_trailer); *) | ||
436 | + let pom = try corect_complm pom tokens with | e -> print_endline (Printexc.to_string e); pom in | ||
437 | + let pom = try | ||
438 | + let pom2 = correct_coordination2 pom tokens in | ||
439 | + remove_interps "," pom2 tokens | ||
440 | + with | ||
441 | + | _ -> (let pom2 = remove_interps "," pom tokens in | ||
442 | + correct_coordination2 pom2 tokens) in | ||
382 | let pom = correct_passive_voice pom tokens in | 443 | let pom = correct_passive_voice pom tokens in |
383 | praet_qub_aglt pom tokens | 444 | praet_qub_aglt pom tokens |
384 | else | 445 | else |
385 | - swap_dep paths tokens in | ||
386 | - (* File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> | ||
387 | - Printf.fprintf file "%s\n" Visualization.html_header; | ||
388 | - Printf.fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths); | ||
389 | - Printf.fprintf file "%s\n" Visualization.html_trailer); *) | 446 | + paths in |
447 | + (* swap_dep paths tokens in *) | ||
448 | + File.file_out (path ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> | ||
449 | + Printf.fprintf file "%s\n" ENIAMvisualization.html_header; | ||
450 | + Printf.fprintf file "%s\n" (ENIAMvisualization.html_of_dep_sentence tokens paths); | ||
451 | + Printf.fprintf file "%s\n" ENIAMvisualization.html_trailer); | ||
390 | paths | 452 | paths |
corpora/makefile
@@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt | @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt | ||
3 | OCAMLDEP=ocamldep | 3 | OCAMLDEP=ocamldep |
4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | OCAMLFLAGS=$(INCLUDES) -g | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa | 6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa eniam-exec.cmxa |
7 | INSTALLDIR=`ocamlc -where`/eniam | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | ||
9 | SOURCES= types.ml CONLL.ml CONLL_adapter.ml resources.ml conllParser.ml interpsInCorpus.ml generate.ml | 9 | SOURCES= types.ml CONLL.ml CONLL_adapter.ml resources.ml conllParser.ml interpsInCorpus.ml generate.ml |
corpora/test_conll.ml
@@ -48,7 +48,7 @@ let clarify_categories senses token = | @@ -48,7 +48,7 @@ let clarify_categories senses token = | ||
48 | | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[]) | 48 | | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[]) |
49 | | _ -> [] | 49 | | _ -> [] |
50 | 50 | ||
51 | -let create_chart tokens lex_sems paths last = | 51 | +(* let create_chart tokens lex_sems paths last = |
52 | ENIAM_LCGrenderer.reset_variable_numbers (); | 52 | ENIAM_LCGrenderer.reset_variable_numbers (); |
53 | let chart = ENIAM_LCGchart.make last in | 53 | let chart = ENIAM_LCGchart.make last in |
54 | let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> | 54 | let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> |
@@ -59,7 +59,7 @@ let create_chart tokens lex_sems paths last = | @@ -59,7 +59,7 @@ let create_chart tokens lex_sems paths last = | ||
59 | let cats = clarify_categories ["X"] t in | 59 | let cats = clarify_categories ["X"] t in |
60 | let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in | 60 | let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in |
61 | ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in | 61 | ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in |
62 | - chart | 62 | + chart *) |
63 | 63 | ||
64 | let rec split_sons left id right = function | 64 | let rec split_sons left id right = function |
65 | [] -> List.rev (List.sort compare left), List.sort compare right | 65 | [] -> List.rev (List.sort compare left), List.sort compare right |
@@ -85,7 +85,7 @@ let create_dep_chart tokens lex_sems paths = | @@ -85,7 +85,7 @@ let create_dep_chart tokens lex_sems paths = | ||
85 | ENIAM_LCGrenderer.reset_variable_names (); | 85 | ENIAM_LCGrenderer.reset_variable_names (); |
86 | ENIAM_LCGrenderer.add_variable_numbers (); | 86 | ENIAM_LCGrenderer.add_variable_numbers (); |
87 | let cats = clarify_categories ["X"] t in | 87 | let cats = clarify_categories ["X"] t in |
88 | - let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in | 88 | + let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata s.ENIAMlexSemanticsTypes.lex_entries in |
89 | IntMap.add nodes i l) in | 89 | IntMap.add nodes i l) in |
90 | (* print_endline "create_dep_chart 3"; *) | 90 | (* print_endline "create_dep_chart 3"; *) |
91 | let x = dep_create_rec nodes sons 0 in | 91 | let x = dep_create_rec nodes sons 0 in |
@@ -93,7 +93,7 @@ let create_dep_chart tokens lex_sems paths = | @@ -93,7 +93,7 @@ let create_dep_chart tokens lex_sems paths = | ||
93 | x | 93 | x |
94 | 94 | ||
95 | 95 | ||
96 | -let test_example path id tokens lex_sems paths last = | 96 | +(* let test_example path id tokens lex_sems paths last = |
97 | ENIAM_LCGreductions.reset_variant_label (); | 97 | ENIAM_LCGreductions.reset_variant_label (); |
98 | let chart = create_chart tokens lex_sems paths last in | 98 | let chart = create_chart tokens lex_sems paths last in |
99 | ENIAM_LCGlatexOf.print_chart path (id^"1_chart") "a1" chart; | 99 | ENIAM_LCGlatexOf.print_chart path (id^"1_chart") "a1" chart; |
@@ -119,43 +119,45 @@ let test_example path id tokens lex_sems paths last = | @@ -119,43 +119,45 @@ let test_example path id tokens lex_sems paths last = | ||
119 | ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; | 119 | ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; |
120 | ()) | 120 | ()) |
121 | else print_endline "not reduced") | 121 | else print_endline "not reduced") |
122 | - else print_endline "not parsed" | 122 | + else print_endline "not parsed" *) |
123 | 123 | ||
124 | -let test_dep_example path id tokens lex_sems paths = | 124 | +let rec test_dep_example path id tokens lex_sems first_try paths = |
125 | + (* print_endline "test_dep_example 1"; *) | ||
126 | + let paths = CONLL_adapter.convert_dep_tree path first_try paths tokens in | ||
125 | try | 127 | try |
126 | - ENIAM_LCGreductions.reset_variant_label (); | ||
127 | - print_endline "test_dep_example 1"; | ||
128 | - let paths = CONLL_adapter.convert_dep_tree id (*first_try*) true paths tokens in | ||
129 | - print_endline "test_dep_example 2"; | ||
130 | - (* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *) | ||
131 | - let chart = create_dep_chart tokens lex_sems paths in | ||
132 | - (* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *) | ||
133 | - let chart,references = ENIAM_LCGchart.dep_lazify chart in | ||
134 | - (* ENIAM_LCGlatexOf.print_dep_chart path (id^"2_chart") "a4" chart; *) | ||
135 | - (* ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references; *) | ||
136 | - let chart = ENIAM_LCGchart.dep_parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | ||
137 | - (* ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart; *) | ||
138 | - (* ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references; *) | ||
139 | - if ENIAM_LCGchart.is_dep_parsed chart then ( | ||
140 | - let term = ENIAM_LCGchart.get_dep_parsed_term chart in | ||
141 | - (* Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file -> | ||
142 | - Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | ||
143 | - Xlatex.latex_compile_and_clean path (id^"4_term"); *) | ||
144 | - let dependency_tree = ENIAM_LCGreductions.reduce term references in | ||
145 | - (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree; *) | ||
146 | - if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | ||
147 | - ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | ||
148 | - (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree; *) | ||
149 | - ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | ||
150 | - (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree; *) | ||
151 | - (* ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree; *) | ||
152 | - (* ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; *) | ||
153 | - ()) | ||
154 | - else print_endline "not reduced") | ||
155 | - else print_endline "not parsed" | 128 | + ENIAM_LCGreductions.reset_variant_label (); |
129 | + (* print_endline "test_dep_example 2"; *) | ||
130 | + (* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *) | ||
131 | + let chart = create_dep_chart tokens lex_sems paths in | ||
132 | + (* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *) | ||
133 | + let chart,references = ENIAM_LCGchart.dep_lazify chart in | ||
134 | + (* ENIAM_LCGlatexOf.print_dep_chart path (id^"2_chart") "a4" chart; *) | ||
135 | + (* ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references; *) | ||
136 | + let chart = ENIAM_LCGchart.dep_parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) | ||
137 | + (* ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart; *) | ||
138 | + (* ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references; *) | ||
139 | + if ENIAM_LCGchart.is_dep_parsed chart then ( | ||
140 | + let term = ENIAM_LCGchart.get_dep_parsed_term chart in | ||
141 | + (* Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file -> | ||
142 | + Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); | ||
143 | + Xlatex.latex_compile_and_clean path (id^"4_term"); *) | ||
144 | + let dependency_tree = ENIAM_LCGreductions.reduce term references in | ||
145 | + (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree; *) | ||
146 | + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( | ||
147 | + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | ||
148 | + (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree; *) | ||
149 | + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) | ||
150 | + (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree; *) | ||
151 | + (* ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree; *) | ||
152 | + (* ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; *) | ||
153 | + ()) | ||
154 | + else print_endline "not reduced") | ||
155 | + else print_endline "not parsed" | ||
156 | with NotDepParsed(id_ndp,left,l,right) -> ( | 156 | with NotDepParsed(id_ndp,left,l,right) -> ( |
157 | - print_endline "not parsed 2"; | ||
158 | - ENIAM_LCGlatexOf.print_not_parsed_dep_chart path (id^"3_not_parsed_chart") "a2" (id_ndp,left,l,right)) | 157 | + if (first_try) |
158 | + then test_dep_example path id tokens lex_sems false paths | ||
159 | + else (print_endline "not parsed 2"; | ||
160 | + ENIAM_LCGlatexOf.print_not_parsed_dep_chart path (id^"3_not_parsed_chart") "a2" (id_ndp,left,l,right))) | ||
159 | 161 | ||
160 | let rec parse_sentence name id tokens lex_sems = function | 162 | let rec parse_sentence name id tokens lex_sems = function |
161 | RawSentence s -> id | 163 | RawSentence s -> id |
@@ -163,7 +165,7 @@ let rec parse_sentence name id tokens lex_sems = function | @@ -163,7 +165,7 @@ let rec parse_sentence name id tokens lex_sems = function | ||
163 | (* test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; *) | 165 | (* test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; *) |
164 | id + 1 | 166 | id + 1 |
165 | | DepSentence(paths) -> | 167 | | DepSentence(paths) -> |
166 | - test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems paths; | 168 | + test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems true paths; |
167 | id + 1 | 169 | id + 1 |
168 | | QuotedSentences sentences -> | 170 | | QuotedSentences sentences -> |
169 | Xlist.fold sentences id (fun id p -> | 171 | Xlist.fold sentences id (fun id p -> |
@@ -212,8 +214,8 @@ let process_id s = | @@ -212,8 +214,8 @@ let process_id s = | ||
212 | else failwith ("process_id: " ^ s) | 214 | else failwith ("process_id: " ^ s) |
213 | 215 | ||
214 | let process_conll_corpus filename = | 216 | let process_conll_corpus filename = |
215 | - let corpus = File.file_in filename (fun file -> CONLL.match_corpus (ENIAM_CONLL.load_corpus file)) in | ||
216 | - print_endline "process_conll_corpus"; | 217 | + let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in |
218 | + (* print_endline "process_conll_corpus 1"; *) | ||
217 | (* let corpus = [List.hd corpus] in *) | 219 | (* let corpus = [List.hd corpus] in *) |
218 | Xlist.iter corpus (fun query -> try | 220 | Xlist.iter corpus (fun query -> try |
219 | let id = process_id (get_query_id query) in | 221 | let id = process_id (get_query_id query) in |
@@ -226,13 +228,17 @@ let process_conll_corpus filename = | @@ -226,13 +228,17 @@ let process_conll_corpus filename = | ||
226 | (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *) | 228 | (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *) |
227 | let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] | 229 | let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] |
228 | (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in | 230 | (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in |
231 | + (* print_endline "process_conll_corpus 2"; *) | ||
229 | let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in | 232 | let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in |
233 | + (* print_endline "process_conll_corpus 3"; *) | ||
230 | let sentences = match text with | 234 | let sentences = match text with |
231 | AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences | 235 | AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences |
232 | | _ -> failwith "process_conll_corpus 1" in | 236 | | _ -> failwith "process_conll_corpus 1" in |
233 | let text = AltText[Raw,RawText query; Struct, StructText([ | 237 | let text = AltText[Raw,RawText query; Struct, StructText([ |
234 | AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in | 238 | AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in |
239 | + (* print_endline "process_conll_corpus 4"; *) | ||
235 | let lex_sems = ENIAMlexSemantics.assign tokens text in | 240 | let lex_sems = ENIAMlexSemantics.assign tokens text in |
241 | + (* print_endline "process_conll_corpus 5"; *) | ||
236 | ignore(parse_text id 1 tokens lex_sems text) | 242 | ignore(parse_text id 1 tokens lex_sems text) |
237 | | _ -> failwith "process_conll_corpus 2" | 243 | | _ -> failwith "process_conll_corpus 2" |
238 | with | 244 | with |
@@ -241,6 +247,7 @@ let process_conll_corpus filename = | @@ -241,6 +247,7 @@ let process_conll_corpus filename = | ||
241 | 247 | ||
242 | let _ = | 248 | let _ = |
243 | Printexc.record_backtrace true; | 249 | Printexc.record_backtrace true; |
250 | + ENIAMlexSemantics.initialize (); | ||
244 | (* LCGfields.reset (); *) | 251 | (* LCGfields.reset (); *) |
245 | (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) | 252 | (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) |
246 | (* process_conll_corpus "../testy/skladnica-test1.conll"; *) | 253 | (* process_conll_corpus "../testy/skladnica-test1.conll"; *) |
exec/ENIAMexec.ml
@@ -85,6 +85,37 @@ let create_chart rules tokens lex_sems paths last = | @@ -85,6 +85,37 @@ let create_chart rules tokens lex_sems paths last = | ||
85 | ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in | 85 | ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in |
86 | chart | 86 | chart |
87 | 87 | ||
88 | +let rec split_sons left id right = function | ||
89 | + [] -> List.rev (List.sort compare left), List.sort compare right | ||
90 | + | x :: l -> if x < id then split_sons (x :: left) id right l else split_sons left id (x :: right) l | ||
91 | + | ||
92 | +let rec dep_create_rec nodes sons conll_id = | ||
93 | + let node = IntMap.find nodes conll_id in | ||
94 | + let l = try IntMap.find sons conll_id with Not_found -> [] in | ||
95 | + let left,right = split_sons [] conll_id [] l in | ||
96 | + (* Printf.printf "dep_create_rec [%s] %d [%s]\n" (String.concat ";" (Xlist.map left string_of_int)) conll_id (String.concat ";" (Xlist.map right string_of_int)); *) | ||
97 | + DepNode(conll_id, Xlist.map left (dep_create_rec nodes sons), node, Xlist.map right (dep_create_rec nodes sons)) | ||
98 | + | ||
99 | +let create_dep_chart dep_rules tokens lex_sems paths = | ||
100 | + (* print_endline "create_dep_chart 1"; *) | ||
101 | + let sons = Int.fold 1 (Array.length paths - 1) IntMap.empty (fun sons i -> | ||
102 | + let _,super,_ = paths.(i) in | ||
103 | + IntMap.add_inc sons super [i] (fun l -> i :: l)) in | ||
104 | + (* print_endline "create_dep_chart 2"; *) | ||
105 | + let nodes = Int.fold 0 (Array.length paths - 1) IntMap.empty (fun nodes i -> | ||
106 | + let id,_,_ = paths.(i) in | ||
107 | + let t = ExtArray.get tokens id in | ||
108 | + let s = ExtArray.get lex_sems id in | ||
109 | + ENIAM_LCGrenderer.reset_variable_names (); | ||
110 | + ENIAM_LCGrenderer.add_variable_numbers (); | ||
111 | + let cats = clarify_categories ["X"] t in | ||
112 | + let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata s.ENIAMlexSemanticsTypes.lex_entries in | ||
113 | + IntMap.add nodes i l) in | ||
114 | + (* print_endline "create_dep_chart 3"; *) | ||
115 | + let x = dep_create_rec nodes sons 0 in | ||
116 | + (* print_endline "create_dep_chart 4"; *) | ||
117 | + x | ||
118 | + | ||
88 | let create_text_fragments tokens paths last = | 119 | let create_text_fragments tokens paths last = |
89 | let text_fragments = Array.make last IntMap.empty in | 120 | let text_fragments = Array.make last IntMap.empty in |
90 | Xlist.iter paths (fun (id,lnode,rnode) -> | 121 | Xlist.iter paths (fun (id,lnode,rnode) -> |
@@ -156,85 +187,75 @@ let eniam_parse_sentence timeout verbosity rules tokens lex_sems paths last = | @@ -156,85 +187,75 @@ let eniam_parse_sentence timeout verbosity rules tokens lex_sems paths last = | ||
156 | with e -> | 187 | with e -> |
157 | let time2 = time_fun () in | 188 | let time2 = time_fun () in |
158 | {result with status=LexiconError; msg=string_of_exn e; lex_time=time2 -. time1} | 189 | {result with status=LexiconError; msg=string_of_exn e; lex_time=time2 -. time1} |
159 | -(* | ||
160 | -let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens lex_sems = | ||
161 | - let result = empty_conll_parse_result in | ||
162 | - let time2 = time_fun () in | ||
163 | - (* let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in *) | 190 | + |
191 | +let rec conll_parse_sentence timeout verbosity dep_rules first_try tokens lex_sems paths = | ||
192 | + ENIAM_LCGreductions.reset_variant_label (); | ||
193 | + let result = {empty_conll_parse_result with paths_size = Xlist.size paths} in | ||
194 | + let result = if verbosity = 0 then result else result(*{result with text_fragments=create_dep_text_fragments tokens paths last}*) in (* FIXME *) | ||
195 | + let time1 = time_fun () in | ||
164 | try | 196 | try |
165 | - let dep_chart = LCGlexicon.dep_create paths tokens lex_sems in | ||
166 | - let dep_chart,references = LCGchart.dep_lazify dep_chart in | ||
167 | - let result = if test_only_flag then result else {result with dep_chart=dep_chart} in | ||
168 | - let time3 = time_fun () in | ||
169 | - let result = {result with lex_time=time3 -. time2} in | 197 | + let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in |
198 | + let chart = create_chart dep_rules tokens lex_sems paths in | ||
199 | + let result = if verbosity = 0 then result else {result with chart1=chart} in | ||
200 | + let chart,references = ENIAM_LCGchart.dep_lazify chart in | ||
201 | + let result = if verbosity = 0 then result else {result with chart2=chart; references2=ExtArray.copy references} in | ||
202 | + let time2 = time_fun () in | ||
203 | + let result = {result with lex_time=time2 -. time1} in | ||
170 | try | 204 | try |
171 | - (* print_endline "conll_parse_sentence 1"; *) | ||
172 | - (* LCGlatexOf.print_references "results/" "references1" references; *) | ||
173 | - let parsed_dep_chart = LCGchart.dep_parse dep_chart references timeout time_fun in (* uwaga: niejawna zmiana imperatywna w references *) | ||
174 | - (* print_endline "conll_parse_sentence 2"; *) | ||
175 | - (* LCGlatexOf.print_references "results/" "references2" references; *) | ||
176 | - let time4 = time_fun () in | ||
177 | - let result = if test_only_flag then result else {result with parsed_dep_chart=parsed_dep_chart} in | ||
178 | - let result = {result with parse_time=time4 -. time3} in | ||
179 | - if LCGchart.is_dep_parsed parsed_dep_chart then | 205 | + let chart = ENIAM_LCGchart.dep_parse chart references timeout time_fun in (* uwaga: niejawna zmiana imperatywna w references *) |
206 | + let time3 = time_fun () in | ||
207 | + let result = if verbosity = 0 then result else {result with parsed_dep_chart=chart; references3=references} in | ||
208 | + let result = {result with parse_time=time3 -. time2; chart_size=ENIAM_LCGchart.get_no_entries chart} in | ||
209 | + if ENIAM_LCGchart.is_dep_parsed chart then | ||
180 | try | 210 | try |
181 | - let term = LCGchart.get_dep_parsed_term tokens lex_sems parsed_dep_chart in | ||
182 | - (* LCGlatexOf.print_dependency_tree "dep_dependency_tree1" dependency_tree; *) | ||
183 | - let dependency_tree = LCGreductions.reduce term references in | ||
184 | - let time5 = time_fun () in | ||
185 | - let result = if test_only_flag then result else {result with dependency_tree=dependency_tree} in | ||
186 | - let result = {result with reduction_time=time5 -. time4; dependency_tree_size=Array.length dependency_tree} in | ||
187 | - if LCGreductions.is_reduced_dependency_tree dependency_tree then | 211 | + let term = ENIAM_LCGchart.get_dep_parsed_term chart in |
212 | + let result = if verbosity = 0 then result else {result with term4=term} in | ||
213 | + let dependency_tree = ENIAM_LCGreductions.reduce term references in | ||
214 | + let time4 = time_fun () in | ||
215 | + let result = if verbosity = 0 then result else {result with dependency_tree4=Array.copy dependency_tree} in | ||
216 | + let result = {result with reduction_time=time4 -. time3; dependency_tree_size=Array.length dependency_tree} in | ||
217 | + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then | ||
188 | try | 218 | try |
189 | - (* print_endline "conll_parse_sentence 3"; *) | ||
190 | - LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *) | ||
191 | - (* print_endline "conll_parse_sentence 4"; *) | ||
192 | - LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *) | ||
193 | -(* if Array.length dependency_tree < 10000 then print_xml_dependency_tree "results/trees/" id dependency_tree; *) | ||
194 | - (* print_endline "conll_parse_sentence 5"; *) | 219 | + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *) |
220 | + let result = if verbosity = 0 then result else {result with dependency_tree5=Array.copy dependency_tree} in | ||
221 | + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *) | ||
222 | + let result = (*if verbosity = 0 then result else*) {result with dependency_tree6=dependency_tree} in | ||
195 | let time6 = time_fun () in | 223 | let time6 = time_fun () in |
196 | - {result with status=Parsed; sem_time=time6 -. time5} | 224 | + {result with status=Parsed; sem_time=time6 -. time4} |
197 | with e -> | 225 | with e -> |
198 | let time6 = time_fun () in | 226 | let time6 = time_fun () in |
199 | - {result with status=SemError; msg=string_of_exn e; sem_time=time6 -. time5} | 227 | + {result with status=SemError1; msg=string_of_exn e; sem_time=time6 -. time4} |
200 | else | 228 | else |
201 | {result with status=NotReduced} | 229 | {result with status=NotReduced} |
202 | with | 230 | with |
203 | | SemTooBig -> | 231 | | SemTooBig -> |
204 | - let time5 = time_fun () in | ||
205 | - {result with status=TooManyNodes; reduction_time=time5 -. time4} | 232 | + let time4 = time_fun () in |
233 | + {result with status=TooManyNodes; reduction_time=time4 -. time3} | ||
206 | | e -> | 234 | | e -> |
207 | - let time5 = time_fun () in | ||
208 | - {result with status=ReductionError; msg=string_of_exn e; reduction_time=time5 -. time4} | 235 | + let time4 = time_fun () in |
236 | + {result with status=ReductionError; msg=string_of_exn e; reduction_time=time4 -. time3} | ||
209 | else if first_try | 237 | else if first_try |
210 | - then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems | 238 | + then conll_parse_sentence timeout verbosity dep_rules false tokens lex_sems paths |
211 | else {result with status=NotParsed} | 239 | else {result with status=NotParsed} |
212 | with | 240 | with |
213 | Timeout t -> | 241 | Timeout t -> |
214 | - let time4 = time_fun () in | ||
215 | - {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time4 -. time3} | 242 | + let time3 = time_fun () in |
243 | + {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time3 -. time2} | ||
216 | | NotDepParsed(id_ndp,left,l,right) -> | 244 | | NotDepParsed(id_ndp,left,l,right) -> |
217 | if first_try | 245 | if first_try |
218 | - then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems | 246 | + then conll_parse_sentence timeout verbosity dep_rules false tokens lex_sems paths |
219 | else let time4 = time_fun () in | 247 | else let time4 = time_fun () in |
220 | {result with status=NotParsed; not_parsed_dep_chart=(id_ndp,left,l,right); parse_time=time4 -. time3} | 248 | {result with status=NotParsed; not_parsed_dep_chart=(id_ndp,left,l,right); parse_time=time4 -. time3} |
221 | | e -> | 249 | | e -> |
222 | - let time4 = time_fun () in | ||
223 | - {result with status=ParseError; msg=string_of_exn e; parse_time=time4 -. time3} | ||
224 | - with e -> (*print_endline (string_of_exn e);*) | ||
225 | - let time3 = time_fun () in | 250 | + let time3 = time_fun () in |
251 | + {result with status=ParseError; msg=string_of_exn e; parse_time=time3 -. time2} | ||
252 | + with e -> | ||
253 | + let time2 = time_fun () in | ||
226 | if first_try | 254 | if first_try |
227 | - then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems | 255 | + then conll_parse_sentence timeout verbosity dep_rules false tokens lex_sems paths |
228 | else {result with status=LexiconError; msg=string_of_exn e; lex_time=time3 -. time2} | 256 | else {result with status=LexiconError; msg=string_of_exn e; lex_time=time3 -. time2} |
229 | 257 | ||
230 | - | ||
231 | -let mate_in, mate_out = (*Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test"*) | ||
232 | - if Paths.config.Paths.mate_parser_enabled then | ||
233 | - Unix.open_process ("java -jar " ^ Paths.config.Paths.mate_parser_path ^ "dist/anna-3.5.jar -model " ^ | ||
234 | - Paths.config.Paths.mate_parser_path ^ "examples/160622_Polish_MateParser.mdl -test") | ||
235 | - else stdin, stdout | ||
236 | - | ||
237 | -let swigra_in, swigra_out = (*Unix.open_process "../swigra/parser/run.sh"*) | 258 | +(*let swigra_in, swigra_out = (*Unix.open_process "../swigra/parser/run.sh"*) |
238 | if Paths.config.Paths.swigra_enabled then | 259 | if Paths.config.Paths.swigra_enabled then |
239 | Unix.open_process (Paths.config.Paths.swigra_path ^ "run.sh") | 260 | Unix.open_process (Paths.config.Paths.swigra_path ^ "run.sh") |
240 | else stdin, stdout | 261 | else stdin, stdout |
@@ -256,38 +277,21 @@ let parse timeout verbosity rules (*name id*) tokens lex_sems = | @@ -256,38 +277,21 @@ let parse timeout verbosity rules (*name id*) tokens lex_sems = | ||
256 | let result = eniam_parse_sentence timeout verbosity rules tokens lex_sems paths last in | 277 | let result = eniam_parse_sentence timeout verbosity rules tokens lex_sems paths last in |
257 | ENIAMSentence result | 278 | ENIAMSentence result |
258 | | _ -> failwith "parse 3") | 279 | | _ -> failwith "parse 3") |
259 | - | DepSentence(paths) -> | 280 | + | DepSentence paths -> |
260 | (match mode with | 281 | (match mode with |
261 | -(* CONLL -> | ||
262 | - let result = conll_parse_sentence timeout verbosity id true paths tokens lex_sems in | ||
263 | - let result = {result with | 282 | + CONLL | Mate -> |
283 | + let result = conll_parse_sentence timeout verbosity dep_rules true tokens lex_sems paths in | ||
284 | + (* let result = {result with | ||
264 | file_prefix = file_prefix_of_mode mode ^ file_prefix; | 285 | file_prefix = file_prefix_of_mode mode ^ file_prefix; |
265 | - paths = paths} in | 286 | + paths = paths} in *) |
266 | CONLLSentence result | 287 | CONLLSentence result |
267 | (* let xml = DepTree.conll_to_xml paths in | 288 | (* let xml = DepTree.conll_to_xml paths in |
268 | let graph = XmlPrinter.graph_of_xml xml in (* FIXME: do poprawy *) | 289 | let graph = XmlPrinter.graph_of_xml xml in (* FIXME: do poprawy *) |
269 | Visualization.print_graph "results/" "term_conll" graph; | 290 | Visualization.print_graph "results/" "term_conll" graph; |
270 | let result = {empty_eniam_parse_result with status=Parsed; term=graph} in | 291 | let result = {empty_eniam_parse_result with status=Parsed; term=graph} in |
271 | ENIAMSentence result, next_id *) | 292 | ENIAMSentence result, next_id *) |
272 | - | Mate -> | ||
273 | - if not Paths.config.Paths.mate_parser_enabled then DepSentence paths else ( | ||
274 | - print_endline "parse_sentence 1"; | ||
275 | - (* print_endline (Visualization.html_of_dep_sentence tokens paths); *) | ||
276 | - let conll = ENIAM_CONLL.string_of_paths ENIAMsubsyntaxTypes.Mate tokens paths in | ||
277 | - print_endline "parse_sentence 2"; | ||
278 | - (* printf "|%s|\n" conll; *) | ||
279 | - Printf.fprintf mate_out "%s%!" conll; | ||
280 | - print_endline "parse_sentence 3"; | ||
281 | - let new_paths = get_paths paths (ENIAM_CONLL.load_sentence mate_in) in | ||
282 | - print_endline "parse_sentence 4"; | ||
283 | - (* print_endline (Visualization.html_of_dep_sentence tokens new_paths); *) | ||
284 | - let result = conll_parse_sentence timeout verbosity id true new_paths tokens lex_sems in | ||
285 | - let result = {result with | ||
286 | - file_prefix = file_prefix_of_mode mode ^ file_prefix; | ||
287 | - paths=new_paths} in | ||
288 | - CONLLSentence result)*) | ||
289 | - | _ -> failwith "parse 2") | ||
290 | - | _ -> failwith "parse 1") | 293 | + | _ -> failwith "parse 2") |
294 | + | _ -> failwith "parse 1") | ||
291 | 295 | ||
292 | 296 | ||
293 | (* | 297 | (* |
exec/ENIAMexecTypes.ml
@@ -49,9 +49,9 @@ type eniam_parse_result = { | @@ -49,9 +49,9 @@ type eniam_parse_result = { | ||
49 | semantic_graph11: ENIAMsemTypes.linear_term; | 49 | semantic_graph11: ENIAMsemTypes.linear_term; |
50 | text_fragments: string IntMap.t array; | 50 | text_fragments: string IntMap.t array; |
51 | } | 51 | } |
52 | -(* | 52 | + |
53 | type conll_parse_result = { | 53 | type conll_parse_result = { |
54 | - file_prefix: string; | 54 | +(* file_prefix: string;*) |
55 | status: status; | 55 | status: status; |
56 | msg: string; | 56 | msg: string; |
57 | lex_time: float; | 57 | lex_time: float; |
@@ -59,17 +59,29 @@ type conll_parse_result = { | @@ -59,17 +59,29 @@ type conll_parse_result = { | ||
59 | reduction_time: float; | 59 | reduction_time: float; |
60 | sem_time: float; | 60 | sem_time: float; |
61 | paths_size: int; | 61 | paths_size: int; |
62 | + chart_size: int; | ||
62 | dependency_tree_size: int; | 63 | dependency_tree_size: int; |
63 | - paths: (int * int * string) array; | ||
64 | - dep_chart: LCGtypes.dep_tree; | ||
65 | - parsed_dep_chart: (LCGtypes.SymbolMap.key * LCGtypes.linear_term) list; | 64 | + chart1: dep_tree; |
65 | + chart2: dep_tree; | ||
66 | + references2: linear_term ExtArray.t; | ||
67 | + parsed_dep_chart: (SymbolMap.key * linear_term) list; | ||
66 | not_parsed_dep_chart: int * | 68 | not_parsed_dep_chart: int * |
67 | - (LCGtypes.grammar_symbol * LCGtypes.linear_term) list list * | ||
68 | - (LCGtypes.grammar_symbol * LCGtypes.linear_term) list * | ||
69 | - (LCGtypes.grammar_symbol * LCGtypes.linear_term) list list; | ||
70 | - dependency_tree: LCGtypes.linear_term array; | 69 | + (grammar_symbol * linear_term) list list * |
70 | + (grammar_symbol * linear_term) list * | ||
71 | + (grammar_symbol * linear_term) list list; | ||
72 | + references3: linear_term ExtArray.t; | ||
73 | + term4: linear_term; | ||
74 | + dependency_tree4: linear_term array; | ||
75 | + dependency_tree5: linear_term array; | ||
76 | + dependency_tree6: linear_term array; | ||
77 | + dependency_tree7: linear_term array; | ||
78 | + dependency_tree8: linear_term ExtArray.t; | ||
79 | + dependency_tree9: linear_term array; | ||
80 | + semantic_graph10: ENIAMsemTypes.linear_term array; | ||
81 | + semantic_graph11: ENIAMsemTypes.linear_term; | ||
82 | + text_fragments: string IntMap.t array; | ||
71 | } | 83 | } |
72 | - | 84 | +(* |
73 | type semantic_processing_result = { | 85 | type semantic_processing_result = { |
74 | file_prefix: string; | 86 | file_prefix: string; |
75 | status: status; | 87 | status: status; |
@@ -190,6 +202,35 @@ let empty_eniam_parse_result = { | @@ -190,6 +202,35 @@ let empty_eniam_parse_result = { | ||
190 | text_fragments=[| |]; | 202 | text_fragments=[| |]; |
191 | } | 203 | } |
192 | 204 | ||
205 | +let empty_conll_parse_result = { | ||
206 | + (* file_prefix=""; *) | ||
207 | + status=Idle; | ||
208 | + msg=""; | ||
209 | + lex_time=0.; | ||
210 | + parse_time=0.; | ||
211 | + reduction_time=0.; | ||
212 | + sem_time=0.; | ||
213 | + paths_size=0; | ||
214 | + chart_size=0; | ||
215 | + dependency_tree_size=0; | ||
216 | + chart1=DepNode(-100,[],[],[]); | ||
217 | + chart2=DepNode(-100,[],[],[]); | ||
218 | + references2=ExtArray.make 0 Dot; | ||
219 | + references3=ExtArray.make 0 Dot; | ||
220 | + term4=Dot; | ||
221 | + dependency_tree4=[| |]; | ||
222 | + dependency_tree5=[| |]; | ||
223 | + dependency_tree6=[| |]; | ||
224 | + dependency_tree7=[| |]; | ||
225 | + dependency_tree8=ExtArray.make 0 Dot; | ||
226 | + dependency_tree9=[| |]; | ||
227 | + semantic_graph10=[| |]; | ||
228 | + semantic_graph11=ENIAMsemTypes.Dot; | ||
229 | + text_fragments=[| |]; | ||
230 | + parsed_dep_chart=[]; | ||
231 | + not_parsed_dep_chart=(-100,[],[],[]); | ||
232 | + } | ||
233 | + | ||
193 | (* | 234 | (* |
194 | let empty_result = { | 235 | let empty_result = { |
195 | input_text=RawText ""; | 236 | input_text=RawText ""; |
@@ -208,23 +249,6 @@ let empty_result = { | @@ -208,23 +249,6 @@ let empty_result = { | ||
208 | lex_sems=ExtArray.make 1 ENIAMlexSemanticsTypes.empty_lex_sem; | 249 | lex_sems=ExtArray.make 1 ENIAMlexSemanticsTypes.empty_lex_sem; |
209 | } | 250 | } |
210 | 251 | ||
211 | -let empty_conll_parse_result = { | ||
212 | - file_prefix=""; | ||
213 | - status=Idle; | ||
214 | - msg=""; | ||
215 | - lex_time=0.; | ||
216 | - parse_time=0.; | ||
217 | - reduction_time=0.; | ||
218 | - sem_time=0.; | ||
219 | - paths_size=0; | ||
220 | - dependency_tree_size=0; | ||
221 | - paths=[| |]; | ||
222 | - dep_chart=DepNode(-100,[],[],[]); | ||
223 | - parsed_dep_chart=[]; | ||
224 | - not_parsed_dep_chart=(-100,[],[],[]); | ||
225 | - dependency_tree=[| |]; | ||
226 | - } | ||
227 | - | ||
228 | let empty_semantic_processing_result = { | 252 | let empty_semantic_processing_result = { |
229 | file_prefix=""; | 253 | file_prefix=""; |
230 | status=Idle; | 254 | status=Idle; |
@@ -321,3 +345,5 @@ let rec fold_text mode s f = function | @@ -321,3 +345,5 @@ let rec fold_text mode s f = function | ||
321 | | AltText l -> | 345 | | AltText l -> |
322 | Xlist.fold l s (fun s (mode,text) -> | 346 | Xlist.fold l s (fun s (mode,text) -> |
323 | fold_text mode s f text) | 347 | fold_text mode s f text) |
348 | + | ||
349 | +let rules_filename = ENIAM_LCGlexiconTypes.resource_path ^ "/LCGlexicon/lexicon-pl.dic" |
exec/ENIAMvisualization.ml
@@ -702,7 +702,7 @@ let html_of_struct_sentence tokens paths last = | @@ -702,7 +702,7 @@ let html_of_struct_sentence tokens paths last = | ||
702 | t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id lnode rnode)) ^ | 702 | t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id lnode rnode)) ^ |
703 | sprintf "<tr><td></td><td></td><td></td><td>%d</td><td></td></tr>" last ^ | 703 | sprintf "<tr><td></td><td></td><td></td><td>%d</td><td></td></tr>" last ^ |
704 | "</table>" | 704 | "</table>" |
705 | -(* | 705 | + |
706 | let html_of_dep_sentence tokens paths = | 706 | let html_of_dep_sentence tokens paths = |
707 | "<table><tr><td><b>orth</b></td><td><b>token</b></td><td><b>id</b></td><td><b>conll_id</b></td><td><b>super</b></td><td><b>label</b></td></tr>" ^ | 707 | "<table><tr><td><b>orth</b></td><td><b>token</b></td><td><b>id</b></td><td><b>conll_id</b></td><td><b>super</b></td><td><b>label</b></td></tr>" ^ |
708 | String.concat "\n" (List.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id -> | 708 | String.concat "\n" (List.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id -> |
@@ -711,7 +711,7 @@ let html_of_dep_sentence tokens paths = | @@ -711,7 +711,7 @@ let html_of_dep_sentence tokens paths = | ||
711 | (sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td></tr>" | 711 | (sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td></tr>" |
712 | t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id conll_id super label) :: l))) ^ | 712 | t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id conll_id super label) :: l))) ^ |
713 | "</table>" | 713 | "</table>" |
714 | - | 714 | +(* |
715 | let html_of_tokens tokens = | 715 | let html_of_tokens tokens = |
716 | "<table><tr><td><b>id</b></td><td><b>orth</b></td><td><b>beg</b></td><td><b>len</b></td><td><b>next</b></td><td><b>token</b></td></td><td><b>attrs</b></td></tr>" ^ | 716 | "<table><tr><td><b>id</b></td><td><b>orth</b></td><td><b>beg</b></td><td><b>len</b></td><td><b>next</b></td><td><b>token</b></td></td><td><b>attrs</b></td></tr>" ^ |
717 | String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size tokens - 1) [] (fun l id -> | 717 | String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size tokens - 1) [] (fun l id -> |
@@ -1048,7 +1048,7 @@ let file_prefix_of_mode = function | @@ -1048,7 +1048,7 @@ let file_prefix_of_mode = function | ||
1048 | let rec html_of_sentence path file_prefix mode img verbosity tokens = function | 1048 | let rec html_of_sentence path file_prefix mode img verbosity tokens = function |
1049 | RawSentence s -> s | 1049 | RawSentence s -> s |
1050 | | StructSentence(paths,last) -> html_of_struct_sentence tokens paths last | 1050 | | StructSentence(paths,last) -> html_of_struct_sentence tokens paths last |
1051 | - (* | DepSentence paths -> html_of_dep_sentence img verbosity tokens paths *) | 1051 | + | DepSentence paths -> html_of_dep_sentence tokens paths |
1052 | | ENIAMSentence result -> | 1052 | | ENIAMSentence result -> |
1053 | let file_prefix = file_prefix_of_mode mode ^ file_prefix in | 1053 | let file_prefix = file_prefix_of_mode mode ^ file_prefix in |
1054 | html_of_eniam_sentence path file_prefix img verbosity tokens result | 1054 | html_of_eniam_sentence path file_prefix img verbosity tokens result |
@@ -1062,7 +1062,7 @@ let rec html_of_sentence path file_prefix mode img verbosity tokens = function | @@ -1062,7 +1062,7 @@ let rec html_of_sentence path file_prefix mode img verbosity tokens = function | ||
1062 | String.concat "\n" (Xlist.map l (fun (mode,sentence) -> | 1062 | String.concat "\n" (Xlist.map l (fun (mode,sentence) -> |
1063 | sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_sentence path file_prefix mode img verbosity tokens sentence))) ^ | 1063 | sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_sentence path file_prefix mode img verbosity tokens sentence))) ^ |
1064 | "</table>" | 1064 | "</table>" |
1065 | - | _ -> failwith "html_of_sentence: ni" | 1065 | + (* | _ -> failwith "html_of_sentence: ni" *) |
1066 | 1066 | ||
1067 | let rec html_of_paragraph path mode img verbosity tokens = function | 1067 | let rec html_of_paragraph path mode img verbosity tokens = function |
1068 | RawParagraph s -> (*print_endline "RawParagraph";*) s | 1068 | RawParagraph s -> (*print_endline "RawParagraph";*) s |
exec/makefile
@@ -19,6 +19,13 @@ install: all | @@ -19,6 +19,13 @@ install: all | ||
19 | cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMvisualization.cmi $(INSTALLDIR) | 19 | cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMvisualization.cmi $(INSTALLDIR) |
20 | cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMvisualization.cmx $(INSTALLDIR) | 20 | cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMvisualization.cmx $(INSTALLDIR) |
21 | 21 | ||
22 | +install-local: all | ||
23 | + mkdir -p $(INSTALLDIR) | ||
24 | + cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR) | ||
25 | + cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMsemLexicon.cmi ENIAMsemValence.cmi ENIAMvisualization.cmi $(INSTALLDIR) | ||
26 | + cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMsemLexicon.cmx ENIAMsemValence.cmx ENIAMvisualization.cmx $(INSTALLDIR) | ||
27 | + mkdir -p /usr/local/share/eniam/exec | ||
28 | + cp resources/* /usr/local/share/eniam/exec | ||
22 | 29 | ||
23 | eniam-exec.cma: $(SOURCES) | 30 | eniam-exec.cma: $(SOURCES) |
24 | ocamlc -linkall -a -o eniam-exec.cma $(OCAMLFLAGS) $^ | 31 | ocamlc -linkall -a -o eniam-exec.cma $(OCAMLFLAGS) $^ |
integration/ENIAMpreIntegration.ml
@@ -198,3 +198,9 @@ let rec parse_text mode tokens = function | @@ -198,3 +198,9 @@ let rec parse_text mode tokens = function | ||
198 | StructText(List.rev paragraphs) | 198 | StructText(List.rev paragraphs) |
199 | | AltText l -> AltText(Xlist.map l (fun (mode,text) -> | 199 | | AltText l -> AltText(Xlist.map l (fun (mode,text) -> |
200 | mode, parse_text mode tokens text)) | 200 | mode, parse_text mode tokens text)) |
201 | + | ||
202 | +let catch_parse_text mode tokens text = | ||
203 | + try | ||
204 | + parse_text mode tokens text,"" | ||
205 | + with e -> | ||
206 | + text, Printexc.to_string e |
lexSemantics/ENIAMwalParser.ml
@@ -73,14 +73,6 @@ let split_text schema = | @@ -73,14 +73,6 @@ let split_text schema = | ||
73 | | Str.Delim "'" -> Quot | 73 | | Str.Delim "'" -> Quot |
74 | | _ -> failwith "parse_text")) | 74 | | _ -> failwith "parse_text")) |
75 | 75 | ||
76 | -let rec split_symbol symb rev = function | ||
77 | - [] -> [List.rev rev](*failwith "split_symbol"*) | ||
78 | - | s :: l -> | ||
79 | - if s = symb then | ||
80 | - if l = [] then (*[List.rev rev]*)failwith "split_symbol" | ||
81 | - else (List.rev rev) :: (split_symbol symb [] l) | ||
82 | - else split_symbol symb (s :: rev) l | ||
83 | - | ||
84 | let rec string_of_token = function | 76 | let rec string_of_token = function |
85 | Text s -> s | 77 | Text s -> s |
86 | | Paren l -> "(" ^ String.concat "" (Xlist.map l string_of_token) ^ ")" | 78 | | Paren l -> "(" ^ String.concat "" (Xlist.map l string_of_token) ^ ")" |
@@ -101,6 +93,14 @@ let rec string_of_token = function | @@ -101,6 +93,14 @@ let rec string_of_token = function | ||
101 | let string_of_token_list l = | 93 | let string_of_token_list l = |
102 | String.concat "" (Xlist.map l string_of_token) | 94 | String.concat "" (Xlist.map l string_of_token) |
103 | 95 | ||
96 | +let rec split_symbol symb rev = function | ||
97 | + [] -> [List.rev rev](*failwith "split_symbol"*) | ||
98 | + | s :: l -> | ||
99 | + if s = symb then | ||
100 | + if l = [] then (*[List.rev rev]*)failwith ("split_symbol: " ^ string_of_token symb) | ||
101 | + else (List.rev rev) :: (split_symbol symb [] l) | ||
102 | + else split_symbol symb (s :: rev) l | ||
103 | + | ||
104 | let parse_case = function | 104 | let parse_case = function |
105 | [Text "nom"] -> Case "nom" | 105 | [Text "nom"] -> Case "nom" |
106 | | [Text "gen"] -> Case "gen" | 106 | | [Text "gen"] -> Case "gen" |
lexSemantics/interface.ml
@@ -23,6 +23,7 @@ let output = ref Text | @@ -23,6 +23,7 @@ let output = ref Text | ||
23 | let comm_stdio = ref true | 23 | let comm_stdio = ref true |
24 | (* let sentence_split = ref true *) | 24 | (* let sentence_split = ref true *) |
25 | let port = ref 5439 | 25 | let port = ref 5439 |
26 | +let perform_integration = ref false | ||
26 | 27 | ||
27 | let spec_list = [ | 28 | let spec_list = [ |
28 | (* "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; | 29 | (* "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; |
@@ -33,6 +34,13 @@ let spec_list = [ | @@ -33,6 +34,13 @@ let spec_list = [ | ||
33 | "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; | 34 | "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; |
34 | "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; | 35 | "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; |
35 | "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; | 36 | "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; |
37 | + "--dep_parser", Arg.Unit (fun () -> | ||
38 | + ENIAMpreIntegration.concraft_enabled := true; | ||
39 | + ENIAMpreIntegration.mate_parser_enabled := true; | ||
40 | + perform_integration := true), "Enable dependency parser"; | ||
41 | + "--no_dep_parser", Arg.Unit (fun () -> | ||
42 | + ENIAMpreIntegration.concraft_enabled := false; | ||
43 | + ENIAMpreIntegration.mate_parser_enabled := false), "Disable dependency parser (default)"; | ||
36 | (* "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; *) | 44 | (* "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; *) |
37 | (* "-r", Arg.String (fun p -> | 45 | (* "-r", Arg.String (fun p -> |
38 | ENIAMtokenizerTypes.set_resource_path p; | 46 | ENIAMtokenizerTypes.set_resource_path p; |
@@ -65,6 +73,9 @@ let rec main_loop in_chan out_chan = | @@ -65,6 +73,9 @@ let rec main_loop in_chan out_chan = | ||
65 | print_endline text; | 73 | print_endline text; |
66 | print_endline "input text end"; *) | 74 | print_endline "input text end"; *) |
67 | let text,tokens,msg = ENIAMsubsyntax.catch_parse_text text in | 75 | let text,tokens,msg = ENIAMsubsyntax.catch_parse_text text in |
76 | + let text,msg = | ||
77 | + if msg <> "" || not !perform_integration then text,msg else | ||
78 | + ENIAMpreIntegration.catch_parse_text ENIAMsubsyntaxTypes.Struct tokens text in | ||
68 | let lex_sems,msg = | 79 | let lex_sems,msg = |
69 | if msg <> "" then ExtArray.make 0 ENIAMlexSemanticsTypes.empty_lex_sem, msg | 80 | if msg <> "" then ExtArray.make 0 ENIAMlexSemanticsTypes.empty_lex_sem, msg |
70 | else ENIAMlexSemantics.catch_assign tokens text in | 81 | else ENIAMlexSemantics.catch_assign tokens text in |
@@ -84,6 +95,7 @@ let _ = | @@ -84,6 +95,7 @@ let _ = | ||
84 | prerr_endline message; | 95 | prerr_endline message; |
85 | Arg.parse spec_list anon_fun usage_msg; | 96 | Arg.parse spec_list anon_fun usage_msg; |
86 | ENIAMlexSemantics.initialize (); | 97 | ENIAMlexSemantics.initialize (); |
98 | + ENIAMpreIntegration.initialize (); | ||
87 | Gc.compact (); | 99 | Gc.compact (); |
88 | prerr_endline "Ready!"; | 100 | prerr_endline "Ready!"; |
89 | if !comm_stdio then main_loop stdin stdout | 101 | if !comm_stdio then main_loop stdin stdout |
lexSemantics/makefile
@@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt | @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt | ||
3 | OCAMLDEP=ocamldep | 3 | OCAMLDEP=ocamldep |
4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam | 4 | INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam |
5 | OCAMLFLAGS=$(INCLUDES) -g | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa | 6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa |
7 | INSTALLDIR=`ocamlc -where`/eniam | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | ||
9 | SOURCES= entries.ml ENIAMwalTypes.ml ENIAMwalStringOf.ml ENIAMwalParser.ml ENIAMwalReduce.ml ENIAMlexSemanticsTypes.ml ENIAMlexSemanticsData.ml ENIAMvalence.ml ENIAMwalRenderer.ml ENIAMadjuncts.ml \ | 9 | SOURCES= entries.ml ENIAMwalTypes.ml ENIAMwalStringOf.ml ENIAMwalParser.ml ENIAMwalReduce.ml ENIAMlexSemanticsTypes.ml ENIAMlexSemanticsData.ml ENIAMvalence.ml ENIAMwalRenderer.ml ENIAMadjuncts.ml \ |
@@ -40,6 +40,9 @@ eniam-lexSemantics.cmxa: $(SOURCES) | @@ -40,6 +40,9 @@ eniam-lexSemantics.cmxa: $(SOURCES) | ||
40 | test: test.ml | 40 | test: test.ml |
41 | $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^ | 41 | $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^ |
42 | 42 | ||
43 | +inttest: inttest.ml | ||
44 | + $(OCAMLOPT) -o inttest $(OCAMLOPTFLAGS) $^ | ||
45 | + | ||
43 | interface: interface.ml | 46 | interface: interface.ml |
44 | $(OCAMLOPT) -o lexSemantics $(OCAMLOPTFLAGS) interface.ml | 47 | $(OCAMLOPT) -o lexSemantics $(OCAMLOPTFLAGS) interface.ml |
45 | 48 | ||
@@ -65,4 +68,4 @@ interface: interface.ml | @@ -65,4 +68,4 @@ interface: interface.ml | ||
65 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< | 68 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< |
66 | 69 | ||
67 | clean: | 70 | clean: |
68 | - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test | 71 | + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test inttest |
semantics/ENIAMsemLexicon.ml
@@ -47,7 +47,7 @@ let parse_multi p = function | @@ -47,7 +47,7 @@ let parse_multi p = function | ||
47 | let parse_morf p = function | 47 | let parse_morf p = function |
48 | [T "1"] -> {p with is_necessary=Opt} | 48 | [T "1"] -> {p with is_necessary=Opt} |
49 | | tokens -> | 49 | | tokens -> |
50 | - let l = Xlist.map (Lexer.split_symbol (T "*") [] tokens) (function | 50 | + let l = Xlist.map (try Lexer.split_symbol (T "*") [] tokens with _ -> failwith "parse_morf: split_symbol *") (function |
51 | [T s] -> Atom s | 51 | [T s] -> Atom s |
52 | | tokens -> failwith ("parse_morf: " ^ Lexer.string_of_token_list tokens)) in | 52 | | tokens -> failwith ("parse_morf: " ^ Lexer.string_of_token_list tokens)) in |
53 | {p with morfs=LCG (Tensor l) :: p.morfs} | 53 | {p with morfs=LCG (Tensor l) :: p.morfs} |
@@ -57,7 +57,7 @@ let parse_arg tokens p = | @@ -57,7 +57,7 @@ let parse_arg tokens p = | ||
57 | let tokens,p = parse_dir p tokens in | 57 | let tokens,p = parse_dir p tokens in |
58 | let tokens,p = parse_multi p tokens in | 58 | let tokens,p = parse_multi p tokens in |
59 | match Lexer.find_brackets ["(",")"] [] tokens with | 59 | match Lexer.find_brackets ["(",")"] [] tokens with |
60 | - [B("(",")",tokens)] -> Xlist.fold (Lexer.split_symbol (T "+") [] tokens) p parse_morf | 60 | + [B("(",")",tokens)] -> Xlist.fold (try Lexer.split_symbol (T "+") [] tokens with _ -> failwith "parse_arg: split_symbol +") p parse_morf |
61 | | tokens -> parse_morf p tokens | 61 | | tokens -> parse_morf p tokens |
62 | 62 | ||
63 | 63 | ||
@@ -75,7 +75,7 @@ let parse_entry = function | @@ -75,7 +75,7 @@ let parse_entry = function | ||
75 | [T symbol; T ":"; T "null"] -> symbol,[] | 75 | [T symbol; T ":"; T "null"] -> symbol,[] |
76 | | T symbol :: T ":" :: tokens -> | 76 | | T symbol :: T ":" :: tokens -> |
77 | (* Printf.printf "parse_entry: %s\n" (Lexer.string_of_token_list tokens); *) | 77 | (* Printf.printf "parse_entry: %s\n" (Lexer.string_of_token_list tokens); *) |
78 | - let tokens = Lexer.split_symbol (T ":") [] tokens in | 78 | + let tokens = try Lexer.split_symbol (T ":") [] tokens with _ -> failwith "parse_entry: split_symbol :" in |
79 | let tokens = manage_tokens tokens in | 79 | let tokens = manage_tokens tokens in |
80 | let positions = Xlist.map tokens (fun (arg,role) -> | 80 | let positions = Xlist.map tokens (fun (arg,role) -> |
81 | parse_arg arg (parse_role {empty_position with is_necessary=Req} role)) in | 81 | parse_arg arg (parse_role {empty_position with is_necessary=Req} role)) in |
@@ -91,7 +91,7 @@ let load_lexicon filename = | @@ -91,7 +91,7 @@ let load_lexicon filename = | ||
91 | | T "\t" -> tokens | 91 | | T "\t" -> tokens |
92 | | T "\r" -> tokens | 92 | | T "\r" -> tokens |
93 | | t -> t :: tokens)) in | 93 | | t -> t :: tokens)) in |
94 | - let entries = Lexer.split_symbol (T ";") [] tokens in | 94 | + let entries = try Lexer.split_symbol (T ";") [] tokens with _ -> failwith "load_lexicon: split_symbol ;" in |
95 | Xlist.fold entries StringMap.empty (fun map entry -> | 95 | Xlist.fold entries StringMap.empty (fun map entry -> |
96 | let symbol,args = parse_entry entry in | 96 | let symbol,args = parse_entry entry in |
97 | StringMap.add_inc map symbol args (fun _ -> failwith ("load_lexicon: " ^ symbol))) | 97 | StringMap.add_inc map symbol args (fun _ -> failwith ("load_lexicon: " ^ symbol))) |
testy/skladnica-test1-Failure.conll
1 | -1 - - interp interp _ 3 punct _ _ | ||
2 | -2 Panowie pan subst subst pl|nom|m1 3 subj _ _ | ||
3 | -3 przyszli przyjść praet praet pl|m1|perf 0 pred _ _ | ||
4 | -4 . . interp interp _ 3 punct _ _ | ||
5 | - | ||
6 | 1 O o prep prep loc 12 comp _ _ | 1 | 1 O o prep prep loc 12 comp _ _ |
7 | 2 klasztornym klasztorny adj adj sg|loc|n|pos 3 adjunct _ _ | 2 | 2 klasztornym klasztorny adj adj sg|loc|n|pos 3 adjunct _ _ |
8 | 3 piekle piekło subst subst sg|loc|n 1 comp _ _ | 3 | 3 piekle piekło subst subst sg|loc|n 1 comp _ _ |
@@ -21,84 +16,118 @@ | @@ -21,84 +16,118 @@ | ||
21 | 16 br bieżący_rok brev brev pun 15 ne _ _ | 16 | 16 br bieżący_rok brev brev pun 15 ne _ _ |
22 | 17 . . interp interp _ 12 punct _ _ | 17 | 17 . . interp interp _ 12 punct _ _ |
23 | 18 | ||
24 | -1 Następnie następnie adv adv _ 2 adjunct _ _ | ||
25 | -2 rozłożyła rozłożyć praet praet sg|f|perf 10 conjunct _ _ | ||
26 | -3 wysoki wysoki adj adj sg|acc|m3|pos 4 adjunct _ _ | ||
27 | -4 statyw statyw subst subst sg|acc|m3 2 obj _ _ | ||
28 | -5 , , interp interp _ 10 coord_punct _ _ | ||
29 | -6 zawiesiła zawiesić praet praet sg|f|perf 10 conjunct _ _ | ||
30 | -7 na na prep prep loc 6 adjunct _ _ | ||
31 | -8 nim on ppron3 ppron3 sg|loc|m3|ter|akc|praep 7 comp _ _ | ||
32 | -9 pudełko pudełko subst subst sg|acc|n 6 obj _ _ | ||
33 | -10 , , interp interp _ 0 pred _ _ | ||
34 | -11 przeprowadziła przeprowadzić praet praet sg|f|perf 10 conjunct _ _ | ||
35 | -12 od od prep prep gen|nwok 11 adjunct _ _ | ||
36 | -13 niego on ppron3 ppron3 sg|gen|n|ter|akc|praep 12 comp _ _ | ||
37 | -14 przezroczysty przezroczysty adj adj sg|acc|m3|pos 15 adjunct _ _ | ||
38 | -15 przewód przewód subst subst sg|acc|m3 11 obj _ _ | ||
39 | -16 do do prep prep gen 11 adjunct _ _ | ||
40 | -17 igły igła subst subst sg|gen|f 16 comp _ _ | ||
41 | -18 , , interp interp _ 23 punct _ _ | ||
42 | -19 którą który adj adj sg|acc|f|pos 23 obj _ _ | ||
43 | -20 wcześniej wcześnie adv adv com 23 adjunct _ _ | ||
44 | -21 automatyczny automatyczny adj adj sg|nom|m3|pos 22 adjunct _ _ | ||
45 | -22 iniektor iniektor subst subst sg|nom|m3 23 subj _ _ | ||
46 | -23 umieścił umieścić praet praet sg|m3|perf 17 adjunct _ _ | ||
47 | -24 w w prep prep loc|nwok 23 comp _ _ | ||
48 | -25 żyle żyła subst subst sg|loc|f 24 comp _ _ | ||
49 | -26 na na prep prep loc 25 adjunct _ _ | ||
50 | -27 przedramieniu przedramię subst subst sg|loc|n 26 comp _ _ | ||
51 | -28 Irka Irek subst subst sg|gen|m1 27 adjunct _ _ | ||
52 | -29 . . interp interp _ 10 punct _ _ | 19 | +1 W w prep prep loc|nwok 9 adjunct _ _ |
20 | +2 stanie stan subst subst sg|loc|m3 1 comp _ _ | ||
21 | +3 obrzydzenia obrzydzenie subst subst sg|gen|n 2 adjunct _ _ | ||
22 | +4 przyprawiającego przyprawiać pact pact sg|gen|n|imperf|aff 3 adjunct _ _ | ||
23 | +5 o o prep prep acc 4 comp _ _ | ||
24 | +6 nowe nowy adj adj pl|acc|n|pos 7 adjunct _ _ | ||
25 | +7 mdłości mdłości subst subst pl|acc|n 5 comp _ _ | ||
26 | +8 nie nie qub qub _ 9 neg _ _ | ||
27 | +9 zauważył zauważyć praet praet sg|m1|perf 0 pred _ _ | ||
28 | +10 nawet nawet qub qub _ 9 adjunct _ _ | ||
29 | +11 , , interp interp _ 15 punct _ _ | ||
30 | +12 że że comp comp _ 15 complm _ _ | ||
31 | +13 wielki wielki adj adj sg|nom|m3|pos 14 adjunct _ _ | ||
32 | +14 ból ból subst subst sg|nom|m3 15 subj _ _ | ||
33 | +15 zaczyna zaczynać fin fin sg|ter|imperf 9 comp_fin _ _ | ||
34 | +16 z z prep prep acc|nwok 18 adjunct _ _ | ||
35 | +17 wolna wolny adj adjp _ 16 mwe _ _ | ||
36 | +18 zanikać zanikać inf inf imperf 15 comp_inf _ _ | ||
37 | +19 . . interp interp _ 9 punct _ _ | ||
38 | + | ||
39 | +1 - - interp interp _ 7 punct _ _ | ||
40 | +2 W w prep prep loc|nwok 4 comp _ _ | ||
41 | +3 szkole szkoła subst subst sg|loc|f 2 comp _ _ | ||
42 | +4 jest być fin fin sg|ter|imperf 7 conjunct _ _ | ||
43 | +5 mniej mało num num pl|nom 4 subj _ _ | ||
44 | +6 uczniów uczeń subst subst pl|gen|m1 5 comp _ _ | ||
45 | +7 , , interp interp _ 0 coord_punct _ _ | ||
46 | +8 dlatego dlatego adv adv _ 9 adjunct _ _ | ||
47 | +9 musiał musieć praet praet sg|m1|imperf 7 conjunct _ _ | ||
48 | +10 em być aglt aglt sg|pri|imperf|wok 9 aglt _ _ | ||
49 | +11 tym ten adj adj pl|dat|f|pos 12 adjunct _ _ | ||
50 | +12 paniom pani subst subst pl|dat|f 13 obj_th _ _ | ||
51 | +13 podziękować podziękować inf inf perf 9 comp_inf _ _ | ||
52 | +14 . . interp interp _ 7 punct _ _ | ||
53 | + | ||
54 | +1 Od od prep prep gen|nwok 9 adjunct _ _ | ||
55 | +2 końca koniec subst subst sg|gen|m3 1 comp _ _ | ||
56 | +3 XVIII XVIII adj adj sg|gen|m3|pos 4 ne _ _ | ||
57 | +4 w wiek brev brev pun 2 comp _ _ | ||
58 | +5 . . interp interp _ 4 abbrev_punct _ _ | ||
59 | +6 informacje informacja subst subst pl|nom|f 9 subj _ _ | ||
60 | +7 o o prep prep loc 6 adjunct _ _ | ||
61 | +8 głodach głód subst subst pl|loc|m3 7 comp _ _ | ||
62 | +9 stają stawać fin fin pl|ter|imperf 0 pred _ _ | ||
63 | +10 się się qub qub _ 9 refl _ _ | ||
64 | +11 coraz coraz adv adv _ 12 adjunct _ _ | ||
65 | +12 rzadsze rzadki adj adj pl|nom|f|com 9 pd _ _ | ||
66 | +13 . . interp interp _ 9 punct _ _ | ||
67 | + | ||
68 | +1 Zabrał zabrać praet praet sg|m1|perf 0 pred _ _ | ||
69 | +2 ponad ponad qub qub _ 3 adjunct _ _ | ||
70 | +3 30 30 num num pl|acc|m3|rec 1 obj _ _ | ||
71 | +4 tys tysiąc brev brev pun 3 mwe _ _ | ||
72 | +5 . . interp interp _ 4 abbrev_punct _ _ | ||
73 | +6 zł złoty brev brev npun 3 comp _ _ | ||
74 | +7 . . interp interp _ 1 punct _ _ | ||
75 | + | ||
76 | +1 ( ( interp interp _ 8 punct _ _ | ||
77 | +2 Kiedyś kiedyś adv adv _ 4 adjunct _ _ | ||
78 | +3 też też qub qub _ 4 adjunct _ _ | ||
79 | +4 miała mieć praet praet sg|f|imperf 8 conjunct _ _ | ||
80 | +5 m być aglt aglt sg|pri|imperf|nwok 4 aglt _ _ | ||
81 | +6 takie taki adj adj pl|acc|f|pos 7 adjunct _ _ | ||
82 | +7 ambicje ambicja subst subst pl|acc|f 4 obj_th _ _ | ||
83 | +8 , , interp interp _ 0 pred _ _ | ||
84 | +9 zrezygnowała zrezygnować praet praet sg|f|perf 8 conjunct _ _ | ||
85 | +10 m być aglt aglt sg|pri|imperf|nwok 9 aglt _ _ | ||
86 | +11 . . interp interp _ 8 punct _ _ | ||
87 | +12 ) ) interp interp _ 8 punct _ _ | ||
53 | 88 | ||
54 | -1 - - interp interp _ 4 punct _ _ | ||
55 | -2 Co co subst subst sg|nom|n 4 pd _ _ | ||
56 | -3 to to subst subst sg|nom|n 4 subj _ _ | ||
57 | -4 jest być fin fin sg|ter|imperf 0 pred _ _ | ||
58 | -5 ? ? interp interp _ 4 punct _ _ | 89 | +1 Zawsze zawsze adv adv _ 2 adjunct _ _ |
90 | +2 mówię mówić fin fin sg|pri|imperf 0 pred _ _ | ||
91 | +3 , , interp interp _ 5 punct _ _ | ||
92 | +4 że że comp comp _ 5 complm _ _ | ||
93 | +5 mogę móc fin fin sg|pri|imperf 2 comp_fin _ _ | ||
94 | +6 pracować pracować inf inf imperf 5 comp_inf _ _ | ||
95 | +7 , , interp interp _ 5 punct _ _ | ||
96 | +8 bo bo comp comp _ 5 adjunct _ _ | ||
97 | +9 mam mieć fin fin sg|pri|imperf 13 conjunct _ _ | ||
98 | +10 dobre dobry adj adj sg|acc|n|pos 11 adjunct _ _ | ||
99 | +11 zdrowie zdrowie subst subst sg|acc|n 9 obj_th _ _ | ||
100 | +12 , , interp interp _ 13 punct _ _ | ||
101 | +13 a a conj conj _ 8 comp_fin _ _ | ||
102 | +14 to to subst subst sg|nom|n 15 subj _ _ | ||
103 | +15 jest być fin fin sg|ter|imperf 13 conjunct _ _ | ||
104 | +16 darmo darmo adv adv _ 17 adjunct _ _ | ||
105 | +17 dane dany adj adj sg|nom|n|perf|aff 15 pd _ _ | ||
106 | +18 . . interp interp _ 2 punct _ _ | ||
59 | 107 | ||
60 | -1 Prosi prosić fin fin sg|ter|imperf 0 pred _ _ | ||
61 | -2 się się qub qub _ 1 refl _ _ | ||
62 | -3 też też qub qub _ 1 adjunct _ _ | ||
63 | -4 zakłady zakład subst subst pl|acc|m3 1 obj _ _ | ||
64 | -5 pracy praca subst subst sg|gen|f 4 adjunct _ _ | ||
65 | -6 , , interp interp _ 8 punct _ _ | ||
66 | -7 które który adj adj pl|nom|m3|pos 8 subj _ _ | ||
67 | -8 dysponują dysponować fin fin pl|ter|imperf 4 adjunct _ _ | ||
68 | -9 autobusami autobus subst subst pl|inst|m3 8 comp _ _ | ||
69 | -10 , , interp interp _ 12 punct _ _ | ||
70 | -11 by by comp comp _ 12 complm _ _ | ||
71 | -12 wspomogły wspomóc praet praet pl|m3|perf 1 comp_fin _ _ | ||
72 | -13 komunikację komunikacja subst subst sg|acc|f 12 obj _ _ | ||
73 | -14 zastępczą zastępczy adj adj sg|acc|f|pos 13 adjunct _ _ | ||
74 | -15 . . interp interp _ 1 punct _ _ | 108 | +1 " " interp interp _ 2 punct _ _ |
109 | +2 Zrobimy zrobić fin fin pl|pri|perf 0 pred _ _ | ||
110 | +3 " " interp interp _ 2 punct _ _ | ||
111 | +4 ! ! interp interp _ 2 punct _ _ | ||
75 | 112 | ||
76 | 1 - - interp interp _ 3 punct _ _ | 113 | 1 - - interp interp _ 3 punct _ _ |
77 | -2 Nie nie qub qub _ 3 neg _ _ | ||
78 | -3 chcą chcieć fin fin pl|ter|imperf 0 pred _ _ | ||
79 | -4 , , interp interp _ 8 punct _ _ | ||
80 | -5 by by comp comp _ 8 complm _ _ | ||
81 | -6 m być aglt aglt sg|pri|imperf|nwok 8 aglt _ _ | ||
82 | -7 ich on ppron3 ppron3 pl|acc|m1|ter|akc|npraep 8 obj _ _ | ||
83 | -8 utrzymywał utrzymywać praet praet sg|m1|imperf 3 comp_fin _ _ | ||
84 | -9 . . interp interp _ 3 punct _ _ | 114 | +2 No no qub qub _ 3 adjunct _ _ |
115 | +3 wie wiedzieć fin fin sg|ter|imperf 0 pred _ _ | ||
116 | +4 pan pan subst subst sg|nom|m1 3 subj _ _ | ||
117 | +5 ! ! interp interp _ 3 punct _ _ | ||
118 | +6 . . interp interp _ 5 punct _ _ | ||
119 | +7 . . interp interp _ 6 punct _ _ | ||
120 | +8 . . interp interp _ 7 punct _ _ | ||
85 | 121 | ||
86 | -1 Wzięli wziąć praet praet pl|m1|perf 0 pred _ _ | ||
87 | -2 w w prep prep loc|nwok 4 adjunct _ _ | ||
88 | -3 niej on ppron3 ppron3 sg|loc|f|ter|akc|praep 2 comp _ _ | ||
89 | -4 udział udział subst subst sg|acc|m3 1 obj _ _ | ||
90 | -5 przedstawiciele przedstawiciel subst subst pl|nom|m1 1 subj _ _ | ||
91 | -6 policji policja subst subst sg|gen|f 5 adjunct _ _ | ||
92 | -7 z z prep prep gen|nwok 5 adjunct _ _ | ||
93 | -8 Niemiec Niemcy subst subst pl|gen|n 17 conjunct _ _ | ||
94 | -9 , , interp interp _ 17 coord_punct _ _ | ||
95 | -10 Czech Czechy subst subst pl|gen|n 17 conjunct _ _ | ||
96 | -11 , , interp interp _ 17 coord_punct _ _ | ||
97 | -12 Słowacji Słowacja subst subst sg|gen|f 17 conjunct _ _ | ||
98 | -13 , , interp interp _ 17 coord_punct _ _ | ||
99 | -14 Węgier Węgry subst subst pl|gen|n 17 conjunct _ _ | ||
100 | -15 , , interp interp _ 17 coord_punct _ _ | ||
101 | -16 Ukrainy Ukraina subst subst sg|gen|f 17 conjunct _ _ | ||
102 | -17 i i conj conj _ 7 comp _ _ | ||
103 | -18 Polski Polska subst subst sg|gen|f 17 conjunct _ _ | ||
104 | -19 . . interp interp _ 1 punct _ _ | 122 | +1 ( ( interp interp _ 6 punct _ _ |
123 | +2 Myszkinku Myszkinek subst subst sg|voc|m3 6 adjunct _ _ | ||
124 | +3 , , interp interp _ 2 punct _ _ | ||
125 | +4 jakie jaki adj adj sg|acc|n|pos 7 adjunct _ _ | ||
126 | +5 ty ty ppron12 ppron12 sg|nom|m2|sec 6 subj _ _ | ||
127 | +6 masz mieć fin fin sg|sec|imperf 0 pred _ _ | ||
128 | +7 futerko futerko subst subst sg|acc|n 6 obj_th _ _ | ||
129 | +8 , , interp interp _ 7 punct _ _ | ||
130 | +9 lazurowe lazurowy adj adj sg|acc|n|pos 7 adjunct _ _ | ||
131 | +10 po po prep prep acc 9 adjunct _ _ | ||
132 | +11 prostu prosty adjp adjp _ 10 mwe _ _ | ||
133 | +12 ! ! interp interp _ 6 punct _ _ |
testy/skladnica-test1-Not_parsed.conll
0 → 100644
1 | +1 Cmentarz cmentarz subst subst sg|nom|m3 2 subj _ _ | ||
2 | +2 jest być fin fin sg|ter|imperf 0 pred _ _ | ||
3 | +3 taki taki adj adj sg|nom|m3|pos 4 adjunct _ _ | ||
4 | +4 pusty pusty adj adj sg|nom|m3|pos 2 pd _ _ | ||
5 | +5 ! ! interp interp _ 2 punct _ _ | ||
6 | + | ||
7 | +1 Mówi mówić fin fin sg|ter|imperf 0 pred _ _ | ||
8 | +2 się się qub qub _ 1 refl _ _ | ||
9 | +3 przecież przecież qub qub _ 1 adjunct _ _ | ||
10 | +4 , , interp interp _ 7 punct _ _ | ||
11 | +5 że że comp comp _ 7 complm _ _ | ||
12 | +6 broń broń subst subst sg|nom|f 7 subj _ _ | ||
13 | +7 była być praet praet sg|f|imperf 1 comp_fin _ _ | ||
14 | +8 w w prep prep loc|nwok 7 adjunct _ _ | ||
15 | +9 szkole szkoła subst subst sg|loc|f 8 comp _ _ | ||
16 | +10 schowana schować ppas ppas sg|nom|f|perf|aff 7 pd _ _ | ||
17 | +11 jeszcze jeszcze qub qub _ 12 adjunct _ _ | ||
18 | +12 latem lato subst subst sg|inst|n 7 adjunct _ _ | ||
19 | +13 w w prep prep loc|nwok 12 adjunct _ _ | ||
20 | +14 czasie czas subst subst sg|loc|m3 13 mwe _ _ | ||
21 | +15 remontu remont subst subst sg|gen|m3 14 comp _ _ | ||
22 | +16 . . interp interp _ 1 punct _ _ | ||
23 | + | ||
24 | +1 Bo bo comp comp _ 9 adjunct _ _ | ||
25 | +2 jak jak adv adv _ 9 adjunct _ _ | ||
26 | +3 ona on ppron3 ppron3 sg|nom|f|ter|akc|npraep 9 subj _ _ | ||
27 | +4 , , interp interp _ 3 punct _ _ | ||
28 | +5 chora chory adj adj sg|nom|f|pos 3 adjunct _ _ | ||
29 | +6 na na prep prep acc 5 adjunct _ _ | ||
30 | +7 cukrzycę cukrzyca subst subst sg|acc|f 6 comp _ _ | ||
31 | +8 , , interp interp _ 3 punct _ _ | ||
32 | +9 przeżyła przeżyć praet praet sg|f|perf 0 pred _ _ | ||
33 | +10 trzy trzy num num pl|acc|m3|congr 9 obj _ _ | ||
34 | +11 dni dzień subst subst pl|acc|m3 10 comp _ _ | ||
35 | +12 bez bez prep prep gen|nwok 9 comp _ _ | ||
36 | +13 wody woda subst subst sg|gen|f 14 conjunct _ _ | ||
37 | +14 i i conj conj _ 12 comp _ _ | ||
38 | +15 jedzenia jedzenie subst subst sg|gen|n 14 conjunct _ _ | ||
39 | +16 ? ? interp interp _ 9 punct _ _ | ||
40 | + | ||
41 | +1 Jednak jednak qub qub _ 9 adjunct _ _ | ||
42 | +2 już już qub qub _ 3 adjunct _ _ | ||
43 | +3 wkrótce wkrótce adv adv _ 9 adjunct _ _ | ||
44 | +4 Nizioł Nizioł subst subst sg|nom|m1 5 conjunct _ _ | ||
45 | +5 i i conj conj _ 9 subj _ _ | ||
46 | +6 Wapiński Wapiński subst subst sg|nom|m1 5 conjunct _ _ | ||
47 | +7 ze z prep prep inst|wok 9 adjunct _ _ | ||
48 | +8 zdumieniem zdumienie subst subst sg|inst|n 7 comp _ _ | ||
49 | +9 odkryli odkryć praet praet pl|m1|perf 0 pred _ _ | ||
50 | +10 , , interp interp _ 14 punct _ _ | ||
51 | +11 że że comp comp _ 14 complm _ _ | ||
52 | +12 Łapiński Łapiński subst subst sg|nom|m1 14 subj _ _ | ||
53 | +13 nie nie qub qub _ 14 neg _ _ | ||
54 | +14 dotrzymuje dotrzymywać fin fin sg|ter|imperf 9 comp_fin _ _ | ||
55 | +15 wcześniej wcześnie adv adv com 16 adjunct _ _ | ||
56 | +16 danego dać ppas ppas sg|gen|n|perf|aff 17 adjunct _ _ | ||
57 | +17 słowa słowo subst subst sg|gen|n 14 obj _ _ | ||
58 | +18 . . interp interp _ 9 punct _ _ | ||
59 | + | ||
60 | +1 A a qub qub _ 8 adjunct _ _ | ||
61 | +2 pan pan subst subst sg|nom|m1 8 subj _ _ | ||
62 | +3 nigdy nigdy adv adv _ 8 adjunct _ _ | ||
63 | +4 się się qub qub _ 8 refl _ _ | ||
64 | +5 z z prep prep inst|nwok 8 comp _ _ | ||
65 | +6 nimi on ppron3 ppron3 pl|inst|m1|ter|akc|praep 5 comp _ _ | ||
66 | +7 nie nie qub qub _ 8 neg _ _ | ||
67 | +8 zetknął zetknąć praet praet sg|m1|perf 0 pred _ _ | ||
68 | +9 ? ? interp interp _ 8 punct _ _ | ||
69 | + | ||
70 | +1 Załapać załapać inf inf perf 3 comp_inf _ _ | ||
71 | +2 się się qub qub _ 1 refl _ _ | ||
72 | +3 trzeba trzeba pred pred _ 0 pred _ _ | ||
73 | +4 teraz teraz adv adv _ 3 adjunct _ _ | ||
74 | +5 , , interp interp _ 3 punct _ _ | ||
75 | +6 bo bo comp comp _ 3 adjunct _ _ | ||
76 | +7 potem potem adv adv _ 8 adjunct _ _ | ||
77 | +8 będzie być bedzie bedzie sg|ter|imperf 6 comp_fin _ _ | ||
78 | +9 trudniej trudno adv adv com 8 pd _ _ | ||
79 | +10 . . interp interp _ 3 punct _ _ | ||
80 | + | ||
81 | +1 Medykamenty medykament subst subst pl|nom|m3 4 subj _ _ | ||
82 | +2 współczesne współczesny adj adj pl|nom|m3|pos 1 adjunct _ _ | ||
83 | +3 dostępne dostępny adj adj pl|nom|m3|pos 4 pd _ _ | ||
84 | +4 są być fin fin pl|ter|imperf 0 pred _ _ | ||
85 | +5 na na prep prep loc 4 adjunct _ _ | ||
86 | +6 czarnym czarny adj adj sg|loc|m3|pos 7 adjunct _ _ | ||
87 | +7 rynku rynek subst subst sg|loc|m3 5 comp _ _ | ||
88 | +8 . . interp interp _ 4 punct _ _ | ||
89 | + | ||
90 | +1 To to subst subst sg|nom|n 3 subj _ _ | ||
91 | +2 samo sam adj adj sg|nom|n|pos 1 adjunct _ _ | ||
92 | +3 dotyczy dotyczyć fin fin sg|ter|imperf 5 conjunct _ _ | ||
93 | +4 leczenia leczenie subst subst sg|gen|n 3 obj_th _ _ | ||
94 | +5 , , interp interp _ 0 coord_punct _ _ | ||
95 | +6 służba służba subst subst sg|nom|f 9 subj _ _ | ||
96 | +7 zdrowia zdrowie subst subst sg|gen|n 6 adjunct _ _ | ||
97 | +8 praktycznie praktycznie adv adv pos 9 adjunct _ _ | ||
98 | +9 przestała przestać praet praet sg|f|perf 5 conjunct _ _ | ||
99 | +10 istnieć istnieć inf inf imperf 9 comp_inf _ _ | ||
100 | +11 . . interp interp _ 5 punct _ _ | ||
101 | + | ||
102 | +1 Zwykły zwykły adj adj sg|nom|m1|pos 2 adjunct _ _ | ||
103 | +2 mieszkaniec mieszkaniec subst subst sg|nom|m1 4 subj _ _ | ||
104 | +3 kraju kraj subst subst sg|gen|m3 2 adjunct _ _ | ||
105 | +4 ma mieć fin fin sg|ter|imperf 0 pred _ _ | ||
106 | +5 się się qub qub _ 6 refl _ _ | ||
107 | +6 leczyć leczyć inf inf imperf 4 comp_inf _ _ | ||
108 | +7 ziołami ziele subst subst pl|inst|n 6 obj_th _ _ | ||
109 | +8 , , interp interp _ 10 punct _ _ | ||
110 | +9 które który adj adj pl|acc|n|pos 10 obj _ _ | ||
111 | +10 zaleca zalecać fin fin sg|ter|imperf 7 adjunct _ _ | ||
112 | +11 tradycyjna tradycyjny adj adj sg|nom|f|pos 12 adjunct _ _ | ||
113 | +12 medycyna medycyna subst subst sg|nom|f 10 subj _ _ | ||
114 | +13 koreańska koreański adj adj sg|nom|f|pos 12 adjunct _ _ | ||
115 | +14 . . interp interp _ 4 punct _ _ |
tokenizer/ENIAMtokens.ml
@@ -814,6 +814,8 @@ let rec recognize_sign_group poss_s_beg i = function | @@ -814,6 +814,8 @@ let rec recognize_sign_group poss_s_beg i = function | ||
814 | | (Sign "?") :: (Sign "?") :: l -> | 814 | | (Sign "?") :: (Sign "?") :: l -> |
815 | create_sentence_seq_q i ((Sign "?") :: (Sign "?") :: []) l "??",i+2*factor,l,true | 815 | create_sentence_seq_q i ((Sign "?") :: (Sign "?") :: []) l "??",i+2*factor,l,true |
816 | (* | (Sign "?") :: (Sign ".") :: l -> *) | 816 | (* | (Sign "?") :: (Sign ".") :: l -> *) |
817 | + | (Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: l -> | ||
818 | + create_sentence_seq_q i ((Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: []) l "!...",i+4*factor,l,true | ||
817 | | (Sign "!") :: (Sign "?") :: l -> | 819 | | (Sign "!") :: (Sign "?") :: l -> |
818 | create_sentence_seq_q i ((Sign "!") :: (Sign "?") :: []) l "!?",i+2*factor,l,true | 820 | create_sentence_seq_q i ((Sign "!") :: (Sign "?") :: []) l "!?",i+2*factor,l,true |
819 | | (Sign "?") :: (Sign "…") :: l -> | 821 | | (Sign "?") :: (Sign "…") :: l -> |