Commit 0bce3b2e8d3baa9d0907f6421e81ca3525540b0e

Authored by Wojciech Jaworski
2 parents 95e859a0 f18f9cc0

Integracja parsera zależnościowego z ENIAMlexSemantics

.gitignore
... ... @@ -7,3 +7,4 @@
7 7 *.aux
8 8 *.log
9 9 *.tex.backup
  10 +tools/mate-tools/dist/*
... ...
corpora/CONLL.ml
... ... @@ -220,13 +220,14 @@ let match_corpus corpus =
220 220  
221 221 (******************)
222 222  
  223 +exception Comment_line
223 224 exception Empty_line
224 225 exception Empty_sentence
225 226 exception Id_line of string
226 227  
227 228 let load_token in_channel =
228 229 let fail line =
229   - (* failwith ("load_token: " ^ line) *)
  230 + print_endline ("load_token: " ^ line);
230 231 () in
231 232 let int_of_super = function
232 233 "_" -> -1
... ... @@ -247,7 +248,8 @@ let load_token in_channel =
247 248 else if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.tree" line
248 249 then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.tree" line in
249 250 raise (Id_line id)
250   - else failwith ("load_token: " ^ line)
  251 + else raise Comment_line
  252 + (* failwith ("load_token: " ^ line) *)
251 253 else
252 254 match Xstring.split "\t" line with
253 255 [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] ->
... ... @@ -272,6 +274,7 @@ let load_sentence in_channel =
272 274 if id_a <> conll_id then failwith "load_sentence: different ids" else
273 275 pom ((id_a,super,label) :: rev_paths) id
274 276 with Id_line new_id -> pom rev_paths new_id
  277 + | Comment_line -> pom rev_paths id
275 278 | Empty_line -> rev_paths, id
276 279 | End_of_file -> if rev_paths = []
277 280 then raise End_of_file
... ...
corpora/CONLL_adapter.ml
... ... @@ -42,6 +42,34 @@ let if_interps interps token =
42 42 ) interp in
43 43 Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value))
44 44  
  45 +let change_dep paths i (id,super,label) =
  46 + let id_S, super_S, label_S = paths.(super) in
  47 + paths.(i) <- (id,super_S,label);
  48 + paths.(super) <- (id_S, id, label_S)
  49 +
  50 +let correct_injection paths tokens = Array.iteri (fun i (id,super,label) ->
  51 + if label = "punct" then (*musi być pierwszym tokenem o tym ojcu*)
  52 + let j = Int.fold (i+1) (Array.length paths - 1) 0 (fun acc n ->
  53 + let i2,s2,l2 = paths.(n) in
  54 + if super = s2
  55 + then if l2 = "punct"
  56 + then n
  57 + else 0
  58 + else acc
  59 + ) in
  60 + let k = Int.fold_down (i-1) 1 i (fun acc n ->
  61 + let i2,s2,l2 = paths.(n) in
  62 + if super = s2
  63 + then 0
  64 + else acc
  65 + ) in
  66 + if k == i && j <> 0 && i < super && super < j
  67 + then
  68 + (paths.(i) <- (0,-1,"");
  69 + paths.(j) <- (0,-1,""))
  70 + ) paths;
  71 + paths
  72 +
45 73 let correct_coordination1 paths tokens =
46 74 let paths_ls = List.mapi (fun i (id,super,label) ->
47 75 (i,id,super,label)) (Array.to_list paths) in
... ... @@ -136,15 +164,15 @@ let correct_coordination2 paths tokens =
136 164 let paths_ls () = List.mapi (fun i (id,super,label) ->
137 165 (i,id,super,label)) (Array.to_list paths_c) in
138 166  
139   - (* let ps a sons =
  167 + let ps a sons =
140 168 print_endline a;
141 169 List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons;
142   - print_endline "" in *)
  170 + print_endline "" in
143 171  
144 172 let rec correct_rec (i,id,super,label) sons =
145 173 let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in
146   - (* ps "left:" (List.rev left_s);
147   - ps "right:" right_s; *)
  174 + ps "left:" (List.rev left_s);
  175 + ps "right:" right_s;
148 176 find_father i (List.rev left_s);
149 177 find_father i right_s
150 178  
... ... @@ -154,23 +182,35 @@ let correct_coordination2 paths tokens =
154 182 paths_c.(i) <- (id,i0,label);
155 183 if not (if_cat ["conj"] (ExtArray.get tokens i).token ||
156 184 (ExtArray.get tokens i).orth = ",")
157   - then failwith "find_father";
  185 + then failwith "find_father1";
158 186 correct_rec (i,id,super,label) (if a < i
159 187 then (a,b,c,d) :: t
160 188 else List.rev @@ (a,b,c,d) :: t)
161   - | _ -> failwith "find_father" in
  189 + | [] -> failwith "find_father2" in
162 190  
163 191 let check_previous_for_interp i =
164 192 if i >= 0 && (ExtArray.get tokens i).orth = "," &&
165 193 not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths_c))
166 194 then paths_c.(i) <- (0,-1,"") in
167 195  
  196 + let filter_comp_construction sons =
  197 + let rec pom acc = function
  198 + (i1,id1,super1,label1) :: (i2,id2,super2,label2) :: t ->
  199 + if if_cat ["interp"] (ExtArray.get tokens i1).token &&
  200 + if_cat ["comp"] (ExtArray.get tokens i2).token
  201 + then pom acc t
  202 + else pom ((i1,id1,super1,label1) :: acc) ((i2,id2,super2,label2) :: t)
  203 + | h :: t -> pom (h :: acc) t
  204 + | [] -> List.rev acc in
  205 + pom [] sons in
  206 +
168 207 Array.iteri (fun i (id,super,label) ->
169 208 if if_cat ["conj"] (ExtArray.get tokens i).token ||
170 209 (ExtArray.get tokens i).orth = ","
171 210 then
172 211 (check_previous_for_interp (i-1);
173 212 let sons = List.filter (fun (_,_,super,_) -> super = i) (paths_ls ()) in
  213 + (* let sons = filter_comp_construction sons in *)
174 214 if (List.length sons > 2)
175 215 then correct_rec (i,id,super,label) sons)) paths_c;
176 216 paths_c
... ... @@ -206,15 +246,16 @@ done; *)
206 246  
207 247 let brev i id super label =
208 248 let if_the_last_dot () =
209   - let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) ->
210   - s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in
211   - Array.fold_left (fun acc (i2,s,l) ->
212   - acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in
  249 + try
  250 + let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) ->
  251 + s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in
  252 + Array.fold_left (fun acc (i2,s,l) ->
  253 + acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths
  254 + with Not_found -> true in
213 255  
214 256 let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot ()
215 257 then ""
216 258 else "." in
217   -
218 259 let n_orth = (ExtArray.get tokens id).orth ^ dot in
219 260 paths.(i) <- (find_token n_orth,super,label) in
220 261  
... ... @@ -317,6 +358,16 @@ let correct_interp_with_father_0 paths tokens =
317 358 then paths.(i1) <- (id1,0,label1)) paths) paths;
318 359 paths
319 360  
  361 +let corect_complm paths tokens =
  362 + Array.iteri (fun i (id,super,label) ->
  363 + if label = "complm" && super > 0
  364 + then
  365 + let i2,s2,l2 = paths.(super) in
  366 + if if_cat ["conj"] (ExtArray.get tokens i2).token
  367 + then change_dep paths i (id,super,label)
  368 + ) paths;
  369 + paths
  370 +
320 371 let remove_interps interp paths tokens =
321 372 let paths_ls = Array.to_list paths in
322 373 Array.iteri (fun i (id,super,label) ->
... ... @@ -339,10 +390,6 @@ let correct_passive_voice paths tokens =
339 390 paths
340 391  
341 392 let swap_dep paths tokens =
342   - let change_dep i (id,super,label) =
343   - let id_S, super_S, label_S = paths.(super) in
344   - paths.(i) <- (id,super_S,label);
345   - paths.(super) <- (id_S, id, label_S) in
346 393 let rec correct_dep i (id,super,label) =
347 394 let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który";
348 395 "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in
... ... @@ -356,7 +403,7 @@ let swap_dep paths tokens =
356 403 (if_lemma adv_relators (ExtArray.get tokens id).token &&
357 404 if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"; "subst"] (ExtArray.get tokens super).token)
358 405 then
359   - change_dep i (id,super,label);
  406 + change_dep paths i (id,super,label);
360 407 if (if_lemma adv_relators (ExtArray.get tokens id).token &&
361 408 if_cat ["subst"; "pred"] (ExtArray.get tokens super).token)
362 409 then correct_dep i paths.(i) in
... ... @@ -367,7 +414,11 @@ let swap_dep paths tokens =
367 414 nieobsługiwana na razie koordynacja strony biernej - zarówno czasowniki posiłkowe, jak i imiesłowy
368 415 nieobsługiwana na razie koordynacja podrzędników spójników podrzędnych *)
369 416  
370   -let convert_dep_tree id first_try paths tokens =
  417 +let convert_dep_tree path first_try paths tokens =
  418 + File.file_out (path ^ "/pre_text_unmodified_" ^ (string_of_bool first_try) ^ ".html") (fun file ->
  419 + Printf.fprintf file "%s\n" ENIAMvisualization.html_header;
  420 + Printf.fprintf file "%s\n" (ENIAMvisualization.html_of_dep_sentence tokens paths);
  421 + Printf.fprintf file "%s\n" ENIAMvisualization.html_trailer);
371 422 let paths = Array.copy paths in
372 423 let paths =
373 424 if first_try
... ... @@ -375,16 +426,27 @@ let convert_dep_tree id first_try paths tokens =
375 426 let pom = replace_tokens paths tokens in
376 427 let pom = (remove_interps ".") pom tokens in
377 428 let pom = replace_hyphens pom tokens in
  429 + let pom = correct_injection pom tokens in
378 430 let pom = correct_coordination1 pom tokens in
379 431 let pom = correct_interp_with_father_0 pom tokens in
380   - let pom = correct_coordination2 pom tokens in
381   - let pom = remove_interps "," pom tokens in
  432 + (* File.file_out (path ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file ->
  433 + Printf.fprintf file "%s\n" ENIAMvisualization.html_header;
  434 + Printf.fprintf file "%s\n" (ENIAMvisualization.html_of_dep_sentence tokens paths);
  435 + Printf.fprintf file "%s\n" ENIAMvisualization.html_trailer); *)
  436 + let pom = try corect_complm pom tokens with | e -> print_endline (Printexc.to_string e); pom in
  437 + let pom = try
  438 + let pom2 = correct_coordination2 pom tokens in
  439 + remove_interps "," pom2 tokens
  440 + with
  441 + | _ -> (let pom2 = remove_interps "," pom tokens in
  442 + correct_coordination2 pom2 tokens) in
382 443 let pom = correct_passive_voice pom tokens in
383 444 praet_qub_aglt pom tokens
384 445 else
385   - swap_dep paths tokens in
386   - (* File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file ->
387   - Printf.fprintf file "%s\n" Visualization.html_header;
388   - Printf.fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths);
389   - Printf.fprintf file "%s\n" Visualization.html_trailer); *)
  446 + paths in
  447 + (* swap_dep paths tokens in *)
  448 + File.file_out (path ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file ->
  449 + Printf.fprintf file "%s\n" ENIAMvisualization.html_header;
  450 + Printf.fprintf file "%s\n" (ENIAMvisualization.html_of_dep_sentence tokens paths);
  451 + Printf.fprintf file "%s\n" ENIAMvisualization.html_trailer);
390 452 paths
... ...
corpora/makefile
... ... @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt
3 3 OCAMLDEP=ocamldep
4 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 5 OCAMLFLAGS=$(INCLUDES) -g
6   -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa
  6 +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa eniam-exec.cmxa
7 7 INSTALLDIR=`ocamlc -where`/eniam
8 8  
9 9 SOURCES= types.ml CONLL.ml CONLL_adapter.ml resources.ml conllParser.ml interpsInCorpus.ml generate.ml
... ...
corpora/test_conll.ml
... ... @@ -48,7 +48,7 @@ let clarify_categories senses token =
48 48 | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[])
49 49 | _ -> []
50 50  
51   -let create_chart tokens lex_sems paths last =
  51 +(* let create_chart tokens lex_sems paths last =
52 52 ENIAM_LCGrenderer.reset_variable_numbers ();
53 53 let chart = ENIAM_LCGchart.make last in
54 54 let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
... ... @@ -59,7 +59,7 @@ let create_chart tokens lex_sems paths last =
59 59 let cats = clarify_categories ["X"] t in
60 60 let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in
61 61 ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
62   - chart
  62 + chart *)
63 63  
64 64 let rec split_sons left id right = function
65 65 [] -> List.rev (List.sort compare left), List.sort compare right
... ... @@ -85,7 +85,7 @@ let create_dep_chart tokens lex_sems paths =
85 85 ENIAM_LCGrenderer.reset_variable_names ();
86 86 ENIAM_LCGrenderer.add_variable_numbers ();
87 87 let cats = clarify_categories ["X"] t in
88   - let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in
  88 + let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata s.ENIAMlexSemanticsTypes.lex_entries in
89 89 IntMap.add nodes i l) in
90 90 (* print_endline "create_dep_chart 3"; *)
91 91 let x = dep_create_rec nodes sons 0 in
... ... @@ -93,7 +93,7 @@ let create_dep_chart tokens lex_sems paths =
93 93 x
94 94  
95 95  
96   -let test_example path id tokens lex_sems paths last =
  96 +(* let test_example path id tokens lex_sems paths last =
97 97 ENIAM_LCGreductions.reset_variant_label ();
98 98 let chart = create_chart tokens lex_sems paths last in
99 99 ENIAM_LCGlatexOf.print_chart path (id^"1_chart") "a1" chart;
... ... @@ -119,43 +119,45 @@ let test_example path id tokens lex_sems paths last =
119 119 ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree;
120 120 ())
121 121 else print_endline "not reduced")
122   - else print_endline "not parsed"
  122 + else print_endline "not parsed" *)
123 123  
124   -let test_dep_example path id tokens lex_sems paths =
  124 +let rec test_dep_example path id tokens lex_sems first_try paths =
  125 + (* print_endline "test_dep_example 1"; *)
  126 + let paths = CONLL_adapter.convert_dep_tree path first_try paths tokens in
125 127 try
126   - ENIAM_LCGreductions.reset_variant_label ();
127   - print_endline "test_dep_example 1";
128   - let paths = CONLL_adapter.convert_dep_tree id (*first_try*) true paths tokens in
129   - print_endline "test_dep_example 2";
130   - (* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *)
131   - let chart = create_dep_chart tokens lex_sems paths in
132   - (* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *)
133   - let chart,references = ENIAM_LCGchart.dep_lazify chart in
134   - (* ENIAM_LCGlatexOf.print_dep_chart path (id^"2_chart") "a4" chart; *)
135   - (* ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references; *)
136   - let chart = ENIAM_LCGchart.dep_parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
137   - (* ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart; *)
138   - (* ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references; *)
139   - if ENIAM_LCGchart.is_dep_parsed chart then (
140   - let term = ENIAM_LCGchart.get_dep_parsed_term chart in
141   - (* Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file ->
142   - Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
143   - Xlatex.latex_compile_and_clean path (id^"4_term"); *)
144   - let dependency_tree = ENIAM_LCGreductions.reduce term references in
145   - (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree; *)
146   - if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
147   - ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
148   - (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree; *)
149   - ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
150   - (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree; *)
151   - (* ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree; *)
152   - (* ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; *)
153   - ())
154   - else print_endline "not reduced")
155   - else print_endline "not parsed"
  128 + ENIAM_LCGreductions.reset_variant_label ();
  129 + (* print_endline "test_dep_example 2"; *)
  130 + (* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *)
  131 + let chart = create_dep_chart tokens lex_sems paths in
  132 + (* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *)
  133 + let chart,references = ENIAM_LCGchart.dep_lazify chart in
  134 + (* ENIAM_LCGlatexOf.print_dep_chart path (id^"2_chart") "a4" chart; *)
  135 + (* ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references; *)
  136 + let chart = ENIAM_LCGchart.dep_parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
  137 + (* ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart; *)
  138 + (* ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references; *)
  139 + if ENIAM_LCGchart.is_dep_parsed chart then (
  140 + let term = ENIAM_LCGchart.get_dep_parsed_term chart in
  141 + (* Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file ->
  142 + Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
  143 + Xlatex.latex_compile_and_clean path (id^"4_term"); *)
  144 + let dependency_tree = ENIAM_LCGreductions.reduce term references in
  145 + (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree; *)
  146 + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
  147 + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
  148 + (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree; *)
  149 + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
  150 + (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree; *)
  151 + (* ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree; *)
  152 + (* ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; *)
  153 + ())
  154 + else print_endline "not reduced")
  155 + else print_endline "not parsed"
156 156 with NotDepParsed(id_ndp,left,l,right) -> (
157   - print_endline "not parsed 2";
158   - ENIAM_LCGlatexOf.print_not_parsed_dep_chart path (id^"3_not_parsed_chart") "a2" (id_ndp,left,l,right))
  157 + if (first_try)
  158 + then test_dep_example path id tokens lex_sems false paths
  159 + else (print_endline "not parsed 2";
  160 + ENIAM_LCGlatexOf.print_not_parsed_dep_chart path (id^"3_not_parsed_chart") "a2" (id_ndp,left,l,right)))
159 161  
160 162 let rec parse_sentence name id tokens lex_sems = function
161 163 RawSentence s -> id
... ... @@ -163,7 +165,7 @@ let rec parse_sentence name id tokens lex_sems = function
163 165 (* test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; *)
164 166 id + 1
165 167 | DepSentence(paths) ->
166   - test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems paths;
  168 + test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems true paths;
167 169 id + 1
168 170 | QuotedSentences sentences ->
169 171 Xlist.fold sentences id (fun id p ->
... ... @@ -212,8 +214,8 @@ let process_id s =
212 214 else failwith ("process_id: " ^ s)
213 215  
214 216 let process_conll_corpus filename =
215   - let corpus = File.file_in filename (fun file -> CONLL.match_corpus (ENIAM_CONLL.load_corpus file)) in
216   - print_endline "process_conll_corpus";
  217 + let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in
  218 + (* print_endline "process_conll_corpus 1"; *)
217 219 (* let corpus = [List.hd corpus] in *)
218 220 Xlist.iter corpus (fun query -> try
219 221 let id = process_id (get_query_id query) in
... ... @@ -226,13 +228,17 @@ let process_conll_corpus filename =
226 228 (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *)
227 229 let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths]
228 230 (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in
  231 + (* print_endline "process_conll_corpus 2"; *)
229 232 let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in
  233 + (* print_endline "process_conll_corpus 3"; *)
230 234 let sentences = match text with
231 235 AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences
232 236 | _ -> failwith "process_conll_corpus 1" in
233 237 let text = AltText[Raw,RawText query; Struct, StructText([
234 238 AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in
  239 + (* print_endline "process_conll_corpus 4"; *)
235 240 let lex_sems = ENIAMlexSemantics.assign tokens text in
  241 + (* print_endline "process_conll_corpus 5"; *)
236 242 ignore(parse_text id 1 tokens lex_sems text)
237 243 | _ -> failwith "process_conll_corpus 2"
238 244 with
... ... @@ -241,6 +247,7 @@ let process_conll_corpus filename =
241 247  
242 248 let _ =
243 249 Printexc.record_backtrace true;
  250 + ENIAMlexSemantics.initialize ();
244 251 (* LCGfields.reset (); *)
245 252 (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *)
246 253 (* process_conll_corpus "../testy/skladnica-test1.conll"; *)
... ...
exec/ENIAMexec.ml
... ... @@ -85,6 +85,37 @@ let create_chart rules tokens lex_sems paths last =
85 85 ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
86 86 chart
87 87  
  88 +let rec split_sons left id right = function
  89 + [] -> List.rev (List.sort compare left), List.sort compare right
  90 + | x :: l -> if x < id then split_sons (x :: left) id right l else split_sons left id (x :: right) l
  91 +
  92 +let rec dep_create_rec nodes sons conll_id =
  93 + let node = IntMap.find nodes conll_id in
  94 + let l = try IntMap.find sons conll_id with Not_found -> [] in
  95 + let left,right = split_sons [] conll_id [] l in
  96 + (* Printf.printf "dep_create_rec [%s] %d [%s]\n" (String.concat ";" (Xlist.map left string_of_int)) conll_id (String.concat ";" (Xlist.map right string_of_int)); *)
  97 + DepNode(conll_id, Xlist.map left (dep_create_rec nodes sons), node, Xlist.map right (dep_create_rec nodes sons))
  98 +
  99 +let create_dep_chart dep_rules tokens lex_sems paths =
  100 + (* print_endline "create_dep_chart 1"; *)
  101 + let sons = Int.fold 1 (Array.length paths - 1) IntMap.empty (fun sons i ->
  102 + let _,super,_ = paths.(i) in
  103 + IntMap.add_inc sons super [i] (fun l -> i :: l)) in
  104 + (* print_endline "create_dep_chart 2"; *)
  105 + let nodes = Int.fold 0 (Array.length paths - 1) IntMap.empty (fun nodes i ->
  106 + let id,_,_ = paths.(i) in
  107 + let t = ExtArray.get tokens id in
  108 + let s = ExtArray.get lex_sems id in
  109 + ENIAM_LCGrenderer.reset_variable_names ();
  110 + ENIAM_LCGrenderer.add_variable_numbers ();
  111 + let cats = clarify_categories ["X"] t in
  112 + let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata s.ENIAMlexSemanticsTypes.lex_entries in
  113 + IntMap.add nodes i l) in
  114 + (* print_endline "create_dep_chart 3"; *)
  115 + let x = dep_create_rec nodes sons 0 in
  116 + (* print_endline "create_dep_chart 4"; *)
  117 + x
  118 +
88 119 let create_text_fragments tokens paths last =
89 120 let text_fragments = Array.make last IntMap.empty in
90 121 Xlist.iter paths (fun (id,lnode,rnode) ->
... ... @@ -156,85 +187,75 @@ let eniam_parse_sentence timeout verbosity rules tokens lex_sems paths last =
156 187 with e ->
157 188 let time2 = time_fun () in
158 189 {result with status=LexiconError; msg=string_of_exn e; lex_time=time2 -. time1}
159   -(*
160   -let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens lex_sems =
161   - let result = empty_conll_parse_result in
162   - let time2 = time_fun () in
163   - (* let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in *)
  190 +
  191 +let rec conll_parse_sentence timeout verbosity dep_rules first_try tokens lex_sems paths =
  192 + ENIAM_LCGreductions.reset_variant_label ();
  193 + let result = {empty_conll_parse_result with paths_size = Xlist.size paths} in
  194 + let result = if verbosity = 0 then result else result(*{result with text_fragments=create_dep_text_fragments tokens paths last}*) in (* FIXME *)
  195 + let time1 = time_fun () in
164 196 try
165   - let dep_chart = LCGlexicon.dep_create paths tokens lex_sems in
166   - let dep_chart,references = LCGchart.dep_lazify dep_chart in
167   - let result = if test_only_flag then result else {result with dep_chart=dep_chart} in
168   - let time3 = time_fun () in
169   - let result = {result with lex_time=time3 -. time2} in
  197 + let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in
  198 + let chart = create_chart dep_rules tokens lex_sems paths in
  199 + let result = if verbosity = 0 then result else {result with chart1=chart} in
  200 + let chart,references = ENIAM_LCGchart.dep_lazify chart in
  201 + let result = if verbosity = 0 then result else {result with chart2=chart; references2=ExtArray.copy references} in
  202 + let time2 = time_fun () in
  203 + let result = {result with lex_time=time2 -. time1} in
170 204 try
171   - (* print_endline "conll_parse_sentence 1"; *)
172   - (* LCGlatexOf.print_references "results/" "references1" references; *)
173   - let parsed_dep_chart = LCGchart.dep_parse dep_chart references timeout time_fun in (* uwaga: niejawna zmiana imperatywna w references *)
174   - (* print_endline "conll_parse_sentence 2"; *)
175   - (* LCGlatexOf.print_references "results/" "references2" references; *)
176   - let time4 = time_fun () in
177   - let result = if test_only_flag then result else {result with parsed_dep_chart=parsed_dep_chart} in
178   - let result = {result with parse_time=time4 -. time3} in
179   - if LCGchart.is_dep_parsed parsed_dep_chart then
  205 + let chart = ENIAM_LCGchart.dep_parse chart references timeout time_fun in (* uwaga: niejawna zmiana imperatywna w references *)
  206 + let time3 = time_fun () in
  207 + let result = if verbosity = 0 then result else {result with parsed_dep_chart=chart; references3=references} in
  208 + let result = {result with parse_time=time3 -. time2; chart_size=ENIAM_LCGchart.get_no_entries chart} in
  209 + if ENIAM_LCGchart.is_dep_parsed chart then
180 210 try
181   - let term = LCGchart.get_dep_parsed_term tokens lex_sems parsed_dep_chart in
182   - (* LCGlatexOf.print_dependency_tree "dep_dependency_tree1" dependency_tree; *)
183   - let dependency_tree = LCGreductions.reduce term references in
184   - let time5 = time_fun () in
185   - let result = if test_only_flag then result else {result with dependency_tree=dependency_tree} in
186   - let result = {result with reduction_time=time5 -. time4; dependency_tree_size=Array.length dependency_tree} in
187   - if LCGreductions.is_reduced_dependency_tree dependency_tree then
  211 + let term = ENIAM_LCGchart.get_dep_parsed_term chart in
  212 + let result = if verbosity = 0 then result else {result with term4=term} in
  213 + let dependency_tree = ENIAM_LCGreductions.reduce term references in
  214 + let time4 = time_fun () in
  215 + let result = if verbosity = 0 then result else {result with dependency_tree4=Array.copy dependency_tree} in
  216 + let result = {result with reduction_time=time4 -. time3; dependency_tree_size=Array.length dependency_tree} in
  217 + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then
188 218 try
189   - (* print_endline "conll_parse_sentence 3"; *)
190   - LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *)
191   - (* print_endline "conll_parse_sentence 4"; *)
192   - LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *)
193   -(* if Array.length dependency_tree < 10000 then print_xml_dependency_tree "results/trees/" id dependency_tree; *)
194   - (* print_endline "conll_parse_sentence 5"; *)
  219 + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *)
  220 + let result = if verbosity = 0 then result else {result with dependency_tree5=Array.copy dependency_tree} in
  221 + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *)
  222 + let result = (*if verbosity = 0 then result else*) {result with dependency_tree6=dependency_tree} in
195 223 let time6 = time_fun () in
196   - {result with status=Parsed; sem_time=time6 -. time5}
  224 + {result with status=Parsed; sem_time=time6 -. time4}
197 225 with e ->
198 226 let time6 = time_fun () in
199   - {result with status=SemError; msg=string_of_exn e; sem_time=time6 -. time5}
  227 + {result with status=SemError1; msg=string_of_exn e; sem_time=time6 -. time4}
200 228 else
201 229 {result with status=NotReduced}
202 230 with
203 231 | SemTooBig ->
204   - let time5 = time_fun () in
205   - {result with status=TooManyNodes; reduction_time=time5 -. time4}
  232 + let time4 = time_fun () in
  233 + {result with status=TooManyNodes; reduction_time=time4 -. time3}
206 234 | e ->
207   - let time5 = time_fun () in
208   - {result with status=ReductionError; msg=string_of_exn e; reduction_time=time5 -. time4}
  235 + let time4 = time_fun () in
  236 + {result with status=ReductionError; msg=string_of_exn e; reduction_time=time4 -. time3}
209 237 else if first_try
210   - then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems
  238 + then conll_parse_sentence timeout verbosity dep_rules false tokens lex_sems paths
211 239 else {result with status=NotParsed}
212 240 with
213 241 Timeout t ->
214   - let time4 = time_fun () in
215   - {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time4 -. time3}
  242 + let time3 = time_fun () in
  243 + {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time3 -. time2}
216 244 | NotDepParsed(id_ndp,left,l,right) ->
217 245 if first_try
218   - then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems
  246 + then conll_parse_sentence timeout verbosity dep_rules false tokens lex_sems paths
219 247 else let time4 = time_fun () in
220 248 {result with status=NotParsed; not_parsed_dep_chart=(id_ndp,left,l,right); parse_time=time4 -. time3}
221 249 | e ->
222   - let time4 = time_fun () in
223   - {result with status=ParseError; msg=string_of_exn e; parse_time=time4 -. time3}
224   - with e -> (*print_endline (string_of_exn e);*)
225   - let time3 = time_fun () in
  250 + let time3 = time_fun () in
  251 + {result with status=ParseError; msg=string_of_exn e; parse_time=time3 -. time2}
  252 + with e ->
  253 + let time2 = time_fun () in
226 254 if first_try
227   - then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems
  255 + then conll_parse_sentence timeout verbosity dep_rules false tokens lex_sems paths
228 256 else {result with status=LexiconError; msg=string_of_exn e; lex_time=time3 -. time2}
229 257  
230   -
231   -let mate_in, mate_out = (*Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test"*)
232   - if Paths.config.Paths.mate_parser_enabled then
233   - Unix.open_process ("java -jar " ^ Paths.config.Paths.mate_parser_path ^ "dist/anna-3.5.jar -model " ^
234   - Paths.config.Paths.mate_parser_path ^ "examples/160622_Polish_MateParser.mdl -test")
235   - else stdin, stdout
236   -
237   -let swigra_in, swigra_out = (*Unix.open_process "../swigra/parser/run.sh"*)
  258 +(*let swigra_in, swigra_out = (*Unix.open_process "../swigra/parser/run.sh"*)
238 259 if Paths.config.Paths.swigra_enabled then
239 260 Unix.open_process (Paths.config.Paths.swigra_path ^ "run.sh")
240 261 else stdin, stdout
... ... @@ -256,38 +277,21 @@ let parse timeout verbosity rules (*name id*) tokens lex_sems =
256 277 let result = eniam_parse_sentence timeout verbosity rules tokens lex_sems paths last in
257 278 ENIAMSentence result
258 279 | _ -> failwith "parse 3")
259   - | DepSentence(paths) ->
  280 + | DepSentence paths ->
260 281 (match mode with
261   -(* CONLL ->
262   - let result = conll_parse_sentence timeout verbosity id true paths tokens lex_sems in
263   - let result = {result with
  282 + CONLL | Mate ->
  283 + let result = conll_parse_sentence timeout verbosity dep_rules true tokens lex_sems paths in
  284 + (* let result = {result with
264 285 file_prefix = file_prefix_of_mode mode ^ file_prefix;
265   - paths = paths} in
  286 + paths = paths} in *)
266 287 CONLLSentence result
267 288 (* let xml = DepTree.conll_to_xml paths in
268 289 let graph = XmlPrinter.graph_of_xml xml in (* FIXME: do poprawy *)
269 290 Visualization.print_graph "results/" "term_conll" graph;
270 291 let result = {empty_eniam_parse_result with status=Parsed; term=graph} in
271 292 ENIAMSentence result, next_id *)
272   - | Mate ->
273   - if not Paths.config.Paths.mate_parser_enabled then DepSentence paths else (
274   - print_endline "parse_sentence 1";
275   - (* print_endline (Visualization.html_of_dep_sentence tokens paths); *)
276   - let conll = ENIAM_CONLL.string_of_paths ENIAMsubsyntaxTypes.Mate tokens paths in
277   - print_endline "parse_sentence 2";
278   - (* printf "|%s|\n" conll; *)
279   - Printf.fprintf mate_out "%s%!" conll;
280   - print_endline "parse_sentence 3";
281   - let new_paths = get_paths paths (ENIAM_CONLL.load_sentence mate_in) in
282   - print_endline "parse_sentence 4";
283   - (* print_endline (Visualization.html_of_dep_sentence tokens new_paths); *)
284   - let result = conll_parse_sentence timeout verbosity id true new_paths tokens lex_sems in
285   - let result = {result with
286   - file_prefix = file_prefix_of_mode mode ^ file_prefix;
287   - paths=new_paths} in
288   - CONLLSentence result)*)
289   - | _ -> failwith "parse 2")
290   - | _ -> failwith "parse 1")
  293 + | _ -> failwith "parse 2")
  294 + | _ -> failwith "parse 1")
291 295  
292 296  
293 297 (*
... ...
exec/ENIAMexecTypes.ml
... ... @@ -49,9 +49,9 @@ type eniam_parse_result = {
49 49 semantic_graph11: ENIAMsemTypes.linear_term;
50 50 text_fragments: string IntMap.t array;
51 51 }
52   -(*
  52 +
53 53 type conll_parse_result = {
54   - file_prefix: string;
  54 +(* file_prefix: string;*)
55 55 status: status;
56 56 msg: string;
57 57 lex_time: float;
... ... @@ -59,17 +59,29 @@ type conll_parse_result = {
59 59 reduction_time: float;
60 60 sem_time: float;
61 61 paths_size: int;
  62 + chart_size: int;
62 63 dependency_tree_size: int;
63   - paths: (int * int * string) array;
64   - dep_chart: LCGtypes.dep_tree;
65   - parsed_dep_chart: (LCGtypes.SymbolMap.key * LCGtypes.linear_term) list;
  64 + chart1: dep_tree;
  65 + chart2: dep_tree;
  66 + references2: linear_term ExtArray.t;
  67 + parsed_dep_chart: (SymbolMap.key * linear_term) list;
66 68 not_parsed_dep_chart: int *
67   - (LCGtypes.grammar_symbol * LCGtypes.linear_term) list list *
68   - (LCGtypes.grammar_symbol * LCGtypes.linear_term) list *
69   - (LCGtypes.grammar_symbol * LCGtypes.linear_term) list list;
70   - dependency_tree: LCGtypes.linear_term array;
  69 + (grammar_symbol * linear_term) list list *
  70 + (grammar_symbol * linear_term) list *
  71 + (grammar_symbol * linear_term) list list;
  72 + references3: linear_term ExtArray.t;
  73 + term4: linear_term;
  74 + dependency_tree4: linear_term array;
  75 + dependency_tree5: linear_term array;
  76 + dependency_tree6: linear_term array;
  77 + dependency_tree7: linear_term array;
  78 + dependency_tree8: linear_term ExtArray.t;
  79 + dependency_tree9: linear_term array;
  80 + semantic_graph10: ENIAMsemTypes.linear_term array;
  81 + semantic_graph11: ENIAMsemTypes.linear_term;
  82 + text_fragments: string IntMap.t array;
71 83 }
72   -
  84 +(*
73 85 type semantic_processing_result = {
74 86 file_prefix: string;
75 87 status: status;
... ... @@ -190,6 +202,35 @@ let empty_eniam_parse_result = {
190 202 text_fragments=[| |];
191 203 }
192 204  
  205 +let empty_conll_parse_result = {
  206 + (* file_prefix=""; *)
  207 + status=Idle;
  208 + msg="";
  209 + lex_time=0.;
  210 + parse_time=0.;
  211 + reduction_time=0.;
  212 + sem_time=0.;
  213 + paths_size=0;
  214 + chart_size=0;
  215 + dependency_tree_size=0;
  216 + chart1=DepNode(-100,[],[],[]);
  217 + chart2=DepNode(-100,[],[],[]);
  218 + references2=ExtArray.make 0 Dot;
  219 + references3=ExtArray.make 0 Dot;
  220 + term4=Dot;
  221 + dependency_tree4=[| |];
  222 + dependency_tree5=[| |];
  223 + dependency_tree6=[| |];
  224 + dependency_tree7=[| |];
  225 + dependency_tree8=ExtArray.make 0 Dot;
  226 + dependency_tree9=[| |];
  227 + semantic_graph10=[| |];
  228 + semantic_graph11=ENIAMsemTypes.Dot;
  229 + text_fragments=[| |];
  230 + parsed_dep_chart=[];
  231 + not_parsed_dep_chart=(-100,[],[],[]);
  232 + }
  233 +
193 234 (*
194 235 let empty_result = {
195 236 input_text=RawText "";
... ... @@ -208,23 +249,6 @@ let empty_result = {
208 249 lex_sems=ExtArray.make 1 ENIAMlexSemanticsTypes.empty_lex_sem;
209 250 }
210 251  
211   -let empty_conll_parse_result = {
212   - file_prefix="";
213   - status=Idle;
214   - msg="";
215   - lex_time=0.;
216   - parse_time=0.;
217   - reduction_time=0.;
218   - sem_time=0.;
219   - paths_size=0;
220   - dependency_tree_size=0;
221   - paths=[| |];
222   - dep_chart=DepNode(-100,[],[],[]);
223   - parsed_dep_chart=[];
224   - not_parsed_dep_chart=(-100,[],[],[]);
225   - dependency_tree=[| |];
226   - }
227   -
228 252 let empty_semantic_processing_result = {
229 253 file_prefix="";
230 254 status=Idle;
... ... @@ -321,3 +345,5 @@ let rec fold_text mode s f = function
321 345 | AltText l ->
322 346 Xlist.fold l s (fun s (mode,text) ->
323 347 fold_text mode s f text)
  348 +
  349 +let rules_filename = ENIAM_LCGlexiconTypes.resource_path ^ "/LCGlexicon/lexicon-pl.dic"
... ...
exec/ENIAMvisualization.ml
... ... @@ -702,7 +702,7 @@ let html_of_struct_sentence tokens paths last =
702 702 t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id lnode rnode)) ^
703 703 sprintf "<tr><td></td><td></td><td></td><td>%d</td><td></td></tr>" last ^
704 704 "</table>"
705   -(*
  705 +
706 706 let html_of_dep_sentence tokens paths =
707 707 "<table><tr><td><b>orth</b></td><td><b>token</b></td><td><b>id</b></td><td><b>conll_id</b></td><td><b>super</b></td><td><b>label</b></td></tr>" ^
708 708 String.concat "\n" (List.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id ->
... ... @@ -711,7 +711,7 @@ let html_of_dep_sentence tokens paths =
711 711 (sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td></tr>"
712 712 t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id conll_id super label) :: l))) ^
713 713 "</table>"
714   -
  714 +(*
715 715 let html_of_tokens tokens =
716 716 "<table><tr><td><b>id</b></td><td><b>orth</b></td><td><b>beg</b></td><td><b>len</b></td><td><b>next</b></td><td><b>token</b></td></td><td><b>attrs</b></td></tr>" ^
717 717 String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size tokens - 1) [] (fun l id ->
... ... @@ -1048,7 +1048,7 @@ let file_prefix_of_mode = function
1048 1048 let rec html_of_sentence path file_prefix mode img verbosity tokens = function
1049 1049 RawSentence s -> s
1050 1050 | StructSentence(paths,last) -> html_of_struct_sentence tokens paths last
1051   - (* | DepSentence paths -> html_of_dep_sentence img verbosity tokens paths *)
  1051 + | DepSentence paths -> html_of_dep_sentence tokens paths
1052 1052 | ENIAMSentence result ->
1053 1053 let file_prefix = file_prefix_of_mode mode ^ file_prefix in
1054 1054 html_of_eniam_sentence path file_prefix img verbosity tokens result
... ... @@ -1062,7 +1062,7 @@ let rec html_of_sentence path file_prefix mode img verbosity tokens = function
1062 1062 String.concat "\n" (Xlist.map l (fun (mode,sentence) ->
1063 1063 sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_sentence path file_prefix mode img verbosity tokens sentence))) ^
1064 1064 "</table>"
1065   - | _ -> failwith "html_of_sentence: ni"
  1065 + (* | _ -> failwith "html_of_sentence: ni" *)
1066 1066  
1067 1067 let rec html_of_paragraph path mode img verbosity tokens = function
1068 1068 RawParagraph s -> (*print_endline "RawParagraph";*) s
... ...
exec/makefile
... ... @@ -19,6 +19,13 @@ install: all
19 19 cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMvisualization.cmi $(INSTALLDIR)
20 20 cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMvisualization.cmx $(INSTALLDIR)
21 21  
  22 +install-local: all
  23 + mkdir -p $(INSTALLDIR)
  24 + cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR)
  25 + cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMsemLexicon.cmi ENIAMsemValence.cmi ENIAMvisualization.cmi $(INSTALLDIR)
  26 + cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMsemLexicon.cmx ENIAMsemValence.cmx ENIAMvisualization.cmx $(INSTALLDIR)
  27 + mkdir -p /usr/local/share/eniam/exec
  28 + cp resources/* /usr/local/share/eniam/exec
22 29  
23 30 eniam-exec.cma: $(SOURCES)
24 31 ocamlc -linkall -a -o eniam-exec.cma $(OCAMLFLAGS) $^
... ...
integration/ENIAMpreIntegration.ml
... ... @@ -198,3 +198,9 @@ let rec parse_text mode tokens = function
198 198 StructText(List.rev paragraphs)
199 199 | AltText l -> AltText(Xlist.map l (fun (mode,text) ->
200 200 mode, parse_text mode tokens text))
  201 +
  202 +let catch_parse_text mode tokens text =
  203 + try
  204 + parse_text mode tokens text,""
  205 + with e ->
  206 + text, Printexc.to_string e
... ...
lexSemantics/.gitignore
1 1 test
2 2 lexSemantics
  3 +inttest
... ...
lexSemantics/ENIAMwalParser.ml
... ... @@ -73,14 +73,6 @@ let split_text schema =
73 73 | Str.Delim "'" -> Quot
74 74 | _ -> failwith "parse_text"))
75 75  
76   -let rec split_symbol symb rev = function
77   - [] -> [List.rev rev](*failwith "split_symbol"*)
78   - | s :: l ->
79   - if s = symb then
80   - if l = [] then (*[List.rev rev]*)failwith "split_symbol"
81   - else (List.rev rev) :: (split_symbol symb [] l)
82   - else split_symbol symb (s :: rev) l
83   -
84 76 let rec string_of_token = function
85 77 Text s -> s
86 78 | Paren l -> "(" ^ String.concat "" (Xlist.map l string_of_token) ^ ")"
... ... @@ -101,6 +93,14 @@ let rec string_of_token = function
101 93 let string_of_token_list l =
102 94 String.concat "" (Xlist.map l string_of_token)
103 95  
  96 +let rec split_symbol symb rev = function
  97 + [] -> [List.rev rev](*failwith "split_symbol"*)
  98 + | s :: l ->
  99 + if s = symb then
  100 + if l = [] then (*[List.rev rev]*)failwith ("split_symbol: " ^ string_of_token symb)
  101 + else (List.rev rev) :: (split_symbol symb [] l)
  102 + else split_symbol symb (s :: rev) l
  103 +
104 104 let parse_case = function
105 105 [Text "nom"] -> Case "nom"
106 106 | [Text "gen"] -> Case "gen"
... ...
lexSemantics/interface.ml
... ... @@ -23,6 +23,7 @@ let output = ref Text
23 23 let comm_stdio = ref true
24 24 (* let sentence_split = ref true *)
25 25 let port = ref 5439
  26 +let perform_integration = ref false
26 27  
27 28 let spec_list = [
28 29 (* "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)";
... ... @@ -33,6 +34,13 @@ let spec_list = [
33 34 "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML";
34 35 "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure";
35 36 "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML";
  37 + "--dep_parser", Arg.Unit (fun () ->
  38 + ENIAMpreIntegration.concraft_enabled := true;
  39 + ENIAMpreIntegration.mate_parser_enabled := true;
  40 + perform_integration := true), "Enable dependency parser";
  41 + "--no_dep_parser", Arg.Unit (fun () ->
  42 + ENIAMpreIntegration.concraft_enabled := false;
  43 + ENIAMpreIntegration.mate_parser_enabled := false), "Disable dependency parser (default)";
36 44 (* "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; *)
37 45 (* "-r", Arg.String (fun p ->
38 46 ENIAMtokenizerTypes.set_resource_path p;
... ... @@ -65,6 +73,9 @@ let rec main_loop in_chan out_chan =
65 73 print_endline text;
66 74 print_endline "input text end"; *)
67 75 let text,tokens,msg = ENIAMsubsyntax.catch_parse_text text in
  76 + let text,msg =
  77 + if msg <> "" || not !perform_integration then text,msg else
  78 + ENIAMpreIntegration.catch_parse_text ENIAMsubsyntaxTypes.Struct tokens text in
68 79 let lex_sems,msg =
69 80 if msg <> "" then ExtArray.make 0 ENIAMlexSemanticsTypes.empty_lex_sem, msg
70 81 else ENIAMlexSemantics.catch_assign tokens text in
... ... @@ -84,6 +95,7 @@ let _ =
84 95 prerr_endline message;
85 96 Arg.parse spec_list anon_fun usage_msg;
86 97 ENIAMlexSemantics.initialize ();
  98 + ENIAMpreIntegration.initialize ();
87 99 Gc.compact ();
88 100 prerr_endline "Ready!";
89 101 if !comm_stdio then main_loop stdin stdout
... ...
lexSemantics/makefile
... ... @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt
3 3 OCAMLDEP=ocamldep
4 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 5 OCAMLFLAGS=$(INCLUDES) -g
6   -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa
  6 +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa
7 7 INSTALLDIR=`ocamlc -where`/eniam
8 8  
9 9 SOURCES= entries.ml ENIAMwalTypes.ml ENIAMwalStringOf.ml ENIAMwalParser.ml ENIAMwalReduce.ml ENIAMlexSemanticsTypes.ml ENIAMlexSemanticsData.ml ENIAMvalence.ml ENIAMwalRenderer.ml ENIAMadjuncts.ml \
... ... @@ -40,6 +40,9 @@ eniam-lexSemantics.cmxa: $(SOURCES)
40 40 test: test.ml
41 41 $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^
42 42  
  43 +inttest: inttest.ml
  44 + $(OCAMLOPT) -o inttest $(OCAMLOPTFLAGS) $^
  45 +
43 46 interface: interface.ml
44 47 $(OCAMLOPT) -o lexSemantics $(OCAMLOPTFLAGS) interface.ml
45 48  
... ... @@ -65,4 +68,4 @@ interface: interface.ml
65 68 $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $<
66 69  
67 70 clean:
68   - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test
  71 + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test inttest
... ...
semantics/ENIAMsemLexicon.ml
... ... @@ -47,7 +47,7 @@ let parse_multi p = function
47 47 let parse_morf p = function
48 48 [T "1"] -> {p with is_necessary=Opt}
49 49 | tokens ->
50   - let l = Xlist.map (Lexer.split_symbol (T "*") [] tokens) (function
  50 + let l = Xlist.map (try Lexer.split_symbol (T "*") [] tokens with _ -> failwith "parse_morf: split_symbol *") (function
51 51 [T s] -> Atom s
52 52 | tokens -> failwith ("parse_morf: " ^ Lexer.string_of_token_list tokens)) in
53 53 {p with morfs=LCG (Tensor l) :: p.morfs}
... ... @@ -57,7 +57,7 @@ let parse_arg tokens p =
57 57 let tokens,p = parse_dir p tokens in
58 58 let tokens,p = parse_multi p tokens in
59 59 match Lexer.find_brackets ["(",")"] [] tokens with
60   - [B("(",")",tokens)] -> Xlist.fold (Lexer.split_symbol (T "+") [] tokens) p parse_morf
  60 + [B("(",")",tokens)] -> Xlist.fold (try Lexer.split_symbol (T "+") [] tokens with _ -> failwith "parse_arg: split_symbol +") p parse_morf
61 61 | tokens -> parse_morf p tokens
62 62  
63 63  
... ... @@ -75,7 +75,7 @@ let parse_entry = function
75 75 [T symbol; T ":"; T "null"] -> symbol,[]
76 76 | T symbol :: T ":" :: tokens ->
77 77 (* Printf.printf "parse_entry: %s\n" (Lexer.string_of_token_list tokens); *)
78   - let tokens = Lexer.split_symbol (T ":") [] tokens in
  78 + let tokens = try Lexer.split_symbol (T ":") [] tokens with _ -> failwith "parse_entry: split_symbol :" in
79 79 let tokens = manage_tokens tokens in
80 80 let positions = Xlist.map tokens (fun (arg,role) ->
81 81 parse_arg arg (parse_role {empty_position with is_necessary=Req} role)) in
... ... @@ -91,7 +91,7 @@ let load_lexicon filename =
91 91 | T "\t" -> tokens
92 92 | T "\r" -> tokens
93 93 | t -> t :: tokens)) in
94   - let entries = Lexer.split_symbol (T ";") [] tokens in
  94 + let entries = try Lexer.split_symbol (T ";") [] tokens with _ -> failwith "load_lexicon: split_symbol ;" in
95 95 Xlist.fold entries StringMap.empty (fun map entry ->
96 96 let symbol,args = parse_entry entry in
97 97 StringMap.add_inc map symbol args (fun _ -> failwith ("load_lexicon: " ^ symbol)))
... ...
testy/skladnica-test1-Failure.conll
1   -1 - - interp interp _ 3 punct _ _
2   -2 Panowie pan subst subst pl|nom|m1 3 subj _ _
3   -3 przyszli przyjść praet praet pl|m1|perf 0 pred _ _
4   -4 . . interp interp _ 3 punct _ _
5   -
6 1 1 O o prep prep loc 12 comp _ _
7 2 2 klasztornym klasztorny adj adj sg|loc|n|pos 3 adjunct _ _
8 3 3 piekle piekło subst subst sg|loc|n 1 comp _ _
... ... @@ -21,84 +16,118 @@
21 16 16 br bieżący_rok brev brev pun 15 ne _ _
22 17 17 . . interp interp _ 12 punct _ _
23 18  
24   -1 Następnie następnie adv adv _ 2 adjunct _ _
25   -2 rozłożyła rozłożyć praet praet sg|f|perf 10 conjunct _ _
26   -3 wysoki wysoki adj adj sg|acc|m3|pos 4 adjunct _ _
27   -4 statyw statyw subst subst sg|acc|m3 2 obj _ _
28   -5 , , interp interp _ 10 coord_punct _ _
29   -6 zawiesiła zawiesić praet praet sg|f|perf 10 conjunct _ _
30   -7 na na prep prep loc 6 adjunct _ _
31   -8 nim on ppron3 ppron3 sg|loc|m3|ter|akc|praep 7 comp _ _
32   -9 pudełko pudełko subst subst sg|acc|n 6 obj _ _
33   -10 , , interp interp _ 0 pred _ _
34   -11 przeprowadziła przeprowadzić praet praet sg|f|perf 10 conjunct _ _
35   -12 od od prep prep gen|nwok 11 adjunct _ _
36   -13 niego on ppron3 ppron3 sg|gen|n|ter|akc|praep 12 comp _ _
37   -14 przezroczysty przezroczysty adj adj sg|acc|m3|pos 15 adjunct _ _
38   -15 przewód przewód subst subst sg|acc|m3 11 obj _ _
39   -16 do do prep prep gen 11 adjunct _ _
40   -17 igły igła subst subst sg|gen|f 16 comp _ _
41   -18 , , interp interp _ 23 punct _ _
42   -19 którą który adj adj sg|acc|f|pos 23 obj _ _
43   -20 wcześniej wcześnie adv adv com 23 adjunct _ _
44   -21 automatyczny automatyczny adj adj sg|nom|m3|pos 22 adjunct _ _
45   -22 iniektor iniektor subst subst sg|nom|m3 23 subj _ _
46   -23 umieścił umieścić praet praet sg|m3|perf 17 adjunct _ _
47   -24 w w prep prep loc|nwok 23 comp _ _
48   -25 żyle żyła subst subst sg|loc|f 24 comp _ _
49   -26 na na prep prep loc 25 adjunct _ _
50   -27 przedramieniu przedramię subst subst sg|loc|n 26 comp _ _
51   -28 Irka Irek subst subst sg|gen|m1 27 adjunct _ _
52   -29 . . interp interp _ 10 punct _ _
  19 +1 W w prep prep loc|nwok 9 adjunct _ _
  20 +2 stanie stan subst subst sg|loc|m3 1 comp _ _
  21 +3 obrzydzenia obrzydzenie subst subst sg|gen|n 2 adjunct _ _
  22 +4 przyprawiającego przyprawiać pact pact sg|gen|n|imperf|aff 3 adjunct _ _
  23 +5 o o prep prep acc 4 comp _ _
  24 +6 nowe nowy adj adj pl|acc|n|pos 7 adjunct _ _
  25 +7 mdłości mdłości subst subst pl|acc|n 5 comp _ _
  26 +8 nie nie qub qub _ 9 neg _ _
  27 +9 zauważył zauważyć praet praet sg|m1|perf 0 pred _ _
  28 +10 nawet nawet qub qub _ 9 adjunct _ _
  29 +11 , , interp interp _ 15 punct _ _
  30 +12 że że comp comp _ 15 complm _ _
  31 +13 wielki wielki adj adj sg|nom|m3|pos 14 adjunct _ _
  32 +14 ból ból subst subst sg|nom|m3 15 subj _ _
  33 +15 zaczyna zaczynać fin fin sg|ter|imperf 9 comp_fin _ _
  34 +16 z z prep prep acc|nwok 18 adjunct _ _
  35 +17 wolna wolny adj adjp _ 16 mwe _ _
  36 +18 zanikać zanikać inf inf imperf 15 comp_inf _ _
  37 +19 . . interp interp _ 9 punct _ _
  38 +
  39 +1 - - interp interp _ 7 punct _ _
  40 +2 W w prep prep loc|nwok 4 comp _ _
  41 +3 szkole szkoła subst subst sg|loc|f 2 comp _ _
  42 +4 jest być fin fin sg|ter|imperf 7 conjunct _ _
  43 +5 mniej mało num num pl|nom 4 subj _ _
  44 +6 uczniów uczeń subst subst pl|gen|m1 5 comp _ _
  45 +7 , , interp interp _ 0 coord_punct _ _
  46 +8 dlatego dlatego adv adv _ 9 adjunct _ _
  47 +9 musiał musieć praet praet sg|m1|imperf 7 conjunct _ _
  48 +10 em być aglt aglt sg|pri|imperf|wok 9 aglt _ _
  49 +11 tym ten adj adj pl|dat|f|pos 12 adjunct _ _
  50 +12 paniom pani subst subst pl|dat|f 13 obj_th _ _
  51 +13 podziękować podziękować inf inf perf 9 comp_inf _ _
  52 +14 . . interp interp _ 7 punct _ _
  53 +
  54 +1 Od od prep prep gen|nwok 9 adjunct _ _
  55 +2 końca koniec subst subst sg|gen|m3 1 comp _ _
  56 +3 XVIII XVIII adj adj sg|gen|m3|pos 4 ne _ _
  57 +4 w wiek brev brev pun 2 comp _ _
  58 +5 . . interp interp _ 4 abbrev_punct _ _
  59 +6 informacje informacja subst subst pl|nom|f 9 subj _ _
  60 +7 o o prep prep loc 6 adjunct _ _
  61 +8 głodach głód subst subst pl|loc|m3 7 comp _ _
  62 +9 stają stawać fin fin pl|ter|imperf 0 pred _ _
  63 +10 się się qub qub _ 9 refl _ _
  64 +11 coraz coraz adv adv _ 12 adjunct _ _
  65 +12 rzadsze rzadki adj adj pl|nom|f|com 9 pd _ _
  66 +13 . . interp interp _ 9 punct _ _
  67 +
  68 +1 Zabrał zabrać praet praet sg|m1|perf 0 pred _ _
  69 +2 ponad ponad qub qub _ 3 adjunct _ _
  70 +3 30 30 num num pl|acc|m3|rec 1 obj _ _
  71 +4 tys tysiąc brev brev pun 3 mwe _ _
  72 +5 . . interp interp _ 4 abbrev_punct _ _
  73 +6 zł złoty brev brev npun 3 comp _ _
  74 +7 . . interp interp _ 1 punct _ _
  75 +
  76 +1 ( ( interp interp _ 8 punct _ _
  77 +2 Kiedyś kiedyś adv adv _ 4 adjunct _ _
  78 +3 też też qub qub _ 4 adjunct _ _
  79 +4 miała mieć praet praet sg|f|imperf 8 conjunct _ _
  80 +5 m być aglt aglt sg|pri|imperf|nwok 4 aglt _ _
  81 +6 takie taki adj adj pl|acc|f|pos 7 adjunct _ _
  82 +7 ambicje ambicja subst subst pl|acc|f 4 obj_th _ _
  83 +8 , , interp interp _ 0 pred _ _
  84 +9 zrezygnowała zrezygnować praet praet sg|f|perf 8 conjunct _ _
  85 +10 m być aglt aglt sg|pri|imperf|nwok 9 aglt _ _
  86 +11 . . interp interp _ 8 punct _ _
  87 +12 ) ) interp interp _ 8 punct _ _
53 88  
54   -1 - - interp interp _ 4 punct _ _
55   -2 Co co subst subst sg|nom|n 4 pd _ _
56   -3 to to subst subst sg|nom|n 4 subj _ _
57   -4 jest być fin fin sg|ter|imperf 0 pred _ _
58   -5 ? ? interp interp _ 4 punct _ _
  89 +1 Zawsze zawsze adv adv _ 2 adjunct _ _
  90 +2 mówię mówić fin fin sg|pri|imperf 0 pred _ _
  91 +3 , , interp interp _ 5 punct _ _
  92 +4 że że comp comp _ 5 complm _ _
  93 +5 mogę móc fin fin sg|pri|imperf 2 comp_fin _ _
  94 +6 pracować pracować inf inf imperf 5 comp_inf _ _
  95 +7 , , interp interp _ 5 punct _ _
  96 +8 bo bo comp comp _ 5 adjunct _ _
  97 +9 mam mieć fin fin sg|pri|imperf 13 conjunct _ _
  98 +10 dobre dobry adj adj sg|acc|n|pos 11 adjunct _ _
  99 +11 zdrowie zdrowie subst subst sg|acc|n 9 obj_th _ _
  100 +12 , , interp interp _ 13 punct _ _
  101 +13 a a conj conj _ 8 comp_fin _ _
  102 +14 to to subst subst sg|nom|n 15 subj _ _
  103 +15 jest być fin fin sg|ter|imperf 13 conjunct _ _
  104 +16 darmo darmo adv adv _ 17 adjunct _ _
  105 +17 dane dany adj adj sg|nom|n|perf|aff 15 pd _ _
  106 +18 . . interp interp _ 2 punct _ _
59 107  
60   -1 Prosi prosić fin fin sg|ter|imperf 0 pred _ _
61   -2 się się qub qub _ 1 refl _ _
62   -3 też też qub qub _ 1 adjunct _ _
63   -4 zakłady zakład subst subst pl|acc|m3 1 obj _ _
64   -5 pracy praca subst subst sg|gen|f 4 adjunct _ _
65   -6 , , interp interp _ 8 punct _ _
66   -7 które który adj adj pl|nom|m3|pos 8 subj _ _
67   -8 dysponują dysponować fin fin pl|ter|imperf 4 adjunct _ _
68   -9 autobusami autobus subst subst pl|inst|m3 8 comp _ _
69   -10 , , interp interp _ 12 punct _ _
70   -11 by by comp comp _ 12 complm _ _
71   -12 wspomogły wspomóc praet praet pl|m3|perf 1 comp_fin _ _
72   -13 komunikację komunikacja subst subst sg|acc|f 12 obj _ _
73   -14 zastępczą zastępczy adj adj sg|acc|f|pos 13 adjunct _ _
74   -15 . . interp interp _ 1 punct _ _
  108 +1 " " interp interp _ 2 punct _ _
  109 +2 Zrobimy zrobić fin fin pl|pri|perf 0 pred _ _
  110 +3 " " interp interp _ 2 punct _ _
  111 +4 ! ! interp interp _ 2 punct _ _
75 112  
76 113 1 - - interp interp _ 3 punct _ _
77   -2 Nie nie qub qub _ 3 neg _ _
78   -3 chcą chcieć fin fin pl|ter|imperf 0 pred _ _
79   -4 , , interp interp _ 8 punct _ _
80   -5 by by comp comp _ 8 complm _ _
81   -6 m być aglt aglt sg|pri|imperf|nwok 8 aglt _ _
82   -7 ich on ppron3 ppron3 pl|acc|m1|ter|akc|npraep 8 obj _ _
83   -8 utrzymywał utrzymywać praet praet sg|m1|imperf 3 comp_fin _ _
84   -9 . . interp interp _ 3 punct _ _
  114 +2 No no qub qub _ 3 adjunct _ _
  115 +3 wie wiedzieć fin fin sg|ter|imperf 0 pred _ _
  116 +4 pan pan subst subst sg|nom|m1 3 subj _ _
  117 +5 ! ! interp interp _ 3 punct _ _
  118 +6 . . interp interp _ 5 punct _ _
  119 +7 . . interp interp _ 6 punct _ _
  120 +8 . . interp interp _ 7 punct _ _
85 121  
86   -1 Wzięli wziąć praet praet pl|m1|perf 0 pred _ _
87   -2 w w prep prep loc|nwok 4 adjunct _ _
88   -3 niej on ppron3 ppron3 sg|loc|f|ter|akc|praep 2 comp _ _
89   -4 udział udział subst subst sg|acc|m3 1 obj _ _
90   -5 przedstawiciele przedstawiciel subst subst pl|nom|m1 1 subj _ _
91   -6 policji policja subst subst sg|gen|f 5 adjunct _ _
92   -7 z z prep prep gen|nwok 5 adjunct _ _
93   -8 Niemiec Niemcy subst subst pl|gen|n 17 conjunct _ _
94   -9 , , interp interp _ 17 coord_punct _ _
95   -10 Czech Czechy subst subst pl|gen|n 17 conjunct _ _
96   -11 , , interp interp _ 17 coord_punct _ _
97   -12 Słowacji Słowacja subst subst sg|gen|f 17 conjunct _ _
98   -13 , , interp interp _ 17 coord_punct _ _
99   -14 Węgier Węgry subst subst pl|gen|n 17 conjunct _ _
100   -15 , , interp interp _ 17 coord_punct _ _
101   -16 Ukrainy Ukraina subst subst sg|gen|f 17 conjunct _ _
102   -17 i i conj conj _ 7 comp _ _
103   -18 Polski Polska subst subst sg|gen|f 17 conjunct _ _
104   -19 . . interp interp _ 1 punct _ _
  122 +1 ( ( interp interp _ 6 punct _ _
  123 +2 Myszkinku Myszkinek subst subst sg|voc|m3 6 adjunct _ _
  124 +3 , , interp interp _ 2 punct _ _
  125 +4 jakie jaki adj adj sg|acc|n|pos 7 adjunct _ _
  126 +5 ty ty ppron12 ppron12 sg|nom|m2|sec 6 subj _ _
  127 +6 masz mieć fin fin sg|sec|imperf 0 pred _ _
  128 +7 futerko futerko subst subst sg|acc|n 6 obj_th _ _
  129 +8 , , interp interp _ 7 punct _ _
  130 +9 lazurowe lazurowy adj adj sg|acc|n|pos 7 adjunct _ _
  131 +10 po po prep prep acc 9 adjunct _ _
  132 +11 prostu prosty adjp adjp _ 10 mwe _ _
  133 +12 ! ! interp interp _ 6 punct _ _
... ...
testy/skladnica-test1-Not_parsed.conll 0 → 100644
  1 +1 Cmentarz cmentarz subst subst sg|nom|m3 2 subj _ _
  2 +2 jest być fin fin sg|ter|imperf 0 pred _ _
  3 +3 taki taki adj adj sg|nom|m3|pos 4 adjunct _ _
  4 +4 pusty pusty adj adj sg|nom|m3|pos 2 pd _ _
  5 +5 ! ! interp interp _ 2 punct _ _
  6 +
  7 +1 Mówi mówić fin fin sg|ter|imperf 0 pred _ _
  8 +2 się się qub qub _ 1 refl _ _
  9 +3 przecież przecież qub qub _ 1 adjunct _ _
  10 +4 , , interp interp _ 7 punct _ _
  11 +5 że że comp comp _ 7 complm _ _
  12 +6 broń broń subst subst sg|nom|f 7 subj _ _
  13 +7 była być praet praet sg|f|imperf 1 comp_fin _ _
  14 +8 w w prep prep loc|nwok 7 adjunct _ _
  15 +9 szkole szkoła subst subst sg|loc|f 8 comp _ _
  16 +10 schowana schować ppas ppas sg|nom|f|perf|aff 7 pd _ _
  17 +11 jeszcze jeszcze qub qub _ 12 adjunct _ _
  18 +12 latem lato subst subst sg|inst|n 7 adjunct _ _
  19 +13 w w prep prep loc|nwok 12 adjunct _ _
  20 +14 czasie czas subst subst sg|loc|m3 13 mwe _ _
  21 +15 remontu remont subst subst sg|gen|m3 14 comp _ _
  22 +16 . . interp interp _ 1 punct _ _
  23 +
  24 +1 Bo bo comp comp _ 9 adjunct _ _
  25 +2 jak jak adv adv _ 9 adjunct _ _
  26 +3 ona on ppron3 ppron3 sg|nom|f|ter|akc|npraep 9 subj _ _
  27 +4 , , interp interp _ 3 punct _ _
  28 +5 chora chory adj adj sg|nom|f|pos 3 adjunct _ _
  29 +6 na na prep prep acc 5 adjunct _ _
  30 +7 cukrzycę cukrzyca subst subst sg|acc|f 6 comp _ _
  31 +8 , , interp interp _ 3 punct _ _
  32 +9 przeżyła przeżyć praet praet sg|f|perf 0 pred _ _
  33 +10 trzy trzy num num pl|acc|m3|congr 9 obj _ _
  34 +11 dni dzień subst subst pl|acc|m3 10 comp _ _
  35 +12 bez bez prep prep gen|nwok 9 comp _ _
  36 +13 wody woda subst subst sg|gen|f 14 conjunct _ _
  37 +14 i i conj conj _ 12 comp _ _
  38 +15 jedzenia jedzenie subst subst sg|gen|n 14 conjunct _ _
  39 +16 ? ? interp interp _ 9 punct _ _
  40 +
  41 +1 Jednak jednak qub qub _ 9 adjunct _ _
  42 +2 już już qub qub _ 3 adjunct _ _
  43 +3 wkrótce wkrótce adv adv _ 9 adjunct _ _
  44 +4 Nizioł Nizioł subst subst sg|nom|m1 5 conjunct _ _
  45 +5 i i conj conj _ 9 subj _ _
  46 +6 Wapiński Wapiński subst subst sg|nom|m1 5 conjunct _ _
  47 +7 ze z prep prep inst|wok 9 adjunct _ _
  48 +8 zdumieniem zdumienie subst subst sg|inst|n 7 comp _ _
  49 +9 odkryli odkryć praet praet pl|m1|perf 0 pred _ _
  50 +10 , , interp interp _ 14 punct _ _
  51 +11 że że comp comp _ 14 complm _ _
  52 +12 Łapiński Łapiński subst subst sg|nom|m1 14 subj _ _
  53 +13 nie nie qub qub _ 14 neg _ _
  54 +14 dotrzymuje dotrzymywać fin fin sg|ter|imperf 9 comp_fin _ _
  55 +15 wcześniej wcześnie adv adv com 16 adjunct _ _
  56 +16 danego dać ppas ppas sg|gen|n|perf|aff 17 adjunct _ _
  57 +17 słowa słowo subst subst sg|gen|n 14 obj _ _
  58 +18 . . interp interp _ 9 punct _ _
  59 +
  60 +1 A a qub qub _ 8 adjunct _ _
  61 +2 pan pan subst subst sg|nom|m1 8 subj _ _
  62 +3 nigdy nigdy adv adv _ 8 adjunct _ _
  63 +4 się się qub qub _ 8 refl _ _
  64 +5 z z prep prep inst|nwok 8 comp _ _
  65 +6 nimi on ppron3 ppron3 pl|inst|m1|ter|akc|praep 5 comp _ _
  66 +7 nie nie qub qub _ 8 neg _ _
  67 +8 zetknął zetknąć praet praet sg|m1|perf 0 pred _ _
  68 +9 ? ? interp interp _ 8 punct _ _
  69 +
  70 +1 Załapać załapać inf inf perf 3 comp_inf _ _
  71 +2 się się qub qub _ 1 refl _ _
  72 +3 trzeba trzeba pred pred _ 0 pred _ _
  73 +4 teraz teraz adv adv _ 3 adjunct _ _
  74 +5 , , interp interp _ 3 punct _ _
  75 +6 bo bo comp comp _ 3 adjunct _ _
  76 +7 potem potem adv adv _ 8 adjunct _ _
  77 +8 będzie być bedzie bedzie sg|ter|imperf 6 comp_fin _ _
  78 +9 trudniej trudno adv adv com 8 pd _ _
  79 +10 . . interp interp _ 3 punct _ _
  80 +
  81 +1 Medykamenty medykament subst subst pl|nom|m3 4 subj _ _
  82 +2 współczesne współczesny adj adj pl|nom|m3|pos 1 adjunct _ _
  83 +3 dostępne dostępny adj adj pl|nom|m3|pos 4 pd _ _
  84 +4 są być fin fin pl|ter|imperf 0 pred _ _
  85 +5 na na prep prep loc 4 adjunct _ _
  86 +6 czarnym czarny adj adj sg|loc|m3|pos 7 adjunct _ _
  87 +7 rynku rynek subst subst sg|loc|m3 5 comp _ _
  88 +8 . . interp interp _ 4 punct _ _
  89 +
  90 +1 To to subst subst sg|nom|n 3 subj _ _
  91 +2 samo sam adj adj sg|nom|n|pos 1 adjunct _ _
  92 +3 dotyczy dotyczyć fin fin sg|ter|imperf 5 conjunct _ _
  93 +4 leczenia leczenie subst subst sg|gen|n 3 obj_th _ _
  94 +5 , , interp interp _ 0 coord_punct _ _
  95 +6 służba służba subst subst sg|nom|f 9 subj _ _
  96 +7 zdrowia zdrowie subst subst sg|gen|n 6 adjunct _ _
  97 +8 praktycznie praktycznie adv adv pos 9 adjunct _ _
  98 +9 przestała przestać praet praet sg|f|perf 5 conjunct _ _
  99 +10 istnieć istnieć inf inf imperf 9 comp_inf _ _
  100 +11 . . interp interp _ 5 punct _ _
  101 +
  102 +1 Zwykły zwykły adj adj sg|nom|m1|pos 2 adjunct _ _
  103 +2 mieszkaniec mieszkaniec subst subst sg|nom|m1 4 subj _ _
  104 +3 kraju kraj subst subst sg|gen|m3 2 adjunct _ _
  105 +4 ma mieć fin fin sg|ter|imperf 0 pred _ _
  106 +5 się się qub qub _ 6 refl _ _
  107 +6 leczyć leczyć inf inf imperf 4 comp_inf _ _
  108 +7 ziołami ziele subst subst pl|inst|n 6 obj_th _ _
  109 +8 , , interp interp _ 10 punct _ _
  110 +9 które który adj adj pl|acc|n|pos 10 obj _ _
  111 +10 zaleca zalecać fin fin sg|ter|imperf 7 adjunct _ _
  112 +11 tradycyjna tradycyjny adj adj sg|nom|f|pos 12 adjunct _ _
  113 +12 medycyna medycyna subst subst sg|nom|f 10 subj _ _
  114 +13 koreańska koreański adj adj sg|nom|f|pos 12 adjunct _ _
  115 +14 . . interp interp _ 4 punct _ _
... ...
tokenizer/ENIAMtokens.ml
... ... @@ -814,6 +814,8 @@ let rec recognize_sign_group poss_s_beg i = function
814 814 | (Sign "?") :: (Sign "?") :: l ->
815 815 create_sentence_seq_q i ((Sign "?") :: (Sign "?") :: []) l "??",i+2*factor,l,true
816 816 (* | (Sign "?") :: (Sign ".") :: l -> *)
  817 + | (Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: l ->
  818 + create_sentence_seq_q i ((Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: []) l "!...",i+4*factor,l,true
817 819 | (Sign "!") :: (Sign "?") :: l ->
818 820 create_sentence_seq_q i ((Sign "!") :: (Sign "?") :: []) l "!?",i+2*factor,l,true
819 821 | (Sign "?") :: (Sign "…") :: l ->
... ...