diff --git a/.gitignore b/.gitignore index 4063cbc..1785ff3 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ *.aux *.log *.tex.backup +tools/mate-tools/dist/* diff --git a/corpora/CONLL.ml b/corpora/CONLL.ml index ca10b29..6812181 100644 --- a/corpora/CONLL.ml +++ b/corpora/CONLL.ml @@ -220,13 +220,14 @@ let match_corpus corpus = (******************) +exception Comment_line exception Empty_line exception Empty_sentence exception Id_line of string let load_token in_channel = let fail line = - (* failwith ("load_token: " ^ line) *) + print_endline ("load_token: " ^ line); () in let int_of_super = function "_" -> -1 @@ -247,7 +248,8 @@ let load_token in_channel = else if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.tree" line then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.tree" line in raise (Id_line id) - else failwith ("load_token: " ^ line) + else raise Comment_line + (* failwith ("load_token: " ^ line) *) else match Xstring.split "\t" line with [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] -> @@ -272,6 +274,7 @@ let load_sentence in_channel = if id_a <> conll_id then failwith "load_sentence: different ids" else pom ((id_a,super,label) :: rev_paths) id with Id_line new_id -> pom rev_paths new_id + | Comment_line -> pom rev_paths id | Empty_line -> rev_paths, id | End_of_file -> if rev_paths = [] then raise End_of_file diff --git a/corpora/CONLL_adapter.ml b/corpora/CONLL_adapter.ml index 747831d..949c933 100644 --- a/corpora/CONLL_adapter.ml +++ b/corpora/CONLL_adapter.ml @@ -42,6 +42,34 @@ let if_interps interps token = ) interp in Xlist.fold interps true (fun acc (nr,value) -> acc && (if_interp nr value)) +let change_dep paths i (id,super,label) = + let id_S, super_S, label_S = paths.(super) in + paths.(i) <- (id,super_S,label); + paths.(super) <- (id_S, id, label_S) + +let correct_injection paths tokens = Array.iteri (fun i (id,super,label) -> + if label = "punct" then (*musi być pierwszym tokenem o tym ojcu*) + let j = Int.fold (i+1) (Array.length paths - 1) 0 (fun acc n -> + let i2,s2,l2 = paths.(n) in + if super = s2 + then if l2 = "punct" + then n + else 0 + else acc + ) in + let k = Int.fold_down (i-1) 1 i (fun acc n -> + let i2,s2,l2 = paths.(n) in + if super = s2 + then 0 + else acc + ) in + if k == i && j <> 0 && i < super && super < j + then + (paths.(i) <- (0,-1,""); + paths.(j) <- (0,-1,"")) + ) paths; + paths + let correct_coordination1 paths tokens = let paths_ls = List.mapi (fun i (id,super,label) -> (i,id,super,label)) (Array.to_list paths) in @@ -136,15 +164,15 @@ let correct_coordination2 paths tokens = let paths_ls () = List.mapi (fun i (id,super,label) -> (i,id,super,label)) (Array.to_list paths_c) in - (* let ps a sons = + let ps a sons = print_endline a; List.iter (fun (i,_,_,_) -> print_endline (ExtArray.get tokens i).orth) sons; - print_endline "" in *) + print_endline "" in let rec correct_rec (i,id,super,label) sons = let left_s, right_s = List.partition (fun (a,b,c,d) -> a < i) sons in - (* ps "left:" (List.rev left_s); - ps "right:" right_s; *) + ps "left:" (List.rev left_s); + ps "right:" right_s; find_father i (List.rev left_s); find_father i right_s @@ -154,23 +182,35 @@ let correct_coordination2 paths tokens = paths_c.(i) <- (id,i0,label); if not (if_cat ["conj"] (ExtArray.get tokens i).token || (ExtArray.get tokens i).orth = ",") - then failwith "find_father"; + then failwith "find_father1"; correct_rec (i,id,super,label) (if a < i then (a,b,c,d) :: t else List.rev @@ (a,b,c,d) :: t) - | _ -> failwith "find_father" in + | [] -> failwith "find_father2" in let check_previous_for_interp i = if i >= 0 && (ExtArray.get tokens i).orth = "," && not (List.exists (fun (_,super,_) -> super = i) (Array.to_list paths_c)) then paths_c.(i) <- (0,-1,"") in + let filter_comp_construction sons = + let rec pom acc = function + (i1,id1,super1,label1) :: (i2,id2,super2,label2) :: t -> + if if_cat ["interp"] (ExtArray.get tokens i1).token && + if_cat ["comp"] (ExtArray.get tokens i2).token + then pom acc t + else pom ((i1,id1,super1,label1) :: acc) ((i2,id2,super2,label2) :: t) + | h :: t -> pom (h :: acc) t + | [] -> List.rev acc in + pom [] sons in + Array.iteri (fun i (id,super,label) -> if if_cat ["conj"] (ExtArray.get tokens i).token || (ExtArray.get tokens i).orth = "," then (check_previous_for_interp (i-1); let sons = List.filter (fun (_,_,super,_) -> super = i) (paths_ls ()) in + (* let sons = filter_comp_construction sons in *) if (List.length sons > 2) then correct_rec (i,id,super,label) sons)) paths_c; paths_c @@ -206,15 +246,16 @@ done; *) let brev i id super label = let if_the_last_dot () = - let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) -> - s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in - Array.fold_left (fun acc (i2,s,l) -> - acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths in + try + let (id_dot, s_dot, l_dot) = List.find (fun (i2,s,l) -> + s = i && ((ExtArray.get tokens i2).orth = "." || (ExtArray.get tokens i2).orth = "...")) (Array.to_list paths) in + Array.fold_left (fun acc (i2,s,l) -> + acc && (ExtArray.get tokens i2).beg <= (ExtArray.get tokens id_dot).beg) true paths + with Not_found -> true in let dot = if if_interps [0,"npun"] (ExtArray.get tokens id).token || if_the_last_dot () then "" else "." in - let n_orth = (ExtArray.get tokens id).orth ^ dot in paths.(i) <- (find_token n_orth,super,label) in @@ -317,6 +358,16 @@ let correct_interp_with_father_0 paths tokens = then paths.(i1) <- (id1,0,label1)) paths) paths; paths +let corect_complm paths tokens = + Array.iteri (fun i (id,super,label) -> + if label = "complm" && super > 0 + then + let i2,s2,l2 = paths.(super) in + if if_cat ["conj"] (ExtArray.get tokens i2).token + then change_dep paths i (id,super,label) + ) paths; + paths + let remove_interps interp paths tokens = let paths_ls = Array.to_list paths in Array.iteri (fun i (id,super,label) -> @@ -339,10 +390,6 @@ let correct_passive_voice paths tokens = paths let swap_dep paths tokens = - let change_dep i (id,super,label) = - let id_S, super_S, label_S = paths.(super) in - paths.(i) <- (id,super_S,label); - paths.(super) <- (id_S, id, label_S) in let rec correct_dep i (id,super,label) = let adv_relators = ["kto";"co";"ile";"czyj";"jaki";"który"; "jak";"skąd";"dokąd";"gdzie";"którędy";"kiedy";"odkąd";"dlaczego";"czemu";"gdy"] in @@ -356,7 +403,7 @@ let swap_dep paths tokens = (if_lemma adv_relators (ExtArray.get tokens id).token && if_cat ["fin"; "praet"; "winien"; "pred"; "imps"; "ppas"; "subst"] (ExtArray.get tokens super).token) then - change_dep i (id,super,label); + change_dep paths i (id,super,label); if (if_lemma adv_relators (ExtArray.get tokens id).token && if_cat ["subst"; "pred"] (ExtArray.get tokens super).token) then correct_dep i paths.(i) in @@ -367,7 +414,11 @@ let swap_dep paths tokens = nieobsługiwana na razie koordynacja strony biernej - zarówno czasowniki posiłkowe, jak i imiesłowy nieobsługiwana na razie koordynacja podrzędników spójników podrzędnych *) -let convert_dep_tree id first_try paths tokens = +let convert_dep_tree path first_try paths tokens = + File.file_out (path ^ "/pre_text_unmodified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> + Printf.fprintf file "%s\n" ENIAMvisualization.html_header; + Printf.fprintf file "%s\n" (ENIAMvisualization.html_of_dep_sentence tokens paths); + Printf.fprintf file "%s\n" ENIAMvisualization.html_trailer); let paths = Array.copy paths in let paths = if first_try @@ -375,16 +426,27 @@ let convert_dep_tree id first_try paths tokens = let pom = replace_tokens paths tokens in let pom = (remove_interps ".") pom tokens in let pom = replace_hyphens pom tokens in + let pom = correct_injection pom tokens in let pom = correct_coordination1 pom tokens in let pom = correct_interp_with_father_0 pom tokens in - let pom = correct_coordination2 pom tokens in - let pom = remove_interps "," pom tokens in + (* File.file_out (path ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> + Printf.fprintf file "%s\n" ENIAMvisualization.html_header; + Printf.fprintf file "%s\n" (ENIAMvisualization.html_of_dep_sentence tokens paths); + Printf.fprintf file "%s\n" ENIAMvisualization.html_trailer); *) + let pom = try corect_complm pom tokens with | e -> print_endline (Printexc.to_string e); pom in + let pom = try + let pom2 = correct_coordination2 pom tokens in + remove_interps "," pom2 tokens + with + | _ -> (let pom2 = remove_interps "," pom tokens in + correct_coordination2 pom2 tokens) in let pom = correct_passive_voice pom tokens in praet_qub_aglt pom tokens else - swap_dep paths tokens in - (* File.file_out ("results/" ^ id ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> - Printf.fprintf file "%s\n" Visualization.html_header; - Printf.fprintf file "%s\n" (Visualization.html_of_dep_sentence tokens paths); - Printf.fprintf file "%s\n" Visualization.html_trailer); *) + paths in + (* swap_dep paths tokens in *) + File.file_out (path ^ "/pre_text_modified_" ^ (string_of_bool first_try) ^ ".html") (fun file -> + Printf.fprintf file "%s\n" ENIAMvisualization.html_header; + Printf.fprintf file "%s\n" (ENIAMvisualization.html_of_dep_sentence tokens paths); + Printf.fprintf file "%s\n" ENIAMvisualization.html_trailer); paths diff --git a/corpora/makefile b/corpora/makefile index cbab48d..eca683d 100755 --- a/corpora/makefile +++ b/corpora/makefile @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt OCAMLDEP=ocamldep INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam OCAMLFLAGS=$(INCLUDES) -g -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa eniam-exec.cmxa INSTALLDIR=`ocamlc -where`/eniam SOURCES= types.ml CONLL.ml CONLL_adapter.ml resources.ml conllParser.ml interpsInCorpus.ml generate.ml diff --git a/corpora/test_conll.ml b/corpora/test_conll.ml index f0439d9..19c5be9 100644 --- a/corpora/test_conll.ml +++ b/corpora/test_conll.ml @@ -48,7 +48,7 @@ let clarify_categories senses token = | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[]) | _ -> [] -let create_chart tokens lex_sems paths last = +(* let create_chart tokens lex_sems paths last = ENIAM_LCGrenderer.reset_variable_numbers (); let chart = ENIAM_LCGchart.make last in let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) -> @@ -59,7 +59,7 @@ let create_chart tokens lex_sems paths last = let cats = clarify_categories ["X"] t in let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in - chart + chart *) let rec split_sons left id right = function [] -> List.rev (List.sort compare left), List.sort compare right @@ -85,7 +85,7 @@ let create_dep_chart tokens lex_sems paths = ENIAM_LCGrenderer.reset_variable_names (); ENIAM_LCGrenderer.add_variable_numbers (); let cats = clarify_categories ["X"] t in - let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in + let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata s.ENIAMlexSemanticsTypes.lex_entries in IntMap.add nodes i l) in (* print_endline "create_dep_chart 3"; *) let x = dep_create_rec nodes sons 0 in @@ -93,7 +93,7 @@ let create_dep_chart tokens lex_sems paths = x -let test_example path id tokens lex_sems paths last = +(* let test_example path id tokens lex_sems paths last = ENIAM_LCGreductions.reset_variant_label (); let chart = create_chart tokens lex_sems paths last in ENIAM_LCGlatexOf.print_chart path (id^"1_chart") "a1" chart; @@ -119,43 +119,45 @@ let test_example path id tokens lex_sems paths last = ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; ()) else print_endline "not reduced") - else print_endline "not parsed" + else print_endline "not parsed" *) -let test_dep_example path id tokens lex_sems paths = +let rec test_dep_example path id tokens lex_sems first_try paths = + (* print_endline "test_dep_example 1"; *) + let paths = CONLL_adapter.convert_dep_tree path first_try paths tokens in try - ENIAM_LCGreductions.reset_variant_label (); - print_endline "test_dep_example 1"; - let paths = CONLL_adapter.convert_dep_tree id (*first_try*) true paths tokens in - print_endline "test_dep_example 2"; - (* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *) - let chart = create_dep_chart tokens lex_sems paths in - (* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *) - let chart,references = ENIAM_LCGchart.dep_lazify chart in - (* ENIAM_LCGlatexOf.print_dep_chart path (id^"2_chart") "a4" chart; *) - (* ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references; *) - let chart = ENIAM_LCGchart.dep_parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) - (* ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart; *) - (* ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references; *) - if ENIAM_LCGchart.is_dep_parsed chart then ( - let term = ENIAM_LCGchart.get_dep_parsed_term chart in - (* Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file -> - Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); - Xlatex.latex_compile_and_clean path (id^"4_term"); *) - let dependency_tree = ENIAM_LCGreductions.reduce term references in - (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree; *) - if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( - ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) - (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree; *) - ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) - (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree; *) - (* ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree; *) - (* ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; *) - ()) - else print_endline "not reduced") - else print_endline "not parsed" + ENIAM_LCGreductions.reset_variant_label (); + (* print_endline "test_dep_example 2"; *) + (* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *) + let chart = create_dep_chart tokens lex_sems paths in + (* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *) + let chart,references = ENIAM_LCGchart.dep_lazify chart in + (* ENIAM_LCGlatexOf.print_dep_chart path (id^"2_chart") "a4" chart; *) + (* ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references; *) + let chart = ENIAM_LCGchart.dep_parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *) + (* ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart; *) + (* ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references; *) + if ENIAM_LCGchart.is_dep_parsed chart then ( + let term = ENIAM_LCGchart.get_dep_parsed_term chart in + (* Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file -> + Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term)); + Xlatex.latex_compile_and_clean path (id^"4_term"); *) + let dependency_tree = ENIAM_LCGreductions.reduce term references in + (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree; *) + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then ( + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) + (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree; *) + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *) + (* ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree; *) + (* ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree; *) + (* ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; *) + ()) + else print_endline "not reduced") + else print_endline "not parsed" with NotDepParsed(id_ndp,left,l,right) -> ( - print_endline "not parsed 2"; - ENIAM_LCGlatexOf.print_not_parsed_dep_chart path (id^"3_not_parsed_chart") "a2" (id_ndp,left,l,right)) + if (first_try) + then test_dep_example path id tokens lex_sems false paths + else (print_endline "not parsed 2"; + ENIAM_LCGlatexOf.print_not_parsed_dep_chart path (id^"3_not_parsed_chart") "a2" (id_ndp,left,l,right))) let rec parse_sentence name id tokens lex_sems = function RawSentence s -> id @@ -163,7 +165,7 @@ let rec parse_sentence name id tokens lex_sems = function (* test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; *) id + 1 | DepSentence(paths) -> - test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems paths; + test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems true paths; id + 1 | QuotedSentences sentences -> Xlist.fold sentences id (fun id p -> @@ -212,8 +214,8 @@ let process_id s = else failwith ("process_id: " ^ s) let process_conll_corpus filename = - let corpus = File.file_in filename (fun file -> CONLL.match_corpus (ENIAM_CONLL.load_corpus file)) in - print_endline "process_conll_corpus"; + let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in + (* print_endline "process_conll_corpus 1"; *) (* let corpus = [List.hd corpus] in *) Xlist.iter corpus (fun query -> try let id = process_id (get_query_id query) in @@ -226,13 +228,17 @@ let process_conll_corpus filename = (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *) let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in + (* print_endline "process_conll_corpus 2"; *) let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in + (* print_endline "process_conll_corpus 3"; *) let sentences = match text with AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences | _ -> failwith "process_conll_corpus 1" in let text = AltText[Raw,RawText query; Struct, StructText([ AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in + (* print_endline "process_conll_corpus 4"; *) let lex_sems = ENIAMlexSemantics.assign tokens text in + (* print_endline "process_conll_corpus 5"; *) ignore(parse_text id 1 tokens lex_sems text) | _ -> failwith "process_conll_corpus 2" with @@ -241,6 +247,7 @@ let process_conll_corpus filename = let _ = Printexc.record_backtrace true; + ENIAMlexSemantics.initialize (); (* LCGfields.reset (); *) (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) (* process_conll_corpus "../testy/skladnica-test1.conll"; *) diff --git a/exec/ENIAMexec.ml b/exec/ENIAMexec.ml index eb0ec47..8393f9e 100644 --- a/exec/ENIAMexec.ml +++ b/exec/ENIAMexec.ml @@ -85,6 +85,37 @@ let create_chart rules tokens lex_sems paths last = ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in chart +let rec split_sons left id right = function + [] -> List.rev (List.sort compare left), List.sort compare right + | x :: l -> if x < id then split_sons (x :: left) id right l else split_sons left id (x :: right) l + +let rec dep_create_rec nodes sons conll_id = + let node = IntMap.find nodes conll_id in + let l = try IntMap.find sons conll_id with Not_found -> [] in + let left,right = split_sons [] conll_id [] l in + (* Printf.printf "dep_create_rec [%s] %d [%s]\n" (String.concat ";" (Xlist.map left string_of_int)) conll_id (String.concat ";" (Xlist.map right string_of_int)); *) + DepNode(conll_id, Xlist.map left (dep_create_rec nodes sons), node, Xlist.map right (dep_create_rec nodes sons)) + +let create_dep_chart dep_rules tokens lex_sems paths = + (* print_endline "create_dep_chart 1"; *) + let sons = Int.fold 1 (Array.length paths - 1) IntMap.empty (fun sons i -> + let _,super,_ = paths.(i) in + IntMap.add_inc sons super [i] (fun l -> i :: l)) in + (* print_endline "create_dep_chart 2"; *) + let nodes = Int.fold 0 (Array.length paths - 1) IntMap.empty (fun nodes i -> + let id,_,_ = paths.(i) in + let t = ExtArray.get tokens id in + let s = ExtArray.get lex_sems id in + ENIAM_LCGrenderer.reset_variable_names (); + ENIAM_LCGrenderer.add_variable_numbers (); + let cats = clarify_categories ["X"] t in + let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata s.ENIAMlexSemanticsTypes.lex_entries in + IntMap.add nodes i l) in + (* print_endline "create_dep_chart 3"; *) + let x = dep_create_rec nodes sons 0 in + (* print_endline "create_dep_chart 4"; *) + x + let create_text_fragments tokens paths last = let text_fragments = Array.make last IntMap.empty in Xlist.iter paths (fun (id,lnode,rnode) -> @@ -156,85 +187,75 @@ let eniam_parse_sentence timeout verbosity rules tokens lex_sems paths last = with e -> let time2 = time_fun () in {result with status=LexiconError; msg=string_of_exn e; lex_time=time2 -. time1} -(* -let rec conll_parse_sentence timeout test_only_flag id first_try paths tokens lex_sems = - let result = empty_conll_parse_result in - let time2 = time_fun () in - (* let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in *) + +let rec conll_parse_sentence timeout verbosity dep_rules first_try tokens lex_sems paths = + ENIAM_LCGreductions.reset_variant_label (); + let result = {empty_conll_parse_result with paths_size = Xlist.size paths} in + let result = if verbosity = 0 then result else result(*{result with text_fragments=create_dep_text_fragments tokens paths last}*) in (* FIXME *) + let time1 = time_fun () in try - let dep_chart = LCGlexicon.dep_create paths tokens lex_sems in - let dep_chart,references = LCGchart.dep_lazify dep_chart in - let result = if test_only_flag then result else {result with dep_chart=dep_chart} in - let time3 = time_fun () in - let result = {result with lex_time=time3 -. time2} in + let paths = CONLL_adapter.convert_dep_tree id first_try paths tokens lex_sems in + let chart = create_chart dep_rules tokens lex_sems paths in + let result = if verbosity = 0 then result else {result with chart1=chart} in + let chart,references = ENIAM_LCGchart.dep_lazify chart in + let result = if verbosity = 0 then result else {result with chart2=chart; references2=ExtArray.copy references} in + let time2 = time_fun () in + let result = {result with lex_time=time2 -. time1} in try - (* print_endline "conll_parse_sentence 1"; *) - (* LCGlatexOf.print_references "results/" "references1" references; *) - let parsed_dep_chart = LCGchart.dep_parse dep_chart references timeout time_fun in (* uwaga: niejawna zmiana imperatywna w references *) - (* print_endline "conll_parse_sentence 2"; *) - (* LCGlatexOf.print_references "results/" "references2" references; *) - let time4 = time_fun () in - let result = if test_only_flag then result else {result with parsed_dep_chart=parsed_dep_chart} in - let result = {result with parse_time=time4 -. time3} in - if LCGchart.is_dep_parsed parsed_dep_chart then + let chart = ENIAM_LCGchart.dep_parse chart references timeout time_fun in (* uwaga: niejawna zmiana imperatywna w references *) + let time3 = time_fun () in + let result = if verbosity = 0 then result else {result with parsed_dep_chart=chart; references3=references} in + let result = {result with parse_time=time3 -. time2; chart_size=ENIAM_LCGchart.get_no_entries chart} in + if ENIAM_LCGchart.is_dep_parsed chart then try - let term = LCGchart.get_dep_parsed_term tokens lex_sems parsed_dep_chart in - (* LCGlatexOf.print_dependency_tree "dep_dependency_tree1" dependency_tree; *) - let dependency_tree = LCGreductions.reduce term references in - let time5 = time_fun () in - let result = if test_only_flag then result else {result with dependency_tree=dependency_tree} in - let result = {result with reduction_time=time5 -. time4; dependency_tree_size=Array.length dependency_tree} in - if LCGreductions.is_reduced_dependency_tree dependency_tree then + let term = ENIAM_LCGchart.get_dep_parsed_term chart in + let result = if verbosity = 0 then result else {result with term4=term} in + let dependency_tree = ENIAM_LCGreductions.reduce term references in + let time4 = time_fun () in + let result = if verbosity = 0 then result else {result with dependency_tree4=Array.copy dependency_tree} in + let result = {result with reduction_time=time4 -. time3; dependency_tree_size=Array.length dependency_tree} in + if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then try - (* print_endline "conll_parse_sentence 3"; *) - LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *) - (* print_endline "conll_parse_sentence 4"; *) - LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *) -(* if Array.length dependency_tree < 10000 then print_xml_dependency_tree "results/trees/" id dependency_tree; *) - (* print_endline "conll_parse_sentence 5"; *) + ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *) + let result = if verbosity = 0 then result else {result with dependency_tree5=Array.copy dependency_tree} in + ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w result *) + let result = (*if verbosity = 0 then result else*) {result with dependency_tree6=dependency_tree} in let time6 = time_fun () in - {result with status=Parsed; sem_time=time6 -. time5} + {result with status=Parsed; sem_time=time6 -. time4} with e -> let time6 = time_fun () in - {result with status=SemError; msg=string_of_exn e; sem_time=time6 -. time5} + {result with status=SemError1; msg=string_of_exn e; sem_time=time6 -. time4} else {result with status=NotReduced} with | SemTooBig -> - let time5 = time_fun () in - {result with status=TooManyNodes; reduction_time=time5 -. time4} + let time4 = time_fun () in + {result with status=TooManyNodes; reduction_time=time4 -. time3} | e -> - let time5 = time_fun () in - {result with status=ReductionError; msg=string_of_exn e; reduction_time=time5 -. time4} + let time4 = time_fun () in + {result with status=ReductionError; msg=string_of_exn e; reduction_time=time4 -. time3} else if first_try - then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems + then conll_parse_sentence timeout verbosity dep_rules false tokens lex_sems paths else {result with status=NotParsed} with Timeout t -> - let time4 = time_fun () in - {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time4 -. time3} + let time3 = time_fun () in + {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time3 -. time2} | NotDepParsed(id_ndp,left,l,right) -> if first_try - then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems + then conll_parse_sentence timeout verbosity dep_rules false tokens lex_sems paths else let time4 = time_fun () in {result with status=NotParsed; not_parsed_dep_chart=(id_ndp,left,l,right); parse_time=time4 -. time3} | e -> - let time4 = time_fun () in - {result with status=ParseError; msg=string_of_exn e; parse_time=time4 -. time3} - with e -> (*print_endline (string_of_exn e);*) - let time3 = time_fun () in + let time3 = time_fun () in + {result with status=ParseError; msg=string_of_exn e; parse_time=time3 -. time2} + with e -> + let time2 = time_fun () in if first_try - then conll_parse_sentence timeout test_only_flag id false paths tokens lex_sems + then conll_parse_sentence timeout verbosity dep_rules false tokens lex_sems paths else {result with status=LexiconError; msg=string_of_exn e; lex_time=time3 -. time2} - -let mate_in, mate_out = (*Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test"*) - if Paths.config.Paths.mate_parser_enabled then - Unix.open_process ("java -jar " ^ Paths.config.Paths.mate_parser_path ^ "dist/anna-3.5.jar -model " ^ - Paths.config.Paths.mate_parser_path ^ "examples/160622_Polish_MateParser.mdl -test") - else stdin, stdout - -let swigra_in, swigra_out = (*Unix.open_process "../swigra/parser/run.sh"*) +(*let swigra_in, swigra_out = (*Unix.open_process "../swigra/parser/run.sh"*) if Paths.config.Paths.swigra_enabled then Unix.open_process (Paths.config.Paths.swigra_path ^ "run.sh") else stdin, stdout @@ -256,38 +277,21 @@ let parse timeout verbosity rules (*name id*) tokens lex_sems = let result = eniam_parse_sentence timeout verbosity rules tokens lex_sems paths last in ENIAMSentence result | _ -> failwith "parse 3") - | DepSentence(paths) -> + | DepSentence paths -> (match mode with -(* CONLL -> - let result = conll_parse_sentence timeout verbosity id true paths tokens lex_sems in - let result = {result with + CONLL | Mate -> + let result = conll_parse_sentence timeout verbosity dep_rules true tokens lex_sems paths in + (* let result = {result with file_prefix = file_prefix_of_mode mode ^ file_prefix; - paths = paths} in + paths = paths} in *) CONLLSentence result (* let xml = DepTree.conll_to_xml paths in let graph = XmlPrinter.graph_of_xml xml in (* FIXME: do poprawy *) Visualization.print_graph "results/" "term_conll" graph; let result = {empty_eniam_parse_result with status=Parsed; term=graph} in ENIAMSentence result, next_id *) - | Mate -> - if not Paths.config.Paths.mate_parser_enabled then DepSentence paths else ( - print_endline "parse_sentence 1"; - (* print_endline (Visualization.html_of_dep_sentence tokens paths); *) - let conll = ENIAM_CONLL.string_of_paths ENIAMsubsyntaxTypes.Mate tokens paths in - print_endline "parse_sentence 2"; - (* printf "|%s|\n" conll; *) - Printf.fprintf mate_out "%s%!" conll; - print_endline "parse_sentence 3"; - let new_paths = get_paths paths (ENIAM_CONLL.load_sentence mate_in) in - print_endline "parse_sentence 4"; - (* print_endline (Visualization.html_of_dep_sentence tokens new_paths); *) - let result = conll_parse_sentence timeout verbosity id true new_paths tokens lex_sems in - let result = {result with - file_prefix = file_prefix_of_mode mode ^ file_prefix; - paths=new_paths} in - CONLLSentence result)*) - | _ -> failwith "parse 2") - | _ -> failwith "parse 1") + | _ -> failwith "parse 2") + | _ -> failwith "parse 1") (* diff --git a/exec/ENIAMexecTypes.ml b/exec/ENIAMexecTypes.ml index 4fe50be..45844f1 100644 --- a/exec/ENIAMexecTypes.ml +++ b/exec/ENIAMexecTypes.ml @@ -49,9 +49,9 @@ type eniam_parse_result = { semantic_graph11: ENIAMsemTypes.linear_term; text_fragments: string IntMap.t array; } -(* + type conll_parse_result = { - file_prefix: string; +(* file_prefix: string;*) status: status; msg: string; lex_time: float; @@ -59,17 +59,29 @@ type conll_parse_result = { reduction_time: float; sem_time: float; paths_size: int; + chart_size: int; dependency_tree_size: int; - paths: (int * int * string) array; - dep_chart: LCGtypes.dep_tree; - parsed_dep_chart: (LCGtypes.SymbolMap.key * LCGtypes.linear_term) list; + chart1: dep_tree; + chart2: dep_tree; + references2: linear_term ExtArray.t; + parsed_dep_chart: (SymbolMap.key * linear_term) list; not_parsed_dep_chart: int * - (LCGtypes.grammar_symbol * LCGtypes.linear_term) list list * - (LCGtypes.grammar_symbol * LCGtypes.linear_term) list * - (LCGtypes.grammar_symbol * LCGtypes.linear_term) list list; - dependency_tree: LCGtypes.linear_term array; + (grammar_symbol * linear_term) list list * + (grammar_symbol * linear_term) list * + (grammar_symbol * linear_term) list list; + references3: linear_term ExtArray.t; + term4: linear_term; + dependency_tree4: linear_term array; + dependency_tree5: linear_term array; + dependency_tree6: linear_term array; + dependency_tree7: linear_term array; + dependency_tree8: linear_term ExtArray.t; + dependency_tree9: linear_term array; + semantic_graph10: ENIAMsemTypes.linear_term array; + semantic_graph11: ENIAMsemTypes.linear_term; + text_fragments: string IntMap.t array; } - +(* type semantic_processing_result = { file_prefix: string; status: status; @@ -190,6 +202,35 @@ let empty_eniam_parse_result = { text_fragments=[| |]; } +let empty_conll_parse_result = { + (* file_prefix=""; *) + status=Idle; + msg=""; + lex_time=0.; + parse_time=0.; + reduction_time=0.; + sem_time=0.; + paths_size=0; + chart_size=0; + dependency_tree_size=0; + chart1=DepNode(-100,[],[],[]); + chart2=DepNode(-100,[],[],[]); + references2=ExtArray.make 0 Dot; + references3=ExtArray.make 0 Dot; + term4=Dot; + dependency_tree4=[| |]; + dependency_tree5=[| |]; + dependency_tree6=[| |]; + dependency_tree7=[| |]; + dependency_tree8=ExtArray.make 0 Dot; + dependency_tree9=[| |]; + semantic_graph10=[| |]; + semantic_graph11=ENIAMsemTypes.Dot; + text_fragments=[| |]; + parsed_dep_chart=[]; + not_parsed_dep_chart=(-100,[],[],[]); + } + (* let empty_result = { input_text=RawText ""; @@ -208,23 +249,6 @@ let empty_result = { lex_sems=ExtArray.make 1 ENIAMlexSemanticsTypes.empty_lex_sem; } -let empty_conll_parse_result = { - file_prefix=""; - status=Idle; - msg=""; - lex_time=0.; - parse_time=0.; - reduction_time=0.; - sem_time=0.; - paths_size=0; - dependency_tree_size=0; - paths=[| |]; - dep_chart=DepNode(-100,[],[],[]); - parsed_dep_chart=[]; - not_parsed_dep_chart=(-100,[],[],[]); - dependency_tree=[| |]; - } - let empty_semantic_processing_result = { file_prefix=""; status=Idle; @@ -321,3 +345,5 @@ let rec fold_text mode s f = function | AltText l -> Xlist.fold l s (fun s (mode,text) -> fold_text mode s f text) + +let rules_filename = ENIAM_LCGlexiconTypes.resource_path ^ "/LCGlexicon/lexicon-pl.dic" diff --git a/exec/ENIAMvisualization.ml b/exec/ENIAMvisualization.ml index b81222c..ebccf94 100644 --- a/exec/ENIAMvisualization.ml +++ b/exec/ENIAMvisualization.ml @@ -702,7 +702,7 @@ let html_of_struct_sentence tokens paths last = t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id lnode rnode)) ^ sprintf "<tr><td></td><td></td><td></td><td>%d</td><td></td></tr>" last ^ "</table>" -(* + let html_of_dep_sentence tokens paths = "<table><tr><td><b>orth</b></td><td><b>token</b></td><td><b>id</b></td><td><b>conll_id</b></td><td><b>super</b></td><td><b>label</b></td></tr>" ^ String.concat "\n" (List.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id -> @@ -711,7 +711,7 @@ let html_of_dep_sentence tokens paths = (sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td></tr>" t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id conll_id super label) :: l))) ^ "</table>" - +(* let html_of_tokens tokens = "<table><tr><td><b>id</b></td><td><b>orth</b></td><td><b>beg</b></td><td><b>len</b></td><td><b>next</b></td><td><b>token</b></td></td><td><b>attrs</b></td></tr>" ^ String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size tokens - 1) [] (fun l id -> @@ -1048,7 +1048,7 @@ let file_prefix_of_mode = function let rec html_of_sentence path file_prefix mode img verbosity tokens = function RawSentence s -> s | StructSentence(paths,last) -> html_of_struct_sentence tokens paths last - (* | DepSentence paths -> html_of_dep_sentence img verbosity tokens paths *) + | DepSentence paths -> html_of_dep_sentence tokens paths | ENIAMSentence result -> let file_prefix = file_prefix_of_mode mode ^ file_prefix in html_of_eniam_sentence path file_prefix img verbosity tokens result @@ -1062,7 +1062,7 @@ let rec html_of_sentence path file_prefix mode img verbosity tokens = function String.concat "\n" (Xlist.map l (fun (mode,sentence) -> sprintf "<tr><td>%s</td><td>%s</td></tr>" (string_of_mode mode) (html_of_sentence path file_prefix mode img verbosity tokens sentence))) ^ "</table>" - | _ -> failwith "html_of_sentence: ni" + (* | _ -> failwith "html_of_sentence: ni" *) let rec html_of_paragraph path mode img verbosity tokens = function RawParagraph s -> (*print_endline "RawParagraph";*) s diff --git a/exec/makefile b/exec/makefile index bdaf8df..38b45b6 100755 --- a/exec/makefile +++ b/exec/makefile @@ -19,6 +19,13 @@ install: all cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMvisualization.cmi $(INSTALLDIR) cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMvisualization.cmx $(INSTALLDIR) +install-local: all + mkdir -p $(INSTALLDIR) + cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR) + cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMsemLexicon.cmi ENIAMsemValence.cmi ENIAMvisualization.cmi $(INSTALLDIR) + cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMsemLexicon.cmx ENIAMsemValence.cmx ENIAMvisualization.cmx $(INSTALLDIR) + mkdir -p /usr/local/share/eniam/exec + cp resources/* /usr/local/share/eniam/exec eniam-exec.cma: $(SOURCES) ocamlc -linkall -a -o eniam-exec.cma $(OCAMLFLAGS) $^ diff --git a/integration/ENIAMpreIntegration.ml b/integration/ENIAMpreIntegration.ml index 2f2dd02..8679c37 100644 --- a/integration/ENIAMpreIntegration.ml +++ b/integration/ENIAMpreIntegration.ml @@ -198,3 +198,9 @@ let rec parse_text mode tokens = function StructText(List.rev paragraphs) | AltText l -> AltText(Xlist.map l (fun (mode,text) -> mode, parse_text mode tokens text)) + +let catch_parse_text mode tokens text = + try + parse_text mode tokens text,"" + with e -> + text, Printexc.to_string e diff --git a/lexSemantics/.gitignore b/lexSemantics/.gitignore index 6a4b4e5..6d2ac49 100644 --- a/lexSemantics/.gitignore +++ b/lexSemantics/.gitignore @@ -1,2 +1,3 @@ test lexSemantics +inttest diff --git a/lexSemantics/ENIAMwalParser.ml b/lexSemantics/ENIAMwalParser.ml index 2b30aa0..1b3e0a2 100644 --- a/lexSemantics/ENIAMwalParser.ml +++ b/lexSemantics/ENIAMwalParser.ml @@ -73,14 +73,6 @@ let split_text schema = | Str.Delim "'" -> Quot | _ -> failwith "parse_text")) -let rec split_symbol symb rev = function - [] -> [List.rev rev](*failwith "split_symbol"*) - | s :: l -> - if s = symb then - if l = [] then (*[List.rev rev]*)failwith "split_symbol" - else (List.rev rev) :: (split_symbol symb [] l) - else split_symbol symb (s :: rev) l - let rec string_of_token = function Text s -> s | Paren l -> "(" ^ String.concat "" (Xlist.map l string_of_token) ^ ")" @@ -101,6 +93,14 @@ let rec string_of_token = function let string_of_token_list l = String.concat "" (Xlist.map l string_of_token) +let rec split_symbol symb rev = function + [] -> [List.rev rev](*failwith "split_symbol"*) + | s :: l -> + if s = symb then + if l = [] then (*[List.rev rev]*)failwith ("split_symbol: " ^ string_of_token symb) + else (List.rev rev) :: (split_symbol symb [] l) + else split_symbol symb (s :: rev) l + let parse_case = function [Text "nom"] -> Case "nom" | [Text "gen"] -> Case "gen" diff --git a/lexSemantics/interface.ml b/lexSemantics/interface.ml index 6d9923e..a9f8ead 100644 --- a/lexSemantics/interface.ml +++ b/lexSemantics/interface.ml @@ -23,6 +23,7 @@ let output = ref Text let comm_stdio = ref true (* let sentence_split = ref true *) let port = ref 5439 +let perform_integration = ref false let spec_list = [ (* "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)"; @@ -33,6 +34,13 @@ let spec_list = [ "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML"; "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure"; "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML"; + "--dep_parser", Arg.Unit (fun () -> + ENIAMpreIntegration.concraft_enabled := true; + ENIAMpreIntegration.mate_parser_enabled := true; + perform_integration := true), "Enable dependency parser"; + "--no_dep_parser", Arg.Unit (fun () -> + ENIAMpreIntegration.concraft_enabled := false; + ENIAMpreIntegration.mate_parser_enabled := false), "Disable dependency parser (default)"; (* "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off"; *) (* "-r", Arg.String (fun p -> ENIAMtokenizerTypes.set_resource_path p; @@ -65,6 +73,9 @@ let rec main_loop in_chan out_chan = print_endline text; print_endline "input text end"; *) let text,tokens,msg = ENIAMsubsyntax.catch_parse_text text in + let text,msg = + if msg <> "" || not !perform_integration then text,msg else + ENIAMpreIntegration.catch_parse_text ENIAMsubsyntaxTypes.Struct tokens text in let lex_sems,msg = if msg <> "" then ExtArray.make 0 ENIAMlexSemanticsTypes.empty_lex_sem, msg else ENIAMlexSemantics.catch_assign tokens text in @@ -84,6 +95,7 @@ let _ = prerr_endline message; Arg.parse spec_list anon_fun usage_msg; ENIAMlexSemantics.initialize (); + ENIAMpreIntegration.initialize (); Gc.compact (); prerr_endline "Ready!"; if !comm_stdio then main_loop stdin stdout diff --git a/lexSemantics/makefile b/lexSemantics/makefile index b4afd18..af2eb69 100644 --- a/lexSemantics/makefile +++ b/lexSemantics/makefile @@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt OCAMLDEP=ocamldep INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam OCAMLFLAGS=$(INCLUDES) -g -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa eniam-integration.cmxa eniam-lcg-parser.cmxa eniam-lcg-lexicon.cmxa eniam-lexSemantics.cmxa INSTALLDIR=`ocamlc -where`/eniam SOURCES= entries.ml ENIAMwalTypes.ml ENIAMwalStringOf.ml ENIAMwalParser.ml ENIAMwalReduce.ml ENIAMlexSemanticsTypes.ml ENIAMlexSemanticsData.ml ENIAMvalence.ml ENIAMwalRenderer.ml ENIAMadjuncts.ml \ @@ -40,6 +40,9 @@ eniam-lexSemantics.cmxa: $(SOURCES) test: test.ml $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^ +inttest: inttest.ml + $(OCAMLOPT) -o inttest $(OCAMLOPTFLAGS) $^ + interface: interface.ml $(OCAMLOPT) -o lexSemantics $(OCAMLOPTFLAGS) interface.ml @@ -65,4 +68,4 @@ interface: interface.ml $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< clean: - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test inttest diff --git a/semantics/ENIAMsemLexicon.ml b/semantics/ENIAMsemLexicon.ml index e65c434..328be91 100644 --- a/semantics/ENIAMsemLexicon.ml +++ b/semantics/ENIAMsemLexicon.ml @@ -47,7 +47,7 @@ let parse_multi p = function let parse_morf p = function [T "1"] -> {p with is_necessary=Opt} | tokens -> - let l = Xlist.map (Lexer.split_symbol (T "*") [] tokens) (function + let l = Xlist.map (try Lexer.split_symbol (T "*") [] tokens with _ -> failwith "parse_morf: split_symbol *") (function [T s] -> Atom s | tokens -> failwith ("parse_morf: " ^ Lexer.string_of_token_list tokens)) in {p with morfs=LCG (Tensor l) :: p.morfs} @@ -57,7 +57,7 @@ let parse_arg tokens p = let tokens,p = parse_dir p tokens in let tokens,p = parse_multi p tokens in match Lexer.find_brackets ["(",")"] [] tokens with - [B("(",")",tokens)] -> Xlist.fold (Lexer.split_symbol (T "+") [] tokens) p parse_morf + [B("(",")",tokens)] -> Xlist.fold (try Lexer.split_symbol (T "+") [] tokens with _ -> failwith "parse_arg: split_symbol +") p parse_morf | tokens -> parse_morf p tokens @@ -75,7 +75,7 @@ let parse_entry = function [T symbol; T ":"; T "null"] -> symbol,[] | T symbol :: T ":" :: tokens -> (* Printf.printf "parse_entry: %s\n" (Lexer.string_of_token_list tokens); *) - let tokens = Lexer.split_symbol (T ":") [] tokens in + let tokens = try Lexer.split_symbol (T ":") [] tokens with _ -> failwith "parse_entry: split_symbol :" in let tokens = manage_tokens tokens in let positions = Xlist.map tokens (fun (arg,role) -> parse_arg arg (parse_role {empty_position with is_necessary=Req} role)) in @@ -91,7 +91,7 @@ let load_lexicon filename = | T "\t" -> tokens | T "\r" -> tokens | t -> t :: tokens)) in - let entries = Lexer.split_symbol (T ";") [] tokens in + let entries = try Lexer.split_symbol (T ";") [] tokens with _ -> failwith "load_lexicon: split_symbol ;" in Xlist.fold entries StringMap.empty (fun map entry -> let symbol,args = parse_entry entry in StringMap.add_inc map symbol args (fun _ -> failwith ("load_lexicon: " ^ symbol))) diff --git a/testy/skladnica-test1-Failure.conll b/testy/skladnica-test1-Failure.conll index 91faeaf..1ccebc1 100644 --- a/testy/skladnica-test1-Failure.conll +++ b/testy/skladnica-test1-Failure.conll @@ -1,8 +1,3 @@ -1 - - interp interp _ 3 punct _ _ -2 Panowie pan subst subst pl|nom|m1 3 subj _ _ -3 przyszli przyjść praet praet pl|m1|perf 0 pred _ _ -4 . . interp interp _ 3 punct _ _ - 1 O o prep prep loc 12 comp _ _ 2 klasztornym klasztorny adj adj sg|loc|n|pos 3 adjunct _ _ 3 piekle piekło subst subst sg|loc|n 1 comp _ _ @@ -21,84 +16,118 @@ 16 br bieżący_rok brev brev pun 15 ne _ _ 17 . . interp interp _ 12 punct _ _ -1 Następnie następnie adv adv _ 2 adjunct _ _ -2 rozłożyła rozłożyć praet praet sg|f|perf 10 conjunct _ _ -3 wysoki wysoki adj adj sg|acc|m3|pos 4 adjunct _ _ -4 statyw statyw subst subst sg|acc|m3 2 obj _ _ -5 , , interp interp _ 10 coord_punct _ _ -6 zawiesiła zawiesić praet praet sg|f|perf 10 conjunct _ _ -7 na na prep prep loc 6 adjunct _ _ -8 nim on ppron3 ppron3 sg|loc|m3|ter|akc|praep 7 comp _ _ -9 pudełko pudełko subst subst sg|acc|n 6 obj _ _ -10 , , interp interp _ 0 pred _ _ -11 przeprowadziła przeprowadzić praet praet sg|f|perf 10 conjunct _ _ -12 od od prep prep gen|nwok 11 adjunct _ _ -13 niego on ppron3 ppron3 sg|gen|n|ter|akc|praep 12 comp _ _ -14 przezroczysty przezroczysty adj adj sg|acc|m3|pos 15 adjunct _ _ -15 przewód przewód subst subst sg|acc|m3 11 obj _ _ -16 do do prep prep gen 11 adjunct _ _ -17 igły igła subst subst sg|gen|f 16 comp _ _ -18 , , interp interp _ 23 punct _ _ -19 którą który adj adj sg|acc|f|pos 23 obj _ _ -20 wcześniej wcześnie adv adv com 23 adjunct _ _ -21 automatyczny automatyczny adj adj sg|nom|m3|pos 22 adjunct _ _ -22 iniektor iniektor subst subst sg|nom|m3 23 subj _ _ -23 umieścił umieścić praet praet sg|m3|perf 17 adjunct _ _ -24 w w prep prep loc|nwok 23 comp _ _ -25 żyle żyła subst subst sg|loc|f 24 comp _ _ -26 na na prep prep loc 25 adjunct _ _ -27 przedramieniu przedramię subst subst sg|loc|n 26 comp _ _ -28 Irka Irek subst subst sg|gen|m1 27 adjunct _ _ -29 . . interp interp _ 10 punct _ _ +1 W w prep prep loc|nwok 9 adjunct _ _ +2 stanie stan subst subst sg|loc|m3 1 comp _ _ +3 obrzydzenia obrzydzenie subst subst sg|gen|n 2 adjunct _ _ +4 przyprawiającego przyprawiać pact pact sg|gen|n|imperf|aff 3 adjunct _ _ +5 o o prep prep acc 4 comp _ _ +6 nowe nowy adj adj pl|acc|n|pos 7 adjunct _ _ +7 mdłości mdłości subst subst pl|acc|n 5 comp _ _ +8 nie nie qub qub _ 9 neg _ _ +9 zauważył zauważyć praet praet sg|m1|perf 0 pred _ _ +10 nawet nawet qub qub _ 9 adjunct _ _ +11 , , interp interp _ 15 punct _ _ +12 że że comp comp _ 15 complm _ _ +13 wielki wielki adj adj sg|nom|m3|pos 14 adjunct _ _ +14 ból ból subst subst sg|nom|m3 15 subj _ _ +15 zaczyna zaczynać fin fin sg|ter|imperf 9 comp_fin _ _ +16 z z prep prep acc|nwok 18 adjunct _ _ +17 wolna wolny adj adjp _ 16 mwe _ _ +18 zanikać zanikać inf inf imperf 15 comp_inf _ _ +19 . . interp interp _ 9 punct _ _ + +1 - - interp interp _ 7 punct _ _ +2 W w prep prep loc|nwok 4 comp _ _ +3 szkole szkoła subst subst sg|loc|f 2 comp _ _ +4 jest być fin fin sg|ter|imperf 7 conjunct _ _ +5 mniej mało num num pl|nom 4 subj _ _ +6 uczniów uczeń subst subst pl|gen|m1 5 comp _ _ +7 , , interp interp _ 0 coord_punct _ _ +8 dlatego dlatego adv adv _ 9 adjunct _ _ +9 musiał musieć praet praet sg|m1|imperf 7 conjunct _ _ +10 em być aglt aglt sg|pri|imperf|wok 9 aglt _ _ +11 tym ten adj adj pl|dat|f|pos 12 adjunct _ _ +12 paniom pani subst subst pl|dat|f 13 obj_th _ _ +13 podziękować podziękować inf inf perf 9 comp_inf _ _ +14 . . interp interp _ 7 punct _ _ + +1 Od od prep prep gen|nwok 9 adjunct _ _ +2 końca koniec subst subst sg|gen|m3 1 comp _ _ +3 XVIII XVIII adj adj sg|gen|m3|pos 4 ne _ _ +4 w wiek brev brev pun 2 comp _ _ +5 . . interp interp _ 4 abbrev_punct _ _ +6 informacje informacja subst subst pl|nom|f 9 subj _ _ +7 o o prep prep loc 6 adjunct _ _ +8 głodach głód subst subst pl|loc|m3 7 comp _ _ +9 stają stawać fin fin pl|ter|imperf 0 pred _ _ +10 się się qub qub _ 9 refl _ _ +11 coraz coraz adv adv _ 12 adjunct _ _ +12 rzadsze rzadki adj adj pl|nom|f|com 9 pd _ _ +13 . . interp interp _ 9 punct _ _ + +1 Zabrał zabrać praet praet sg|m1|perf 0 pred _ _ +2 ponad ponad qub qub _ 3 adjunct _ _ +3 30 30 num num pl|acc|m3|rec 1 obj _ _ +4 tys tysiąc brev brev pun 3 mwe _ _ +5 . . interp interp _ 4 abbrev_punct _ _ +6 zł złoty brev brev npun 3 comp _ _ +7 . . interp interp _ 1 punct _ _ + +1 ( ( interp interp _ 8 punct _ _ +2 Kiedyś kiedyś adv adv _ 4 adjunct _ _ +3 też też qub qub _ 4 adjunct _ _ +4 miała mieć praet praet sg|f|imperf 8 conjunct _ _ +5 m być aglt aglt sg|pri|imperf|nwok 4 aglt _ _ +6 takie taki adj adj pl|acc|f|pos 7 adjunct _ _ +7 ambicje ambicja subst subst pl|acc|f 4 obj_th _ _ +8 , , interp interp _ 0 pred _ _ +9 zrezygnowała zrezygnować praet praet sg|f|perf 8 conjunct _ _ +10 m być aglt aglt sg|pri|imperf|nwok 9 aglt _ _ +11 . . interp interp _ 8 punct _ _ +12 ) ) interp interp _ 8 punct _ _ -1 - - interp interp _ 4 punct _ _ -2 Co co subst subst sg|nom|n 4 pd _ _ -3 to to subst subst sg|nom|n 4 subj _ _ -4 jest być fin fin sg|ter|imperf 0 pred _ _ -5 ? ? interp interp _ 4 punct _ _ +1 Zawsze zawsze adv adv _ 2 adjunct _ _ +2 mówię mówić fin fin sg|pri|imperf 0 pred _ _ +3 , , interp interp _ 5 punct _ _ +4 że że comp comp _ 5 complm _ _ +5 mogę móc fin fin sg|pri|imperf 2 comp_fin _ _ +6 pracować pracować inf inf imperf 5 comp_inf _ _ +7 , , interp interp _ 5 punct _ _ +8 bo bo comp comp _ 5 adjunct _ _ +9 mam mieć fin fin sg|pri|imperf 13 conjunct _ _ +10 dobre dobry adj adj sg|acc|n|pos 11 adjunct _ _ +11 zdrowie zdrowie subst subst sg|acc|n 9 obj_th _ _ +12 , , interp interp _ 13 punct _ _ +13 a a conj conj _ 8 comp_fin _ _ +14 to to subst subst sg|nom|n 15 subj _ _ +15 jest być fin fin sg|ter|imperf 13 conjunct _ _ +16 darmo darmo adv adv _ 17 adjunct _ _ +17 dane dany adj adj sg|nom|n|perf|aff 15 pd _ _ +18 . . interp interp _ 2 punct _ _ -1 Prosi prosić fin fin sg|ter|imperf 0 pred _ _ -2 się się qub qub _ 1 refl _ _ -3 też też qub qub _ 1 adjunct _ _ -4 zakłady zakład subst subst pl|acc|m3 1 obj _ _ -5 pracy praca subst subst sg|gen|f 4 adjunct _ _ -6 , , interp interp _ 8 punct _ _ -7 które który adj adj pl|nom|m3|pos 8 subj _ _ -8 dysponują dysponować fin fin pl|ter|imperf 4 adjunct _ _ -9 autobusami autobus subst subst pl|inst|m3 8 comp _ _ -10 , , interp interp _ 12 punct _ _ -11 by by comp comp _ 12 complm _ _ -12 wspomogły wspomóc praet praet pl|m3|perf 1 comp_fin _ _ -13 komunikację komunikacja subst subst sg|acc|f 12 obj _ _ -14 zastępczą zastępczy adj adj sg|acc|f|pos 13 adjunct _ _ -15 . . interp interp _ 1 punct _ _ +1 " " interp interp _ 2 punct _ _ +2 Zrobimy zrobić fin fin pl|pri|perf 0 pred _ _ +3 " " interp interp _ 2 punct _ _ +4 ! ! interp interp _ 2 punct _ _ 1 - - interp interp _ 3 punct _ _ -2 Nie nie qub qub _ 3 neg _ _ -3 chcą chcieć fin fin pl|ter|imperf 0 pred _ _ -4 , , interp interp _ 8 punct _ _ -5 by by comp comp _ 8 complm _ _ -6 m być aglt aglt sg|pri|imperf|nwok 8 aglt _ _ -7 ich on ppron3 ppron3 pl|acc|m1|ter|akc|npraep 8 obj _ _ -8 utrzymywał utrzymywać praet praet sg|m1|imperf 3 comp_fin _ _ -9 . . interp interp _ 3 punct _ _ +2 No no qub qub _ 3 adjunct _ _ +3 wie wiedzieć fin fin sg|ter|imperf 0 pred _ _ +4 pan pan subst subst sg|nom|m1 3 subj _ _ +5 ! ! interp interp _ 3 punct _ _ +6 . . interp interp _ 5 punct _ _ +7 . . interp interp _ 6 punct _ _ +8 . . interp interp _ 7 punct _ _ -1 Wzięli wziąć praet praet pl|m1|perf 0 pred _ _ -2 w w prep prep loc|nwok 4 adjunct _ _ -3 niej on ppron3 ppron3 sg|loc|f|ter|akc|praep 2 comp _ _ -4 udział udział subst subst sg|acc|m3 1 obj _ _ -5 przedstawiciele przedstawiciel subst subst pl|nom|m1 1 subj _ _ -6 policji policja subst subst sg|gen|f 5 adjunct _ _ -7 z z prep prep gen|nwok 5 adjunct _ _ -8 Niemiec Niemcy subst subst pl|gen|n 17 conjunct _ _ -9 , , interp interp _ 17 coord_punct _ _ -10 Czech Czechy subst subst pl|gen|n 17 conjunct _ _ -11 , , interp interp _ 17 coord_punct _ _ -12 Słowacji Słowacja subst subst sg|gen|f 17 conjunct _ _ -13 , , interp interp _ 17 coord_punct _ _ -14 Węgier Węgry subst subst pl|gen|n 17 conjunct _ _ -15 , , interp interp _ 17 coord_punct _ _ -16 Ukrainy Ukraina subst subst sg|gen|f 17 conjunct _ _ -17 i i conj conj _ 7 comp _ _ -18 Polski Polska subst subst sg|gen|f 17 conjunct _ _ -19 . . interp interp _ 1 punct _ _ +1 ( ( interp interp _ 6 punct _ _ +2 Myszkinku Myszkinek subst subst sg|voc|m3 6 adjunct _ _ +3 , , interp interp _ 2 punct _ _ +4 jakie jaki adj adj sg|acc|n|pos 7 adjunct _ _ +5 ty ty ppron12 ppron12 sg|nom|m2|sec 6 subj _ _ +6 masz mieć fin fin sg|sec|imperf 0 pred _ _ +7 futerko futerko subst subst sg|acc|n 6 obj_th _ _ +8 , , interp interp _ 7 punct _ _ +9 lazurowe lazurowy adj adj sg|acc|n|pos 7 adjunct _ _ +10 po po prep prep acc 9 adjunct _ _ +11 prostu prosty adjp adjp _ 10 mwe _ _ +12 ! ! interp interp _ 6 punct _ _ diff --git a/testy/skladnica-test1-Not_parsed.conll b/testy/skladnica-test1-Not_parsed.conll new file mode 100644 index 0000000..6f510e9 --- /dev/null +++ b/testy/skladnica-test1-Not_parsed.conll @@ -0,0 +1,115 @@ +1 Cmentarz cmentarz subst subst sg|nom|m3 2 subj _ _ +2 jest być fin fin sg|ter|imperf 0 pred _ _ +3 taki taki adj adj sg|nom|m3|pos 4 adjunct _ _ +4 pusty pusty adj adj sg|nom|m3|pos 2 pd _ _ +5 ! ! interp interp _ 2 punct _ _ + +1 Mówi mówić fin fin sg|ter|imperf 0 pred _ _ +2 się się qub qub _ 1 refl _ _ +3 przecież przecież qub qub _ 1 adjunct _ _ +4 , , interp interp _ 7 punct _ _ +5 że że comp comp _ 7 complm _ _ +6 broń broń subst subst sg|nom|f 7 subj _ _ +7 była być praet praet sg|f|imperf 1 comp_fin _ _ +8 w w prep prep loc|nwok 7 adjunct _ _ +9 szkole szkoła subst subst sg|loc|f 8 comp _ _ +10 schowana schować ppas ppas sg|nom|f|perf|aff 7 pd _ _ +11 jeszcze jeszcze qub qub _ 12 adjunct _ _ +12 latem lato subst subst sg|inst|n 7 adjunct _ _ +13 w w prep prep loc|nwok 12 adjunct _ _ +14 czasie czas subst subst sg|loc|m3 13 mwe _ _ +15 remontu remont subst subst sg|gen|m3 14 comp _ _ +16 . . interp interp _ 1 punct _ _ + +1 Bo bo comp comp _ 9 adjunct _ _ +2 jak jak adv adv _ 9 adjunct _ _ +3 ona on ppron3 ppron3 sg|nom|f|ter|akc|npraep 9 subj _ _ +4 , , interp interp _ 3 punct _ _ +5 chora chory adj adj sg|nom|f|pos 3 adjunct _ _ +6 na na prep prep acc 5 adjunct _ _ +7 cukrzycę cukrzyca subst subst sg|acc|f 6 comp _ _ +8 , , interp interp _ 3 punct _ _ +9 przeżyła przeżyć praet praet sg|f|perf 0 pred _ _ +10 trzy trzy num num pl|acc|m3|congr 9 obj _ _ +11 dni dzień subst subst pl|acc|m3 10 comp _ _ +12 bez bez prep prep gen|nwok 9 comp _ _ +13 wody woda subst subst sg|gen|f 14 conjunct _ _ +14 i i conj conj _ 12 comp _ _ +15 jedzenia jedzenie subst subst sg|gen|n 14 conjunct _ _ +16 ? ? interp interp _ 9 punct _ _ + +1 Jednak jednak qub qub _ 9 adjunct _ _ +2 już już qub qub _ 3 adjunct _ _ +3 wkrótce wkrótce adv adv _ 9 adjunct _ _ +4 Nizioł Nizioł subst subst sg|nom|m1 5 conjunct _ _ +5 i i conj conj _ 9 subj _ _ +6 Wapiński Wapiński subst subst sg|nom|m1 5 conjunct _ _ +7 ze z prep prep inst|wok 9 adjunct _ _ +8 zdumieniem zdumienie subst subst sg|inst|n 7 comp _ _ +9 odkryli odkryć praet praet pl|m1|perf 0 pred _ _ +10 , , interp interp _ 14 punct _ _ +11 że że comp comp _ 14 complm _ _ +12 Łapiński Łapiński subst subst sg|nom|m1 14 subj _ _ +13 nie nie qub qub _ 14 neg _ _ +14 dotrzymuje dotrzymywać fin fin sg|ter|imperf 9 comp_fin _ _ +15 wcześniej wcześnie adv adv com 16 adjunct _ _ +16 danego dać ppas ppas sg|gen|n|perf|aff 17 adjunct _ _ +17 słowa słowo subst subst sg|gen|n 14 obj _ _ +18 . . interp interp _ 9 punct _ _ + +1 A a qub qub _ 8 adjunct _ _ +2 pan pan subst subst sg|nom|m1 8 subj _ _ +3 nigdy nigdy adv adv _ 8 adjunct _ _ +4 się się qub qub _ 8 refl _ _ +5 z z prep prep inst|nwok 8 comp _ _ +6 nimi on ppron3 ppron3 pl|inst|m1|ter|akc|praep 5 comp _ _ +7 nie nie qub qub _ 8 neg _ _ +8 zetknął zetknąć praet praet sg|m1|perf 0 pred _ _ +9 ? ? interp interp _ 8 punct _ _ + +1 Załapać załapać inf inf perf 3 comp_inf _ _ +2 się się qub qub _ 1 refl _ _ +3 trzeba trzeba pred pred _ 0 pred _ _ +4 teraz teraz adv adv _ 3 adjunct _ _ +5 , , interp interp _ 3 punct _ _ +6 bo bo comp comp _ 3 adjunct _ _ +7 potem potem adv adv _ 8 adjunct _ _ +8 będzie być bedzie bedzie sg|ter|imperf 6 comp_fin _ _ +9 trudniej trudno adv adv com 8 pd _ _ +10 . . interp interp _ 3 punct _ _ + +1 Medykamenty medykament subst subst pl|nom|m3 4 subj _ _ +2 współczesne współczesny adj adj pl|nom|m3|pos 1 adjunct _ _ +3 dostępne dostępny adj adj pl|nom|m3|pos 4 pd _ _ +4 są być fin fin pl|ter|imperf 0 pred _ _ +5 na na prep prep loc 4 adjunct _ _ +6 czarnym czarny adj adj sg|loc|m3|pos 7 adjunct _ _ +7 rynku rynek subst subst sg|loc|m3 5 comp _ _ +8 . . interp interp _ 4 punct _ _ + +1 To to subst subst sg|nom|n 3 subj _ _ +2 samo sam adj adj sg|nom|n|pos 1 adjunct _ _ +3 dotyczy dotyczyć fin fin sg|ter|imperf 5 conjunct _ _ +4 leczenia leczenie subst subst sg|gen|n 3 obj_th _ _ +5 , , interp interp _ 0 coord_punct _ _ +6 służba służba subst subst sg|nom|f 9 subj _ _ +7 zdrowia zdrowie subst subst sg|gen|n 6 adjunct _ _ +8 praktycznie praktycznie adv adv pos 9 adjunct _ _ +9 przestała przestać praet praet sg|f|perf 5 conjunct _ _ +10 istnieć istnieć inf inf imperf 9 comp_inf _ _ +11 . . interp interp _ 5 punct _ _ + +1 Zwykły zwykły adj adj sg|nom|m1|pos 2 adjunct _ _ +2 mieszkaniec mieszkaniec subst subst sg|nom|m1 4 subj _ _ +3 kraju kraj subst subst sg|gen|m3 2 adjunct _ _ +4 ma mieć fin fin sg|ter|imperf 0 pred _ _ +5 się się qub qub _ 6 refl _ _ +6 leczyć leczyć inf inf imperf 4 comp_inf _ _ +7 ziołami ziele subst subst pl|inst|n 6 obj_th _ _ +8 , , interp interp _ 10 punct _ _ +9 które który adj adj pl|acc|n|pos 10 obj _ _ +10 zaleca zalecać fin fin sg|ter|imperf 7 adjunct _ _ +11 tradycyjna tradycyjny adj adj sg|nom|f|pos 12 adjunct _ _ +12 medycyna medycyna subst subst sg|nom|f 10 subj _ _ +13 koreańska koreański adj adj sg|nom|f|pos 12 adjunct _ _ +14 . . interp interp _ 4 punct _ _ diff --git a/tokenizer/ENIAMtokens.ml b/tokenizer/ENIAMtokens.ml index 1f967b5..03bd65e 100644 --- a/tokenizer/ENIAMtokens.ml +++ b/tokenizer/ENIAMtokens.ml @@ -814,6 +814,8 @@ let rec recognize_sign_group poss_s_beg i = function | (Sign "?") :: (Sign "?") :: l -> create_sentence_seq_q i ((Sign "?") :: (Sign "?") :: []) l "??",i+2*factor,l,true (* | (Sign "?") :: (Sign ".") :: l -> *) + | (Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: l -> + create_sentence_seq_q i ((Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: []) l "!...",i+4*factor,l,true | (Sign "!") :: (Sign "?") :: l -> create_sentence_seq_q i ((Sign "!") :: (Sign "?") :: []) l "!?",i+2*factor,l,true | (Sign "?") :: (Sign "…") :: l ->