From f18f9cc018bb278049234468c21ac27e041114f7 Mon Sep 17 00:00:00 2001 From: Wojciech Jaworski <wjaworski@mimuw.edu.pl> Date: Sun, 28 May 2017 20:20:21 +0200 Subject: [PATCH] Korekta błędów przy parsowaniu deptrees --- corpora/test_conll.ml | 13 +++++++++---- exec/ENIAMexecTypes.ml | 2 ++ exec/ENIAMsemLexicon.ml | 10 +++++----- exec/ENIAMvisualization.ml | 4 ++-- exec/makefile | 9 +++++++++ lexSemantics/ENIAMlexSemantics.ml | 8 ++++---- lexSemantics/ENIAMwalParser.ml | 16 ++++++++-------- tokenizer/ENIAMtokens.ml | 2 ++ 8 files changed, 41 insertions(+), 23 deletions(-) diff --git a/corpora/test_conll.ml b/corpora/test_conll.ml index b87507e..19c5be9 100644 --- a/corpora/test_conll.ml +++ b/corpora/test_conll.ml @@ -122,11 +122,11 @@ let create_dep_chart tokens lex_sems paths = else print_endline "not parsed" *) let rec test_dep_example path id tokens lex_sems first_try paths = + (* print_endline "test_dep_example 1"; *) let paths = CONLL_adapter.convert_dep_tree path first_try paths tokens in try ENIAM_LCGreductions.reset_variant_label (); - print_endline "test_dep_example 1"; - print_endline "test_dep_example 2"; + (* print_endline "test_dep_example 2"; *) (* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *) let chart = create_dep_chart tokens lex_sems paths in (* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *) @@ -215,7 +215,7 @@ let process_id s = let process_conll_corpus filename = let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in - print_endline "process_conll_corpus"; + (* print_endline "process_conll_corpus 1"; *) (* let corpus = [List.hd corpus] in *) Xlist.iter corpus (fun query -> try let id = process_id (get_query_id query) in @@ -228,13 +228,17 @@ let process_conll_corpus filename = (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *) let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in + (* print_endline "process_conll_corpus 2"; *) let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in + (* print_endline "process_conll_corpus 3"; *) let sentences = match text with AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences | _ -> failwith "process_conll_corpus 1" in let text = AltText[Raw,RawText query; Struct, StructText([ AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in + (* print_endline "process_conll_corpus 4"; *) let lex_sems = ENIAMlexSemantics.assign tokens text in + (* print_endline "process_conll_corpus 5"; *) ignore(parse_text id 1 tokens lex_sems text) | _ -> failwith "process_conll_corpus 2" with @@ -243,8 +247,9 @@ let process_conll_corpus filename = let _ = Printexc.record_backtrace true; + ENIAMlexSemantics.initialize (); (* LCGfields.reset (); *) (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) (* process_conll_corpus "../testy/skladnica-test1.conll"; *) - process_conll_corpus "../testy/skladnica-test1-Find_father.conll"; + process_conll_corpus "../testy/skladnica-test1-Failure.conll"; (* LCGfields.print_results () *) diff --git a/exec/ENIAMexecTypes.ml b/exec/ENIAMexecTypes.ml index 5a1fafc..03f66e4 100644 --- a/exec/ENIAMexecTypes.ml +++ b/exec/ENIAMexecTypes.ml @@ -311,3 +311,5 @@ let rec fold_text mode s f = function | AltText l -> Xlist.fold l s (fun s (mode,text) -> fold_text mode s f text) + +let rules_filename = ENIAM_LCGlexiconTypes.resource_path ^ "/LCGlexicon/lexicon-pl.dic" diff --git a/exec/ENIAMsemLexicon.ml b/exec/ENIAMsemLexicon.ml index 6ceef37..121e2a6 100644 --- a/exec/ENIAMsemLexicon.ml +++ b/exec/ENIAMsemLexicon.ml @@ -47,7 +47,7 @@ let parse_multi p = function let parse_morf p = function [T "1"] -> {p with is_necessary=Opt} | tokens -> - let l = Xlist.map (Lexer.split_symbol (T "*") [] tokens) (function + let l = Xlist.map (try Lexer.split_symbol (T "*") [] tokens with _ -> failwith "parse_morf: split_symbol *") (function [T s] -> Atom s | tokens -> failwith ("parse_morf: " ^ Lexer.string_of_token_list tokens)) in {p with morfs=LCG (Tensor l) :: p.morfs} @@ -57,7 +57,7 @@ let parse_arg tokens p = let tokens,p = parse_dir p tokens in let tokens,p = parse_multi p tokens in match Lexer.find_brackets ["(",")"] [] tokens with - [B("(",")",tokens)] -> Xlist.fold (Lexer.split_symbol (T "+") [] tokens) p parse_morf + [B("(",")",tokens)] -> Xlist.fold (try Lexer.split_symbol (T "+") [] tokens with _ -> failwith "parse_arg: split_symbol +") p parse_morf | tokens -> parse_morf p tokens @@ -73,7 +73,7 @@ let parse_entry = function [T symbol; T ":"; T "null"] -> symbol,[] | T symbol :: T ":" :: tokens -> (* Printf.printf "parse_entry: %s\n" (Lexer.string_of_token_list tokens); *) - let tokens = Lexer.split_symbol (T ":") [] tokens in + let tokens = try Lexer.split_symbol (T ":") [] tokens with _ -> failwith "parse_entry: split_symbol :" in let tokens = manage_tokens tokens in let positions = Xlist.map tokens (fun (arg,role) -> parse_arg arg (parse_role {empty_position with is_necessary=Req} role)) in @@ -89,12 +89,12 @@ let load_lexicon filename = | T "\t" -> tokens | T "\r" -> tokens | t -> t :: tokens)) in - let entries = Lexer.split_symbol (T ";") [] tokens in + let entries = try Lexer.split_symbol (T ";") [] tokens with _ -> failwith "load_lexicon: split_symbol ;" in Xlist.fold entries StringMap.empty (fun map entry -> let symbol,args = parse_entry entry in StringMap.add_inc map symbol args (fun _ -> failwith ("load_lexicon: " ^ symbol))) -let sem_lexicon = load_lexicon "resources/lexicon-pl.dic" +let sem_lexicon = StringMap.empty (*load_lexicon ENIAMexecTypes.rules_filename*)(* FIXME!!! *) let extend_frame symbol frame = try diff --git a/exec/ENIAMvisualization.ml b/exec/ENIAMvisualization.ml index 1da9f91..968e52d 100644 --- a/exec/ENIAMvisualization.ml +++ b/exec/ENIAMvisualization.ml @@ -702,7 +702,7 @@ let html_of_struct_sentence tokens paths last = t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id lnode rnode)) ^ sprintf "<tr><td></td><td></td><td></td><td>%d</td><td></td></tr>" last ^ "</table>" -(* + let html_of_dep_sentence tokens paths = "<table><tr><td><b>orth</b></td><td><b>token</b></td><td><b>id</b></td><td><b>conll_id</b></td><td><b>super</b></td><td><b>label</b></td></tr>" ^ String.concat "\n" (List.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id -> @@ -711,7 +711,7 @@ let html_of_dep_sentence tokens paths = (sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td></tr>" t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id conll_id super label) :: l))) ^ "</table>" - +(* let html_of_tokens tokens = "<table><tr><td><b>id</b></td><td><b>orth</b></td><td><b>beg</b></td><td><b>len</b></td><td><b>next</b></td><td><b>token</b></td></td><td><b>attrs</b></td></tr>" ^ String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size tokens - 1) [] (fun l id -> diff --git a/exec/makefile b/exec/makefile index 41ddcbd..2140132 100755 --- a/exec/makefile +++ b/exec/makefile @@ -18,7 +18,16 @@ install: all cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR) cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMsemLexicon.cmi ENIAMsemValence.cmi ENIAMvisualization.cmi $(INSTALLDIR) cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMsemLexicon.cmx ENIAMsemValence.cmx ENIAMvisualization.cmx $(INSTALLDIR) + mkdir -p /usr/share/eniam/exec + cp resources/* /usr/share/eniam/exec +install-local: all + mkdir -p $(INSTALLDIR) + cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR) + cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMsemLexicon.cmi ENIAMsemValence.cmi ENIAMvisualization.cmi $(INSTALLDIR) + cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMsemLexicon.cmx ENIAMsemValence.cmx ENIAMvisualization.cmx $(INSTALLDIR) + mkdir -p /usr/local/share/eniam/exec + cp resources/* /usr/local/share/eniam/exec eniam-exec.cma: $(SOURCES) ocamlc -linkall -a -o eniam-exec.cma $(OCAMLFLAGS) $^ diff --git a/lexSemantics/ENIAMlexSemantics.ml b/lexSemantics/ENIAMlexSemantics.ml index 9f18280..a8a3fde 100644 --- a/lexSemantics/ENIAMlexSemantics.ml +++ b/lexSemantics/ENIAMlexSemantics.ml @@ -166,15 +166,15 @@ let semantize lemma pos (selectors,schema) = let assign_prep_semantics lemma = let roles = try StringMap.find ENIAMlexSemanticsData.prep_roles lemma with Not_found -> [] in - Printf.printf "assign_prep_semantics: |roles|=%d\n%!" (Xlist.size roles); + (* Printf.printf "assign_prep_semantics: |roles|=%d\n%!" (Xlist.size roles); *) Xlist.map roles (function (case,arole,arole_attr,hipero,sel_prefs) -> - Printf.printf "assign_prep_semantics: case=%s arole=%s arole_attr=%s\n%!" case arole arole_attr; + (* Printf.printf "assign_prep_semantics: case=%s arole=%s arole_attr=%s\n%!" case arole arole_attr; *) let meaning = find_prep_meaning lemma hipero in (* FIXME: zaślepka dla meaning i weight *) - print_endline "assign_prep_semantics 1"; + (* print_endline "assign_prep_semantics 1"; *) let positions = [{empty_position with sel_prefs=sel_prefs; dir=if lemma="temu" then Backward_ else Forward_; morfs=ENIAMwalRenderer.assing_pref_morfs (lemma,case); is_necessary=Req}] in - print_endline "assign_prep_semantics 2"; + (* print_endline "assign_prep_semantics 2"; *) {empty_frame with selectors=[ENIAM_LCGlexiconTypes.Case,ENIAM_LCGlexiconTypes.Eq,[case]]; meanings=[meaning]; positions=find_selprefs positions; arole=arole; arole_attr=arole_attr; arev=false}) diff --git a/lexSemantics/ENIAMwalParser.ml b/lexSemantics/ENIAMwalParser.ml index 2b30aa0..1b3e0a2 100644 --- a/lexSemantics/ENIAMwalParser.ml +++ b/lexSemantics/ENIAMwalParser.ml @@ -73,14 +73,6 @@ let split_text schema = | Str.Delim "'" -> Quot | _ -> failwith "parse_text")) -let rec split_symbol symb rev = function - [] -> [List.rev rev](*failwith "split_symbol"*) - | s :: l -> - if s = symb then - if l = [] then (*[List.rev rev]*)failwith "split_symbol" - else (List.rev rev) :: (split_symbol symb [] l) - else split_symbol symb (s :: rev) l - let rec string_of_token = function Text s -> s | Paren l -> "(" ^ String.concat "" (Xlist.map l string_of_token) ^ ")" @@ -101,6 +93,14 @@ let rec string_of_token = function let string_of_token_list l = String.concat "" (Xlist.map l string_of_token) +let rec split_symbol symb rev = function + [] -> [List.rev rev](*failwith "split_symbol"*) + | s :: l -> + if s = symb then + if l = [] then (*[List.rev rev]*)failwith ("split_symbol: " ^ string_of_token symb) + else (List.rev rev) :: (split_symbol symb [] l) + else split_symbol symb (s :: rev) l + let parse_case = function [Text "nom"] -> Case "nom" | [Text "gen"] -> Case "gen" diff --git a/tokenizer/ENIAMtokens.ml b/tokenizer/ENIAMtokens.ml index 1f967b5..03bd65e 100644 --- a/tokenizer/ENIAMtokens.ml +++ b/tokenizer/ENIAMtokens.ml @@ -814,6 +814,8 @@ let rec recognize_sign_group poss_s_beg i = function | (Sign "?") :: (Sign "?") :: l -> create_sentence_seq_q i ((Sign "?") :: (Sign "?") :: []) l "??",i+2*factor,l,true (* | (Sign "?") :: (Sign ".") :: l -> *) + | (Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: l -> + create_sentence_seq_q i ((Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: []) l "!...",i+4*factor,l,true | (Sign "!") :: (Sign "?") :: l -> create_sentence_seq_q i ((Sign "!") :: (Sign "?") :: []) l "!?",i+2*factor,l,true | (Sign "?") :: (Sign "…") :: l -> -- libgit2 0.22.2