Commit f18f9cc018bb278049234468c21ac27e041114f7

Authored by Wojciech Jaworski
1 parent dd02bfeb

Korekta błędów przy parsowaniu deptrees

corpora/test_conll.ml
@@ -122,11 +122,11 @@ let create_dep_chart tokens lex_sems paths = @@ -122,11 +122,11 @@ let create_dep_chart tokens lex_sems paths =
122 else print_endline "not parsed" *) 122 else print_endline "not parsed" *)
123 123
124 let rec test_dep_example path id tokens lex_sems first_try paths = 124 let rec test_dep_example path id tokens lex_sems first_try paths =
  125 + (* print_endline "test_dep_example 1"; *)
125 let paths = CONLL_adapter.convert_dep_tree path first_try paths tokens in 126 let paths = CONLL_adapter.convert_dep_tree path first_try paths tokens in
126 try 127 try
127 ENIAM_LCGreductions.reset_variant_label (); 128 ENIAM_LCGreductions.reset_variant_label ();
128 - print_endline "test_dep_example 1";  
129 - print_endline "test_dep_example 2"; 129 + (* print_endline "test_dep_example 2"; *)
130 (* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *) 130 (* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *)
131 let chart = create_dep_chart tokens lex_sems paths in 131 let chart = create_dep_chart tokens lex_sems paths in
132 (* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *) 132 (* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *)
@@ -215,7 +215,7 @@ let process_id s = @@ -215,7 +215,7 @@ let process_id s =
215 215
216 let process_conll_corpus filename = 216 let process_conll_corpus filename =
217 let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in 217 let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in
218 - print_endline "process_conll_corpus"; 218 + (* print_endline "process_conll_corpus 1"; *)
219 (* let corpus = [List.hd corpus] in *) 219 (* let corpus = [List.hd corpus] in *)
220 Xlist.iter corpus (fun query -> try 220 Xlist.iter corpus (fun query -> try
221 let id = process_id (get_query_id query) in 221 let id = process_id (get_query_id query) in
@@ -228,13 +228,17 @@ let process_conll_corpus filename = @@ -228,13 +228,17 @@ let process_conll_corpus filename =
228 (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *) 228 (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *)
229 let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] 229 let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths]
230 (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in 230 (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in
  231 + (* print_endline "process_conll_corpus 2"; *)
231 let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in 232 let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in
  233 + (* print_endline "process_conll_corpus 3"; *)
232 let sentences = match text with 234 let sentences = match text with
233 AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences 235 AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences
234 | _ -> failwith "process_conll_corpus 1" in 236 | _ -> failwith "process_conll_corpus 1" in
235 let text = AltText[Raw,RawText query; Struct, StructText([ 237 let text = AltText[Raw,RawText query; Struct, StructText([
236 AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in 238 AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in
  239 + (* print_endline "process_conll_corpus 4"; *)
237 let lex_sems = ENIAMlexSemantics.assign tokens text in 240 let lex_sems = ENIAMlexSemantics.assign tokens text in
  241 + (* print_endline "process_conll_corpus 5"; *)
238 ignore(parse_text id 1 tokens lex_sems text) 242 ignore(parse_text id 1 tokens lex_sems text)
239 | _ -> failwith "process_conll_corpus 2" 243 | _ -> failwith "process_conll_corpus 2"
240 with 244 with
@@ -243,8 +247,9 @@ let process_conll_corpus filename = @@ -243,8 +247,9 @@ let process_conll_corpus filename =
243 247
244 let _ = 248 let _ =
245 Printexc.record_backtrace true; 249 Printexc.record_backtrace true;
  250 + ENIAMlexSemantics.initialize ();
246 (* LCGfields.reset (); *) 251 (* LCGfields.reset (); *)
247 (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) 252 (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *)
248 (* process_conll_corpus "../testy/skladnica-test1.conll"; *) 253 (* process_conll_corpus "../testy/skladnica-test1.conll"; *)
249 - process_conll_corpus "../testy/skladnica-test1-Find_father.conll"; 254 + process_conll_corpus "../testy/skladnica-test1-Failure.conll";
250 (* LCGfields.print_results () *) 255 (* LCGfields.print_results () *)
exec/ENIAMexecTypes.ml
@@ -311,3 +311,5 @@ let rec fold_text mode s f = function @@ -311,3 +311,5 @@ let rec fold_text mode s f = function
311 | AltText l -> 311 | AltText l ->
312 Xlist.fold l s (fun s (mode,text) -> 312 Xlist.fold l s (fun s (mode,text) ->
313 fold_text mode s f text) 313 fold_text mode s f text)
  314 +
  315 +let rules_filename = ENIAM_LCGlexiconTypes.resource_path ^ "/LCGlexicon/lexicon-pl.dic"
exec/ENIAMsemLexicon.ml
@@ -47,7 +47,7 @@ let parse_multi p = function @@ -47,7 +47,7 @@ let parse_multi p = function
47 let parse_morf p = function 47 let parse_morf p = function
48 [T "1"] -> {p with is_necessary=Opt} 48 [T "1"] -> {p with is_necessary=Opt}
49 | tokens -> 49 | tokens ->
50 - let l = Xlist.map (Lexer.split_symbol (T "*") [] tokens) (function 50 + let l = Xlist.map (try Lexer.split_symbol (T "*") [] tokens with _ -> failwith "parse_morf: split_symbol *") (function
51 [T s] -> Atom s 51 [T s] -> Atom s
52 | tokens -> failwith ("parse_morf: " ^ Lexer.string_of_token_list tokens)) in 52 | tokens -> failwith ("parse_morf: " ^ Lexer.string_of_token_list tokens)) in
53 {p with morfs=LCG (Tensor l) :: p.morfs} 53 {p with morfs=LCG (Tensor l) :: p.morfs}
@@ -57,7 +57,7 @@ let parse_arg tokens p = @@ -57,7 +57,7 @@ let parse_arg tokens p =
57 let tokens,p = parse_dir p tokens in 57 let tokens,p = parse_dir p tokens in
58 let tokens,p = parse_multi p tokens in 58 let tokens,p = parse_multi p tokens in
59 match Lexer.find_brackets ["(",")"] [] tokens with 59 match Lexer.find_brackets ["(",")"] [] tokens with
60 - [B("(",")",tokens)] -> Xlist.fold (Lexer.split_symbol (T "+") [] tokens) p parse_morf 60 + [B("(",")",tokens)] -> Xlist.fold (try Lexer.split_symbol (T "+") [] tokens with _ -> failwith "parse_arg: split_symbol +") p parse_morf
61 | tokens -> parse_morf p tokens 61 | tokens -> parse_morf p tokens
62 62
63 63
@@ -73,7 +73,7 @@ let parse_entry = function @@ -73,7 +73,7 @@ let parse_entry = function
73 [T symbol; T ":"; T "null"] -> symbol,[] 73 [T symbol; T ":"; T "null"] -> symbol,[]
74 | T symbol :: T ":" :: tokens -> 74 | T symbol :: T ":" :: tokens ->
75 (* Printf.printf "parse_entry: %s\n" (Lexer.string_of_token_list tokens); *) 75 (* Printf.printf "parse_entry: %s\n" (Lexer.string_of_token_list tokens); *)
76 - let tokens = Lexer.split_symbol (T ":") [] tokens in 76 + let tokens = try Lexer.split_symbol (T ":") [] tokens with _ -> failwith "parse_entry: split_symbol :" in
77 let tokens = manage_tokens tokens in 77 let tokens = manage_tokens tokens in
78 let positions = Xlist.map tokens (fun (arg,role) -> 78 let positions = Xlist.map tokens (fun (arg,role) ->
79 parse_arg arg (parse_role {empty_position with is_necessary=Req} role)) in 79 parse_arg arg (parse_role {empty_position with is_necessary=Req} role)) in
@@ -89,12 +89,12 @@ let load_lexicon filename = @@ -89,12 +89,12 @@ let load_lexicon filename =
89 | T "\t" -> tokens 89 | T "\t" -> tokens
90 | T "\r" -> tokens 90 | T "\r" -> tokens
91 | t -> t :: tokens)) in 91 | t -> t :: tokens)) in
92 - let entries = Lexer.split_symbol (T ";") [] tokens in 92 + let entries = try Lexer.split_symbol (T ";") [] tokens with _ -> failwith "load_lexicon: split_symbol ;" in
93 Xlist.fold entries StringMap.empty (fun map entry -> 93 Xlist.fold entries StringMap.empty (fun map entry ->
94 let symbol,args = parse_entry entry in 94 let symbol,args = parse_entry entry in
95 StringMap.add_inc map symbol args (fun _ -> failwith ("load_lexicon: " ^ symbol))) 95 StringMap.add_inc map symbol args (fun _ -> failwith ("load_lexicon: " ^ symbol)))
96 96
97 -let sem_lexicon = load_lexicon "resources/lexicon-pl.dic" 97 +let sem_lexicon = StringMap.empty (*load_lexicon ENIAMexecTypes.rules_filename*)(* FIXME!!! *)
98 98
99 let extend_frame symbol frame = 99 let extend_frame symbol frame =
100 try 100 try
exec/ENIAMvisualization.ml
@@ -702,7 +702,7 @@ let html_of_struct_sentence tokens paths last = @@ -702,7 +702,7 @@ let html_of_struct_sentence tokens paths last =
702 t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id lnode rnode)) ^ 702 t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id lnode rnode)) ^
703 sprintf "<tr><td></td><td></td><td></td><td>%d</td><td></td></tr>" last ^ 703 sprintf "<tr><td></td><td></td><td></td><td>%d</td><td></td></tr>" last ^
704 "</table>" 704 "</table>"
705 -(* 705 +
706 let html_of_dep_sentence tokens paths = 706 let html_of_dep_sentence tokens paths =
707 "<table><tr><td><b>orth</b></td><td><b>token</b></td><td><b>id</b></td><td><b>conll_id</b></td><td><b>super</b></td><td><b>label</b></td></tr>" ^ 707 "<table><tr><td><b>orth</b></td><td><b>token</b></td><td><b>id</b></td><td><b>conll_id</b></td><td><b>super</b></td><td><b>label</b></td></tr>" ^
708 String.concat "\n" (List.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id -> 708 String.concat "\n" (List.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id ->
@@ -711,7 +711,7 @@ let html_of_dep_sentence tokens paths = @@ -711,7 +711,7 @@ let html_of_dep_sentence tokens paths =
711 (sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td></tr>" 711 (sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td></tr>"
712 t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id conll_id super label) :: l))) ^ 712 t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id conll_id super label) :: l))) ^
713 "</table>" 713 "</table>"
714 - 714 +(*
715 let html_of_tokens tokens = 715 let html_of_tokens tokens =
716 "<table><tr><td><b>id</b></td><td><b>orth</b></td><td><b>beg</b></td><td><b>len</b></td><td><b>next</b></td><td><b>token</b></td></td><td><b>attrs</b></td></tr>" ^ 716 "<table><tr><td><b>id</b></td><td><b>orth</b></td><td><b>beg</b></td><td><b>len</b></td><td><b>next</b></td><td><b>token</b></td></td><td><b>attrs</b></td></tr>" ^
717 String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size tokens - 1) [] (fun l id -> 717 String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size tokens - 1) [] (fun l id ->
exec/makefile
@@ -18,7 +18,16 @@ install: all @@ -18,7 +18,16 @@ install: all
18 cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR) 18 cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR)
19 cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMsemLexicon.cmi ENIAMsemValence.cmi ENIAMvisualization.cmi $(INSTALLDIR) 19 cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMsemLexicon.cmi ENIAMsemValence.cmi ENIAMvisualization.cmi $(INSTALLDIR)
20 cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMsemLexicon.cmx ENIAMsemValence.cmx ENIAMvisualization.cmx $(INSTALLDIR) 20 cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMsemLexicon.cmx ENIAMsemValence.cmx ENIAMvisualization.cmx $(INSTALLDIR)
  21 + mkdir -p /usr/share/eniam/exec
  22 + cp resources/* /usr/share/eniam/exec
21 23
  24 +install-local: all
  25 + mkdir -p $(INSTALLDIR)
  26 + cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR)
  27 + cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMsemLexicon.cmi ENIAMsemValence.cmi ENIAMvisualization.cmi $(INSTALLDIR)
  28 + cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMsemLexicon.cmx ENIAMsemValence.cmx ENIAMvisualization.cmx $(INSTALLDIR)
  29 + mkdir -p /usr/local/share/eniam/exec
  30 + cp resources/* /usr/local/share/eniam/exec
22 31
23 eniam-exec.cma: $(SOURCES) 32 eniam-exec.cma: $(SOURCES)
24 ocamlc -linkall -a -o eniam-exec.cma $(OCAMLFLAGS) $^ 33 ocamlc -linkall -a -o eniam-exec.cma $(OCAMLFLAGS) $^
lexSemantics/ENIAMlexSemantics.ml
@@ -166,15 +166,15 @@ let semantize lemma pos (selectors,schema) = @@ -166,15 +166,15 @@ let semantize lemma pos (selectors,schema) =
166 166
167 let assign_prep_semantics lemma = 167 let assign_prep_semantics lemma =
168 let roles = try StringMap.find ENIAMlexSemanticsData.prep_roles lemma with Not_found -> [] in 168 let roles = try StringMap.find ENIAMlexSemanticsData.prep_roles lemma with Not_found -> [] in
169 - Printf.printf "assign_prep_semantics: |roles|=%d\n%!" (Xlist.size roles); 169 + (* Printf.printf "assign_prep_semantics: |roles|=%d\n%!" (Xlist.size roles); *)
170 Xlist.map roles (function (case,arole,arole_attr,hipero,sel_prefs) -> 170 Xlist.map roles (function (case,arole,arole_attr,hipero,sel_prefs) ->
171 - Printf.printf "assign_prep_semantics: case=%s arole=%s arole_attr=%s\n%!" case arole arole_attr; 171 + (* Printf.printf "assign_prep_semantics: case=%s arole=%s arole_attr=%s\n%!" case arole arole_attr; *)
172 let meaning = find_prep_meaning lemma hipero in (* FIXME: zaślepka dla meaning i weight *) 172 let meaning = find_prep_meaning lemma hipero in (* FIXME: zaślepka dla meaning i weight *)
173 - print_endline "assign_prep_semantics 1"; 173 + (* print_endline "assign_prep_semantics 1"; *)
174 let positions = [{empty_position with 174 let positions = [{empty_position with
175 sel_prefs=sel_prefs; dir=if lemma="temu" then Backward_ else Forward_; 175 sel_prefs=sel_prefs; dir=if lemma="temu" then Backward_ else Forward_;
176 morfs=ENIAMwalRenderer.assing_pref_morfs (lemma,case); is_necessary=Req}] in 176 morfs=ENIAMwalRenderer.assing_pref_morfs (lemma,case); is_necessary=Req}] in
177 - print_endline "assign_prep_semantics 2"; 177 + (* print_endline "assign_prep_semantics 2"; *)
178 {empty_frame with selectors=[ENIAM_LCGlexiconTypes.Case,ENIAM_LCGlexiconTypes.Eq,[case]]; meanings=[meaning]; positions=find_selprefs positions; 178 {empty_frame with selectors=[ENIAM_LCGlexiconTypes.Case,ENIAM_LCGlexiconTypes.Eq,[case]]; meanings=[meaning]; positions=find_selprefs positions;
179 arole=arole; arole_attr=arole_attr; arev=false}) 179 arole=arole; arole_attr=arole_attr; arev=false})
180 180
lexSemantics/ENIAMwalParser.ml
@@ -73,14 +73,6 @@ let split_text schema = @@ -73,14 +73,6 @@ let split_text schema =
73 | Str.Delim "'" -> Quot 73 | Str.Delim "'" -> Quot
74 | _ -> failwith "parse_text")) 74 | _ -> failwith "parse_text"))
75 75
76 -let rec split_symbol symb rev = function  
77 - [] -> [List.rev rev](*failwith "split_symbol"*)  
78 - | s :: l ->  
79 - if s = symb then  
80 - if l = [] then (*[List.rev rev]*)failwith "split_symbol"  
81 - else (List.rev rev) :: (split_symbol symb [] l)  
82 - else split_symbol symb (s :: rev) l  
83 -  
84 let rec string_of_token = function 76 let rec string_of_token = function
85 Text s -> s 77 Text s -> s
86 | Paren l -> "(" ^ String.concat "" (Xlist.map l string_of_token) ^ ")" 78 | Paren l -> "(" ^ String.concat "" (Xlist.map l string_of_token) ^ ")"
@@ -101,6 +93,14 @@ let rec string_of_token = function @@ -101,6 +93,14 @@ let rec string_of_token = function
101 let string_of_token_list l = 93 let string_of_token_list l =
102 String.concat "" (Xlist.map l string_of_token) 94 String.concat "" (Xlist.map l string_of_token)
103 95
  96 +let rec split_symbol symb rev = function
  97 + [] -> [List.rev rev](*failwith "split_symbol"*)
  98 + | s :: l ->
  99 + if s = symb then
  100 + if l = [] then (*[List.rev rev]*)failwith ("split_symbol: " ^ string_of_token symb)
  101 + else (List.rev rev) :: (split_symbol symb [] l)
  102 + else split_symbol symb (s :: rev) l
  103 +
104 let parse_case = function 104 let parse_case = function
105 [Text "nom"] -> Case "nom" 105 [Text "nom"] -> Case "nom"
106 | [Text "gen"] -> Case "gen" 106 | [Text "gen"] -> Case "gen"
tokenizer/ENIAMtokens.ml
@@ -814,6 +814,8 @@ let rec recognize_sign_group poss_s_beg i = function @@ -814,6 +814,8 @@ let rec recognize_sign_group poss_s_beg i = function
814 | (Sign "?") :: (Sign "?") :: l -> 814 | (Sign "?") :: (Sign "?") :: l ->
815 create_sentence_seq_q i ((Sign "?") :: (Sign "?") :: []) l "??",i+2*factor,l,true 815 create_sentence_seq_q i ((Sign "?") :: (Sign "?") :: []) l "??",i+2*factor,l,true
816 (* | (Sign "?") :: (Sign ".") :: l -> *) 816 (* | (Sign "?") :: (Sign ".") :: l -> *)
  817 + | (Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: l ->
  818 + create_sentence_seq_q i ((Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: []) l "!...",i+4*factor,l,true
817 | (Sign "!") :: (Sign "?") :: l -> 819 | (Sign "!") :: (Sign "?") :: l ->
818 create_sentence_seq_q i ((Sign "!") :: (Sign "?") :: []) l "!?",i+2*factor,l,true 820 create_sentence_seq_q i ((Sign "!") :: (Sign "?") :: []) l "!?",i+2*factor,l,true
819 | (Sign "?") :: (Sign "…") :: l -> 821 | (Sign "?") :: (Sign "…") :: l ->