Commit f18f9cc018bb278049234468c21ac27e041114f7
1 parent
dd02bfeb
Korekta błędów przy parsowaniu deptrees
Showing
8 changed files
with
41 additions
and
23 deletions
corpora/test_conll.ml
... | ... | @@ -122,11 +122,11 @@ let create_dep_chart tokens lex_sems paths = |
122 | 122 | else print_endline "not parsed" *) |
123 | 123 | |
124 | 124 | let rec test_dep_example path id tokens lex_sems first_try paths = |
125 | + (* print_endline "test_dep_example 1"; *) | |
125 | 126 | let paths = CONLL_adapter.convert_dep_tree path first_try paths tokens in |
126 | 127 | try |
127 | 128 | ENIAM_LCGreductions.reset_variant_label (); |
128 | - print_endline "test_dep_example 1"; | |
129 | - print_endline "test_dep_example 2"; | |
129 | + (* print_endline "test_dep_example 2"; *) | |
130 | 130 | (* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *) |
131 | 131 | let chart = create_dep_chart tokens lex_sems paths in |
132 | 132 | (* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *) |
... | ... | @@ -215,7 +215,7 @@ let process_id s = |
215 | 215 | |
216 | 216 | let process_conll_corpus filename = |
217 | 217 | let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in |
218 | - print_endline "process_conll_corpus"; | |
218 | + (* print_endline "process_conll_corpus 1"; *) | |
219 | 219 | (* let corpus = [List.hd corpus] in *) |
220 | 220 | Xlist.iter corpus (fun query -> try |
221 | 221 | let id = process_id (get_query_id query) in |
... | ... | @@ -228,13 +228,17 @@ let process_conll_corpus filename = |
228 | 228 | (* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *) |
229 | 229 | let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths] |
230 | 230 | (*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in |
231 | + (* print_endline "process_conll_corpus 2"; *) | |
231 | 232 | let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in |
233 | + (* print_endline "process_conll_corpus 3"; *) | |
232 | 234 | let sentences = match text with |
233 | 235 | AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences |
234 | 236 | | _ -> failwith "process_conll_corpus 1" in |
235 | 237 | let text = AltText[Raw,RawText query; Struct, StructText([ |
236 | 238 | AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in |
239 | + (* print_endline "process_conll_corpus 4"; *) | |
237 | 240 | let lex_sems = ENIAMlexSemantics.assign tokens text in |
241 | + (* print_endline "process_conll_corpus 5"; *) | |
238 | 242 | ignore(parse_text id 1 tokens lex_sems text) |
239 | 243 | | _ -> failwith "process_conll_corpus 2" |
240 | 244 | with |
... | ... | @@ -243,8 +247,9 @@ let process_conll_corpus filename = |
243 | 247 | |
244 | 248 | let _ = |
245 | 249 | Printexc.record_backtrace true; |
250 | + ENIAMlexSemantics.initialize (); | |
246 | 251 | (* LCGfields.reset (); *) |
247 | 252 | (* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *) |
248 | 253 | (* process_conll_corpus "../testy/skladnica-test1.conll"; *) |
249 | - process_conll_corpus "../testy/skladnica-test1-Find_father.conll"; | |
254 | + process_conll_corpus "../testy/skladnica-test1-Failure.conll"; | |
250 | 255 | (* LCGfields.print_results () *) |
... | ... |
exec/ENIAMexecTypes.ml
exec/ENIAMsemLexicon.ml
... | ... | @@ -47,7 +47,7 @@ let parse_multi p = function |
47 | 47 | let parse_morf p = function |
48 | 48 | [T "1"] -> {p with is_necessary=Opt} |
49 | 49 | | tokens -> |
50 | - let l = Xlist.map (Lexer.split_symbol (T "*") [] tokens) (function | |
50 | + let l = Xlist.map (try Lexer.split_symbol (T "*") [] tokens with _ -> failwith "parse_morf: split_symbol *") (function | |
51 | 51 | [T s] -> Atom s |
52 | 52 | | tokens -> failwith ("parse_morf: " ^ Lexer.string_of_token_list tokens)) in |
53 | 53 | {p with morfs=LCG (Tensor l) :: p.morfs} |
... | ... | @@ -57,7 +57,7 @@ let parse_arg tokens p = |
57 | 57 | let tokens,p = parse_dir p tokens in |
58 | 58 | let tokens,p = parse_multi p tokens in |
59 | 59 | match Lexer.find_brackets ["(",")"] [] tokens with |
60 | - [B("(",")",tokens)] -> Xlist.fold (Lexer.split_symbol (T "+") [] tokens) p parse_morf | |
60 | + [B("(",")",tokens)] -> Xlist.fold (try Lexer.split_symbol (T "+") [] tokens with _ -> failwith "parse_arg: split_symbol +") p parse_morf | |
61 | 61 | | tokens -> parse_morf p tokens |
62 | 62 | |
63 | 63 | |
... | ... | @@ -73,7 +73,7 @@ let parse_entry = function |
73 | 73 | [T symbol; T ":"; T "null"] -> symbol,[] |
74 | 74 | | T symbol :: T ":" :: tokens -> |
75 | 75 | (* Printf.printf "parse_entry: %s\n" (Lexer.string_of_token_list tokens); *) |
76 | - let tokens = Lexer.split_symbol (T ":") [] tokens in | |
76 | + let tokens = try Lexer.split_symbol (T ":") [] tokens with _ -> failwith "parse_entry: split_symbol :" in | |
77 | 77 | let tokens = manage_tokens tokens in |
78 | 78 | let positions = Xlist.map tokens (fun (arg,role) -> |
79 | 79 | parse_arg arg (parse_role {empty_position with is_necessary=Req} role)) in |
... | ... | @@ -89,12 +89,12 @@ let load_lexicon filename = |
89 | 89 | | T "\t" -> tokens |
90 | 90 | | T "\r" -> tokens |
91 | 91 | | t -> t :: tokens)) in |
92 | - let entries = Lexer.split_symbol (T ";") [] tokens in | |
92 | + let entries = try Lexer.split_symbol (T ";") [] tokens with _ -> failwith "load_lexicon: split_symbol ;" in | |
93 | 93 | Xlist.fold entries StringMap.empty (fun map entry -> |
94 | 94 | let symbol,args = parse_entry entry in |
95 | 95 | StringMap.add_inc map symbol args (fun _ -> failwith ("load_lexicon: " ^ symbol))) |
96 | 96 | |
97 | -let sem_lexicon = load_lexicon "resources/lexicon-pl.dic" | |
97 | +let sem_lexicon = StringMap.empty (*load_lexicon ENIAMexecTypes.rules_filename*)(* FIXME!!! *) | |
98 | 98 | |
99 | 99 | let extend_frame symbol frame = |
100 | 100 | try |
... | ... |
exec/ENIAMvisualization.ml
... | ... | @@ -702,7 +702,7 @@ let html_of_struct_sentence tokens paths last = |
702 | 702 | t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id lnode rnode)) ^ |
703 | 703 | sprintf "<tr><td></td><td></td><td></td><td>%d</td><td></td></tr>" last ^ |
704 | 704 | "</table>" |
705 | -(* | |
705 | + | |
706 | 706 | let html_of_dep_sentence tokens paths = |
707 | 707 | "<table><tr><td><b>orth</b></td><td><b>token</b></td><td><b>id</b></td><td><b>conll_id</b></td><td><b>super</b></td><td><b>label</b></td></tr>" ^ |
708 | 708 | String.concat "\n" (List.rev (Int.fold 0 (Array.length paths - 1) [] (fun l conll_id -> |
... | ... | @@ -711,7 +711,7 @@ let html_of_dep_sentence tokens paths = |
711 | 711 | (sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%d</td><td>%d</td><td>%s</td></tr>" |
712 | 712 | t.ENIAMtokenizerTypes.orth (escape_html (ENIAMtokens.string_of_token t.ENIAMtokenizerTypes.token)) id conll_id super label) :: l))) ^ |
713 | 713 | "</table>" |
714 | - | |
714 | +(* | |
715 | 715 | let html_of_tokens tokens = |
716 | 716 | "<table><tr><td><b>id</b></td><td><b>orth</b></td><td><b>beg</b></td><td><b>len</b></td><td><b>next</b></td><td><b>token</b></td></td><td><b>attrs</b></td></tr>" ^ |
717 | 717 | String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size tokens - 1) [] (fun l id -> |
... | ... |
exec/makefile
... | ... | @@ -18,7 +18,16 @@ install: all |
18 | 18 | cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR) |
19 | 19 | cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMsemLexicon.cmi ENIAMsemValence.cmi ENIAMvisualization.cmi $(INSTALLDIR) |
20 | 20 | cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMsemLexicon.cmx ENIAMsemValence.cmx ENIAMvisualization.cmx $(INSTALLDIR) |
21 | + mkdir -p /usr/share/eniam/exec | |
22 | + cp resources/* /usr/share/eniam/exec | |
21 | 23 | |
24 | +install-local: all | |
25 | + mkdir -p $(INSTALLDIR) | |
26 | + cp eniam-exec.cmxa eniam-exec.a eniam-exec.cma $(INSTALLDIR) | |
27 | + cp ENIAMexecTypes.cmi ENIAMexec.cmi ENIAMselectSent.cmi ENIAMsemLexicon.cmi ENIAMsemValence.cmi ENIAMvisualization.cmi $(INSTALLDIR) | |
28 | + cp ENIAMexecTypes.cmx ENIAMexec.cmx ENIAMselectSent.cmx ENIAMsemLexicon.cmx ENIAMsemValence.cmx ENIAMvisualization.cmx $(INSTALLDIR) | |
29 | + mkdir -p /usr/local/share/eniam/exec | |
30 | + cp resources/* /usr/local/share/eniam/exec | |
22 | 31 | |
23 | 32 | eniam-exec.cma: $(SOURCES) |
24 | 33 | ocamlc -linkall -a -o eniam-exec.cma $(OCAMLFLAGS) $^ |
... | ... |
lexSemantics/ENIAMlexSemantics.ml
... | ... | @@ -166,15 +166,15 @@ let semantize lemma pos (selectors,schema) = |
166 | 166 | |
167 | 167 | let assign_prep_semantics lemma = |
168 | 168 | let roles = try StringMap.find ENIAMlexSemanticsData.prep_roles lemma with Not_found -> [] in |
169 | - Printf.printf "assign_prep_semantics: |roles|=%d\n%!" (Xlist.size roles); | |
169 | + (* Printf.printf "assign_prep_semantics: |roles|=%d\n%!" (Xlist.size roles); *) | |
170 | 170 | Xlist.map roles (function (case,arole,arole_attr,hipero,sel_prefs) -> |
171 | - Printf.printf "assign_prep_semantics: case=%s arole=%s arole_attr=%s\n%!" case arole arole_attr; | |
171 | + (* Printf.printf "assign_prep_semantics: case=%s arole=%s arole_attr=%s\n%!" case arole arole_attr; *) | |
172 | 172 | let meaning = find_prep_meaning lemma hipero in (* FIXME: zaślepka dla meaning i weight *) |
173 | - print_endline "assign_prep_semantics 1"; | |
173 | + (* print_endline "assign_prep_semantics 1"; *) | |
174 | 174 | let positions = [{empty_position with |
175 | 175 | sel_prefs=sel_prefs; dir=if lemma="temu" then Backward_ else Forward_; |
176 | 176 | morfs=ENIAMwalRenderer.assing_pref_morfs (lemma,case); is_necessary=Req}] in |
177 | - print_endline "assign_prep_semantics 2"; | |
177 | + (* print_endline "assign_prep_semantics 2"; *) | |
178 | 178 | {empty_frame with selectors=[ENIAM_LCGlexiconTypes.Case,ENIAM_LCGlexiconTypes.Eq,[case]]; meanings=[meaning]; positions=find_selprefs positions; |
179 | 179 | arole=arole; arole_attr=arole_attr; arev=false}) |
180 | 180 | |
... | ... |
lexSemantics/ENIAMwalParser.ml
... | ... | @@ -73,14 +73,6 @@ let split_text schema = |
73 | 73 | | Str.Delim "'" -> Quot |
74 | 74 | | _ -> failwith "parse_text")) |
75 | 75 | |
76 | -let rec split_symbol symb rev = function | |
77 | - [] -> [List.rev rev](*failwith "split_symbol"*) | |
78 | - | s :: l -> | |
79 | - if s = symb then | |
80 | - if l = [] then (*[List.rev rev]*)failwith "split_symbol" | |
81 | - else (List.rev rev) :: (split_symbol symb [] l) | |
82 | - else split_symbol symb (s :: rev) l | |
83 | - | |
84 | 76 | let rec string_of_token = function |
85 | 77 | Text s -> s |
86 | 78 | | Paren l -> "(" ^ String.concat "" (Xlist.map l string_of_token) ^ ")" |
... | ... | @@ -101,6 +93,14 @@ let rec string_of_token = function |
101 | 93 | let string_of_token_list l = |
102 | 94 | String.concat "" (Xlist.map l string_of_token) |
103 | 95 | |
96 | +let rec split_symbol symb rev = function | |
97 | + [] -> [List.rev rev](*failwith "split_symbol"*) | |
98 | + | s :: l -> | |
99 | + if s = symb then | |
100 | + if l = [] then (*[List.rev rev]*)failwith ("split_symbol: " ^ string_of_token symb) | |
101 | + else (List.rev rev) :: (split_symbol symb [] l) | |
102 | + else split_symbol symb (s :: rev) l | |
103 | + | |
104 | 104 | let parse_case = function |
105 | 105 | [Text "nom"] -> Case "nom" |
106 | 106 | | [Text "gen"] -> Case "gen" |
... | ... |
tokenizer/ENIAMtokens.ml
... | ... | @@ -814,6 +814,8 @@ let rec recognize_sign_group poss_s_beg i = function |
814 | 814 | | (Sign "?") :: (Sign "?") :: l -> |
815 | 815 | create_sentence_seq_q i ((Sign "?") :: (Sign "?") :: []) l "??",i+2*factor,l,true |
816 | 816 | (* | (Sign "?") :: (Sign ".") :: l -> *) |
817 | + | (Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: l -> | |
818 | + create_sentence_seq_q i ((Sign "!") :: (Sign ".") :: (Sign ".") :: (Sign ".") :: []) l "!...",i+4*factor,l,true | |
817 | 819 | | (Sign "!") :: (Sign "?") :: l -> |
818 | 820 | create_sentence_seq_q i ((Sign "!") :: (Sign "?") :: []) l "!?",i+2*factor,l,true |
819 | 821 | | (Sign "?") :: (Sign "…") :: l -> |
... | ... |