test_conll.ml
18.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
(*
* ENIAMcorpora is a library that integrates ENIAM with corpora in CONLL format
* Copyright (C) 2016 Daniel Oklesinski <oklesinski dot daniel atSPAMfree gmail dot com>
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open ENIAM_LCGlexiconTypes
open ENIAM_LCGtypes
open ENIAMsubsyntaxTypes
(* let parsed = ref 1 *)
let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.rules_filename
let dep_rules = ENIAM_LCGlexicon.make_rules true ENIAM_LCGlexiconTypes.rules_filename
let examples = [
(* "Szpak","Szpak śpiewa.";*)
(* "miał","Miałem miał."; *)
(* "Ala","Ala ma kota.";
"Ale","Ale mają kota:"; *)
(* "zima","Szpak frunie zimą.";*)
(* "październik","Kot miauczy w październiku."; *)
(* "Szpak-Kot","Szpak frunie. Kot miauczy.";
"powiedział","Szpak powiedział: „Frunę. Kiszę.”";*)
"teraz","Teraz frunie jakiś szpak.";
"chłopcy","Chłopcy mają ulicę kwiatami.";
(* "arabia","Arabia Saudyjska biegnie.";*)
(* "Tom","Tom idzie."; *)
]
(* let clarify_categories senses token =
match token.ENIAMtokenizerTypes.token with
ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp)))
| ENIAMtokenizerTypes.Proper(lemma,pos,interp,_) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp)))
| ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[])
| _ -> [] *)
(* let create_chart tokens lex_sems paths last =
ENIAM_LCGrenderer.reset_variable_numbers ();
let chart = ENIAM_LCGchart.make last in
let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
let t = ExtArray.get tokens id in
let s = ExtArray.get lex_sems id in
ENIAM_LCGrenderer.reset_variable_names ();
ENIAM_LCGrenderer.add_variable_numbers ();
let cats = clarify_categories ["X"] t in
let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in
ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
chart *)
(*let rec split_sons left id right = function
[] -> List.rev (List.sort compare left), List.sort compare right
| x :: l -> if x < id then split_sons (x :: left) id right l else split_sons left id (x :: right) l
let rec dep_create_rec nodes sons conll_id =
let node = IntMap.find nodes conll_id in
let l = try IntMap.find sons conll_id with Not_found -> [] in
let left,right = split_sons [] conll_id [] l in
(* Printf.printf "dep_create_rec [%s] %d [%s]\n" (String.concat ";" (Xlist.map left string_of_int)) conll_id (String.concat ";" (Xlist.map right string_of_int)); *)
DepNode(conll_id, Xlist.map left (dep_create_rec nodes sons), node, Xlist.map right (dep_create_rec nodes sons))
let create_dep_chart tokens lex_sems paths =
(* print_endline "create_dep_chart 1"; *)
let sons = Int.fold 1 (Array.length paths - 1) IntMap.empty (fun sons i ->
let _,super,_ = paths.(i) in
IntMap.add_inc sons super [i] (fun l -> i :: l)) in
(* print_endline "create_dep_chart 2"; *)
let nodes = Int.fold 0 (Array.length paths - 1) IntMap.empty (fun nodes i ->
let id,_,_ = paths.(i) in
let t = ExtArray.get tokens id in
let s = ExtArray.get lex_sems id in
ENIAM_LCGrenderer.reset_variable_names ();
ENIAM_LCGrenderer.add_variable_numbers ();
let cats = clarify_categories ["X"] t in
let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata s.ENIAMlexSemanticsTypes.lex_entries in
IntMap.add nodes i l) in
(* print_endline "create_dep_chart 3"; *)
let x = dep_create_rec nodes sons 0 in
(* print_endline "create_dep_chart 4"; *)
x*)
(* let test_example path id tokens lex_sems paths last =
ENIAM_LCGreductions.reset_variant_label ();
let chart = create_chart tokens lex_sems paths last in
ENIAM_LCGlatexOf.print_chart path (id^"1_chart") "a1" chart;
let chart,references = ENIAM_LCGchart.lazify chart in
ENIAM_LCGlatexOf.print_chart path (id^"2_chart") "a4" chart;
ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references;
let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart;
ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references;
if ENIAM_LCGchart.is_parsed chart then (
let term = ENIAM_LCGchart.get_parsed_term chart in
Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file ->
Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
Xlatex.latex_compile_and_clean path (id^"4_term");
let dependency_tree = ENIAM_LCGreductions.reduce term references in
ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree;
if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree;
ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree;
ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree;
ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree;
())
else print_endline "not reduced")
else print_endline "not parsed" *)
(* let rec test_dep_example path id tokens lex_sems first_try paths =
(* print_endline "test_dep_example 1"; *)
let paths = CONLL_adapter.convert_dep_tree path first_try paths tokens in
try
ENIAM_LCGreductions.reset_variant_label ();
(* print_endline "test_dep_example 2"; *)
(* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *)
let chart = create_dep_chart tokens lex_sems paths in
(* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *)
let chart,references = ENIAM_LCGchart.dep_lazify chart in
(* ENIAM_LCGlatexOf.print_dep_chart path (id^"2_chart") "a4" chart; *)
(* ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references; *)
let chart = ENIAM_LCGchart.dep_parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
(* ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart; *)
(* ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references; *)
if ENIAM_LCGchart.is_dep_parsed chart then (
let term = ENIAM_LCGchart.get_dep_parsed_term chart in
(* Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file ->
Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
Xlatex.latex_compile_and_clean path (id^"4_term"); *)
let dependency_tree = ENIAM_LCGreductions.reduce term references in
(* ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree; *)
if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
(* ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree; *)
ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
(* ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree; *)
(* ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree; *)
(* ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; *)
())
else print_endline "not reduced")
else (print_endline "not parsed";
parsed := 0)
with NotDepParsed(id_ndp,left,l,right) -> (
if (first_try)
then test_dep_example path id tokens lex_sems false paths
else (print_endline "not parsed 2";
parsed := 0;
ENIAM_LCGlatexOf.print_not_parsed_dep_chart path (id^"3_not_parsed_chart") "a2" (id_ndp,left,l,right)))
let rec parse_sentence name id tokens lex_sems = function
RawSentence s -> id
| StructSentence(paths,last) ->
(* test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; *)
id + 1
| DepSentence(paths) ->
(* test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems true paths; *)
ENIAMexec.conll_parse_sentence 30. 1 (*("results/" ^ name ^ "/") (string_of_int id ^ "_")*) dep_rules tokens lex_sems (*false*) paths;
id + 1
| QuotedSentences sentences ->
Xlist.fold sentences id (fun id p ->
parse_sentence name id tokens lex_sems p.sentence)
| AltSentence l ->
Xlist.fold l id (fun id (mode,sentence) ->
parse_sentence name id tokens lex_sems sentence)
let rec parse_paragraph name id tokens lex_sems = function
RawParagraph s -> id
| StructParagraph sentences ->
Xlist.fold sentences id (fun id p ->
parse_sentence name id tokens lex_sems p.sentence)
| AltParagraph l ->
Xlist.fold l id (fun id (mode,paragraph) ->
parse_paragraph name id tokens lex_sems paragraph)
let rec parse_text name id tokens lex_sems = function
RawText s -> id
| StructText paragraphs ->
Xlist.fold paragraphs id (fun id paragraph ->
parse_paragraph name id tokens lex_sems paragraph)
| AltText l ->
Xlist.fold l id (fun id (mode,text) ->
parse_text name id tokens lex_sems text) *)
let id_counter = ref 0
let get_id () =
incr id_counter;
"ID_" ^ (string_of_int !id_counter)
let get_query_id = function
AltText[_;CONLL,StructText[StructParagraph[p]]],_ -> if p.id = "" then get_id () else p.id
| AltText[CONLL,StructText[StructParagraph[p]]],_ -> if p.id = "" then get_id () else p.id
| _ -> failwith "get_query_id"
let process_id s =
if Xstring.check_prefix "ID_" s then s else
let a,b,c = match Xstring.split_delim "/" s with
[a;b;c] -> a,b,c
| _ -> failwith ("process_id: " ^ s) in
if Xstring.check_prefix "NKJP_1M_" a && Xstring.check_prefix "morph_" b && Xstring.check_sufix "-p" b &&
Xstring.check_prefix "morph_" c && Xstring.check_sufix "-s" c then
Xstring.cut_prefix "NKJP_1M_" a ^ "." ^ Xstring.cut_sufix "-s" (Xstring.cut_prefix "morph_" c)
else failwith ("process_id: " ^ s)
let process_conll_corpus filename =
(* let escaped str = String.map (fun ch -> if ch = ' ' then '/' else
if ch = '(' || ch = ')' then '_' else ch) (String.escaped str) in
let sort_results folder path id str_query =
(ignore (Sys.command ("mkdir -p " ^ "results/" ^ (escaped folder) ^ "/" ^ id ^ "/"));
ignore (Sys.command ("cp -r " ^ path ^ " results/" ^ (escaped folder) ^ "/" ^ id ^ "/"));
ignore (Sys.command ("rm -r " ^ path));
let oc = open_out_gen [Open_append; Open_text; Open_creat] 0o640 ("results/" ^ folder ^ "/sentences.txt") in
output_string oc str_query;
close_out oc) in *)
print_endline "process_conll_corpus 1";
let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in
print_endline "process_conll_corpus 2";
(* let corpus = [List.hd corpus] in *)
Xlist.iter corpus (fun query ->
(* parsed := 1; *)
let id = process_id (get_query_id query) in
let path = "results/" ^ id ^ "/" in
ignore (Sys.command ("mkdir -p " ^ path));
match query with
| AltText[Raw,RawText query;CONLL,StructText[
StructParagraph[{sentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence [dep_paths]]} as p]]],tokens ->
let _ = print_endline @@ CONLL.string_of_sentence_env CONLL tokens p in
(* (try *)
let paths1 = CONLL_adapter.convert_dep_tree "" true dep_paths tokens in
let paths2 = CONLL_adapter.convert_dep_tree "" false dep_paths tokens in
print_endline ("\nPróba sparsowania zdania:\n" ^ text ^ "\n");
(* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *)
let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence [paths1;paths2]]
(*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in
let text,tokens = ENIAMsubsyntax.parse_text_tokens false false tokens query in
let sentences = match text with
AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences
| _ -> failwith "process_conll_corpus 1" in
let text = AltText[Raw,RawText query; Struct, StructText([
AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in
(* print_endline (ENIAMsubsyntaxStringOf.token_extarray tokens);
print_endline "";
print_endline (ENIAMsubsyntaxStringOf.text "" tokens text);
print_endline ""; *)
let lex_sems = ENIAMlexSemantics.assign tokens text in
(* ignore(parse_text id 1 tokens lex_sems text); *)
let text = ENIAMexec.translate_text text in
let text = ENIAMexec.parse 30. 1 2 rules dep_rules tokens lex_sems text in
let statuses = ENIAMexecTypes.fold_text ENIAMexecTypes.Struct [] (fun mode acc -> function
ENIAMexecTypes.ENIAMSentence parse_result -> parse_result.ENIAMexecTypes.status :: acc
| _ -> acc) text in
List.iter (fun x -> print_string (ENIAMvisualization.string_of_status x ^ " ")) statuses;
print_newline ()
(* if !parsed = 1
then sort_results "Parsed" path id str_query
else sort_results "Not_parsed" path id str_query *)
(* with
Failure e -> (sort_results ("Failure_" ^ (escaped e)) path id str_query;
print_endline ("Failure " ^ e))
| e -> (sort_results (escaped @@ Printexc.to_string e) path id str_query;
print_endline (Printexc.to_string e))) *)
| _ -> failwith "process_conll_corpus 2")
let process_conll_corpus2 filename =
print_endline "process_conll_corpus2 1";
let corpus = File.file_in filename (fun file -> CONLL.match_corpus (ENIAM_CONLL.load_corpus file)) in
print_endline "process_conll_corpus2 2";
(* let corpus = [List.hd corpus] in *)
let corpus = List.rev (Xlist.rev_map corpus (fun query ->
(* parsed := 1; *)
let id = process_id (get_query_id query) in
let path = "results/" ^ id ^ "/" in
ignore (Sys.command ("mkdir -p " ^ path));
match query with
| AltText[Raw,RawText query;CONLL,StructText[
StructParagraph[{sentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence [dep_paths]]} as p]]],tokens ->
(* let text,tokens,msg =
if !sentence_split = Full then ENIAMsubsyntax.catch_parse_text true !par_names text
else ENIAMsubsyntax.catch_parse_text false !par_names text in
let text,msg =
if msg <> "" || not !perform_integration then text,msg else
ENIAMpreIntegration.catch_parse_text ENIAMsubsyntaxTypes.Struct tokens text in
let lex_sems,msg =
if msg <> "" then ExtArray.make 0 ENIAMlexSemanticsTypes.empty_lex_sem, msg
else ENIAMlexSemantics.catch_assign tokens text in *)
let _ = print_endline @@ CONLL.string_of_sentence_env CONLL tokens p in
let paths1 = CONLL_adapter.convert_dep_tree "" true dep_paths tokens in
let paths2 = CONLL_adapter.convert_dep_tree "" false dep_paths tokens in
print_endline ("\nPróba sparsowania zdania:\n" ^ text ^ "\n");
let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence [paths1;paths2]])}] in
(* let text,tokens = ENIAMsubsyntax.parse_text_tokens false false tokens query in
let sentences = match text with
AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences
| _ -> failwith "process_conll_corpus 1" in *)
let text = AltText[Raw,RawText query; Struct, StructText([
AltParagraph[Raw,RawParagraph query; (*ENIAM, StructParagraph sentences;*) CONLL, conll]])] in
print_endline (ENIAMsubsyntaxStringOf.text "" tokens text);
print_endline (ENIAMsubsyntaxStringOf.token_extarray tokens);
(* let lex_sems = ENIAMlexSemantics.assign tokens text in *)
let lex_sems = ENIAMdomainLexSemantics.assign2 tokens text in
let text = ENIAMexec.translate_text text in
let text = ENIAMexec.parse 30. 1 2 rules dep_rules tokens lex_sems text in
(match text with
ENIAMexecTypes.AltText[ENIAMexecTypes.Raw,ENIAMexecTypes.RawText _; ENIAMexecTypes.Struct,ENIAMexecTypes.StructText[p]] -> p
| _ -> failwith "process_conll_corpus2 3")
| _ -> failwith "process_conll_corpus2 2")) in
let corpus = ENIAMexecTypes.AltText[ENIAMexecTypes.Raw,ENIAMexecTypes.RawText ""; ENIAMexecTypes.Struct,ENIAMexecTypes.StructText corpus] in
ENIAMvisualization.print_html_text "results/" "parsed_corpus" corpus 1 1 (ExtArray.make 1 ENIAMtokenizerTypes.empty_token_env)
let _ =
print_endline "test_conll 1";
Printexc.record_backtrace true;
print_endline "test_conll 2";
(* ENIAMlexSemantics.initialize (); *)
ENIAMsemTypes.user_ontology_flag := true;
ENIAMcategoriesPL.initialize ();
ENIAMsemLexicon.sem_lexicon := ENIAMsemLexicon.load_lexicon "data/sem-lexicon.dic";
ENIAMdomainLexSemantics.initialize2 ();
print_endline "test_conll 3";
(* LCGfields.reset (); *)
(* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *)
(* process_conll_corpus "resultsFF1/Failure_find_father1/sentences.txt"; *)
(* process_conll_corpus2 "../testy/skladnica-test1.conll"; *)
process_conll_corpus2 "../testy/skladnica-test1b.conll";
(* process_conll_corpus "../testy/skladnica-test1-Not_found.conll"; *)
(* process_conll_corpus "../testy/zdania_generujace_blady1/sentences5.txt";*)
(* LCGfields.print_results () *)