test_conll.ml
13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
(*
* ENIAMcorpora is a library that integrates ENIAM with corpora in CONLL format
* Copyright (C) 2016 Daniel Oklesinski <oklesinski dot daniel atSPAMfree gmail dot com>
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open ENIAM_LCGlexiconTypes
open ENIAM_LCGtypes
open ENIAMsubsyntaxTypes
let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.rules_filename
let dep_rules = ENIAM_LCGlexicon.make_rules true ENIAM_LCGlexiconTypes.rules_filename
let examples = [
(* "Szpak","Szpak śpiewa.";*)
(* "miał","Miałem miał."; *)
(* "Ala","Ala ma kota.";
"Ale","Ale mają kota:"; *)
(* "zima","Szpak frunie zimą.";*)
(* "październik","Kot miauczy w październiku."; *)
(* "Szpak-Kot","Szpak frunie. Kot miauczy.";
"powiedział","Szpak powiedział: „Frunę. Kiszę.”";*)
"teraz","Teraz frunie jakiś szpak.";
"chłopcy","Chłopcy mają ulicę kwiatami.";
(* "arabia","Arabia Saudyjska biegnie.";*)
(* "Tom","Tom idzie."; *)
]
let clarify_categories senses token =
match token.ENIAMtokenizerTypes.token with
ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp)))
| ENIAMtokenizerTypes.Proper(lemma,pos,interp,_) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp)))
| ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[])
| _ -> []
let create_chart tokens lex_sems paths last =
ENIAM_LCGrenderer.reset_variable_numbers ();
let chart = ENIAM_LCGchart.make last in
let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
let t = ExtArray.get tokens id in
let s = ExtArray.get lex_sems id in
ENIAM_LCGrenderer.reset_variable_names ();
ENIAM_LCGrenderer.add_variable_numbers ();
let cats = clarify_categories ["X"] t in
let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in
ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
chart
let rec split_sons left id right = function
[] -> List.rev (List.sort compare left), List.sort compare right
| x :: l -> if x < id then split_sons (x :: left) id right l else split_sons left id (x :: right) l
let rec dep_create_rec nodes sons conll_id =
let node = IntMap.find nodes conll_id in
let l = try IntMap.find sons conll_id with Not_found -> [] in
let left,right = split_sons [] conll_id [] l in
(* Printf.printf "dep_create_rec [%s] %d [%s]\n" (String.concat ";" (Xlist.map left string_of_int)) conll_id (String.concat ";" (Xlist.map right string_of_int)); *)
DepNode(conll_id, Xlist.map left (dep_create_rec nodes sons), node, Xlist.map right (dep_create_rec nodes sons))
let create_dep_chart tokens lex_sems paths =
(* print_endline "create_dep_chart 1"; *)
let sons = Int.fold 1 (Array.length paths - 1) IntMap.empty (fun sons i ->
let _,super,_ = paths.(i) in
IntMap.add_inc sons super [i] (fun l -> i :: l)) in
(* print_endline "create_dep_chart 2"; *)
let nodes = Int.fold 0 (Array.length paths - 1) IntMap.empty (fun nodes i ->
let id,_,_ = paths.(i) in
let t = ExtArray.get tokens id in
let s = ExtArray.get lex_sems id in
ENIAM_LCGrenderer.reset_variable_names ();
ENIAM_LCGrenderer.add_variable_numbers ();
let cats = clarify_categories ["X"] t in
let l = ENIAM_LCGlexicon.create_entries dep_rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in
IntMap.add nodes i l) in
(* print_endline "create_dep_chart 3"; *)
let x = dep_create_rec nodes sons 0 in
(* print_endline "create_dep_chart 4"; *)
x
let test_example path id tokens lex_sems paths last =
ENIAM_LCGreductions.reset_variant_label ();
let chart = create_chart tokens lex_sems paths last in
ENIAM_LCGlatexOf.print_chart path (id^"1_chart") "a1" chart;
let chart,references = ENIAM_LCGchart.lazify chart in
ENIAM_LCGlatexOf.print_chart path (id^"2_chart") "a4" chart;
ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references;
let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart;
ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references;
if ENIAM_LCGchart.is_parsed chart then (
let term = ENIAM_LCGchart.get_parsed_term chart in
Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file ->
Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
Xlatex.latex_compile_and_clean path (id^"4_term");
let dependency_tree = ENIAM_LCGreductions.reduce term references in
ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree;
if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree;
ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree;
ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree;
ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree;
())
else print_endline "not reduced")
else print_endline "not parsed"
let test_dep_example path id tokens lex_sems paths =
try
ENIAM_LCGreductions.reset_variant_label ();
print_endline "test_dep_example 1";
let paths = CONLL_adapter.convert_dep_tree id (*first_try*) true paths tokens in
print_endline "test_dep_example 2";
(* ENIAMsubsyntaxHTMLof.print_dep_sentence path (id^"1_paths") tokens paths; *)
let chart = create_dep_chart tokens lex_sems paths in
(* ENIAM_LCGlatexOf.print_dep_chart path (id^"1_chart") "a1" chart; *)
let chart,references = ENIAM_LCGchart.dep_lazify chart in
(* ENIAM_LCGlatexOf.print_dep_chart path (id^"2_chart") "a4" chart; *)
(* ENIAM_LCGlatexOf.print_references path (id^"2_references") "a4" references; *)
let chart = ENIAM_LCGchart.dep_parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
(* ENIAM_LCGlatexOf.print_chart path (id^"3_chart") "a4" chart; *)
(* ENIAM_LCGlatexOf.print_references path (id^"3_references") "a4" references; *)
if ENIAM_LCGchart.is_dep_parsed chart then (
let term = ENIAM_LCGchart.get_dep_parsed_term chart in
(* Xlatex.latex_file_out path (id^"4_term") "a4" false (fun file ->
Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
Xlatex.latex_compile_and_clean path (id^"4_term"); *)
let dependency_tree = ENIAM_LCGreductions.reduce term references in
(* ENIAM_LCGlatexOf.print_dependency_tree path (id^"4_dependency_tree") "a0" dependency_tree; *)
if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
(* ENIAM_LCGlatexOf.print_dependency_tree path (id^"5_dependency_tree") "a4" dependency_tree; *)
ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
(* ENIAM_LCGlatexOf.print_dependency_tree path (id^"6_dependency_tree") "a4" dependency_tree; *)
(* ENIAM_LCGgraphOf.print_dependency_tree path (id^"6_dependency_tree") dependency_tree; *)
(* ENIAM_LCGgraphOf.print_simplified_dependency_tree path (id^"6_simple_dependency_tree") dependency_tree; *)
())
else print_endline "not reduced")
else print_endline "not parsed"
with NotDepParsed(id_ndp,left,l,right) -> (
print_endline "not parsed 2";
ENIAM_LCGlatexOf.print_not_parsed_dep_chart path (id^"3_not_parsed_chart") "a2" (id_ndp,left,l,right))
let rec parse_sentence name id tokens lex_sems = function
RawSentence s -> id
| StructSentence(paths,last) ->
(* test_example ("results/" ^ name^"/") (string_of_int id ^ "_") tokens lex_sems paths last; *)
id + 1
| DepSentence(paths) ->
test_dep_example ("results/" ^ name ^ "/") (string_of_int id ^ "_") tokens lex_sems paths;
id + 1
| QuotedSentences sentences ->
Xlist.fold sentences id (fun id p ->
parse_sentence name id tokens lex_sems p.sentence)
| AltSentence l ->
Xlist.fold l id (fun id (mode,sentence) ->
parse_sentence name id tokens lex_sems sentence)
let rec parse_paragraph name id tokens lex_sems = function
RawParagraph s -> id
| StructParagraph sentences ->
Xlist.fold sentences id (fun id p ->
parse_sentence name id tokens lex_sems p.sentence)
| AltParagraph l ->
Xlist.fold l id (fun id (mode,paragraph) ->
parse_paragraph name id tokens lex_sems paragraph)
let rec parse_text name id tokens lex_sems = function
RawText s -> id
| StructText paragraphs ->
Xlist.fold paragraphs id (fun id paragraph ->
parse_paragraph name id tokens lex_sems paragraph)
| AltText l ->
Xlist.fold l id (fun id (mode,text) ->
parse_text name id tokens lex_sems text)
let id_counter = ref 0
let get_id () =
incr id_counter;
"ID_" ^ (string_of_int !id_counter)
let get_query_id = function
AltText[_;CONLL,StructText[StructParagraph[p]]],_ -> if p.id = "" then get_id () else p.id
| AltText[CONLL,StructText[StructParagraph[p]]],_ -> if p.id = "" then get_id () else p.id
| _ -> failwith "get_query_id"
let process_id s =
if Xstring.check_prefix "ID_" s then s else
let a,b,c = match Xstring.split_delim "/" s with
[a;b;c] -> a,b,c
| _ -> failwith ("process_id: " ^ s) in
if Xstring.check_prefix "NKJP_1M_" a && Xstring.check_prefix "morph_" b && Xstring.check_sufix "-p" b &&
Xstring.check_prefix "morph_" c && Xstring.check_sufix "-s" c then
Xstring.cut_prefix "NKJP_1M_" a ^ "." ^ Xstring.cut_sufix "-s" (Xstring.cut_prefix "morph_" c)
else failwith ("process_id: " ^ s)
let process_conll_corpus filename =
let corpus = File.file_in filename (fun file -> CONLL.match_corpus (ENIAM_CONLL.load_corpus file)) in
print_endline "process_conll_corpus";
(* let corpus = [List.hd corpus] in *)
Xlist.iter corpus (fun query -> try
let id = process_id (get_query_id query) in
let path = "results/" ^ id ^ "/" in
ignore (Sys.command ("mkdir -p " ^ path));
match query with
| AltText[Raw,RawText query;CONLL,StructText[
StructParagraph[{sentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens ->
print_endline ("\nPróba sparsowania zdania:\n" ^ text ^ "\n");
(* let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in *)
let conll = StructParagraph[{p with sentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths]
(*@ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else []*))}] in
let text,tokens = ENIAMsubsyntax.parse_text_tokens tokens query in
let sentences = match text with
AltText[Raw,RawText _; Struct,StructText[AltParagraph[Raw,RawParagraph _; Struct,StructParagraph sentences]]] -> sentences
| _ -> failwith "process_conll_corpus 1" in
let text = AltText[Raw,RawText query; Struct, StructText([
AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in
let lex_sems = ENIAMlexSemantics.assign tokens text in
ignore(parse_text id 1 tokens lex_sems text)
| _ -> failwith "process_conll_corpus 2"
with
Failure e -> print_endline ("Failure " ^ e)
| e -> print_endline (Printexc.get_backtrace () ^ "\n" ^ (Printexc.to_string e)))
let _ =
Printexc.record_backtrace true;
(* LCGfields.reset (); *)
(* process_conll_corpus "../../NLP resources/skladnica_zaleznosciowa.conll"; *)
(* process_conll_corpus "../testy/skladnica-test1.conll"; *)
process_conll_corpus "../testy/skladnica-test1-Failure.conll";
(* LCGfields.print_results () *)