ENIAMpreIntegration.ml
14.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
(*
* ENIAMintegration, a library that integrates ENIAM with other parsers.
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>, Jan Lupa, Daniel Oklesiński
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMtokenizerTypes
open ENIAMsubsyntaxTypes
let concraft_enabled = ref false
let mate_parser_enabled = ref false
let swigra_enabled = ref false
let polfie_enabled = ref false
let concraft_model_filename = "../tools/concraft/nkjp-model-0.2.gz"
let concraft_server_pid = ref (-1)
let mate_parser_path = "../tools/mate-tools/"
(* let mate_model_filename = "examples/160622_Polish_MateParser.mdl" *)
let mate_model_filename = "examples/180322_PDBMate.mdl"
let mate_in = ref stdin
let mate_out = ref stdout
let swigra_path = "../tools/swigra/parser"
let swigra_in = ref stdin
let swigra_out = ref stdout
let swigra_err = ref stdin
let polfie_path = "../tools/polfie"
let polfie_results = "results/polfie"
let polfie_in = ref stdin
let polfie_out = ref stdout
let polfie_err = ref stdin
let concraft_exists () =
let check_in, check_out, check_err = Unix.open_process_full ("command -v concraft-pl")
[|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
let close_check () = Unix.close_process_full (check_in, check_out, check_err) in
try
ignore @@ input_line check_in;
ignore @@ close_check ();
true
with End_of_file -> ignore @@ close_check (); false
let wait_for_concraft_server () =
let rec wait s a =
try Unix.connect s a
with e -> Unix.sleep 1; wait s a in
let s = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 6 in
let a = Unix.ADDR_INET (Unix.inet_addr_loopback, 10089) in
wait s a;
Unix.shutdown s Unix.SHUTDOWN_SEND;
Unix.close s
let start_concraft_server m =
let client_out, server_out = Unix.pipe () in
let client_err, server_err = Unix.pipe () in
let pid = Unix.create_process "concraft-pl" [|"concraft-pl"; "server"; "--inmodel"; m|]
Unix.stdin server_out server_err in
List.iter Unix.close [client_out; server_out; client_err; server_err];
wait_for_concraft_server ();
pid
let stop_concraft_server pid =
Unix.kill pid Sys.sigint
let start_swigra_server dir =
let serv_in, serv_out, serv_err = Unix.open_process_full ("cd " ^ dir ^ "; ./swigra -w")
[|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
ignore @@ input_line serv_in;
serv_in, serv_out, serv_err
let stop_swigra_server (serv_in, serv_out, serv_err) =
output_string serv_out "halt.\n";
ignore @@ Unix.close_process_full (serv_in, serv_out, serv_err)
(*
let rec read_until regexp in_channel success_msg timeout = (* FIXME: obecnie zawiesi się jeśli nie natknie się na linię matchującą do regexp *)
let line = input_line in_channel in
prerr_endline line;
if Str.string_match regexp line 0
then prerr_endline success_msg
else read_until regexp in_channel success_msg timeout
let start_polfie_server dir =
let serv_out = Unix.open_process_out ("cd " ^ dir ^ ";
export XLEPATH=/opt/xle-20130507;
export PATH=$HOME/bin:$XLEPATH/bin:/opt/morfeusz/bin:$PATH;
export LD_LIBRARY_PATH=$XLEPATH/lib:$LD_LIBRARY_PATH;
export DYLD_LIBRARY_PATH=$XLEPATH/lib:$DYLD_LIBRARY_PATH;
xle -noTk") in
serv_out
let stop_polfie_server (serv_in, serv_out, serv_err) =
output_string serv_out "exit\n";
ignore @@ Unix.close_process_out serv_out
let rec read_until regexp in_channel success_msg timeout = (* FIXME: obecnie zawiesi się jeśli nie natknie się na linię matchującą do regexp *)
let line = input_line in_channel in
prerr_endline line;
if Str.string_match regexp line 0
then prerr_endline success_msg
else read_until regexp in_channel success_msg timeout
let start_polfie_server dir =
let serv_in, serv_out, serv_err = Unix.open_process_full ("cd " ^ dir ^ ";
export XLEPATH=/opt/xle-20130507;
export PATH=$HOME/bin:$XLEPATH/bin:/opt/morfeusz/bin:$PATH;
export LD_LIBRARY_PATH=$XLEPATH/lib:$LD_LIBRARY_PATH;
export DYLD_LIBRARY_PATH=$XLEPATH/lib:$DYLD_LIBRARY_PATH;
xle -noTk")
[|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
ignore @@ read_until (Str.regexp_string "/home/wjaworski/Dokumenty/ENIAM3/tools/polfie/.xlerc loaded.") serv_err "Polfie ready!" 0;
serv_in, serv_out, serv_err
let stop_polfie_server (serv_in, serv_out, serv_err) =
output_string serv_out "exit\n";
ignore @@ Unix.close_process_full (serv_in, serv_out, serv_err)
*)
let start_polfie_server dir = stdin, stdout, stdin
let stop_polfie_server (_,_,_) = ()
let initialize () =
if !concraft_enabled then (
if not (concraft_exists ()) then failwith "The command concraft-pl is missing. Please make sure Concraft is installed properly." else
if not (Sys.file_exists concraft_model_filename) then failwith "Concraft model file does not exist." else
print_endline "Starting Concraft Server";
concraft_server_pid := start_concraft_server concraft_model_filename;
print_endline "Server started");
if !mate_parser_enabled then (
let m_in, m_out =
Unix.open_process ("java -jar " ^ mate_parser_path ^ "dist/anna-3.5.jar -model " ^
mate_parser_path ^ mate_model_filename ^ " -test") in
mate_in := m_in;
mate_out := m_out);
if !swigra_enabled then (
let a,b,c = start_swigra_server swigra_path in
swigra_in := a;
swigra_out := b;
swigra_err := c);
if !polfie_enabled then (
let a,b,c = start_polfie_server polfie_path in
polfie_in := a;
polfie_out := b;
polfie_err := c)
let stop_servers () =
if !concraft_enabled then stop_concraft_server !concraft_server_pid;
if !swigra_enabled then stop_swigra_server (!swigra_in, !swigra_out, !swigra_err);
if !polfie_enabled then stop_polfie_server (!polfie_in, !polfie_out, !polfie_err)
let read_whole_channel c =
let r = ref [] in
try
while true do
r := (input_line c) :: !r
done;
!r
with End_of_file -> List.rev (!r)
let concraft_parse s =
let concraft_in, concraft_out, concraft_err =
Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client")
[|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
let err_msg = String.concat "\n" (read_whole_channel concraft_err) in
let result = read_whole_channel concraft_in in
ignore (Unix.close_process_full (concraft_in, concraft_out, concraft_err));
if err_msg <> "" then failwith err_msg else
result
let rec process_concraft_result orth lemma interp others rev = function
[] -> List.rev ((orth,(lemma,interp) :: others) :: rev)
| "" :: l -> process_concraft_result orth lemma interp others rev l
| line :: l ->
(match Xstring.split_delim "\t" line with
[orth2;s] when s = "none" || s = "space" ->
if orth = "" then process_concraft_result orth2 lemma interp others rev l
else process_concraft_result orth2 "" "" [] ((orth,(lemma,interp) :: others) :: rev) l
| ["";lemma2;interp2] -> process_concraft_result orth lemma interp ((lemma2,interp2) :: others) rev l
| ["";lemma;interp;"disamb"] -> process_concraft_result orth lemma interp others rev l
| _ -> failwith ("process_concraft_result: " ^ line))
let make_token (orth,l) =
if l = [] then failwith "make_token 1" else
let lemma,interp = List.hd l in
let cat,interp = match Xstring.split ":" interp with
cat :: l -> cat, [Xlist.map l (fun tag -> [tag])]
| _ -> failwith ("make_token 2: " ^ orth ^ " " ^ lemma ^ " " ^ interp) in
{empty_token_env with orth = orth; token = Lemma(lemma,cat,interp)}
let parse_mate tokens pbeg s =
let result = concraft_parse s in
let l = process_concraft_result "" "" "" [] [] result in
let l = Xlist.map l make_token in
let l = {empty_token_env with token = Interp "<conll_root>"} :: l in
let l = Xlist.map l (fun t -> ExtArray.add tokens t,-1,"") in
let _ = ENIAM_CONLL.establish_lengths pbeg s l tokens in
let dep_paths = Array.of_list l in
(* parse_conll tokens dep_paths; *)
dep_paths
let get_paths old_paths = function
{sentence=DepSentence[paths]},_ ->
Int.iter 0 (Array.length paths - 1) (fun i ->
let id,_,_ = old_paths.(i) in
let _,super,label = paths.(i) in
paths.(i) <- id,super,label);
paths
| _ -> failwith "get_paths"
let rec parse_mate_sentence tokens pbeg s =
let paths = parse_mate tokens pbeg s in
print_endline "parse_mate2 1";
(* print_endline (Visualization.html_of_dep_sentence tokens paths); *)
let conll = ENIAM_CONLL.string_of_paths Mate tokens paths in
print_endline "parse_mate2 2";
(* printf "|%s|\n" conll; *)
Printf.fprintf !mate_out "%s%!" conll;
print_endline "parse_mate2 3";
let new_paths = get_paths paths (ENIAM_CONLL.load_sentence !mate_in) in
print_endline "parse_mate2 4";
(* print_endline (Visualization.html_of_dep_sentence tokens new_paths); *)
let paths1 = CONLL_adapter.convert_dep_tree "" true new_paths tokens in
let paths2 = CONLL_adapter.convert_dep_tree "" false new_paths tokens in
print_endline "parse_mate2 5";
DepSentence[paths1;paths2]
let curl_swigra s =
let curl_in, curl_out, curl_err = Unix.open_process_full ("curl 'http://localhost:3333/swigra' --data-urlencode 'q=" ^ s ^ "'")
[|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
try
while true do
ignore @@ input_line curl_in
done
with End_of_file -> ignore @@ Unix.close_process_full (curl_in, curl_out, curl_err)
(* let print_swigra_xml dir =
let xml_in = open_in @@ dir ^ "/httpd/forest-disamb.xml" in
try
while true do
print_endline @@ input_line xml_in
done
with End_of_file -> close_in xml_in *)
let parse_swigra_sentence tokens pbeg s =
(* print_endline (ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n"); *)
curl_swigra s;
(* print_endline "s1"; *)
let xml = Xml.parse_file (swigra_path ^ "/httpd/forest-disamb.xml") in
(* print_endline "s2"; *)
(* try
*) let conll = SkladnicaXmlToConll.parse xml in
(* print_endline "s3"; *)
(* print_swigra_xml swigra_path; *)
(* print_endline (conll ^ "\n"); *)
let sentence = ENIAM_CONLL.parse_sentence conll in
let _, conll_tokens = match sentence with
{sentence=DepSentence[paths]}, tokens_from_conll -> paths, tokens_from_conll
| _ -> failwith "parse_swigra_sentence" in
(* print_endline (ENIAMsubsyntaxStringOf.token_extarray conll_tokens ^ "\n"); *)
let l = List.rev @@ Int.fold 0 (ExtArray.size conll_tokens) [] (fun acc i ->
ExtArray.get conll_tokens i :: acc) in
let l = Xlist.map l (fun t -> ExtArray.add tokens t,-1,"") in
let _ = ENIAM_CONLL.establish_lengths pbeg s l tokens in
let paths = Array.of_list l in
(* print_endline (ENIAMsubsyntaxStringOf.dep_sentence "" tokens paths ^ "\n"); *)
(* print_endline (ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n"); *)
let new_paths = get_paths paths sentence in
(* print_endline (ENIAMsubsyntaxStringOf.dep_sentence "" tokens new_paths ^ "\n"); *)
let paths1 = CONLL_adapter.convert_dep_tree "" true new_paths tokens in
let paths2 = CONLL_adapter.convert_dep_tree "" false new_paths tokens in
DepSentence[paths1;paths2]
(* with
| _ -> RawSentence s
*)
let parse_polfie_sentence s =
(* let return_directory = Sys.getcwd() in
Sys.chdir(polfie_path);
File.file_out "sentences" (fun oc -> output_string oc s; flush oc);
prerr_endline s;
(* ignore @@ Sys.command("echo " ^ s ^ " > sentences; cat sentence");
ignore @@ Sys.command("python morf2xle-Walenty.py sentence sentences morfeusz_interactive_dict"); *)
output_string !polfie_out ("parse-testfile sentences -outputPrefix " ^ return_directory ^ "/" ^ polfie_results ^ "/fstructure-\n");
(* output_string !polfie_out ("parse {" ^ s ^ "}\n"); *)
flush !polfie_out;
Sys.chdir(return_directory);
read_until (Str.regexp_string "statistics printed on sentences.stats.") !polfie_in "Sentence parsed by POLFIE!" 0;
*) let fstructure = File.load_file (polfie_results ^ "/fstructure-1.pl") in
(* print_bytes fstructure; *)
RawSentence fstructure
let compare_mode (x,_) (y,_) = compare_mode x y
let rec parse_sentence mode tokens pbeg = function
RawSentence s ->
[Raw,RawSentence s] @
(if !mate_parser_enabled && !concraft_enabled then [Mate,parse_mate_sentence tokens pbeg s] else []) @
(if !swigra_enabled then [Swigra,parse_swigra_sentence tokens pbeg s] else []) @
(if !polfie_enabled then [POLFIE,parse_polfie_sentence s] else [])
| StructSentence(paths,last) -> [mode,StructSentence(paths,last)]
| DepSentence(paths) -> [mode,DepSentence paths]
| QuotedSentences sentences ->
let sentences = Xlist.rev_map sentences (fun p ->
let sentence = parse_sentence mode tokens p.beg p.sentence in (* FIXME: p.pbeg czy pbeg *)
let sentence = match sentence with
[_,s] -> s
| _ -> failwith "ENIAMpreIntegration.parse_sentence" in
{p with sentence=sentence}) in
[mode,QuotedSentences(List.rev sentences)]
| AltSentence l ->
let l = List.flatten (Xlist.rev_map l (fun (mode,sentence) ->
parse_sentence mode tokens pbeg sentence)) in
[mode,AltSentence(Xlist.sort l compare_mode)]
| ErrorSentence s -> [mode,ErrorSentence s]
let rec parse_paragraph mode tokens = function
RawParagraph s -> RawParagraph s
| StructParagraph sentences ->
let sentences = Xlist.rev_map sentences (fun p ->
let sentence = parse_sentence mode tokens p.beg p.sentence in
let sentence = match sentence with
[_,s] -> s
| _ -> failwith "ENIAMpreIntegration.parse_paragraph" in
{p with sentence=sentence}) in
StructParagraph(List.rev sentences)
| AltParagraph l ->
let l = Xlist.rev_map l (fun (mode,paragraph) ->
mode, parse_paragraph mode tokens paragraph) in
AltParagraph(List.rev l)
| ErrorParagraph s -> ErrorParagraph s
let rec parse_text mode tokens = function
RawText s -> RawText s
| StructText paragraphs ->
let paragraphs = Xlist.rev_map paragraphs (fun paragraph ->
parse_paragraph mode tokens paragraph) in
StructText(List.rev paragraphs)
| AltText l -> AltText(Xlist.map l (fun (mode,text) ->
mode, parse_text mode tokens text))
| ErrorText s -> ErrorText s
let catch_parse_text mode tokens text =
try
parse_text mode tokens text,""
with e ->
text, Printexc.to_string e