ENIAMpreIntegration.ml
8.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
(*
* ENIAMintegration, a library that integrates ENIAM with other parsers.
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>, Jan Lupa, Daniel Oklesiński
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMtokenizerTypes
open ENIAMsubsyntaxTypes
let concraft_enabled = ref false
let mate_parser_enabled = ref false
let swigra_enabled = ref false
let polfie_enabled = ref false
let concraft_model_filename = "../tools/concraft/nkjp-model-0.2.gz"
let concraft_server_pid = ref (-1)
let mate_parser_path = "../tools/mate-tools/"
let mate_in = ref stdin
let mate_out = ref stdout
let concraft_exists () =
let check_in, check_out, check_err = Unix.open_process_full ("command -v concraft-pl")
[|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
let close_check () = Unix.close_process_full (check_in, check_out, check_err) in
try
ignore @@ input_line check_in;
ignore @@ close_check ();
true
with End_of_file -> ignore @@ close_check (); false
let wait_for_server () =
let rec wait s a =
try Unix.connect s a
with e -> Unix.sleep 1; wait s a in
let s = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 6 in
let a = Unix.ADDR_INET (Unix.inet_addr_loopback, 10089) in
wait s a;
Unix.shutdown s Unix.SHUTDOWN_SEND;
Unix.close s
let start_server m =
let client_out, server_out = Unix.pipe () in
let client_err, server_err = Unix.pipe () in
let pid = Unix.create_process "concraft-pl" [|"concraft-pl"; "server"; "--inmodel"; m|]
Unix.stdin server_out server_err in
List.iter Unix.close [client_out; server_out; client_err; server_err];
wait_for_server ();
pid
let stop_server pid =
Unix.kill pid Sys.sigint
let initialize () =
if !concraft_enabled then (
if not (concraft_exists ()) then failwith "The command concraft-pl is missing. Please make sure Concraft is installed properly." else
if not (Sys.file_exists concraft_model_filename) then failwith "Concraft model file does not exist." else
print_endline "Starting Concraft Server";
concraft_server_pid := start_server concraft_model_filename;
print_endline "Server started");
if !mate_parser_enabled then (
let m_in, m_out = (*Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test"*)
Unix.open_process ("java -jar " ^ mate_parser_path ^ "dist/anna-3.5.jar -model " ^
mate_parser_path ^ "examples/160622_Polish_MateParser.mdl -test") in
mate_in := m_in;
mate_out := m_out)
let read_whole_channel c =
let r = ref [] in
try
while true do
r := (input_line c) :: !r
done;
!r
with End_of_file -> List.rev (!r)
let concraft_parse s =
let concraft_in, concraft_out, concraft_err =
Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client")
[|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
let err_msg = String.concat "\n" (read_whole_channel concraft_err) in
let result = read_whole_channel concraft_in in
ignore (Unix.close_process_full (concraft_in, concraft_out, concraft_err));
if err_msg <> "" then failwith err_msg else
result
let rec process_concraft_result orth lemma interp others rev = function
[] -> List.rev ((orth,(lemma,interp) :: others) :: rev)
| "" :: l -> process_concraft_result orth lemma interp others rev l
| line :: l ->
(match Xstring.split_delim "\t" line with
[orth2;s] when s = "none" || s = "space" ->
if orth = "" then process_concraft_result orth2 lemma interp others rev l
else process_concraft_result orth2 "" "" [] ((orth,(lemma,interp) :: others) :: rev) l
| ["";lemma2;interp2] -> process_concraft_result orth lemma interp ((lemma2,interp2) :: others) rev l
| ["";lemma;interp;"disamb"] -> process_concraft_result orth lemma interp others rev l
| _ -> failwith ("process_concraft_result: " ^ line))
let make_token (orth,l) =
if l = [] then failwith "make_token 1" else
let lemma,interp = List.hd l in
let cat,interp = match Xstring.split ":" interp with
cat :: l -> cat, [Xlist.map l (fun tag -> [tag])]
| _ -> failwith ("make_token 2: " ^ orth ^ " " ^ lemma ^ " " ^ interp) in
{empty_token_env with orth = orth; token = Lemma(lemma,cat,interp)}
let parse_mate tokens pbeg s =
let result = concraft_parse s in
let l = process_concraft_result "" "" "" [] [] result in
let l = Xlist.map l make_token in
let l = {empty_token_env with token = Interp "<conll_root>"} :: l in
let l = Xlist.map l (fun t -> ExtArray.add tokens t,-1,"") in
let _ = ENIAM_CONLL.establish_lengths pbeg s l tokens in
let dep_paths = Array.of_list l in
(* parse_conll tokens dep_paths; *)
dep_paths
let get_paths old_paths = function
{sentence=DepSentence(paths)},_ ->
Int.iter 0 (Array.length paths - 1) (fun i ->
let id,_,_ = old_paths.(i) in
let _,super,label = paths.(i) in
paths.(i) <- id,super,label);
paths
| _ -> failwith "get_paths"
let rec parse_mate_sentence tokens pbeg s =
let paths = parse_mate tokens pbeg s in
print_endline "parse_mate2 1";
(* print_endline (Visualization.html_of_dep_sentence tokens paths); *)
let conll = ENIAM_CONLL.string_of_paths Mate tokens paths in
print_endline "parse_mate2 2";
(* printf "|%s|\n" conll; *)
Printf.fprintf !mate_out "%s%!" conll;
print_endline "parse_mate2 3";
let new_paths = get_paths paths (ENIAM_CONLL.load_sentence !mate_in) in
print_endline "parse_mate2 4";
(* print_endline (Visualization.html_of_dep_sentence tokens new_paths); *)
DepSentence new_paths
let compare_mode (x,_) (y,_) = compare_mode x y
let rec parse_sentence mode tokens pbeg = function
RawSentence s ->
[Raw,RawSentence s] @
(if !mate_parser_enabled && !concraft_enabled then [Mate,parse_mate_sentence tokens pbeg s] else []) @
(if !swigra_enabled then [Swigra,RawSentence s] else []) @
(if !polfie_enabled then [POLFIE,RawSentence s] else [])
| StructSentence(paths,last) -> [mode,StructSentence(paths,last)]
| DepSentence(paths) -> [mode,DepSentence paths]
| QuotedSentences sentences ->
let sentences =Xlist.rev_map sentences (fun p ->
let sentence = parse_sentence mode tokens p.beg p.sentence in (* FIXME: p.pbeg czy pbeg *)
let sentence = match sentence with
[_,s] -> s
| _ -> failwith "ENIAMpreIntegration.parse_sentence" in
{p with sentence=sentence}) in
[mode,QuotedSentences(List.rev sentences)]
| AltSentence l ->
let l = List.flatten (Xlist.rev_map l (fun (mode,sentence) ->
parse_sentence mode tokens pbeg sentence)) in
[mode,AltSentence(Xlist.sort l compare_mode)]
let rec parse_paragraph mode tokens = function
RawParagraph s -> RawParagraph s
| StructParagraph sentences ->
let sentences = Xlist.rev_map sentences (fun p ->
let sentence = parse_sentence mode tokens p.beg p.sentence in
let sentence = match sentence with
[_,s] -> s
| _ -> failwith "ENIAMpreIntegration.parse_paragraph" in
{p with sentence=sentence}) in
StructParagraph(List.rev sentences)
| AltParagraph l ->
let l = Xlist.rev_map l (fun (mode,paragraph) ->
mode, parse_paragraph mode tokens paragraph) in
AltParagraph(List.rev l)
let rec parse_text mode tokens = function
RawText s -> RawText s
| StructText paragraphs ->
let paragraphs = Xlist.rev_map paragraphs (fun paragraph ->
parse_paragraph mode tokens paragraph) in
StructText(List.rev paragraphs)
| AltText l -> AltText(Xlist.map l (fun (mode,text) ->
mode, parse_text mode tokens text))
let catch_parse_text mode tokens text =
try
parse_text mode tokens text,""
with e ->
text, Printexc.to_string e