ENIAMpreIntegration.ml
6.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
(*
* ENIAMintegration, a library that integrates ENIAM with other parsers.
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>, Jan Lupa, Daniel Oklesiński
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMtokenizerTypes
open ENIAMsubsyntaxTypes
let concraft_enabled=ref false
let mate_parser_enabled=ref false
let swigra_enabled=ref false
let polfie_enabled=ref false
(*
UWAGA: Aby korzytać z concrafta trzeba najpierw postawić serwer wpisując z linii poleceń:
concraft-pl server --inmodel ../concraft/nkjp-model-0.2.gz
*)
let read_whole_channel c =
let r = ref [] in
try
while true do
r := (input_line c) :: !r
done;
!r
with End_of_file -> List.rev (!r)
let rec process_concraft_result orth lemma interp others rev = function
[] -> List.rev ((orth,(lemma,interp) :: others) :: rev)
| "" :: l -> process_concraft_result orth lemma interp others rev l
| line :: l ->
(match Xstring.split_delim "\t" line with
[orth2;s] when s = "none" || s = "space" ->
if orth = "" then process_concraft_result orth2 lemma interp others rev l
else process_concraft_result orth2 "" "" [] ((orth,(lemma,interp) :: others) :: rev) l
| ["";lemma2;interp2] -> process_concraft_result orth lemma interp ((lemma2,interp2) :: others) rev l
| ["";lemma;interp;"disamb"] -> process_concraft_result orth lemma interp others rev l
| _ -> failwith ("process_concraft_result: " ^ line))
let concraft_parse s =
let concraft_in, concraft_out, concraft_err =
Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client")
[|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
let err_msg = String.concat "\n" (read_whole_channel concraft_err) in
let result = read_whole_channel concraft_in in
if err_msg <> "" then failwith err_msg else
process_concraft_result "" "" "" [] [] result
(*let rec load_concraft_sentence white orth rev ic =
(* print_endline "load_concraft_sentence 1"; *)
(* print_endline ("concraft error message: " ^ input_line concraft_err); *)
let s = input_line ic in
(* print_endline ("load_concraft_sentence: " ^ s); *)
if s = "" then List.rev rev else
match Xstring.split_delim "\t" s with
[""; lemma; interp; "disamb"] -> load_concraft_sentence "" "" ((white,orth,lemma,interp) :: rev) ic
| [""; lemma; interp] -> load_concraft_sentence white orth rev ic
| [orth; white] -> load_concraft_sentence white orth rev ic
| _ -> failwith ("load_concraft_sentence: " ^ s)*)
let make_token (orth,l) =
if l = [] then failwith "make_token 1" else
let lemma,interp = List.hd l in
let cat,interp = match Xstring.split ":" interp with
cat :: l -> cat, [Xlist.map l (fun tag -> [tag])]
| _ -> failwith ("make_token 2: " ^ orth ^ " " ^ lemma ^ " " ^ interp) in
{empty_token_env with orth = orth; token = Lemma(lemma,cat,interp)}
let parse_mate tokens pbeg s =
(* print_endline ("parse_mate: " ^ s); *)
(* Printf.fprintf concraft_out "%s\n\n%!" s;
let l = load_concraft_sentence "" "" [] concraft_in in *)
let l = concraft_parse s in
let l = Xlist.map l make_token in
let l = {empty_token_env with token = Interp "<conll_root>"} :: l in
let l = Xlist.map l (fun t -> ExtArray.add tokens t,-1,"") in
let _ = ENIAM_CONLL.establish_lengths pbeg s l tokens in
let dep_paths = Array.of_list l in
(* parse_conll tokens dep_paths; *)
dep_paths
let rec parse_mate_sentence tokens pbeg s =
if not !concraft_enabled then RawSentence s
else DepSentence(parse_mate tokens pbeg s)
let compare_mode (x,_) (y,_) = compare_mode x y
let rec parse_sentence mode tokens pbeg = function
RawSentence s ->
[Raw,RawSentence s] @
(if !mate_parser_enabled then [Mate,parse_mate_sentence tokens pbeg s] else []) @
(if !swigra_enabled then [Swigra,RawSentence s] else []) @
(if !polfie_enabled then [POLFIE,RawSentence s] else [])
| StructSentence(paths,last) -> [mode,StructSentence(paths,last)]
| DepSentence(paths) -> [mode,DepSentence paths]
| QuotedSentences sentences ->
let sentences =Xlist.rev_map sentences (fun p ->
let sentence = parse_sentence mode tokens p.beg p.sentence in (* FIXME: p.pbeg czy pbeg *)
let sentence = match sentence with
[_,s] -> s
| _ -> failwith "ENIAMpreIntegration.parse_sentence" in
{p with sentence=sentence}) in
[mode,QuotedSentences(List.rev sentences)]
| AltSentence l ->
let l = List.flatten (Xlist.rev_map l (fun (mode,sentence) ->
parse_sentence mode tokens pbeg sentence)) in
[mode,AltSentence(Xlist.sort l compare_mode)]
let rec parse_paragraph mode tokens = function
RawParagraph s -> RawParagraph s
| StructParagraph sentences ->
let sentences = Xlist.rev_map sentences (fun p ->
let sentence = parse_sentence mode tokens p.beg p.sentence in
let sentence = match sentence with
[_,s] -> s
| _ -> failwith "ENIAMpreIntegration.parse_paragraph" in
{p with sentence=sentence}) in
StructParagraph(List.rev sentences)
| AltParagraph l ->
let l = Xlist.rev_map l (fun (mode,paragraph) ->
mode, parse_paragraph mode tokens paragraph) in
AltParagraph(List.rev l)
let rec parse_text mode tokens = function
RawText s -> RawText s
| StructText paragraphs ->
let paragraphs = Xlist.rev_map paragraphs (fun paragraph ->
parse_paragraph mode tokens paragraph) in
StructText(List.rev paragraphs)
| AltText l -> AltText(Xlist.map l (fun (mode,text) ->
mode, parse_text mode tokens text))