ENIAMsentences.ml
11.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
(*
* ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Printf
open ENIAMsubsyntaxTypes
open ENIAMtokenizerTypes
open Xstd
let find_bracket_begs tokens chart last beg_selector =
Int.fold 0 last [] (fun found lnode ->
Xlist.fold chart.(lnode) found (fun found (id,rnode) ->
if beg_selector tokens id then (id,lnode,rnode) :: found else found))
let rec find_bracket_ends tokens chart mid_selector end_selector found nodes =
if IntSet.is_empty nodes then found else
let lnode = IntSet.min_elt nodes in
let nodes = IntSet.remove nodes lnode in
let found,nodes = Xlist.fold chart.(lnode) (found,nodes) (fun (found,nodes) (id,rnode) ->
if end_selector tokens id then (id,lnode,rnode) :: found, nodes else
if mid_selector tokens id then found, IntSet.add nodes rnode else
found, nodes) in
find_bracket_ends tokens chart mid_selector end_selector found nodes
let rec find_bracket_paths tokens chart mid_selector rnode map nodes =
if IntSet.is_empty nodes then failwith "find_bracket_paths" else
let lnode = IntSet.min_elt nodes in
let ids = try IntMap.find map lnode with Not_found -> IntSet.empty in
if lnode = rnode then IntSet.to_list ids else
let nodes = IntSet.remove nodes lnode in
let map,nodes = Xlist.fold chart.(lnode) (map,nodes) (fun (map,nodes) (id,rnode) ->
if not (mid_selector tokens id) then map, nodes else
let ids = IntSet.add ids id in
let map = IntMap.add_inc map rnode ids (fun set -> IntSet.union set ids) in
let nodes = IntSet.add nodes rnode in
map,nodes) in
find_bracket_paths tokens chart mid_selector rnode map nodes
let get_raw_sentence a beg len =
let next = beg + len in
let beg = beg / factor + (if beg mod factor < 50 then 0 else 1) in
let next = next / factor + (if next mod factor < 50 then 0 else 1) in
(* if beg mod factor <> 0 then failwith ("get_raw_sentence: beg " ^ string_of_int beg) else
if len mod factor <> 0 then failwith ("get_raw_sentence: len " ^ string_of_int len) else *)
let buf = Buffer.create 512 in
(* Int.iter (beg / factor - 1) (beg / factor + len / factor - 2) (fun i -> *)
Int.iter beg (next - 1) (fun i ->
(* printf "%d" i; printf " %s\n%!" a.(i); *)
Buffer.add_string buf a.(i)(*try a.(i) with _ -> "<" ^ string_of_int i ^ ">"*));
Buffer.contents buf
let parse_bracket_rule paragraph tokens chart last beg_selector mid_selector end_selector command =
let begs = find_bracket_begs tokens chart last beg_selector in
let found = Xlist.fold begs [] (fun found (beg_id,beg_l,beg_r) ->
let ends = find_bracket_ends tokens chart mid_selector end_selector [] (IntSet.singleton beg_r) in
(* if ends = [] then failwith "parse_bracket_rule: end not found" else *)
Xlist.fold ends found (fun found (end_id,end_l,end_r) ->
let ids = find_bracket_paths tokens chart mid_selector end_l IntMap.empty (IntSet.singleton beg_r) in
(beg_id,ids,end_id,beg_l,end_r) :: found)) in
Xlist.iter found (fun (beg_id,ids,end_id,lnode,rnode) ->
let beg_t = ExtArray.get tokens beg_id in
let end_t = ExtArray.get tokens end_id in
let beg = beg_t.beg in
let len = end_t.beg+end_t.len-beg_t.beg in
let id = ExtArray.add tokens {empty_token_env with orth=get_raw_sentence paragraph beg len;
beg=beg; len=len; next=end_t.next;
token=command (*tokens*) (beg_id::end_id::ids)} in
chart.(lnode) <- (id,rnode) :: chart.(lnode))
let find_or_sentence paragraph tokens chart last =
parse_bracket_rule paragraph tokens chart last
(fun tokens id -> (ExtArray.get tokens id).token = Interp "<or>")
(fun tokens id -> false)
(fun tokens id -> (ExtArray.get tokens id).token = Interp "<sentence>")
(fun ids -> Interp "<or-sentence>")
let find_slash_or_sentence paragraph tokens chart last =
parse_bracket_rule paragraph tokens chart last
(fun tokens id -> (ExtArray.get tokens id).token = Interp "</sentence>")
(fun tokens id -> false)
(fun tokens id -> (ExtArray.get tokens id).token = Interp "</or>")
(fun ids -> Interp "</or-sentence>")
let find_sentence paragraph tokens chart last =
parse_bracket_rule paragraph tokens chart last
(fun tokens id ->
let t = (ExtArray.get tokens id).token in
t = Interp "<sentence>" || t = Interp "<or-sentence>")
(fun tokens id ->
let t = (ExtArray.get tokens id).token in
t <> Interp "<sentence>" && t <> Interp "<or-sentence>" && t <> Interp "</sentence>")
(fun tokens id -> (ExtArray.get tokens id).token = Interp "</sentence>")
(fun ids -> Tokens("sentence",ids))
let find_quoted_sentences paragraph tokens chart last =
parse_bracket_rule paragraph tokens chart last
(fun tokens id -> (ExtArray.get tokens id).token = Interp "„s")
(fun tokens id ->
match (ExtArray.get tokens id).token with
Tokens("sentence",_) -> true
| _ -> false)
(fun tokens id -> (ExtArray.get tokens id).token = Interp "”s")
(fun ids -> Tokens("quoted_sentences",ids))
let find_query paragraph tokens chart last =
parse_bracket_rule paragraph tokens chart last
(fun tokens id -> (ExtArray.get tokens id).token = Interp "<query>")
(fun tokens id ->
match (ExtArray.get tokens id).token with
Tokens("sentence",_) -> true
| Tokens("quoted_sentences",_) -> true
| _ -> false)
(fun tokens id -> (ExtArray.get tokens id).token = Interp "</query>")
(fun ids -> Tokens("query",ids))
let find_tokens_in_chart tokens chart lnode rnode cat =
let found = Xlist.fold chart.(lnode) [] (fun found (id,rnode2) ->
if rnode = rnode2 then
let t = ExtArray.get tokens id in
match t.token with
Tokens(cat2,ids) -> if cat = cat2 then ids :: found else found
| _ -> found
else found) in
match found with
[x] -> x
| _ -> failwith "Unable to extract sentences. Check puntuation."
(*let find_tokens_in_chart_id tokens chart lnode rnode cat =
let found = Int.fold 0 last [] (fun ids lnode ->
Xlist.fold chart.(lnode) ids (fun ids (id,rnode) ->
id
(id,lnode,rnode) :: paths)) in*)
let rec add_struct_sentence_ids_rec pid n sentences =
Xlist.fold sentences ([],n) (fun (l,n) -> function
{sentence=AltSentence[Raw,s;Struct,QuotedSentences sentences]} as p ->
let sentences, n = add_struct_sentence_ids_rec pid n sentences in
{p with sentence=AltSentence[Raw,s;Struct,QuotedSentences (List.rev sentences)]} :: l, n
| p -> {p with file_prefix=pid ^ string_of_int n} :: l, n+1)
let add_struct_sentence_ids pid sentences =
match sentences with
[{sentence=AltSentence[Raw,_;Struct,QuotedSentences _]}] -> List.rev (fst (add_struct_sentence_ids_rec pid 1 sentences))
| [p] -> [{p with file_prefix=pid}]
| _ -> List.rev (fst (add_struct_sentence_ids_rec pid 1 sentences))
let prepare_indexes paths =
let set = Xlist.fold paths IntSet.empty (fun set (_,beg,next) ->
IntSet.add (IntSet.add set beg) next) in
let map,last = Xlist.fold (Xlist.sort (IntSet.to_list set) compare) (IntMap.empty,0) (fun (map,n) x ->
IntMap.add map x n, n+1) in
List.rev (Xlist.rev_map paths (fun (id,beg,next) ->
(id,IntMap.find map beg,IntMap.find map next))), last - 1
let make_paths tokens ids =
let paths = Xlist.map ids (fun id ->
let t = ExtArray.get tokens id in
id,t.beg,t.next) in
prepare_indexes paths
let par_compare (s:sentence_env) (t:sentence_env) = compare (s.beg,s.next) (t.beg,t.next)
let rec extract_sentences_rec tokens id =
let t = ExtArray.get tokens id in
match t.token with
Tokens("sentence",ids) ->
let paths,last = make_paths tokens ids in
[{id=string_of_int id; beg=t.beg; len=t.len; next=t.next; file_prefix="";
sentence=AltSentence([Raw,RawSentence t.orth; ENIAM,StructSentence(paths,last)])}]
| Tokens("quoted_sentences",ids) ->
[{id=string_of_int id; beg=t.beg; len=t.len; next=t.next; file_prefix="";
sentence=AltSentence[Raw,RawSentence t.orth;
Struct,QuotedSentences(List.sort par_compare (List.flatten (Xlist.rev_map ids (extract_sentences_rec tokens))))]}]
| _ -> []
let extract_sentences pid tokens chart last =
let ids = find_tokens_in_chart tokens chart 0 last "query" in
let sentences = List.sort par_compare (List.flatten (Xlist.rev_map ids (fun id ->
extract_sentences_rec tokens id))) in
add_struct_sentence_ids pid sentences
(* let paths = Int.fold 0 last [] (fun paths lnode ->
Xlist.fold chart.(lnode) paths (fun paths (id,rnode) ->
(id,lnode,rnode) :: paths)) in
[{pid=string_of_int "xx"; pbeg=0; plen=0;
psentence=AltSentence[Raw,RawSentence paragraph;
ENIAM,StructSentence("",paths,last)]}]*)
(*
let is_sentence = function
Sentence _ -> true
| _ -> false
let get_sentence t =
match t.token with
Sentence(paths,last) -> paths,last
| _ -> failwith "get_sentence"
let rec find_query2 found map = function
[] -> found
| t :: l ->
if not (IntMap.mem map t.beg) then find_query2 found map l else
if t.token = Interp "</query>" then find_query2 ((IntMap.find map t.beg) :: found) map l else
if not (is_sentence t.token) then find_query2 found map l else
let tokens = IntSet.add (IntMap.find map t.beg) t.id in
find_query2 found (IntMap.add_inc map t.next tokens (fun tokens2 -> IntSet.union tokens tokens2)) l
let rec find_query found = function
[] -> failwith "find_query"
| t :: l ->
if t.beg = 0 && t.token = Interp "<query>" then
find_query2 found (IntMap.add IntMap.empty t.next IntSet.empty) l
else
if t.beg > 0 then found else find_query found l
let extract_sentences par (paths,last) =
let par = Array.of_list (Xunicode.utf8_chars_of_utf8_string par) in
let paths,last = PrePaths.sort (paths,last) in
let found = find_query [] paths in
let pars = Xlist.fold found [] (fun pars set ->
Xlist.fold paths [] (fun sentences t -> if IntSet.mem set t.id then
let paths,last = get_sentence t in
{pid=string_of_int t.id; pbeg=t.beg; plen=t.len;
psentence=AltSentence[Raw,RawSentence (get_raw_sentence par t.beg t.len);
ENIAM,StructSentence("",paths,(*last*)10)]} :: sentences else sentences) :: pars) in (* FIXME: (*last*)10 !!!! *)
match pars with
[sentences] -> add_struct_sentence_ids sentences
| _ -> failwith "extract_sentences"
*)
let make_ids tokens paths =
Xlist.rev_map paths (fun t ->
let n = ExtArray.add tokens t in
n,t.beg,t.next)
let make_chart paths last =
let chart = Array.make (last+1) [] in
Xlist.iter paths (fun (id,beg,next) ->
chart.(beg) <- (id,next) :: chart.(beg));
chart
let split_into_sentences pid paragraph tokens paths =
let paths = make_ids tokens paths in
let paths,last = prepare_indexes paths in
let chart = make_chart paths last in
let par = Array.of_list ([""] @ Xunicode.utf8_chars_of_utf8_string paragraph @ [""]) in
find_or_sentence par tokens chart last;
find_slash_or_sentence par tokens chart last;
find_sentence par tokens chart last;
find_quoted_sentences par tokens chart last;
find_query par tokens chart last;
extract_sentences pid tokens chart last