ENIAMsubsyntax.ml
17.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
(*
* ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMsubsyntaxTypes
open ENIAMtokenizerTypes
open Xstd
let load_lemma_frequencies filename map =
let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
Xlist.fold l map (fun map line ->
if String.length line = 0 then map else
if String.get line 0 = '#' then map else
match Str.split_delim (Str.regexp "\t") line with
[count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.))
| _ -> failwith ("load_lemma_frequencies: " ^ line))
let lemma_frequencies = ref (StringMap.empty : float StringMap.t)
let modify_weights paths =
List.rev (Xlist.fold paths [] (fun paths t ->
let w = Xlist.fold t.attrs t.weight (fun w -> function
TokNotFound -> w -. 25.
| LemmNotVal -> w -. 20.
| NotValProper -> w -. 1.
| LemmLowercase -> w -. 0.1
| _ -> w) in
let w = match t.token with
Lemma(lemma,cat,_) -> (try w +. StringMap.find !lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
| Proper(lemma,cat,_,_) -> (try w +. StringMap.find !lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
| _ -> w in
{t with weight = w} :: paths))
let translate_digs paths =
Xlist.map paths (fun t ->
match t.token with
Dig(lemma,"dig") -> t
| Dig(lemma,"intnum") -> {t with token=Lemma(lemma,"intnum",[[]])}
| Dig(lemma,"realnum") -> {t with token=Lemma(lemma,"realnum",[[]])}
| Dig(lemma,"year") -> {t with token=Proper(lemma,"year",[[]],["rok"])}
| Dig(lemma,"month") -> t (*{t with token=Proper(lemma,"month",[[]],["miesiąc"])}*)
| Dig(lemma,"hour") -> {t with token=Proper(lemma,"hour",[[]],["godzina"])}
| Dig(lemma,"day") -> {t with token=Proper(lemma,"day",[[]],["dzień"])}
| Dig(lemma,"minute") -> t (*{t with token=Proper(lemma,"minute",[[]],["minuta"])}*)
| Dig(lemma,"2dig") -> t
| Dig(lemma,"3dig") -> t
| Dig(lemma,"pref3dig") -> t
| RomanDig(lemma,"roman") -> {t with token=Lemma(lemma,"roman",[[]]); attrs=t.attrs}
| RomanDig(lemma,"month") -> t (*{t with token=Proper(lemma,"symbol",[[]],["month"]); attrs="roman" :: t.attrs}*)
| Dig(lemma,"ordnum") -> {t with token=Lemma(lemma,"ordnum",[[]])}
| Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"year")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"year")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"2dig")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"2dig")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("day-month",[Dig(d,"day");Dig(m,"month")]) -> {t with token=Proper(d ^ "." ^ m,"day-month",[[]],["data"])}
| Compound("hour-minute",[Dig(h,"hour");Dig(m,"minute")]) -> {t with token=Proper(h ^ ":" ^ m,"hour-minute",[[]],["godzina"])}
| Compound("match-result",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Proper(x ^ ":" ^ y,"match-result",[[]],["rezultat"])}
| Compound("intnum-interval",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"intnum-interval",[[]])}
| Compound("roman-interval",[RomanDig(x,"roman");RomanDig(y,"roman")]) -> {t with token=Lemma(x ^ "-" ^ y,"roman-interval",[[]]); attrs=t.attrs}
| Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
| Compound("realnum-interval",[Dig(x,"intnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
| Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
| Compound("date-interval",[Compound("date",[Dig(d1,"day");Dig(m1,"month");Dig(y1,"year")]);
Compound("date",[Dig(d2,"day");Dig(m2,"month");Dig(y2,"year")])]) -> {t with token=Proper(d1 ^ "." ^ m1 ^ "." ^ y1 ^ "-" ^ d2 ^ "." ^ m2 ^ "." ^ y2,"date-interval",[[]],["interwał"])}
| Compound("day-month-interval",[Compound("day-month",[Dig(d1,"day");Dig(m1,"month")]);
Compound("day-month",[Dig(d2,"day");Dig(m2,"month")])]) -> {t with token=Proper(d1 ^ "." ^ m1 ^ "-" ^ d2 ^ "." ^ m2,"day-month-interval",[[]],["interwał"])}
| Compound("day-interval",[Dig(d1,"day");Dig(d2,"day")]) -> {t with token=Proper(d1 ^ "-" ^ d2,"day-interval",[[]],["interwał"])}
| Compound("month-interval",[Dig(m1,"month");Dig(m2,"month")]) -> {t with token=Proper(m1 ^ "-" ^ m2,"month-interval",[[]],["interwał"])}
| Compound("month-interval",[RomanDig(m1,"month");RomanDig(m2,"month")]) -> {t with token=Proper(m1 ^ "-" ^ m2,"month-interval",[[]],["interwał"]); attrs=Roman :: t.attrs}
| Compound("year-interval",[Dig(y1,"year");Dig(y2,"year")]) -> {t with token=Proper(y1 ^ "-" ^ y2,"year-interval",[[]],["interwał"])}
| Compound("year-interval",[Dig(y1,"year");Dig(y2,"2dig")]) -> {t with token=Proper(y1 ^ "-" ^ y2,"year-interval",[[]],["interwał"])}
| Compound("hour-minute-interval",[Compound("hour-minute",[Dig(h1,"hour");Dig(m1,"minute")]);Compound("hour-minute",[Dig(h2,"hour");Dig(m2,"minute")])]) ->
{t with token=Proper(h1 ^ ":" ^ m1 ^ "-" ^ h2 ^ ":" ^ m2,"hour-minute-interval",[[]],["interwał"])}
| Compound("hour-interval",[Dig(h1,"hour");Dig(h2,"hour")]) -> {t with token=Proper(h1 ^ "-" ^ h2,"hour-interval",[[]],["interwał"])}
| Compound("minute-interval",[Dig(m1,"minute");Dig(m2,"minute")]) -> t (*{t with token=Proper(m1 ^ "-" ^ m2,"minute-interval",[[]],["interwał"])}*)
| Dig(lemma,"url") -> {t with token=Proper(lemma,"url",[[]],["url"])}
| Dig(lemma,"email") -> {t with token=Proper(lemma,"email",[[]],["email"])}
| Dig(lemma,"html-tag") -> {t with token=Lemma(lemma,"html-tag",[[]])}
| Dig(cat,_) -> failwith ("translate_digs: Dig " ^ cat)
| RomanDig(cat,_) -> failwith ("translate_digs: Romandig " ^ cat)
| Compound(cat,_) as t -> failwith ("translate_digs: " ^ ENIAMtokens.string_of_token t)
| _ -> t)
(**********************************************************************************)
module OrderedStringList = struct
type t = string list
let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare)
end
module OrderedStringListList = struct
type t = string list list
let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare)
end
module StringListMap = Xmap.Make(OrderedStringList)
module StringListListMap = Xmap.Make(OrderedStringListList)
module StringListListSet = Xset.Make(OrderedStringListList)
type tree = T of tree StringListMap.t | S of StringSet.t
let single_tags = function
[_] :: _ -> true
| _ -> false
let rec make_tree interp =
if single_tags interp then S (StringSet.of_list (List.flatten (List.flatten interp))) else
let map = Xlist.fold interp StringListMap.empty (fun map tags ->
StringListMap.add_inc map (List.hd tags) [List.tl tags] (fun l -> (List.tl tags) :: l)) in
T(StringListMap.map map make_tree)
let is_s_tree map =
StringListListMap.fold map false (fun b _ -> function
S _ -> true
| T _ -> b)
let rec fold_tree_rec rev s f = function
S set -> f s (List.rev rev) set
| T map -> StringListMap.fold map s (fun s tag tree ->
fold_tree_rec (tag :: rev) s f tree)
let fold_tree tree s f = fold_tree_rec [] s f tree
let rec combine_interps_rec map =
if is_s_tree map then
StringListListMap.fold map [] (fun interp tail_tags -> function
S tag -> ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) :: interp
| _ -> failwith "combine_interps_rec")
else
let map = StringListListMap.fold map StringListListMap.empty (fun map tail_tags tree ->
fold_tree tree map (fun map head_tags tag ->
StringListListMap.add_inc map ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) [head_tags] (fun l -> head_tags :: l))) in
combine_interps_rec (StringListListMap.map map make_tree)
let combine_interp interp =
let map = StringListListMap.add StringListListMap.empty [] (make_tree interp) in
combine_interps_rec map
let combine_pos = StringSet.of_list ["subst"; "depr"; "ppron12"; "ppron3"; "siebie"; "adj"; "num"; "ger"; "praet"; "fin"; "impt"; "imps"; "pcon"; "ppas"; "pact";
"inf"; "bedzie"; "aglt"; "winien"; "pant"; "prep"]
let combine_interps paths =
List.rev (Xlist.rev_map paths (fun t ->
match t.token with
Lemma(lemma,pos,interp) ->
(* Printf.printf "%s %s %s\n" lemma pos (PreTokenizer.string_of_interps interp); *)
let interp =
if pos = "ppron12" then Xlist.map interp (fun tags -> if Xlist.size tags = 4 then tags @ [["_"]] else tags)
else interp in
let interp =
if StringSet.mem combine_pos pos then combine_interp interp else
StringListListSet.to_list (StringListListSet.of_list interp) in
{t with token=Lemma(lemma,pos,interp)}
| Proper(lemma,pos,interp,cat) ->
(* Printf.printf "%s %s %s\n" lemma pos (PreTokenizer.string_of_interps interp); *)
let interp =
if pos = "ppron12" then Xlist.map interp (fun tags -> if Xlist.size tags = 4 then tags @ [["_"]] else tags)
else interp in
let interp =
if StringSet.mem combine_pos pos then combine_interp interp else
StringListListSet.to_list (StringListListSet.of_list interp) in
{t with token=Proper(lemma,pos,interp,cat)}
| _ -> t))
(**********************************************************************************)
let select_tokens paths =
List.rev (Xlist.fold paths [] (fun paths t ->
match t.token with
(* RomanDig(v,cat) -> {t with token=Lemma(v,cat,[[]])} :: paths
| Interp orth -> {t with token=Lemma(orth,"interp",[[]])} :: paths
| Dig(value,cat) -> {t with token=Lemma(value,cat,[[]])} :: paths
| Other2 orth -> {t with token=Lemma(orth,"unk",[[]])} :: paths
| Lemma(lemma,cat,interp) -> t :: paths
| Proper _ -> failwith "select_tokens"
| Compound _ -> t :: paths*)
(* RomanDig(v,cat) -> t :: paths *)
| Interp orth -> t :: paths
(* | Dig(value,cat) -> t :: paths *)
| Other orth -> t :: paths
| Lemma(lemma,pos,interp) -> if pos = "brev" then paths else t :: paths
| Proper(lemma,pos,interp,cat) -> if pos = "brev" then paths else t :: paths
(* | Compound _ -> t :: paths *)
| _ -> paths))
let add_token paths (q,t,n) =
let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in
let map = IntMap.add_inc map t.next [q,t,n] (fun l -> (q,t,n) :: l) in
IntMap.add paths t.beg map
let rec select_tokens2_rec last paths nodes map =
let node = IntSet.min_elt nodes in
if node = last then try snd (IntMap.find map node) with Not_found -> failwith "select_tokens2_rec" else
let nodes = IntSet.remove nodes node in
if not (IntMap.mem map node) then select_tokens2_rec last paths nodes map else
let qselected,selected = IntMap.find map node in
let map2 = try IntMap.find paths node with Not_found -> IntMap.empty in
let map = IntMap.fold map2 map (fun map next l ->
Xlist.fold l map (fun map (q,t,n) ->
let selected = IntSet.add selected n in
let qselected = qselected+q in
IntMap.add_inc map t.next (qselected,selected) (fun (qselected2,selected2) ->
if qselected2 > qselected then qselected2,selected2 else
if qselected2 < qselected then qselected,selected else
qselected,IntSet.union selected selected2))) in
select_tokens2_rec last paths nodes map
let rec calculate_quality q = function
CS :: l -> calculate_quality q l
| MaybeCS :: l -> calculate_quality q l
| ReqValLemm :: l -> calculate_quality q l
| MWE :: l -> calculate_quality (q+6) l
| LemmNotVal :: l -> calculate_quality (q-5) l
| TokNotFound :: l -> calculate_quality (q-10) l
| NotValProper :: l -> calculate_quality (q-1) l
| LemmLowercase :: l -> calculate_quality q l
| Roman :: l -> calculate_quality q l
| [] -> q
let select_tokens2 paths =
let beg,last = Xlist.fold paths (max_int,-1) (fun (beg,last) t ->
min beg t.beg, max last t.next) in
let nodes = Xlist.fold paths IntSet.empty (fun nodes t ->
IntSet.add (IntSet.add nodes t.beg) t.next) in
let paths2,_ = Xlist.fold paths ([],1) (fun (paths2,n) t ->
(* Printf.printf "%3d %s\n" (calculate_quality 0 t.attrs) (ENIAMtokens.string_of_token_env t); *)
(calculate_quality 0 t.attrs, t, n) :: paths2, n+1) in
let paths2 = Xlist.fold paths2 IntMap.empty add_token in
let selected = select_tokens2_rec last paths2 nodes (IntMap.add IntMap.empty beg (0,IntSet.empty)) in
(* print_endline (String.concat " " (StringSet.to_list selected)); *)
IntMap.fold paths2 [] (fun paths _ map ->
IntMap.fold map paths (fun paths _ l ->
Xlist.fold l paths (fun paths (q,t,n) ->
if IntSet.mem selected n then t :: paths else paths)))
let load_proper_name proper = function
[lemma; types] ->
let types = Str.split (Str.regexp "|") types in
StringMap.add_inc proper lemma types (fun types2 -> types @ types2)
| l -> failwith ("proper_names: " ^ String.concat " " l)
let load_proper_names filename proper =
File.fold_tab filename proper load_proper_name
let load_proper_names () =
let proper = File.catch_no_file (load_proper_names proper_names_filename) StringMap.empty in
let proper = File.catch_no_file (load_proper_names proper_names_filename2) proper in
let proper = File.catch_no_file (load_proper_names proper_names_filename3) proper in
proper
let proper_names = ref (StringMap.empty : string list StringMap.t)
let remove l s =
Xlist.fold l [] (fun l t ->
if s = t then l else t :: l)
let find_proper_names t =
match t.token with
Lemma(lemma,pos,interp) ->
if StringMap.mem !proper_names lemma then
{t with token=Proper(lemma,pos,interp,StringMap.find !proper_names lemma);
attrs=remove t.attrs NotValProper} else
if Xlist.mem t.attrs NotValProper then
{t with token=Proper(lemma,pos,interp,[])}
else t
| _ -> t
let initialize () =
ENIAMtokenizer.initialize ();
ENIAMinflexion.initialize ();
let mwe_dict,mwe_dict2 = ENIAM_MWE.load_mwe_dicts () in
ENIAM_MWE.mwe_dict := mwe_dict;
ENIAM_MWE.mwe_dict2 := mwe_dict2;
lemma_frequencies := File.catch_no_file (load_lemma_frequencies lemma_frequencies_filename) StringMap.empty;
proper_names := load_proper_names ()
let parse query =
let l = ENIAMtokenizer.parse query in
(* print_endline "a6"; *)
let paths = ENIAMpaths.translate_into_paths l in
(* print_endline "a7"; *)
let paths = ENIAMpaths.lemmatize paths in
(* print_endline "a8"; *)
let paths,_ = ENIAM_MWE.process paths in
(* print_endline "a12"; *)
(* let paths = find_proper_names paths in*)
let paths = List.rev (Xlist.rev_map paths find_proper_names) in
(* print_endline "a13"; *)
let paths = modify_weights paths in
let paths = translate_digs paths in
(* let paths = assign_senses paths in
(* print_endline "a14"; *)
let paths = assign_valence paths in*)
(* print_endline "a15"; *)
let paths = combine_interps paths in
(* print_endline "a16"; *)
(* let paths = disambiguate_senses paths in
let paths = assign_simplified_valence paths in
let paths = PreSemantics.assign_semantics paths in*)
(* print_endline "a16"; *)
let paths = select_tokens paths in
(* print_endline "a17"; *)
let paths = select_tokens2 paths in
let paths = Xlist.sort paths ENIAMpaths.compare_token_record in
(* print_endline "a18"; *)
paths(*, next_id*)
let parse_text_tokens tokens query =
(* print_endline ("parse_text_tokens: " ^ query); *)
let paragraphs = Xstring.split "\n\\|\r" query in
let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in
let n = if Xlist.size paragraphs = 1 then 0 else 1 in
let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) paragraph ->
let paths = parse paragraph in
(* print_endline "parse_text 1"; *)
let pid = if n = 0 then "" else string_of_int n ^ "_" in
let sentences = ENIAMsentences.split_into_sentences pid paragraph tokens paths in
(AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) :: paragraphs, n+1) in
AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens
let parse_text query =
(* print_endline ("parse_text: " ^ query); *)
let tokens = ExtArray.make 100 empty_token_env in
let _ = ExtArray.add tokens empty_token_env in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *)
parse_text_tokens tokens query
let catch_parse text =
try
let tokens = parse text in tokens,""
with e -> [], Printexc.to_string e
let catch_parse_text text =
try
let text,tokens = parse_text text in text,tokens,""
with e ->
RawText text,
ExtArray.make 0 empty_token_env,
Printexc.to_string e