ENIAMsubsyntax.ml
30 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
(*
* ENIAMsubsyntax: MWE, abbreviation and sentence detecion for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMsubsyntaxTypes
open ENIAMtokenizerTypes
open Xstd
let load_lemma_frequencies filename map =
let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
Xlist.fold l map (fun map line ->
if String.length line = 0 then map else
if String.get line 0 = '#' then map else
match Str.split_delim (Str.regexp "\t") line with
[count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.))
| _ -> failwith ("load_lemma_frequencies: " ^ line))
let lemma_frequencies = ref (StringMap.empty : float StringMap.t)
let modify_weights paths =
List.rev (Xlist.fold paths [] (fun paths t ->
let w = Xlist.fold t.attrs t.weight (fun w -> function
TokNotFound -> w -. 25.
| LemmNotVal -> w -. 20.
| NotValProper -> w -. 1.
| LemmLowercase -> w -. 0.1
| _ -> w) in
let freq = match t.token with
Lemma(lemma,cat,_) -> (try StringMap.find !lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
(* | Proper(lemma,cat,_,_) -> (try StringMap.find !lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w) *)
| _ -> w in
{t with weight = w +. freq; lemma_frequency=freq} :: paths))
let translate_digs paths = (* FIXME: brakuje initial, postal-code i być może innych *)
Xlist.map paths (fun t ->
(* if not !recognize_proper_names then *)
match t.token with
Dig(lemma,"dig") -> t
| Dig(lemma,"intnum") -> {t with token=Lemma(lemma,"intnum",[[]]); cat="Number"}
| Dig(lemma,"realnum") -> {t with token=Lemma(lemma,"realnum",[[]]); cat="Number"}
| Dig(lemma,"year") -> {t with token=Lemma(lemma,"year",[[]]); cat="YearNumber"}
| Dig(lemma,"month") -> t (*{t with token=Lemma(lemma,"month",[[]])}*)
| Dig(lemma,"hour") -> {t with token=Lemma(lemma,"hour",[[]]); cat="HourNumber"}
| Dig(lemma,"day") -> {t with token=Lemma(lemma,"day",[[]]); cat="DayNumber"}
| Dig(lemma,"minute") -> t (*{t with token=Lemma(lemma,"minute",[[])}*)
| Dig(lemma,"2dig") -> t
| Dig(lemma,"3dig") -> t
| Dig(lemma,"4dig") -> t
| Dig(lemma,"pref3dig") -> t
| RomanDig(lemma,"roman") -> {t with token=Lemma(lemma,"roman",[[]]); cat="OrdNumber"}
| RomanDig(lemma,"month") -> t (*{t with token=Lemma(lemma,"symbol",[[]]); attrs="roman" :: t.attrs}*)
| Dig(lemma,"ordnum") -> {t with token=Lemma(lemma,"ordnum",[[]])}
| Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"year")]) -> {t with token=Lemma(d ^ "." ^ m ^ "." ^ y,"date",[[]]); cat="DayNumber"}
| Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"year")]) -> {t with token=Lemma(d ^ "." ^ m ^ "." ^ y,"date",[[]]); cat="DayNumber"}
| Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"2dig")]) -> {t with token=Lemma(d ^ "." ^ m ^ "." ^ y,"date",[[]]); cat="DayNumber"}
| Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"2dig")]) -> {t with token=Lemma(d ^ "." ^ m ^ "." ^ y,"date",[[]]); cat="DayNumber"}
| Compound("day-month",[Dig(d,"day");Dig(m,"month")]) -> {t with token=Lemma(d ^ "." ^ m,"day-month",[[]]); cat="DayNumber"}
| Compound("hour-minute",[Dig(h,"hour");Dig(m,"minute")]) -> {t with token=Lemma(h ^ ":" ^ m,"hour-minute",[[]]); cat="HourNumber"}
| Compound("match-result",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ ":" ^ y,"match-result",[[]]); cat="X"}
| Compound("intnum-interval",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"intnum-interval",[[]]); cat="Number"}
| Compound("roman-interval",[RomanDig(x,"roman");RomanDig(y,"roman")]) -> {t with token=Lemma(x ^ "-" ^ y,"roman-interval",[[]]); cat="OrdNumber"}
| Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]]); cat="Number"}
| Compound("realnum-interval",[Dig(x,"intnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]]); cat="Number"}
| Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]]); cat="Number"}
| Compound("date-interval",[Compound("date",[Dig(d1,"day");Dig(m1,"month");Dig(y1,"year")]);
Compound("date",[Dig(d2,"day");Dig(m2,"month");Dig(y2,"year")])]) -> {t with token=Lemma(d1 ^ "." ^ m1 ^ "." ^ y1 ^ "-" ^ d2 ^ "." ^ m2 ^ "." ^ y2,"date-interval",[[]]); cat="DayNumber"}
| Compound("day-month-interval",[Compound("day-month",[Dig(d1,"day");Dig(m1,"month")]);
Compound("day-month",[Dig(d2,"day");Dig(m2,"month")])]) -> {t with token=Lemma(d1 ^ "." ^ m1 ^ "-" ^ d2 ^ "." ^ m2,"day-month-interval",[[]]); cat="DayNumber"}
| Compound("day-interval",[Dig(d1,"day");Dig(d2,"day")]) -> {t with token=Lemma(d1 ^ "-" ^ d2,"day-interval",[[]]); cat="DayNumber"}
| Compound("month-interval",[Dig(m1,"month");Dig(m2,"month")]) -> {t with token=Lemma(m1 ^ "-" ^ m2,"month-interval",[[]]); cat="Month"}
| Compound("month-interval",[RomanDig(m1,"month");RomanDig(m2,"month")]) -> {t with token=Lemma(m1 ^ "-" ^ m2,"month-interval",[[]]); attrs=Roman :: t.attrs; cat="Month"}
| Compound("year-interval",[Dig(y1,"year");Dig(y2,"year")]) -> {t with token=Lemma(y1 ^ "-" ^ y2,"year-interval",[[]]); cat="YearNumber"}
| Compound("year-interval",[Dig(y1,"year");Dig(y2,"2dig")]) -> {t with token=Lemma(y1 ^ "-" ^ y2,"year-interval",[[]]); cat="YearNumber"}
| Compound("hour-minute-interval",[Compound("hour-minute",[Dig(h1,"hour");Dig(m1,"minute")]);Compound("hour-minute",[Dig(h2,"hour");Dig(m2,"minute")])]) ->
{t with token=Lemma(h1 ^ ":" ^ m1 ^ "-" ^ h2 ^ ":" ^ m2,"hour-minute-interval",[[]]); cat="HourNumber"}
| Compound("hour-interval",[Dig(h1,"hour");Dig(h2,"hour")]) -> {t with token=Lemma(h1 ^ "-" ^ h2,"hour-interval",[[]]); cat="HourNumber"}
| Compound("minute-interval",[Dig(m1,"minute");Dig(m2,"minute")]) -> t (*{t with token=Lemma(m1 ^ "-" ^ m2,"minute-interval",[[]])}*)
| Dig(lemma,"url") -> {t with token=Lemma(lemma,"url",[[]]); cat="X"}
| Dig(lemma,"email") -> {t with token=Lemma(lemma,"email",[[]]); cat="X"}
| Dig(lemma,"html-tag") -> {t with token=Lemma(lemma,"html-tag",[[]]); cat="X"}
| Dig(lemma,"list-item") -> {t with token=Lemma(lemma,"list-item",[[]]); cat="X"}
| Dig(lemma,cat) -> failwith ("translate_digs: Dig " ^ cat)
| Interp "." -> {t with cat="Interp"}
| Interp "," -> {t with token=Lemma(",","conj",[[]]); cat="Conj"}
| Interp "</query>" -> {t with cat="Interp"}
| Interp "<query>" -> {t with cat="Interp"}
(* | Interp "</sentence>" -> {t with cat="Interp"}
| Interp "<sentence>" -> {t with cat="Interp"}
| Interp "</clause>" -> {t with cat="Interp"}
| Interp "<clause>" -> {t with cat="Interp"}
| Interp s -> Printf.printf "translate_digs: „%s”\n" s; t*)
| Interp "(" -> {t with cat="Interp"}
| Interp ")" -> {t with cat="Interp"}
| Interp "-" -> {t with cat="Interp"}
(* | Interp "α" -> {t with cat="Interp"} *)
| RomanDig(lemma,cat) -> failwith ("translate_digs: Romandig " ^ cat)
| Compound(cat,_) as t -> failwith ("translate_digs: " ^ ENIAMtokens.string_of_token t)
| _ -> t
(* else
match t.token with
Dig(lemma,"dig") -> t
| Dig(lemma,"intnum") -> {t with token=Lemma(lemma,"intnum",[[]])}
| Dig(lemma,"realnum") -> {t with token=Lemma(lemma,"realnum",[[]])}
| Dig(lemma,"year") -> {t with token=Proper(lemma,"year",[[]],["rok"])}
| Dig(lemma,"month") -> t (*{t with token=Proper(lemma,"month",[[]],["miesiąc"])}*)
| Dig(lemma,"hour") -> {t with token=Proper(lemma,"hour",[[]],["godzina"])}
| Dig(lemma,"day") -> {t with token=Proper(lemma,"day",[[]],["dzień"])}
| Dig(lemma,"minute") -> t (*{t with token=Proper(lemma,"minute",[[]],["minuta"])}*)
| Dig(lemma,"2dig") -> t
| Dig(lemma,"3dig") -> t
| Dig(lemma,"4dig") -> t
| Dig(lemma,"pref3dig") -> t
| RomanDig(lemma,"roman") -> {t with token=Lemma(lemma,"roman",[[]]); attrs=t.attrs}
| RomanDig(lemma,"month") -> t (*{t with token=Proper(lemma,"symbol",[[]],["month"]); attrs="roman" :: t.attrs}*)
| Dig(lemma,"ordnum") -> {t with token=Lemma(lemma,"ordnum",[[]])}
| Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"year")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"year")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("date",[Dig(d,"day");Dig(m,"month");Dig(y,"2dig")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("date",[Dig(d,"day");RomanDig(m,"month");Dig(y,"2dig")]) -> {t with token=Proper(d ^ "." ^ m ^ "." ^ y,"date",[[]],["data"])}
| Compound("day-month",[Dig(d,"day");Dig(m,"month")]) -> {t with token=Proper(d ^ "." ^ m,"day-month",[[]],["data"])}
| Compound("hour-minute",[Dig(h,"hour");Dig(m,"minute")]) -> {t with token=Proper(h ^ ":" ^ m,"hour-minute",[[]],["godzina"])}
| Compound("match-result",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Proper(x ^ ":" ^ y,"match-result",[[]],["rezultat"])}
| Compound("intnum-interval",[Dig(x,"intnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"intnum-interval",[[]])}
| Compound("roman-interval",[RomanDig(x,"roman");RomanDig(y,"roman")]) -> {t with token=Lemma(x ^ "-" ^ y,"roman-interval",[[]]); attrs=t.attrs}
| Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
| Compound("realnum-interval",[Dig(x,"intnum");Dig(y,"realnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
| Compound("realnum-interval",[Dig(x,"realnum");Dig(y,"intnum")]) -> {t with token=Lemma(x ^ "-" ^ y,"realnum-interval",[[]])}
| Compound("date-interval",[Compound("date",[Dig(d1,"day");Dig(m1,"month");Dig(y1,"year")]);
Compound("date",[Dig(d2,"day");Dig(m2,"month");Dig(y2,"year")])]) -> {t with token=Proper(d1 ^ "." ^ m1 ^ "." ^ y1 ^ "-" ^ d2 ^ "." ^ m2 ^ "." ^ y2,"date-interval",[[]],["interwał"])}
| Compound("day-month-interval",[Compound("day-month",[Dig(d1,"day");Dig(m1,"month")]);
Compound("day-month",[Dig(d2,"day");Dig(m2,"month")])]) -> {t with token=Proper(d1 ^ "." ^ m1 ^ "-" ^ d2 ^ "." ^ m2,"day-month-interval",[[]],["interwał"])}
| Compound("day-interval",[Dig(d1,"day");Dig(d2,"day")]) -> {t with token=Proper(d1 ^ "-" ^ d2,"day-interval",[[]],["interwał"])}
| Compound("month-interval",[Dig(m1,"month");Dig(m2,"month")]) -> {t with token=Proper(m1 ^ "-" ^ m2,"month-interval",[[]],["interwał"])}
| Compound("month-interval",[RomanDig(m1,"month");RomanDig(m2,"month")]) -> {t with token=Proper(m1 ^ "-" ^ m2,"month-interval",[[]],["interwał"]); attrs=Roman :: t.attrs}
| Compound("year-interval",[Dig(y1,"year");Dig(y2,"year")]) -> {t with token=Proper(y1 ^ "-" ^ y2,"year-interval",[[]],["interwał"])}
| Compound("year-interval",[Dig(y1,"year");Dig(y2,"2dig")]) -> {t with token=Proper(y1 ^ "-" ^ y2,"year-interval",[[]],["interwał"])}
| Compound("hour-minute-interval",[Compound("hour-minute",[Dig(h1,"hour");Dig(m1,"minute")]);Compound("hour-minute",[Dig(h2,"hour");Dig(m2,"minute")])]) ->
{t with token=Proper(h1 ^ ":" ^ m1 ^ "-" ^ h2 ^ ":" ^ m2,"hour-minute-interval",[[]],["interwał"])}
| Compound("hour-interval",[Dig(h1,"hour");Dig(h2,"hour")]) -> {t with token=Proper(h1 ^ "-" ^ h2,"hour-interval",[[]],["interwał"])}
| Compound("minute-interval",[Dig(m1,"minute");Dig(m2,"minute")]) -> t (*{t with token=Proper(m1 ^ "-" ^ m2,"minute-interval",[[]],["interwał"])}*)
| Dig(lemma,"url") -> {t with token=Proper(lemma,"url",[[]],["url"])}
| Dig(lemma,"email") -> {t with token=Proper(lemma,"email",[[]],["email"])}
| Dig(lemma,"html-tag") -> {t with token=Lemma(lemma,"html-tag",[[]])}
| Dig(lemma,"list-item") -> {t with token=Lemma(lemma,"list-item",[[]])}
| Dig(lemma,cat) -> failwith ("translate_digs: Dig " ^ cat)
| RomanDig(lemma,cat) -> failwith ("translate_digs: Romandig " ^ cat)
| Compound(cat,_) as t -> failwith ("translate_digs: " ^ ENIAMtokens.string_of_token t)
| _ -> t)*))
(**********************************************************************************)
module OrderedStringList = struct
type t = string list
let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare)
end
module OrderedStringListList = struct
type t = string list list
let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare)
end
module StringListMap = Xmap.Make(OrderedStringList)
module StringListListMap = Xmap.Make(OrderedStringListList)
module StringListListSet = Xset.Make(OrderedStringListList)
type tree = T of tree StringListMap.t | S of StringSet.t
let single_tags = function
[_] :: _ -> true
| _ -> false
let rec make_tree interp =
if single_tags interp then S (StringSet.of_list (List.flatten (List.flatten interp))) else
let map = Xlist.fold interp StringListMap.empty (fun map tags ->
StringListMap.add_inc map (List.hd tags) [List.tl tags] (fun l -> (List.tl tags) :: l)) in
T(StringListMap.map map make_tree)
let is_s_tree map =
StringListListMap.fold map false (fun b _ -> function
S _ -> true
| T _ -> b)
let rec fold_tree_rec rev s f = function
S set -> f s (List.rev rev) set
| T map -> StringListMap.fold map s (fun s tag tree ->
fold_tree_rec (tag :: rev) s f tree)
let fold_tree tree s f = fold_tree_rec [] s f tree
let rec combine_interps_rec map =
if is_s_tree map then
StringListListMap.fold map [] (fun interp tail_tags -> function
S tag -> ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) :: interp
| _ -> failwith "combine_interps_rec")
else
let map = StringListListMap.fold map StringListListMap.empty (fun map tail_tags tree ->
fold_tree tree map (fun map head_tags tag ->
StringListListMap.add_inc map ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) [head_tags] (fun l -> head_tags :: l))) in
combine_interps_rec (StringListListMap.map map make_tree)
let combine_interp interp =
try
let map = StringListListMap.add StringListListMap.empty [] (make_tree interp) in
combine_interps_rec map
with e -> failwith ("combine_interp: " ^ Printexc.to_string e)
let combine_pos = StringSet.of_list ["subst"; "depr"; "ppron12"; "ppron3"; "siebie"; "adj"; "num"; "ger"; "praet"; "fin"; "impt"; "imps"; "pcon"; "ppas"; "pact";
"inf"; "bedzie"; "aglt"; "winien"; "pant"; "prep"]
let combine_subst_tags = function
[n;c;g] -> Xlist.map (Xlist.multiply_list [n;c;g]) (fun l -> Xlist.map l (fun x -> [x]))
| [n;c;[g];[col]] -> Xlist.map (Xlist.multiply_list [n;c;[g ^ ":" ^ col]]) (fun l -> Xlist.map l (fun x -> [x]))
| _ -> failwith "combine_subst_tags"
let combine_interps paths =
List.rev (Xlist.rev_map paths (fun t ->
match t.token with
Lemma(lemma,pos,interp) ->
(* Printf.printf "%s %s %s\n" lemma pos (ENIAMtagset.render interp); *)
if StringSet.mem combine_pos pos && interp = [[]] then failwith ("combine_interps: interp=[[]] for " ^ lemma ^ ":" ^ pos) else
let interp =
if pos = "subst" then List.flatten (Xlist.map interp combine_subst_tags) else
if pos = "ppron12" then Xlist.map interp (fun tags -> if Xlist.size tags = 4 then tags @ [["_"]] else tags)
else interp in
let interp =
if StringSet.mem combine_pos pos then combine_interp interp else
StringListListSet.to_list (StringListListSet.of_list interp) in
{t with token=Lemma(lemma,pos,interp)}
(* | Proper(lemma,pos,interp,cat) ->
(* Printf.printf "%s %s %s\n" lemma pos (ENIAMtagset.render interp); *)
if StringSet.mem combine_pos pos && interp = [[]] then failwith ("combine_interps: interp=[[]] for " ^ lemma ^ ":" ^ pos) else
let interp =
if pos = "subst" then List.flatten (Xlist.map interp combine_subst_tags) else
if pos = "ppron12" then Xlist.map interp (fun tags -> if Xlist.size tags = 4 then tags @ [["_"]] else tags)
else interp in
let interp =
if StringSet.mem combine_pos pos then combine_interp interp else
StringListListSet.to_list (StringListListSet.of_list interp) in
{t with token=Proper(lemma,pos,interp,cat)}*)
| _ -> t))
(**********************************************************************************)
let select_tokens paths =
List.rev (Xlist.fold paths [] (fun paths t ->
match t.token with
(* RomanDig(v,cat) -> {t with token=Lemma(v,cat,[[]])} :: paths
| Interp orth -> {t with token=Lemma(orth,"interp",[[]])} :: paths
| Dig(value,cat) -> {t with token=Lemma(value,cat,[[]])} :: paths
| Other2 orth -> {t with token=Lemma(orth,"unk",[[]])} :: paths
| Lemma(lemma,cat,interp) -> t :: paths
| Proper _ -> failwith "select_tokens"
| Compound _ -> t :: paths*)
(* RomanDig(v,cat) -> t :: paths *)
| Interp orth -> t :: paths
(* | Dig(value,cat) -> t :: paths *)
| Other orth -> t :: paths
| Lemma(lemma,pos,interp) -> if pos = "brev" then paths else t :: paths
(* | Proper(lemma,pos,interp,cat) -> if pos = "brev" then paths else t :: paths *)
(* | Compound _ -> t :: paths *)
| _ -> paths))
let add_token paths (q,t,n) =
let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in
let map = IntMap.add_inc map t.next [q,t,n] (fun l -> (q,t,n) :: l) in
IntMap.add paths t.beg map
let rec select_tokens2_rec last paths nodes map =
let node = IntSet.min_elt nodes in
if node = last then try snd (IntMap.find map node) with Not_found -> failwith "select_tokens2_rec: token graph is not connected" else
let nodes = IntSet.remove nodes node in
if not (IntMap.mem map node) then select_tokens2_rec last paths nodes map else
let qselected,selected = IntMap.find map node in
let map2 = try IntMap.find paths node with Not_found -> IntMap.empty in
let map = IntMap.fold map2 map (fun map next l ->
Xlist.fold l map (fun map (q,t,n) ->
let selected = IntSet.add selected n in
let qselected = qselected+q in
IntMap.add_inc map t.next (qselected,selected) (fun (qselected2,selected2) ->
if qselected2 > qselected then qselected2,selected2 else
if qselected2 < qselected then qselected,selected else
qselected,IntSet.union selected selected2))) in
select_tokens2_rec last paths nodes map
let rec calculate_quality q = function
FC :: l -> calculate_quality (q-2) l
| CS :: l -> calculate_quality (q-2) l
| MaybeCS :: l -> calculate_quality q l
| HasAglSuffix :: l -> calculate_quality q l
| MWE :: l -> calculate_quality (q+6) l
| Capitalics :: l -> calculate_quality (q+1) l
| LemmNotVal :: l -> calculate_quality (q-5) l
| TokNotFound :: l -> calculate_quality (q-10) l
| NotValProper :: l -> calculate_quality (q-1) l
| LemmLowercase :: l -> calculate_quality q l
| Roman :: l -> calculate_quality q l
| SentBeg :: l -> calculate_quality q l
| SentBegEnd :: l -> calculate_quality q l
| SentEnd :: l -> calculate_quality q l
| BrevLemma _ :: l -> calculate_quality q l
| Disamb _ :: l -> calculate_quality q l
| [] -> q
let added_quality t =
match ENIAMtokens.get_pos t with
"prep" -> 11
| "conj" -> 11
| "qub" -> 11
| "interj" -> 11
| "adv" -> 11
| "adja" -> 11
| "comp" -> 11
| "ppron12" -> 11
| "ppron3" -> 11
| _ -> 0
let select_tokens2 paths =
(* print_endline "select_tokens2" ; *)
let beg,last = Xlist.fold paths (max_int,-1) (fun (beg,last) t ->
min beg t.beg, max last t.next) in
let nodes = Xlist.fold paths IntSet.empty (fun nodes t ->
IntSet.add (IntSet.add nodes t.beg) t.next) in
let paths2,_ = Xlist.fold paths ([],1) (fun (paths2,n) t ->
(* Printf.printf "%3d %3d %s\n%!" (added_quality t.token) (calculate_quality 0 t.attrs) (ENIAMtokens.string_of_token_env t); *)
(added_quality t.token + calculate_quality 0 t.attrs, t, n) :: paths2, n+1) in
let paths2 = Xlist.fold paths2 IntMap.empty add_token in
let selected = select_tokens2_rec last paths2 nodes (IntMap.add IntMap.empty beg (0,IntSet.empty)) in
(* print_endline (String.concat " " (StringSet.to_list selected)); *)
IntMap.fold paths2 [] (fun paths _ map ->
IntMap.fold map paths (fun paths _ l ->
Xlist.fold l paths (fun paths (q,t,n) ->
if IntSet.mem selected n then t :: paths else paths)))
(*let load_proper_name proper = function
[lemma; types] ->
let types = Str.split (Str.regexp "|") types in
StringMap.add_inc proper lemma types (fun types2 -> types @ types2)
| l -> failwith ("proper_names: " ^ String.concat " " l)
let load_proper_names filename proper =
File.fold_tab filename proper load_proper_name
let load_proper_names () =
let proper = File.catch_no_file (load_proper_names proper_names_filename) StringMap.empty in
let proper = File.catch_no_file (load_proper_names proper_names_filename2) proper in
let proper = File.catch_no_file (load_proper_names proper_names_filename3) proper in
let proper = File.catch_no_file (load_proper_names proper_names_filename4) proper in
let proper = File.catch_no_file (load_proper_names simc_filename) proper in
let proper = File.catch_no_file (load_proper_names terc_filename) proper in
proper
let proper_names = ref (StringMap.empty : string list StringMap.t)*)
let remove l s =
Xlist.fold l [] (fun l t ->
if s = t then l else t :: l)
(*let find_proper_names t =
match t.token with
Lemma(lemma,pos,interp) ->
if (pos="subst" || pos="depr" || pos="fin" || pos="inf") && StringMap.mem !proper_names lemma then
{t with token=Proper(lemma,pos,interp,StringMap.find !proper_names lemma);
attrs=remove (remove t.attrs NotValProper) LemmNotVal} else
if Xlist.mem t.attrs NotValProper then
{t with token=Proper(lemma,pos,interp,[])}
else t
| _ -> t*)
let initialize () =
ENIAMtokenizer.initialize ();
let mwe_dict,mwe_dict2 = ENIAM_MWE.load_mwe_dicts () in
ENIAM_MWE.mwe_dict := mwe_dict;
ENIAM_MWE.mwe_dict2 := mwe_dict2;
lemma_frequencies := File.catch_no_file (load_lemma_frequencies lemma_frequencies_filename) StringMap.empty
(* proper_names := load_proper_names () *)
let parse query =
let l = ENIAMtokenizer.parse query in
(* print_endline "a6"; *)
let paths = ENIAMpaths.translate_into_paths l in
(* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a7"; *)
(* print_endline (ENIAMsubsyntaxStringOf.token_list (fst paths)); *)
(* let paths = ENIAMpaths.lemmatize paths in *)
(* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a8"; *)
(* print_endline (ENIAMsubsyntaxStringOf.token_list (fst paths)); *)
(* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a9"; *)
let paths,last = ENIAM_MWE.process paths in
(* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a12"; *)
(* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
(* let paths = if !recognize_proper_names then List.rev (Xlist.rev_map paths find_proper_names) else paths in *)
(* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a13"; *)
(* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
let paths = modify_weights paths in
let paths = translate_digs paths in
(* print_endline "a14"; *)
let paths = combine_interps paths in
(* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a16"; *)
(* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
let paths = select_tokens paths in
let paths = Xlist.sort paths ENIAMpaths.compare_token_record in
let paths = ENIAMpaths.remove_inaccessible_tokens paths 0 last in
let paths = Xlist.sort paths ENIAMpaths.compare_token_record in
let paths = if !concraft_enabled then ENIAMconcraft.process_paths paths else paths in
(* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a16"; *)
(* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
let paths = if !concraft_disambiguate then ENIAMconcraft.disambiguate paths last else paths in
(* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a17"; *)
(* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
(* let paths = if !strong_disambiguate_flag then select_tokens2 paths else paths in (* Ta procedura wycina potrzebne tokeny *) *)
(* let paths = ENIAMpaths.process_interpunction paths in *)
let paths = Xlist.sort paths ENIAMpaths.compare_token_record in
(* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a18"; *)
(* print_endline (ENIAMsubsyntaxStringOf.token_list paths); *)
(* print_endline "XXXXXXXXXXXXXXXXXXXXXXXXX a19"; *)
paths(*, next_id*)
let parse_text_tokens sentence_split_flag par_names_flag tokens query =
(* print_endline ("parse_text_tokens 1: " ^ query); *)
let paragraphs = Xstring.split "\n\\|\r" query in
let paragraphs = List.rev (Xlist.fold paragraphs [] (fun l -> function "" -> l | s -> s :: l)) in
(* print_endline ("parse_text_tokens 2: " ^ query); *)
let paragraphs = List.rev (Xlist.rev_map paragraphs (fun paragraph ->
if par_names_flag then
match Xstring.split "\t" paragraph with
[name; paragraph] ->
(match Xstring.split " | " name with
[name; id] -> name, id, paragraph
| _ -> name,"", paragraph)
| _ -> failwith ("parse_text_tokens: " ^ paragraph)
else "", "", paragraph)) in
(* print_endline ("parse_text_tokens 3: " ^ query); *)
let n = if Xlist.size paragraphs = 1 then 0 else 1 in (* FIXME: powyższe do przeniesienia do osobnej procedury *)
let paragraphs,_ = Xlist.fold paragraphs ([],n) (fun (paragraphs,n) (name,id,paragraph) ->
try
(* print_endline paragraph; *)
let paths = parse paragraph in
(* print_endline "parse_text 1"; *)
let pid = if n = 0 then "" else string_of_int n ^ "_" in
let sentences =
if sentence_split_flag then ENIAMsentences.split_into_sentences pid paragraph tokens paths
else ENIAMsentences.no_split_into_sentences pid paragraph tokens paths in
(AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @ (if id = "" then [] else [Identifier,RawParagraph id]) @
[Raw,RawParagraph paragraph; Struct,StructParagraph sentences])) :: paragraphs, n+1
with e ->
(AltParagraph ((if par_names_flag then [Name,RawParagraph name] else []) @ (if id = "" then [] else [Identifier,RawParagraph id]) @
[Raw,RawParagraph paragraph; Error,ErrorParagraph (Printexc.to_string e)])) :: paragraphs, n+1) in
AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs)], tokens
let parse_text sentence_split_flag par_names_flag query =
(* print_endline ("parse_text: " ^ query); *)
let tokens = ExtArray.make 100 empty_token_env in
let _ = ExtArray.add tokens empty_token_env in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *)
parse_text_tokens sentence_split_flag par_names_flag tokens query
let catch_parse text =
try
let tokens = parse text in tokens,""
with e -> [], Printexc.to_string e
let catch_parse_text sentence_split_flag par_names_flag text =
try
parse_text sentence_split_flag par_names_flag text
with e ->
AltText[Raw,RawText text; Error,ErrorText (Printexc.to_string e)],
ExtArray.make 0 empty_token_env
let is_parsed tokens paths last =
(* Printf.printf "is_parsed: last=%d\n" last; *)
let set = Xlist.fold paths (IntSet.singleton 0) (fun set (id,lnode,rnode) ->
let t = ExtArray.get tokens id in
(* Printf.printf "is_parsed: lnode=%d rnode=%d orth=%s token=%s cat=%s\n" lnode rnode t.orth (ENIAMtokens.string_of_token t.token) t.cat; *)
if IntSet.mem set lnode && t.cat <> "X" then IntSet.add set rnode else set) in
(* if IntSet.mem set last then Printf.printf "is_parsed: true\n" else Printf.printf "is_parsed: false\n"; *)
IntSet.mem set last
let rec select_not_parsed_sentence mode tokens = function
| QuotedSentences sentences ->
let sentences = Xlist.rev_map sentences (fun p ->
let sentence = select_not_parsed_sentence mode tokens p.sentence in
{p with sentence=List.hd sentence}) in
[QuotedSentences(List.rev sentences)]
| AltSentence l ->
let l = List.flatten (Xlist.rev_map l (fun (mode,sentence) ->
Xlist.rev_map (select_not_parsed_sentence mode tokens sentence) (fun s -> mode, s))) in
[AltSentence(List.rev l)]
| RawSentence s -> (*Printf.printf "select_not_parsed_sentence: %s\n" s;*) [RawSentence s]
| StructSentence(paths,last) -> if is_parsed tokens paths last then [] else [StructSentence(paths,last)]
| DepSentence _ -> failwith "select_not_parsed_sentence"
| ErrorSentence s -> (*Printf.printf "select_not_parsed_sentence: %s\n" s;*) [ErrorSentence s]
let rec select_not_parsed_paragraph mode tokens = function
RawParagraph s -> RawParagraph s
| StructParagraph sentences ->
let sentences = Xlist.rev_map sentences (fun p ->
let sentence = select_not_parsed_sentence mode tokens p.sentence in
{p with sentence=List.hd sentence}) in
StructParagraph(List.rev sentences)
| AltParagraph l ->
let l = Xlist.rev_map l (fun (mode,paragraph) ->
mode, select_not_parsed_paragraph mode tokens paragraph) in
AltParagraph(List.rev l)
| ErrorParagraph s -> ErrorParagraph s
let rec select_not_parsed_text mode tokens = function
RawText s -> RawText s
| StructText paragraphs ->
let paragraphs = Xlist.rev_map paragraphs (fun paragraph ->
select_not_parsed_paragraph mode tokens paragraph) in
StructText(List.rev paragraphs)
| AltText l -> AltText(Xlist.map l (fun (mode,text) ->
mode, select_not_parsed_text mode tokens text))
| ErrorText s -> ErrorText s
let select_not_parsed tokens text =
select_not_parsed_text Struct tokens text