dict.ml
55.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
open Xstd
open Printf
open Types
let get_form e =
match e.forms with
[form] -> form
| _ -> failwith "get_form"
let load_tab filename =
File.load_tab filename (function
orth :: lemma :: interp :: _ ->
{empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]}
| line -> failwith ("load_tab: " ^ (String.concat "\t" line)))
let load_tab_full filename =
File.load_tab filename (function
[orth; lemma; interp] ->
{empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]}
| [orth; lemma; interp; proper_type] ->
{empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]; proper_type=proper_type}
| [orth; lemma; interp; proper_type; genre] ->
{empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp; genre=genre}]; proper_type=proper_type}
| line -> failwith ("load_tab_full: " ^ (String.concat "\t" line)))
let load_lu dict id path =
let filename = path ^ "morf_rel_" ^ string_of_int id ^ "_lu.tab" in
File.fold_tab filename dict (fun dict -> function
[lemma1; lemma2] ->
{lemma1=lemma1; lemma2=lemma2; rel_id=id; lu_stem=""; lu_validated=false;validated1=false;validated2=false} :: dict
| line -> failwith ("load_lu: " ^ (String.concat "\t" line)))
let pos = StringSet.of_list [
"subst";"adj";"adv";"interp";"num";"xxx";"prep";"fin";"praet";"qub";"inf";"interj";
"brev";"numcol";"ppas";"pact";"adja";"conj";"ger";"pcon";"pant";"comp";"depr";
"adjp";"imps";"impt";"pred";"bedzie";"burk";"aglt";"ppron12";"ppron3";"adjc";
"winien";"siebie";"numcomp"
]
let rec find_pos rev = function
s :: l -> if StringSet.mem pos s then List.rev rev, s :: l else find_pos (s :: rev) l
| [] -> failwith "find_pos"
let split_lemma_interp s =
let l = Xstring.split_delim ":" s in
let lemma,interp = find_pos [List.hd l] (List.tl l) in
String.concat ":" lemma, String.concat ":" interp
let rec remove_empties = function
"" :: l -> remove_empties l
| l -> l
let split_freq_orth s =
match remove_empties (Xstring.split " " s) with
freq :: l -> int_of_string freq, String.concat " " l
| _ -> failwith "split_freq_orth"
let load_freq_tab filename =
File.load_tab filename (function
[freq_orth; lemma_interp] ->
let freq,orth = split_freq_orth freq_orth in
let lemma,interp = split_lemma_interp lemma_interp in
{empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp; freq=freq}]}
| line -> failwith ("load_freq_tab: " ^ (String.concat "\t" line)))
let proper_type_selector e = e.proper_type
let genre_selector e = (get_form e).genre
let interp_selector e = (get_form e).interp
let freq_selector e = (get_form e).freq
let print_quantities out_filename selector dict =
let qmap = Xlist.fold dict StringQMap.empty (fun qmap entry ->
StringQMap.add qmap (selector entry)) in
File.file_out out_filename (fun file ->
StringQMap.iter qmap (fun k v ->
fprintf file "%6d\t%s\n" v k))
(**********************************************************************************)
let load_dict_as_set filename =
let l = load_tab filename in
List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
let form = get_form entry in
StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp]))))
let load_dict_as_set_full filename =
let l = load_tab_full filename in
List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
let form = get_form entry in
StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp;entry.proper_type;form.genre]))))
let rec compare_dicts_rec file = function
[],[] -> ()
| [],b :: lb -> fprintf file "> %s\n" b; compare_dicts_rec file ([],lb)
| a :: la,[] -> fprintf file "< %s\n" a; compare_dicts_rec file (la,[])
| a :: la, b :: lb ->
if a = b then compare_dicts_rec file (la,lb) else
if a < b then (fprintf file "< %s\n" a; compare_dicts_rec file (la,b :: lb)) else
(fprintf file "> %s\n" b; compare_dicts_rec file (a :: la,lb))
let compare_dicts filename1 filename2 filename_out =
let dict1 = load_dict_as_set filename1 in
let dict2 = load_dict_as_set filename2 in
File.file_out filename_out (fun file ->
compare_dicts_rec file (dict1,dict2))
let compare_dicts_full filename1 filename2 filename_out =
let dict1 = load_dict_as_set_full filename1 in
let dict2 = load_dict_as_set_full filename2 in
File.file_out filename_out (fun file ->
compare_dicts_rec file (dict1,dict2))
(**********************************************************************************)
let find_entry_cat entry =
let form = get_form entry in
let cat,tags = match Xstring.split ":" form.interp with
cat :: tags -> cat,tags
| _ -> failwith ("find_entry_type: " ^ form.interp) in
if cat = "praet" then
let t = match tags with
_ :: _ :: t :: _ -> t
| _ -> failwith ("find_entry_cat: " ^ form.interp) in
if t = "pri" || t = "sec" || t = "ter" then "cond" else "verb" else
if cat = "subst" || cat = "depr" then "noun" else
if cat = "adj" || cat = "adja"|| cat = "adjc"|| cat = "adjp" then "adj" else
if cat = "adv" then "adv" else
if cat = "inf" || cat = "praet"|| cat = "fin" || cat = "ppas" || cat = "pact" || cat = "pacta" ||
cat = "impt" || cat = "imps" || cat = "pcon" || cat = "pant" || cat = "ger" || cat = "" then "verb" else
if cat = "bedzie" || cat = "pred"|| cat = "prep" || cat = "num" || cat = "aglt" || cat = "winien" ||
cat = "qub" || cat = "brev" || cat = "comp" || cat = "interj" || cat = "burk" ||
cat = "conj" || cat = "ppron12" || cat = "ppron3" || cat = "numcomp" || cat = "" then "other" else
if cat = "cond" then "cond" else
failwith ("find_entry_cat: " ^ cat)
let assign_entry_cat dict =
Xlist.rev_map dict (fun entry ->
{entry with cat = find_entry_cat entry})
let split_dict in_path filename out_path =
let dict = load_tab (in_path ^ filename) in
let dict = List.rev (assign_entry_cat dict) in
let filename = if Xstring.check_sufix ".gz" filename then
Xstring.cut_sufix ".gz" filename else filename in
File.file_out (out_path ^ "noun_" ^ filename) (fun noun_file ->
File.file_out (out_path ^ "adj_" ^ filename) (fun adj_file ->
File.file_out (out_path ^ "adv_" ^ filename) (fun adv_file ->
File.file_out (out_path ^ "verb_" ^ filename) (fun verb_file ->
File.file_out (out_path ^ "other_" ^ filename) (fun other_file ->
Xlist.iter dict (fun entry ->
let form = get_form entry in
try
let file = match entry.cat with
"noun" -> noun_file
| "adj" -> adj_file
| "adv" -> adv_file
| "verb" -> verb_file
| "other" -> other_file
| "cond" -> raise Not_found
| _ -> failwith "split_dict" in
fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp
with Not_found -> ()))))))
let merge_entries dict =
let dict = assign_entry_cat dict in
let map = Xlist.fold dict StringMap.empty (fun map entry ->
let form = get_form entry in
let key =
if entry.cat = "noun" then
let gender = match Xstring.split ":" form.interp with
["depr";_;_;"m2"] -> "m1"
| "depr" :: _ -> failwith ("merge_entries: " ^ form.interp)
| [_;_;_;gender] -> gender
| [_;_;_;gender;col] -> gender ^ ":" ^ col
| _ -> failwith ("merge_entries: " ^ form.interp) in
entry.lemma ^ "|" ^ entry.cat ^ "|" ^ gender
else entry.lemma ^ "|" ^ entry.cat in
StringMap.add_inc map key entry (fun e ->
if entry.proper_type <> e.proper_type then
failwith ("merge_entries: " ^ key ^ " " ^ entry.proper_type ^ " " ^ e.proper_type) else
{e with forms = form :: e.forms})) in
StringMap.fold map [] (fun dict _ e -> e :: dict)
let remove_cat cat dict =
Xlist.fold dict [] (fun dict entry ->
if entry.cat = cat then dict
else entry :: dict)
let rec get_aspect lemma = function
(f : form) :: l ->
(match Xstring.split ":" f.interp with
["inf";a] -> a
| ["ger";_;_;_;a;_] -> a
| _ -> get_aspect lemma l)
| [] -> failwith ("get_aspect: " ^ lemma)
let get_lemma_suf lemma =
let lemma_suf =
if lemma = "" then "" else
List.hd (List.rev (Xunicode.utf8_chars_of_utf8_string (Stem.simplify_lemma lemma))) in
match lemma_suf with
"a" -> "a"
| "e" -> "e"
| "o" -> "o"
| "y" -> "y"
| "i" -> "y"
| "ę" -> "ę"
| _ -> "ε"
let get_orth_suf orth =
let orth_suf =
if orth = "" then "" else
List.hd (List.rev (Xunicode.utf8_chars_of_utf8_string orth)) in
match orth_suf with
"j" -> "j"
| "e" -> "e"
| _ -> "ε"
let merge_interps lemma forms =
let lemma_suf = get_lemma_suf lemma in
let map = Xlist.fold forms StringMap.empty (fun map form ->
(* printf "merge_interps 1: %s %s\n%!" form.orth form.interp; *)
StringMap.add_inc map form.orth (StringSet.singleton form.interp) (fun set -> StringSet.add set form.interp)) in
StringMap.fold map [] (fun forms orth set ->
(* printf "merge_interps 2: %s %s\n%!" orth (String.concat " " (StringSet.to_list set)); *)
let orth_suf = get_orth_suf orth in
match lemma_suf, Xlist.sort (StringSet.to_list set) compare with
_,["adv"] -> {empty_form with orth=orth; interp="adv:pos"} :: forms
| _,["adv";"adv:pos"] -> {empty_form with orth=orth; interp="adv:pos"} :: forms
| _,["adj:pl:acc:m2.m3.f.n:pos";"adj:pl:nom.voc:m2.m3.f.n:pos";"adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos|adj:pl:nom.acc.voc:m2.m3.f.n:pos"} :: forms
| _,["adj:sg:acc:m1.m2:pos";"adj:sg:gen:m1.m2.m3.n:pos"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:pos|adj:sg:acc:m1.m2:pos"} :: forms
| _,["adj:sg:dat:f:pos";"adj:sg:gen:f:pos";"adj:sg:loc:f:pos"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:pos"} :: forms
| _,["adj:sg:acc:m3:pos";"adj:sg:nom.voc:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos"} :: forms
| _,["adj:pl:acc:m1:pos";"adj:pl:gen:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:pos|adj:pl:acc:m1:pos"} :: forms
| _,["adj:pl:dat:m1.m2.m3.f.n:pos";"adj:sg:inst:m1.m2.m3.n:pos";"adj:sg:loc:m1.m2.m3.n:pos"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:pos|adj:pl:dat:m1.m2.m3.f.n:pos"} :: forms
| _,["adj:sg:acc:f:pos";"adj:sg:inst:f:pos"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:pos"} :: forms
| _,["adj:pl:nom.voc:m1:pos";"adj:sg:acc:m3:pos";"adj:sg:nom.voc:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos|adj:pl:nom.voc:m1:pos"} :: forms
| _,["adj:sg:acc:m3:pos";"adj:sg:nom:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom:m1.m2.m3:pos|adj:sg:acc:m3:pos"} :: forms
| _,["adj:pl:acc:m2.m3.f.n:pos";"adj:pl:nom.voc:m2.m3.f.n:pos"] -> {empty_form with orth=orth; interp="adj:pl:nom.acc.voc:m2.m3.f.n:pos"} :: forms
| _,["adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos"} :: forms
| _,["adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos";"adja"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos|adja"} :: forms
| _,["adj:pl:nom:m2.m3.f.n:pos";"adj:sg:nom:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom:n:pos|adj:pl:nom:m2.m3.f.n:pos"} :: forms
| _,["adj:pl:acc:m2.m3.f.n:sup";"adj:pl:nom.voc:m2.m3.f.n:sup";"adj:sg:acc:n:sup";"adj:sg:nom.voc:n:sup"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:sup|adj:pl:nom.acc.voc:m2.m3.f.n:sup"} :: forms
| _,["adj:sg:acc:m1.m2:sup";"adj:sg:gen:m1.m2.m3.n:sup"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:sup|adj:sg:acc:m1.m2:sup"} :: forms
| _,["adj:sg:dat:f:sup";"adj:sg:gen:f:sup";"adj:sg:loc:f:sup"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:sup"} :: forms
| _,["adj:sg:acc:m3:sup";"adj:sg:nom.voc:m1.m2.m3:sup"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:sup|adj:sg:acc:m3:sup"} :: forms
| _,["adj:pl:acc:m1:sup";"adj:pl:gen:m1.m2.m3.f.n:sup";"adj:pl:loc:m1.m2.m3.f.n:sup"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:sup|adj:pl:acc:m1:sup"} :: forms
| _,["adj:pl:dat:m1.m2.m3.f.n:sup";"adj:sg:inst:m1.m2.m3.n:sup";"adj:sg:loc:m1.m2.m3.n:sup"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:sup|adj:pl:dat:m1.m2.m3.f.n:sup"} :: forms
| _,["adj:sg:acc:f:sup";"adj:sg:inst:f:sup"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:sup"} :: forms
| _,["adj:pl:acc:m2.m3.f.n:com";"adj:pl:nom.voc:m2.m3.f.n:com";"adj:sg:acc:n:com";"adj:sg:nom.voc:n:com"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:com|adj:pl:nom.acc.voc:m2.m3.f.n:com"} :: forms
| _,["adj:sg:acc:m1.m2:com";"adj:sg:gen:m1.m2.m3.n:com"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:com|adj:sg:acc:m1.m2:com"} :: forms
| _,["adj:sg:dat:f:com";"adj:sg:gen:f:com";"adj:sg:loc:f:com"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:com"} :: forms
| _,["adj:sg:acc:m3:com";"adj:sg:nom.voc:m1.m2.m3:com"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:com|adj:sg:acc:m3:com"} :: forms
| _,["adj:pl:acc:m1:com";"adj:pl:gen:m1.m2.m3.f.n:com";"adj:pl:loc:m1.m2.m3.f.n:com"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:com|adj:pl:acc:m1:com"} :: forms
| _,["adj:pl:dat:m1.m2.m3.f.n:com";"adj:sg:inst:m1.m2.m3.n:com";"adj:sg:loc:m1.m2.m3.n:com"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:com|adj:pl:dat:m1.m2.m3.f.n:com"} :: forms
| _,["adj:sg:acc:f:com";"adj:sg:inst:f:com"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:com"} :: forms
| _,["adj:pl:acc:m1:pos";"adj:pl:acc:m2.m3.f.n:pos";"adj:pl:dat:m1.m2.m3.f.n:pos";"adj:pl:gen:m1.m2.m3.f.n:pos";
"adj:pl:inst:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos";"adj:pl:nom.voc:m1:pos";"adj:pl:nom.voc:m2.m3.f.n:pos";
"adj:sg:acc:f:pos";"adj:sg:acc:m1.m2:pos";"adj:sg:acc:m3:pos";"adj:sg:acc:n:pos";"adj:sg:dat:f:pos";
"adj:sg:dat:m1.m2.m3.n:pos";"adj:sg:gen:f:pos";"adj:sg:gen:m1.m2.m3.n:pos";"adj:sg:inst:f:pos";"adj:sg:inst:m1.m2.m3.n:pos";
"adj:sg:loc:f:pos";"adj:sg:loc:m1.m2.m3.n:pos";"adj:sg:nom.voc:f:pos";"adj:sg:nom.voc:m1.m2.m3:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos"} :: forms
| _,["adj:pl:acc:m1:pos";"adj:pl:acc:m2.m3.f.n:pos";"adj:pl:dat:m1.m2.m3.f.n:pos";"adj:pl:gen:m1.m2.m3.f.n:pos";
"adj:pl:inst:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos";"adj:pl:nom.voc:m1:pos";"adj:pl:nom.voc:m2.m3.f.n:pos";
"adj:sg:acc:f:pos";"adj:sg:acc:m1.m2:pos";"adj:sg:acc:m3:pos";"adj:sg:acc:n:pos";"adj:sg:dat:f:pos";
"adj:sg:dat:m1.m2.m3.n:pos";"adj:sg:gen:f:pos";"adj:sg:gen:m1.m2.m3.n:pos";"adj:sg:inst:f:pos";"adj:sg:inst:m1.m2.m3.n:pos";
"adj:sg:loc:f:pos";"adj:sg:loc:m1.m2.m3.n:pos";"adj:sg:nom.voc:f:pos";"adj:sg:nom.voc:m1.m2.m3:pos";"adj:sg:nom.voc:n:pos";"adja"] -> {empty_form with orth=orth; interp="adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos|adja"} :: forms
| _,["ger:pl:nom.acc:n:imperf.perf:aff";"ger:sg:gen:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ger:sg:gen:n:imperf.perf:aff|ger:pl:nom.acc:n:imperf.perf:aff"} :: forms
| _,["ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:aff";"ppas:sg:nom.acc.voc:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:nom.acc.voc:n:imperf.perf:aff|ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:aff"} :: forms
| _,["ppas:sg:acc:m1.m2:imperf.perf:aff";"ppas:sg:gen:m1.m2.m3.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:gen:m1.m2.m3.n:imperf.perf:aff|ppas:sg:acc:m1.m2:imperf.perf:aff"} :: forms
| _,["ppas:sg:acc:m3:imperf.perf:aff";"ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff|ppas:sg:acc:m3:imperf.perf:aff"} :: forms
| _,["ppas:pl:acc:m1:imperf.perf:aff";"ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:aff|ppas:pl:acc:m1:imperf.perf:aff"} :: forms
| _,["ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:aff";"ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:aff|ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:aff"} :: forms
| _,["pact:pl:nom.acc.voc:m2.m3.f.n:imperf:aff";"pact:sg:nom.acc.voc:n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:nom.acc.voc:n:imperf:aff|pact:pl:nom.acc.voc:m2.m3.f.n:imperf:aff"} :: forms
| _,["pact:sg:acc:m1.m2:imperf:aff";"pact:sg:gen:m1.m2.m3.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:gen:m1.m2.m3.n:imperf:aff|pact:sg:acc:m1.m2:imperf:aff"} :: forms
| _,["pact:pl:nom.voc:m1:imperf:aff";"pact:sg:acc:m3:imperf:aff";"pact:sg:nom.voc:m1.m2.m3:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:nom.voc:m1.m2.m3:imperf:aff|pact:sg:acc:m3:imperf:aff|pact:pl:nom.voc:m1:imperf:aff"} :: forms
| _,["pact:pl:acc:m1:imperf:aff";"pact:pl:gen.loc:m1.m2.m3.f.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:pl:gen.loc:m1.m2.m3.f.n:imperf:aff|pact:pl:acc:m1:imperf:aff"} :: forms
| _,["pact:pl:dat:m1.m2.m3.f.n:imperf:aff";"pact:sg:inst.loc:m1.m2.m3.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:inst.loc:m1.m2.m3.n:imperf:aff|pact:pl:dat:m1.m2.m3.f.n:imperf:aff"} :: forms
| _,["ger:pl:nom.acc:n:imperf.perf:neg";"ger:sg:gen:n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ger:sg:gen:n:imperf.perf:neg|ger:pl:nom.acc:n:imperf.perf:neg"} :: forms
| _,["ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:neg";"ppas:sg:nom.acc.voc:n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:nom.acc.voc:n:imperf.perf:neg|ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:neg"} :: forms
| _,["ppas:sg:acc:m1.m2:imperf.perf:neg";"ppas:sg:gen:m1.m2.m3.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:gen:m1.m2.m3.n:imperf.perf:neg|ppas:sg:acc:m1.m2:imperf.perf:neg"} :: forms
| _,["ppas:sg:acc:m3:imperf.perf:neg";"ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg|ppas:sg:acc:m3:imperf.perf:neg"} :: forms
| _,["ppas:pl:acc:m1:imperf.perf:neg";"ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:neg|ppas:pl:acc:m1:imperf.perf:neg"} :: forms
| _,["ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:neg";"ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:neg|ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:neg"} :: forms
| _,["pact:pl:nom.acc.voc:m2.m3.f.n:imperf:neg";"pact:sg:nom.acc.voc:n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:nom.acc.voc:n:imperf:neg|pact:pl:nom.acc.voc:m2.m3.f.n:imperf:neg"} :: forms
| _,["pact:sg:acc:m1.m2:imperf:neg";"pact:sg:gen:m1.m2.m3.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:gen:m1.m2.m3.n:imperf:neg|pact:sg:acc:m1.m2:imperf:neg"} :: forms
| _,["pact:pl:nom.voc:m1:imperf:neg";"pact:sg:acc:m3:imperf:neg";"pact:sg:nom.voc:m1.m2.m3:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:nom.voc:m1.m2.m3:imperf:neg|pact:sg:acc:m3:imperf:neg|pact:pl:nom.voc:m1:imperf:neg"} :: forms
| _,["pact:pl:acc:m1:imperf:neg";"pact:pl:gen.loc:m1.m2.m3.f.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:pl:gen.loc:m1.m2.m3.f.n:imperf:neg|pact:pl:acc:m1:imperf:neg"} :: forms
| _,["pact:pl:dat:m1.m2.m3.f.n:imperf:neg";"pact:sg:inst.loc:m1.m2.m3.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:inst.loc:m1.m2.m3.n:imperf:neg|pact:pl:dat:m1.m2.m3.f.n:imperf:neg"} :: forms
| _,["ger:pl:gen:n:imperf.perf:aff";"inf:imperf.perf"] -> {empty_form with orth=orth; interp="ger:pl:gen:n:imperf.perf:aff"} :: {empty_form with orth=orth; interp="inf:imperf.perf"} :: forms
| _,["praet:sg:m1.m2.m3:imperf.perf";"praet:sg:m1.m2.m3:imperf.perf:nagl"] -> {empty_form with orth=orth; interp="praet:sg:m1.m2.m3:imperf.perf:nagl"} :: forms
| _,["fin:sg:ter:imperf.perf";"ger:sg:nom.acc:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="fin:sg:ter:imperf.perf"} :: {empty_form with orth=orth; interp="ger:sg:nom.acc:n:imperf.perf:aff"} :: forms
| _,["ger:pl:gen:n:imperf.perf:aff";"impt:sg:sec:imperf.perf"] -> {empty_form with orth=orth; interp="ger:pl:gen:n:imperf.perf:aff"} :: {empty_form with orth=orth; interp="impt:sg:sec:imperf.perf"} :: forms
| _,["fin:pl:ter:imperf.perf";"ppas:sg:acc.inst:f:imperf.perf:aff"] -> {empty_form with orth=orth; interp="fin:pl:ter:imperf.perf"} :: {empty_form with orth=orth; interp="ppas:sg:acc.inst:f:imperf.perf:aff"} :: forms
| "a",["subst:sg:dat.loc:f";"subst:sg:gen:f"] ->
if orth_suf = "j" then {empty_form with orth=orth; interp="subst:sg:gen.dat.loc:f"} :: forms
else {empty_form with orth=orth; interp="subst:sg:gen:f"} :: {empty_form with orth=orth; interp="subst:sg:dat.loc:f"} :: forms
(* | "a",["subst:pl:gen:f";"subst:sg:dat.loc:f";"subst:sg:gen:f"] -> (*print_endline lemma;*) {empty_form with orth=orth; interp="subst:pl:gen:f"} :: {empty_form with orth=orth; interp="subst:sg:dat.loc:f"} :: {empty_form with orth=orth; interp="subst:sg:gen:f"} :: forms *)
| "a",["subst:pl:gen:f";"subst:pl:loc:f"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:f"} :: forms
| "ε",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
| "ε",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms
| "ε",["subst:sg:loc:m1";"subst:sg:voc:m1"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m1"} :: forms
| "ε",["subst:sg:loc:m2";"subst:sg:voc:m2"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m2"} :: forms
| "ε",["subst:sg:loc:m3";"subst:sg:voc:m3"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m3"} :: forms
| "ε",["subst:sg:gen:m3";"subst:sg:loc:m3";"subst:sg:voc:m3"] -> {empty_form with orth=orth; interp="subst:sg:loc.voc:m3"} :: {empty_form with orth=orth; interp="subst:sg:gen:m3"} :: forms
| "ε",["subst:sg:acc:f";"subst:sg:nom:f"] -> {empty_form with orth=orth; interp="subst:sg:nom.acc:f"} :: forms
| "ε",["subst:pl:gen:f";"subst:pl:nom.acc.voc:f";"subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:gen.dat.loc.voc:f|subst:pl:gen:f"} :: {empty_form with orth=orth; interp="subst:pl:nom.acc.voc:f"} :: forms
| "ε",["subst:pl:gen:f";"subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:gen.dat.loc.voc:f|subst:pl:gen:f"} :: forms
| "y",["subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1"} :: forms
| "y",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
| "y",["subst:pl:gen.acc:m1";"subst:pl:loc:m1"] -> {empty_form with orth=orth; interp="subst:pl:gen.acc.loc:m1"} :: forms
| "y",["subst:pl:dat:m1";"subst:sg:inst:m1";"subst:sg:loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m1|subst:pl:dat:m1"} :: forms
| "y",["subst:sg:nom:m2";"subst:sg:voc:m2"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m2"} :: forms
| "y",["subst:pl:nom.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1"} :: {empty_form with orth=orth; interp="subst:pl:nom.voc:m1"} :: forms
| "y",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms
| "y",["subst:pl:gen:m2";"subst:pl:loc:m2"] -> {empty_form with orth=orth; interp="subst:pl:gen.acc.loc:m2"} :: forms
| "y",["subst:pl:dat:m2";"subst:sg:inst:m2";"subst:sg:loc:m2"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m2|subst:pl:dat:m2"} :: forms
| "y",["subst:sg:nom.acc:m3";"subst:sg:voc:m3"] -> {empty_form with orth=orth; interp="subst:sg:nom.acc.voc:m3"} :: forms
| "y",["subst:pl:gen:m3";"subst:pl:loc:m3"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:m3"} :: forms
| "y",["subst:pl:dat:m3";"subst:sg:inst:m3";"subst:sg:loc:m3"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m3|subst:pl:dat:m3"} :: forms
| "y",["subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:nom:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.loc.voc:f"} :: forms
| "e",["depr:pl:nom.acc.voc:m2";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
| "e",["depr:pl:nom.acc.voc:m2";"subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1|depr:pl:nom.acc.voc:m2"} :: {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
| "e",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
| "e",["subst:sg:inst:m1";"subst:sg:loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m1"} :: forms
| "e",["subst:pl:dat:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:n:ncol"} :: {empty_form with orth=orth; interp="subst:pl:dat:n:ncol"} :: forms
| "e",["subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:n:ncol"} :: forms
| "e",["subst:pl:gen:n:ncol";"subst:pl:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:n:ncol"} :: forms
| "e",["subst:sg:dat:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:ncol"} :: forms
| "e",["subst:pl:nom.acc.voc:n:ncol";"subst:sg:gen:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:gen:n:ncol|subst:pl:nom.acc.voc:n:ncol"} :: forms
| "o",["subst:sg:nom.voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom:m1"} :: {empty_form with orth=orth; interp="subst:sg:voc:m1"} :: forms
| "o",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
| "o",["depr:pl:nom.acc.voc:m2";"subst:pl:nom.voc:m1";"subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: {empty_form with orth=orth; interp="subst:pl:nom.voc:m1"} :: {empty_form with orth=orth; interp="depr:pl:nom.acc.voc:m2"} :: forms
| "o",["subst:sg:dat.loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:dat:m1"} :: {empty_form with orth=orth; interp="subst:sg:loc:m1"} :: forms
| "o",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms
| "o",["subst:pl:dat:m1:pt";"subst:pl:loc:m1:pt"] -> {empty_form with orth=orth; interp="subst:pl:dat.loc:m1:pt"} :: forms
| "ε",["subst:sg:dat:n:ncol";"subst:sg:gen:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol";"subst:sg:nom.acc.voc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: forms
| "ε",["subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";"subst:sg:dat:n:ncol";"subst:sg:gen:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol";"subst:sg:nom.acc.voc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: forms
| "ę",["subst:sg:dat:n:col";"subst:sg:loc:n:col"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:col"} :: forms
| "ę",["subst:sg:dat:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:ncol"} :: forms
| "o",["subst:sg:loc:m1";"subst:sg:voc:m1"] ->
if orth_suf = "e" then {empty_form with orth=orth; interp="subst:sg:loc.voc:m1"} :: forms
else {empty_form with orth=orth; interp="subst:sg:loc:m1"} :: {empty_form with orth=orth; interp="subst:sg:voc:m1"} :: forms
| _,["depr:pl:nom.acc.voc:m2";"subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
| _,[interp] -> {empty_form with orth=orth; interp=interp} :: forms
| _,interps ->
(* print_endline ("merge_interps: " (*^ lemma_suf*) ^ " [\"" ^ String.concat "\";\"" interps ^ "\"]"); *)
Xlist.fold interps forms (fun forms interp ->
{empty_form with orth=orth; interp=interp} :: forms))
let process_interps dict =
Xlist.rev_map dict (fun entry ->
if entry.cat = "verb" then
let aspect = get_aspect entry.lemma entry.forms in
let forms = Xlist.rev_map entry.forms (fun f ->
let interp = match (Xstring.split ":" f.interp) with
["fin";n;p;_] -> String.concat ":" ["fin";n;p;"imperf.perf"]
| ["impt";n;p;_] -> String.concat ":" ["impt";n;p;"imperf.perf"]
| "pcon" :: _ -> f.interp
| "pacta" :: _ -> f.interp
| "pact" :: _ -> f.interp
| ["ger";n;c;g;_;a] -> String.concat ":" ["ger";n;c;g;"imperf.perf";a]
| ["praet";n;g;_] -> String.concat ":" ["praet";n;g;"imperf.perf"]
| ["praet";n;g;_;a] -> String.concat ":" ["praet";n;g;"imperf.perf";a]
| ["inf";_] -> String.concat ":" ["inf";"imperf.perf"]
| ["pant";_] -> String.concat ":" ["pant";"imperf.perf"]
| ["imps";_] -> String.concat ":" ["imps";"imperf.perf"]
| ["ppas";n;c;g;_;a] -> String.concat ":" ["ppas";n;c;g;"imperf.perf";a]
| _ -> print_endline ("merge_interps: " ^ f.interp); f.interp in
{f with interp=interp}) in
let forms = merge_interps entry.lemma forms in
{entry with aspect=aspect; forms=forms} else
{entry with forms=merge_interps entry.lemma entry.forms})
(**********************************************************************************)
(*let mark_ndm dict =
Xlist.fold dict [] (fun dict entry ->
if entry.cat <> "noun" && entry.cat <> "adj" then entry :: dict else
let map = Xlist.fold entry.forms StringMap.empty (fun map form ->
StringMap.add_inc map form.interp (StringSet.singleton form.orth) (fun set -> StringSet.add set form.orth)) in
let qmap = StringMap.fold map StringQMap.empty (fun qmap interp orths ->
StringSet.fold orths qmap StringQMap.add) in
let n = StringMap.size map in
let found = StringQMap.fold qmap [] (fun found orth v ->
if v = n then orth :: found else found) in
match found with
[] -> entry :: dict
| [orth] ->
let ndm,odm = Xlist.fold entry.forms ([],[]) (fun (ndm,odm) form ->
if form.orth = orth then form :: ndm, odm else ndm, form :: odm) in
let dict = {entry with forms=odm} :: dict in
{entry with forms=ndm; ndm=true} :: dict
| _ -> failwith ("mark_ndm: " ^ (String.concat " " found)))
let print_ndm filename dict =
File.file_out filename (fun file ->
Xlist.iter dict (fun entry ->
if entry.ndm then
let orth = (List.hd entry.forms).orth in
fprintf file "%s\t%s\t%s\n" orth entry.lemma entry.cat))
let remove_ndm dict =
Xlist.fold dict [] (fun dict entry ->
if entry.ndm then dict
else entry :: dict)
let remove_not_ndm dict =
Xlist.fold dict [] (fun dict entry ->
if not entry.ndm then dict
else entry :: dict)*)
let kolwiek_lemmas = StringSet.of_list [
(* adj *)
"czyjkolwiek"; "czyjś"; "czyjże"; "jakiciś"; "jakikolwiek"; "jakisi"; "jakiś"; "jakiści";
"jakiściś"; "jakiśkolwiek"; "jakiż"; "jakiżkolwiek"; "jakowyś"; "kijże"; "kiż"; "którykolwiek";
"któryś"; "któryż"; "któryżkolwiek"; "niejakiś"; "takiż"; "takowyż"; "tenże"; "tyliż"; "ówże";
(* noun *)
"cokolwiek:s"; "cośkolwiek"; "cóżkolwiek"; "ktokolwiek"; "ktośkolwiek"; "któżkolwiek";
"cociś"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "coże"; "cóż";
"ktoś:s2"; "któż";
(* adv *)
"jakkolwiek"; "jakoś"; "małoż"; "niejakkolwiek"; "niejakoś"; (*"niemalże";*) ]
let kolwiek_suffixes = [
"żkolwiek"; "żekolwiek"; "śkolwiek"; "kolwiek"; "ż"; "że"; "ściś"; "ciś"; "ś"; "ści"; "sik"; "si"]
let find_kolwiek_suffixes dict =
Xlist.rev_map dict (fun entry ->
if StringSet.mem kolwiek_lemmas entry.lemma then
{entry with forms=Xlist.map entry.forms (fun form ->
{form with orth=Xlist.fold kolwiek_suffixes form.orth (fun orth kolwiek_suf ->
if Xstring.check_sufix kolwiek_suf orth then
Xstring.cut_sufix kolwiek_suf orth
else orth)})}
else entry)
let exceptional_lemmata = StringSet.of_list ([
(* wiele stemów *)
"Apollo"; "Aujeszky"; "Białystok"; "Gózd"; "Krasnystaw"; "Różanystok"; "Wielkanoc"; "białagłowa";
"deszcz"; "imćpan"; "iściec"; "otrząs"; "rzeczpospolita"; "wilczełyko"; "woleoczko";
"prapraojciec"; "praojciec"; "ojciec"; "współbrat"; "spółbrat"; "półbrat"; "brat";
"półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; "człowiek";
"półdziecko"; "+lecie"; "zimoziele"; "ziele"; "trójziele"; "nasienie";
"ksiądz"; "dech"; "tydzień"; "roczek:s2"; "rok:s1"; "przechrzest"; "chrzest";
"dziecko"; "ucho:s2"; "oko:s2"; "cześć:s"; "jo-jo"; "Zabłotce"; "tysiąc:s1"; "półmiesiąc"; "miesiąc"; ""; ""; "";
"Pia"; "ręka"; "człek"; "Kozak:s1"; "bóg"; "psubrat"; "pieniądz"; ""; ""; ""; "";
"kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic";
"co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek";
"niebiosa"; "Włochy:s1"; "Niemcy"; "Węgry"; "Austro-Węgry"; ""; ""; ""; ""; ""; ""; "";
"zając:s1"; "tysiąc:s2"; "wszyscy"; ""; ""; ""; ""; ""; ""; ""; ""; "";
(* "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
"NOT"; "LOT"; "KRRiT"; "OIT"; ""; ""; ""; ""; ""; ""; ""; "";*)
"bliscy"; "ojcowie"; "teściowie"; "ichmościowie"; "wujkowie"; "staruszkowie"; "pradziadkowie"; ""; ""; ""; ""; "";
"małżonkowie"; "kochankowie"; "dziadkowie"; "rozwiedzeni"; "nieliczni"; "chrzestni"; ""; ""; ""; ""; ""; "";
"starzy"; "wasi"; "nasi"; "najmłodsi"; "dzisiejsi"; ""; ""; ""; ""; ""; ""; "";
"IKEA"; "stajnia"; "kuchnia:s"; "suknia"; "minisuknia"; "głównia"; "głownia"; "dźwignia"; ""; ""; ""; "";
"workowiśnia"; "wiśnia"; "sośnia"; "laurowiśnia"; "studnia"; "idea"; "imienie"; ""; ""; ""; ""; "";
"makao"; "macao"; "kakao"; "Akademgorodok"; "yuppi"; "hippie"; "yuppie"; ""; ""; ""; ""; "";
"Uj"; "PIT"; "ChAT"; "podczłowiek"; "nieczłowiek"; "cześć"; "ktoś"; "ktosik"; ""; ""; ""; "";
"+ówna"; "+yna"; "+ina"; "+anka"; "+owa"; "co"; "cokolwiek"; "coś"; "cośtam"; ""; ""; "";
"zając"; "tysiąc"; "rok"; "roczek"; "oko"; "ucho"; "Włochy"; "niebiosy"; "wici"; ""; ""; "";
"André"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"zły:a"; "dobry:a"; "przymały"; "mały:a"; "duży"; "wielki:a";
"ppoż."; "ppanc."; "pepanc."; "midi:a"; ""; ""; ""; ""; ""; ""; ""; "";
"zwać"; "wiedzieć"; "pójść"; "przejść"; "dojść"; "zsiąść"; "iść"; ""; ""; ""; ""; "";
"być"; "zasłonić"; "słonić"; "przysłonić"; "przesłonić"; "osłonić"; "odsłonić"; ""; ""; ""; ""; "";
(*
(* błąd w słowniku *)
"ówże";
(* wiele stemów *)
"twój:a"; "swój"; "mój:a"; "wszystek";
(* oboczności w stemie *)
"co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek";
"kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic";
"Angel"; "Apollo"; "Białystok"; "Bober"; "Dzięgiel"; "Engel"; "Gołąb:s2"; "Gózd"; "Hendel"; "Herschel"; "Jastrząb";
"Kodrąb:s2"; "Kozioł"; "Krasnystaw"; "Majcher"; "Ob"; "Omulew"; "Orzeł"; "Różanystok"; "Schuster"; "Stępień"; "Słonim";
"Wielkanoc"; "achtel"; "archiprezbiter"; "arcydzięgiel"; "bedel"; "ber"; "białagłowa"; "białodrzew"; "ceter"; "deszcz";
"drama"; "dziób:s1"; "dzięgiel"; "dżemper"; "falafel"; "grubodziób"; "harbajtel"; "harbejtel"; "harmider"; "imćpan";
"iściec"; "jarząb:s2"; "kierdel"; "kimel"; "kiper:s1"; "klaster"; "kliper"; "kosodrzew"; "kureń"; "manczester";
"nadpiersień"; "osep"; "otrząs"; "pedel"; "piksel"; "podpiersień"; "podziem"; "prezbiter"; "protokół"; "przedpiersień";
"ratel"; "rondel:s2"; "rozpiór:s1"; "rozpiór:s2"; "rzeczpospolita"; "rzep:s2"; "rzepień"; "rzewień"; "rąb"; "sosrąb";
"srebrnodrzew"; "swąd"; "szmermel"; "szpiegierz"; "ulster"; "wab:s2"; "wermiszel"; "wilczełyko"; "woleoczko"; "włosień:s2";
"zew"; "złotogłów"; "świreń"; "źreb"; "żółtodziób";
"człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
"przechrzest"; "chrzest"; "półdziecko"; "roczek:s2"; "rok:s1"; "tydzień"; ""; ""; ""; ""; ""; "";
(* oboczności w odmianie *)
"niekażdy"; "każdy"; "niektóry:a"; "który"; "tenże"; "ten"; "tamten"; "kijże";
"ucho:s2"; "dziecko"; "oko:s2"; "imię"; "nozdrze";
"ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
"NOT"; "LOT"; "KRRiT"; ""; ""; ""; ""; ""; ""; ""; ""; "";
"być"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
(* pozostawione *)
"czyjże"; "czyjś"; "czyjkolwiek"; "kiż"; "ów"; "ow"; "on:a"; "ki";
"Pia"; "jo-jo"; "+lecie"; "";
"zagrząźć"; "zrzeć";
(* niepełny paradygmat *)
"zróść"; "zląc"; "zaróść"; "zaprząc"; "zaprzysiąc"; "zanieść:v2"; "zaląc"; "wzróść"; "wyróść"; "wyprząc"; "wyprzysiąc";
"róść"; "sprzysiąc"; "sprząc"; "ugrząźć"; "uląc"; "upiec:v2"; "uprząc"; "uróść"; "wieść:v2"; "wprząc"; "wróść"; "wyląc";
"powieść:v2"; "posiąc"; "przeląc"; "przeprząc"; "przeróść"; "przyprząc"; "przysiąc"; "przyróść"; "prząc"; "pójść:v2"; "rozprząc"; "rozróść";
"krzywoprzysiąc"; "ląc"; "naróść"; "obróść"; "odprzysiąc"; "odprząc"; "odróść"; "oprzysiąc"; "podróść"; "pogrząźć"; "poprzysiąc"; "poróść";
"dojść:v2"; "doprząc"; "doróść"; "dosiąc"; "grząźć"; "iść:v2";
(* wiele stemów *)
"uwlec"; "wewlec"; "wlec"; "wwlec"; "wywlec"; "wyżec"; "zawlec"; "zażec"; "zewlec"; "zwlec"; "zżec"; "żec";
"podwlec"; "podżec"; "powlec:v1"; "powlec:v2"; "przeoblec"; "przewlec"; "przeżec"; "przyoblec"; "przywlec"; "przyżec"; "rozwlec"; "rozżec";
"dowlec"; "nawlec"; "oblec:v2"; "obwlec"; "odwlec"; "owlec"; "zeżreć";
(* inne *)
"liźć"; "iść:v1"; "wyniść"; "wynijść"; "wyjść"; "wniść"; "wnijść"; "wejść"; "ujść"; "rozejść"; "pójść:v1"; "przyjść"; "przejść:v2"; "przejść:v1"; "podejść"; "odejść"; "obejść:v2"; "obejść:v1"; "najść:v2"; "najść:v1"; "nadejść"; "dojść:v1";
"roztworzyć:v2"; "przetworzyć:v2"; "otworzyć";
"zsiąść:v2"; "zsiąść:v1"; "zesiąść"; "zasiąść"; "wysiąść"; "współposiąść"; "wsiąść"; "usiąść"; "siąść"; "rozsiąść"; "przysiąść"; "przesiąść"; "powsiąść"; "posiąść"; "podsiąść"; "osiąść"; "obsiąść"; "nasiąść"; "dosiąść";
"źreć:v1"; "zniść"; "znijść"; "znajść"; "zejść"; "zejść"; "zajść:v2"; "zajść:v1"; "wzniść"; "wznijść"; "wzejść"
(*
"moi"; "twoi";
(*"AIDS"; "BGŻ"; "BWZ"; "BZ";*) (*"Bandtkie";*) (*"CRZZ"; "FPŻ";*) (*"Jokai"; "Jókai"; "Linde";*)(* "MSZ"; "MWGzZ"; *)
(*"NSZ"; "OPZZ";*) "Radetzky"; "Tagore"; (*"UNZ"; "URz"; "WBZ"; "ZSZ"; "ZWZ"; "ZZ";*) "aids";
"arcyksiężna"; "cornflakes"; "księżna"; (*"scrabble";*) "sms"; "teścina";
"Wielkanoc"; "białagłowa"; "rzeczpospolita"; "imćpan";
"Ob"; "podziem"; "Pia"; "woleoczko"; "wilczełyko"; "jo-jo"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"Omulew"; "drama"; (*"Kayah";*) "ratel"; "grubodziób"; "rozpiór:s1"; "ceter"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"DJ"; "FIFA"; (*"manicure"; "Greenpeace"; "Google";*) ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"włosień:s2"; "deszcz"; "falafel"; "Krasnystaw";
"Różanystok"; "Białystok"; "ZHR"; "rzep:s2"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"IKEA"; "makao"; "macao"; "kakao"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"dziecko"; "oko:s2"; "ucho:s2"; "półdziecko"; "b-cia"; ""; ""; ""; ""; ""; ""; ""; ""; "";
"idea"; "ręka"; "cześć:s"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"ABBA"; "UEFA"; "FAMA"; "SABENA"; "MENA"; "APA"; "NASA"; "ANSA";
"NAFTA"; "LETTA"; "ETA"; "ELTA"; "EFTA"; "CEFTA";
"WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART";
"PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT";
"człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
"szwa"; "hawanna"; "butaforia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"Skopie"; "Mathea"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"poema:s1"; "klima:s1"; "dylema"; "dilemma"; "apoftegma"; "aksjoma"; ""; ""; ""; ""; ""; ""; ""; "";
"burgrabia"; "gograbia"; "grabia"; "hrabia"; "margrabia"; "murgrabia"; "sędzia:s1"; "wicehrabia"; "współsędzia";
"cieśla"; "bibliopola"; "świszczypałka"; "śwircałka"; "świerczałka"; "ścierciałka"; "tatka"; "sługa:s1"; "stupajka:s1"; "stepka"; "starowinka:s2"; "skurczypałka"; "mężczyzna"; "klecha";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
] @ File.load_lines "../morphology/data/obce.tab" @ File.load_lines "../morphology/data/akronimy.tab" @
(* File.load_lines "results/interp_validated_verb.tab" @ *)
(*File.load_lines "results/interp_validated_noun.tab" @ File.load_lines "results/interp_validated_adj.tab" @
File.load_lines "../morphology/data/validated_adj.tab" @ File.load_lines "../morphology/data/validated_noun.tab" @
File.load_lines "../morphology/data/validated_verb.tab" @ File.load_lines "../morphology/data/adv_nieodprzymiotnikowe.tab" *) [])
let remove_exceptional_lemmata dict =
Xlist.fold dict [] (fun dict entry ->
if StringSet.mem exceptional_lemmata entry.lemma then dict
else entry :: dict)
let remove_exceptional_lemmata_gen ex dict =
Xlist.fold dict [] (fun dict entry ->
if StringSet.mem ex entry.lemma then dict
else entry :: dict)
let generate_stem dict =
Xlist.rev_map dict (fun entry ->
{entry with stem=
(* if entry.ndm then (List.hd entry.forms).orth else *)
if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then
Stem.generate_stem entry
else ""})
(*let phon_generate_stem dict =
Xlist.rev_map dict (fun entry ->
{entry with phon_stem=
if entry.ndm then (List.hd entry.forms).phon_orth else
if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then
Stem.phon_generate_stem entry
else []})*)
let generate_stem_lu dict =
Xlist.rev_map dict (fun entry ->
{entry with lu_stem=Stem.generate_stem_lu entry.lemma1 entry.lemma2})
let lowercase_lu dict =
Xlist.rev_map dict (fun entry ->
{entry with
lemma1=Xunicode.lowercase_utf8_string entry.lemma1;
lemma2=Xunicode.lowercase_utf8_string entry.lemma2})
let fonetic_translation dict =
Xlist.fold dict [] (fun dict e ->
try
let lemma = Stem.simplify_lemma e.lemma in
let phon_lemma = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules lemma in
let phon_stem = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules e.stem in
{e with phon_lemma = phon_lemma; phon_stem=phon_stem;
forms = Xlist.map e.forms (fun f ->
let phon_orth = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules f.orth in
{f with phon_orth = phon_orth})} :: dict
with
Fonetics.NotFound(x,s) -> printf "NF %s %s %s\n%!" e.lemma x s; dict
| Fonetics.NotEqual(x,s,t) -> printf "NE %s %s %s %s\n%!" e.lemma x s t; dict
| Fonetics.MulipleSolutions(x,s,l) -> printf "MS %s %s %s: %s\n%!" e.lemma x s (String.concat " " l); dict
| _ -> dict)
let validate rules dict =
Xlist.rev_map dict (fun entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
let forms = Xlist.rev_map entry.forms (fun form ->
let candidates = Rules.CharTrees.find rules form.orth in
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
if stem ^ rule.set = simple_lemma then (stem,rule) :: candidates else candidates) in
if candidates = [] then {form with validated=false} else {form with validated=true}) in
{entry with forms=forms})
let phon_validate rules dict =
Xlist.rev_map dict (fun entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
let forms = Xlist.rev_map entry.forms (fun form ->
let candidates = Xlist.fold form.phon_orth [] (fun candidates s ->
Xlist.fold (Rules.CharTrees.find rules s) candidates (fun candidates (stem,rule) ->
let candidate_lemma = Fonetics.translate_single true Fonetics.rev_rules (stem ^ rule.set) in
if candidate_lemma = simple_lemma then (stem,rule) :: candidates else candidates)) in
if candidates = [] then {form with validated=false} else {form with validated=true}) in
{entry with forms=forms})
let validate_lu rules dict =
Xlist.rev_map dict (fun entry ->
let candidates1 = Rules.CharTrees.find rules entry.lemma1 in
let candidates2 = Rules.CharTrees.find rules entry.lemma2 in
let b = Xlist.fold candidates1 false (fun b (stem1,rule1) ->
Xlist.fold candidates2 b (fun b (stem2,rule2) ->
(* Printf.printf "%s %s %s %s\n%!" stem1 stem2 (string_of_rule rule1) (string_of_rule rule1); *)
if stem1 ^ rule1.set = stem2 ^ rule2.set then true else b)) in
(* if b then print_endline "validated"; *)
let b1 = Xlist.fold candidates1 false (fun b (stem1,rule1) -> if stem1 = entry.lu_stem then true else b) in
let b2 = Xlist.fold candidates2 false (fun b (stem2,rule2) -> if stem2 = entry.lu_stem then true else b) in
{entry with lu_validated=b; validated1=b1; validated2=b2})
let validate_interp rules dict =
Xlist.rev_map dict (fun entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
let forms = Xlist.rev_map entry.forms (fun form ->
let candidates = Xlist.fold form.phon_orth [] (fun candidates s ->
Xlist.fold (Rules.CharTrees.find rules s) candidates (fun candidates (stem,rule) ->
(* if rule.star = Ndm && not entry.ndm then candidates else
if rule.star <> Ndm && entry.ndm then candidates else *)
let candidate_lemma = Fonetics.translate_single true Fonetics.rev_rules (stem ^ rule.set) in
if candidate_lemma = simple_lemma && form.interp = rule.interp then
(stem,rule) :: candidates else candidates)) in
if candidates = [] then {form with validated=false} else {form with validated=true}) in
{entry with forms=forms})
let remove_validated_forms dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then forms else form :: forms) in
if forms = [] then dict else {entry with forms=forms} :: dict)
let remove_validated_entries dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then forms else form :: forms) in
if forms = [] then dict else entry :: dict)
let remove_validated_lu dict =
Xlist.fold dict [] (fun dict entry ->
if entry.lu_validated then dict else entry :: dict)
let remove_not_validated_forms dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then form :: forms else forms) in
if forms = [] then dict else {entry with forms=forms} :: dict)
let remove_not_validated_entries dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then form :: forms else forms) in
if Xlist.size forms <> Xlist.size entry.forms then dict else entry :: dict)
let print filename dict =
File.file_out filename (fun file ->
Xlist.iter dict (fun entry ->
Xlist.iter entry.forms (fun form ->
fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp)))
let print_lemmata filename dict =
File.file_out filename (fun file ->
Xlist.iter dict (fun entry ->
fprintf file "%s\n" entry.lemma))
let remove_sup_neg_forms dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if Xstring.check_sufix ":neg" form.interp || Xstring.check_sufix ":sup" form.interp then
forms else form :: forms) in
if forms = [] then dict else {entry with forms=forms} :: dict)
let generate_rules rules path filename rules_filename =
let dict = load_tab (path ^ filename) in
let dict = merge_entries dict in
let dict = process_interps dict in
(* let dict = mark_ndm dict in (* FIXME: remove_ndm? *) *)
let dict = remove_exceptional_lemmata dict in
let dict = find_kolwiek_suffixes dict in (* FIXME: lematy z kolwiek_suffixes nie są walidowane *)
let dict = generate_stem dict in
let dict = fonetic_translation dict in
let dict = phon_validate rules dict in
let dict = remove_validated_forms dict in
let dict = remove_sup_neg_forms dict in (* FIXME *)
let rules = Xlist.fold dict StringMap.empty (fun rules entry ->
Xlist.fold (RuleGenerator.phon_generate_rules_entry entry) rules (fun rules (key,rule) ->
let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in
let rules2 = StringMap.add_inc rules2 rule (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l) in
StringMap.add rules key rules2)) in
File.file_out rules_filename (fun file ->
StringMap.iter rules (fun interp rules2 ->
fprintf file "\n@RULES %s\n" interp;
StringMap.iter rules2 (fun rule (q,l) ->
fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))
let generate_rules_lu rules id path rules_filename =
let dict = load_lu [] id path in
let dict = lowercase_lu dict in
let dict = generate_stem_lu dict in
let dict = validate_lu rules dict in
let dict = remove_validated_lu dict in
(* let dict = remove_sup_neg_forms dict in *)
let rules = Xlist.fold dict StringMap.empty (fun rules entry ->
Xlist.fold (RuleGenerator.generate_rules_lu_entry entry) rules (fun rules (key,rule,lemma) ->
let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in
let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in
StringMap.add rules key rules2)) in
File.file_out rules_filename (fun file ->
StringMap.iter rules (fun interp rules2 ->
fprintf file "\n@RULES %s\n" interp;
StringMap.iter rules2 (fun rule (q,l) ->
fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))
let rec get_first n l =
if n = 0 || l = [] then [] else
List.hd l :: (get_first (n-1) (List.tl l))
let generate_interp_rules rules interp_rules selected_tags path filename rules_filename =
let selected_tags = StringSet.of_list selected_tags in
let dict = load_tab (path ^ filename) in
let dict = merge_entries dict in
let dict = process_interps dict in
(* let dict = mark_ndm dict in (* FIXME: remove_ndm? *) *)
let dict = remove_exceptional_lemmata dict in
(* let dict = find_kolwiek_suffixes dict in *)
(* let dict = generate_stem dict in *)
let dict = fonetic_translation dict in
let dict = validate_interp interp_rules dict in
let dict = remove_validated_forms dict in
let interp_rules = Xlist.fold dict StringMap.empty (fun interp_rules entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
Xlist.fold entry.forms interp_rules (fun interp_rules form ->
let candidates = RuleGenerator.phon_generate_interp_rules rules selected_tags simple_lemma form in
Xlist.fold candidates interp_rules (fun interp_rules (v,cand) ->
(* StringMap.add_inc interp_rules cand (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l)))) in *)
StringMap.add_inc interp_rules cand (v,StringSet.singleton entry.lemma) (fun (v,set) -> v,StringSet.add set entry.lemma)))) in
let interp_rules = List.rev (List.sort compare (StringMap.fold interp_rules [] (fun l k (v,set) ->
(v,k,set) :: l))) in
File.file_out rules_filename (fun file ->
Xlist.iter interp_rules (fun (v,k,set)(*q,l*) ->
(* fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l))) *)
(*if StringSet.size set > 1000 then*) fprintf file "\t%s\t# %d %s\n" k (StringSet.size set) (String.concat " " (get_first 20 (List.rev (StringSet.to_list set))))))
(* let generate_ndm_rules dict =
let freq_rules = Xlist.fold dict Rules.RuleQMap.empty (fun freq_rules entry ->
Xlist.fold entry.forms freq_rules (fun freq_rules form ->
let rule = {id=""; freq=0; star=Ndm; pref=""; find=""; set=""; tags=[]; interp=form.interp} in
Rules.RuleQMap.add freq_rules rule)) in
fst (Rules.RuleQMap.fold freq_rules (Rules.RuleQMap.empty,1) (fun (freq_rules,i) rule freq ->
Rules.RuleQMap.add_val freq_rules {rule with id = "N" ^ string_of_int i} freq, i+1)) *)
let generate_rule_frequencies rules path filename rules_filename =
let dict = load_tab (path ^ filename) in
let dict = merge_entries dict in
let dict = process_interps dict in
let dict = remove_cat "cond" dict in
(* let dict = mark_ndm dict in
let freq_rules = generate_ndm_rules (remove_not_ndm dict) in
let dict = remove_ndm dict in *)
let dict = remove_exceptional_lemmata dict in
let dict = generate_stem dict in
let freq_rules = Xlist.fold dict Rules.RuleQMap.empty(*freq_rules*) (fun freq_rules entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
Xlist.fold entry.forms freq_rules (fun freq_rules form ->
let candidates = Rules.CharTrees.find rules form.orth in
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
(stem,rule) :: candidates else candidates) in
if candidates = [] then freq_rules else Rules.RuleQMap.add freq_rules (snd (List.hd candidates)))) in
File.file_out rules_filename (fun file ->
Rules.RuleQMap.iter freq_rules (fun rule freq ->
fprintf file "%s\n" (Rules.string_of_freq_rule {rule with freq=freq})))
let generate_stem_dict rules_filename path filename out_filename =
let rules = Rules.load_freq_rules rules_filename in
let rules = Rules.CharTrees.create rules in
let dict = load_tab (path ^ filename) in
let dict = merge_entries dict in
let dict = process_interps dict in
let dict = remove_cat "cond" dict in
(* let dict = mark_ndm dict in *)
let stems = Xlist.fold dict StringMap.empty (fun stems entry ->
let simple_lemma,lemma_suf = Stem.simplify_lemma_full entry.lemma in
Xlist.fold entry.forms stems (fun stems form ->
let candidates = Rules.CharTrees.find rules form.orth in
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
(* if rule.star = Ndm && not entry.ndm then candidates else
if rule.star <> Ndm && entry.ndm then candidates else *)
if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
(stem,rule) :: candidates else candidates) in
if candidates = [] then stems else
let stem,rule = List.hd candidates in
StringMap.add_inc stems (stem ^ "\t" ^ lemma_suf) [rule.id] (fun l -> rule.id :: l))) in
File.file_out out_filename (fun file ->
StringMap.iter stems (fun stem ids ->
fprintf file "%s\t%s\n" stem (String.concat " " ids)))