dict.ml
31.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
open Xstd
open Printf
open Types
let get_form e =
match e.forms with
[form] -> form
| _ -> failwith "get_form"
let load_tab filename =
File.load_tab filename (function
orth :: lemma :: interp :: _ ->
{lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type="";
ndm=false; stem=""}
| line -> failwith ("load_tab: " ^ (String.concat "\t" line)))
let load_tab_full filename =
File.load_tab filename (function
[orth; lemma; interp] ->
{lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type="";
ndm=false; stem=""}
| [orth; lemma; interp; proper_type] ->
{lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type=proper_type;
ndm=false; stem=""}
| [orth; lemma; interp; proper_type; genre] ->
{lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=genre; validated=false}]; proper_type=proper_type;
ndm=false; stem=""}
| line -> failwith ("load_tab_full: " ^ (String.concat "\t" line)))
let load_lu dict id path =
let filename = path ^ "morf_rel_" ^ string_of_int id ^ "_lu.tab" in
File.fold_tab filename dict (fun dict -> function
[lemma1; lemma2] ->
{lemma1=lemma1; lemma2=lemma2; rel_id=id; lu_stem=""; lu_validated=false;validated1=false;validated2=false} :: dict
| line -> failwith ("load_lu: " ^ (String.concat "\t" line)))
let pos = StringSet.of_list [
"subst";"adj";"adv";"interp";"num";"xxx";"prep";"fin";"praet";"qub";"inf";"interj";
"brev";"numcol";"ppas";"pact";"adja";"conj";"ger";"pcon";"pant";"comp";"depr";
"adjp";"imps";"impt";"pred";"bedzie";"burk";"aglt";"ppron12";"ppron3";"adjc";
"winien";"siebie";"numcomp"
]
let rec find_pos rev = function
s :: l -> if StringSet.mem pos s then List.rev rev, s :: l else find_pos (s :: rev) l
| [] -> failwith "find_pos"
let split_lemma_interp s =
let l = Xstring.split_delim ":" s in
let lemma,interp = find_pos [List.hd l] (List.tl l) in
String.concat ":" lemma, String.concat ":" interp
let rec remove_empties = function
"" :: l -> remove_empties l
| l -> l
let split_freq_orth s =
match remove_empties (Xstring.split " " s) with
freq :: l -> int_of_string freq, String.concat " " l
| _ -> failwith "split_freq_orth"
let load_freq_tab filename =
File.load_tab filename (function
[freq_orth; lemma_interp] ->
let freq,orth = split_freq_orth freq_orth in
let lemma,interp = split_lemma_interp lemma_interp in
{lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=freq; genre=""; validated=false}]; proper_type="";
ndm=false; stem=""}
| line -> failwith ("load_freq_tab: " ^ (String.concat "\t" line)))
let proper_type_selector e = e.proper_type
let genre_selector e = (get_form e).genre
let interp_selector e = (get_form e).interp
let freq_selector e = (get_form e).freq
let print_quantities out_filename selector dict =
let qmap = Xlist.fold dict StringQMap.empty (fun qmap entry ->
StringQMap.add qmap (selector entry)) in
File.file_out out_filename (fun file ->
StringQMap.iter qmap (fun k v ->
fprintf file "%6d\t%s\n" v k))
(**********************************************************************************)
let load_dict_as_set filename =
let l = load_tab filename in
List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
let form = get_form entry in
StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp]))))
let load_dict_as_set_full filename =
let l = load_tab_full filename in
List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
let form = get_form entry in
StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp;entry.proper_type;form.genre]))))
let rec compare_dicts_rec file = function
[],[] -> ()
| [],b :: lb -> fprintf file "> %s\n" b; compare_dicts_rec file ([],lb)
| a :: la,[] -> fprintf file "< %s\n" a; compare_dicts_rec file (la,[])
| a :: la, b :: lb ->
if a = b then compare_dicts_rec file (la,lb) else
if a < b then (fprintf file "< %s\n" a; compare_dicts_rec file (la,b :: lb)) else
(fprintf file "> %s\n" b; compare_dicts_rec file (a :: la,lb))
let compare_dicts filename1 filename2 filename_out =
let dict1 = load_dict_as_set filename1 in
let dict2 = load_dict_as_set filename2 in
File.file_out filename_out (fun file ->
compare_dicts_rec file (dict1,dict2))
let compare_dicts_full filename1 filename2 filename_out =
let dict1 = load_dict_as_set_full filename1 in
let dict2 = load_dict_as_set_full filename2 in
File.file_out filename_out (fun file ->
compare_dicts_rec file (dict1,dict2))
(**********************************************************************************)
let find_entry_cat entry =
let form = get_form entry in
let cat,tags = match Xstring.split ":" form.interp with
cat :: tags -> cat,tags
| _ -> failwith ("find_entry_type: " ^ form.interp) in
if cat = "subst" || cat = "depr" then "noun" else
if cat = "adj" || cat = "adja"|| cat = "adjc"|| cat = "adjp" then "adj" else
if cat = "adv" then "adv" else
if cat = "inf" || cat = "praet"|| cat = "fin" || cat = "ppas" || cat = "pact" || cat = "pacta" ||
cat = "impt" || cat = "imps" || cat = "pcon" || cat = "pant" || cat = "ger" || cat = "" then "verb" else
if cat = "bedzie" || cat = "pred"|| cat = "prep" || cat = "num" || cat = "aglt" || cat = "winien" ||
cat = "qub" || cat = "brev" || cat = "comp" || cat = "interj" || cat = "burk" ||
cat = "conj" || cat = "ppron12" || cat = "ppron3" || cat = "numcomp" || cat = "" then "other" else
if cat = "cond" then "cond" else
failwith ("find_entry_cat: " ^ cat)
let assign_entry_cat dict =
Xlist.rev_map dict (fun entry ->
{entry with cat = find_entry_cat entry})
let split_dict in_path filename out_path =
let dict = load_tab (in_path ^ filename) in
let dict = List.rev (assign_entry_cat dict) in
let filename = if Xstring.check_sufix ".gz" filename then
Xstring.cut_sufix ".gz" filename else filename in
File.file_out (out_path ^ "noun_" ^ filename) (fun noun_file ->
File.file_out (out_path ^ "adj_" ^ filename) (fun adj_file ->
File.file_out (out_path ^ "adv_" ^ filename) (fun adv_file ->
File.file_out (out_path ^ "verb_" ^ filename) (fun verb_file ->
File.file_out (out_path ^ "other_" ^ filename) (fun other_file ->
Xlist.iter dict (fun entry ->
let form = get_form entry in
try
let file = match entry.cat with
"noun" -> noun_file
| "adj" -> adj_file
| "adv" -> adv_file
| "verb" -> verb_file
| "other" -> other_file
| "cond" -> raise Not_found
| _ -> failwith "split_dict" in
fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp
with Not_found -> ()))))))
let merge_entries dict =
let dict = assign_entry_cat dict in
let map = Xlist.fold dict StringMap.empty (fun map entry ->
let form = get_form entry in
let key =
if entry.cat = "noun" then
let gender = match Xstring.split ":" form.interp with
["depr";_;_;"m2"] -> "m1"
| "depr" :: _ -> failwith ("merge_entries: " ^ form.interp)
| [_;_;_;gender] -> gender
| [_;_;_;gender;col] -> gender ^ ":" ^ col
| _ -> failwith ("merge_entries: " ^ form.interp) in
entry.lemma ^ "|" ^ entry.cat ^ "|" ^ gender
else entry.lemma ^ "|" ^ entry.cat in
StringMap.add_inc map key entry (fun e ->
if entry.proper_type <> e.proper_type then
failwith ("merge_entries: " ^ key ^ " " ^ entry.proper_type ^ " " ^ e.proper_type) else
{e with forms = form :: e.forms})) in
StringMap.fold map [] (fun dict _ e -> e :: dict)
let remove_cat cat dict =
Xlist.fold dict [] (fun dict entry ->
if entry.cat = cat then dict
else entry :: dict)
(**********************************************************************************)
let mark_ndm dict =
Xlist.fold dict [] (fun dict entry ->
if entry.cat <> "noun" && entry.cat <> "adj" then entry :: dict else
let map = Xlist.fold entry.forms StringMap.empty (fun map form ->
StringMap.add_inc map form.interp (StringSet.singleton form.orth) (fun set -> StringSet.add set form.orth)) in
let qmap = StringMap.fold map StringQMap.empty (fun qmap interp orths ->
StringSet.fold orths qmap StringQMap.add) in
let n = StringMap.size map in
let found = StringQMap.fold qmap [] (fun found orth v ->
if v = n then orth :: found else found) in
match found with
[] -> entry :: dict
| [orth] ->
let ndm,odm = Xlist.fold entry.forms ([],[]) (fun (ndm,odm) form ->
if form.orth = orth then form :: ndm, odm else ndm, form :: odm) in
let dict = {entry with forms=odm} :: dict in
{entry with forms=ndm; ndm=true} :: dict
| _ -> failwith ("mark_ndm: " ^ (String.concat " " found)))
let print_ndm filename dict =
File.file_out filename (fun file ->
Xlist.iter dict (fun entry ->
if entry.ndm then
let orth = (List.hd entry.forms).orth in
fprintf file "%s\t%s\t%s\n" orth entry.lemma entry.cat))
let remove_ndm dict =
Xlist.fold dict [] (fun dict entry ->
if entry.ndm then dict
else entry :: dict)
let remove_not_ndm dict =
Xlist.fold dict [] (fun dict entry ->
if not entry.ndm then dict
else entry :: dict)
let kolwiek_lemmas = StringSet.of_list [
(* adj *)
"czyjkolwiek"; "czyjล"; "czyjลผe"; "jakiciล"; "jakikolwiek"; "jakisi"; "jakiล"; "jakiลci";
"jakiลciล"; "jakiลkolwiek"; "jakiลผ"; "jakiลผkolwiek"; "jakowyล"; "kijลผe"; "kiลผ"; "ktรณrykolwiek";
"ktรณryล"; "ktรณryลผ"; "ktรณryลผkolwiek"; "niejakiล"; "takiลผ"; "takowyลผ"; "tenลผe"; "tyliลผ"; "รณwลผe";
(* noun *)
"cokolwiek:s"; "coลkolwiek"; "cรณลผkolwiek"; "ktokolwiek"; "ktoลkolwiek"; "ktรณลผkolwiek";
"cociล"; "cosi"; "cosik"; "cosiล"; "coล:s"; "coลciล"; "coลผe"; "cรณลผ";
"ktoล:s2"; "ktรณลผ";
(* adv *)
"jakkolwiek"; "jakoล"; "maลoลผ"; "niejakkolwiek"; "niejakoล"; (*"niemalลผe";*) ]
let kolwiek_suffixes = [
"ลผkolwiek"; "ลผekolwiek"; "ลkolwiek"; "kolwiek"; "ลผ"; "ลผe"; "ลciล"; "ciล"; "ล"; "ลci"; "sik"; "si"]
let find_kolwiek_suffixes dict =
Xlist.rev_map dict (fun entry ->
if StringSet.mem kolwiek_lemmas entry.lemma then
{entry with forms=Xlist.map entry.forms (fun form ->
{form with orth=Xlist.fold kolwiek_suffixes form.orth (fun orth kolwiek_suf ->
if Xstring.check_sufix kolwiek_suf orth then
Xstring.cut_sufix kolwiek_suf orth
else orth)})}
else entry)
let exceptional_lemmata = StringSet.of_list ([
(* wiele stemรณw *)
"Apollo"; "Aujeszky"; "Biaลystok"; "Gรณzd"; "Krasnystaw"; "Rรณลผanystok"; "Wielkanoc"; "biaลagลowa";
"deszcz"; "imฤpan"; "iลciec"; "otrzฤ
s"; "rzeczpospolita"; "wilczeลyko"; "woleoczko";
"prapraojciec"; "praojciec"; "ojciec"; "wspรณลbrat"; "spรณลbrat"; "pรณลbrat"; "brat";
"pรณลczลowiek"; "przedczลowiek"; "praczลowiek"; "nadczลowiek"; "git-czลowiek"; "czลowiek";
"pรณลdziecko"; "+lecie"; "zimoziele"; "ziele"; "trรณjziele"; "nasienie";
"ksiฤ
dz"; "dech"; "tydzieล"; "roczek:s2"; "rok:s1"; "przechrzest"; "chrzest";
"dziecko"; "ucho:s2"; "oko:s2"; "czeลฤ:s"; "jo-jo"; "Zabลotce"; "tysiฤ
c:s1"; "pรณลmiesiฤ
c"; "miesiฤ
c"; ""; ""; "";
"Pia"; "rฤka"; "czลek"; "Kozak:s1"; "bรณg"; "psubrat"; "pieniฤ
dz"; ""; ""; ""; "";
"kto"; "ktokolwiek"; "ktoล:s2"; "ktoลkolwiek"; "ktรณลผ"; "ktรณลผkolwiek"; "nikt"; "nic";
"co:s"; "cociล"; "cokolwiek:s"; "cosi"; "cosik"; "cosiล"; "coล:s"; "coลciล"; "coลkolwiek"; "coลผe"; "cรณลผ"; "cรณลผkolwiek";
"niebiosa"; "Wลochy:s1"; "Niemcy"; "Wฤgry"; "Austro-Wฤgry"; ""; ""; ""; ""; ""; ""; "";
"zajฤ
c:s1"; "tysiฤ
c:s2"; "wszyscy"; ""; ""; ""; ""; ""; ""; ""; ""; "";
"ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
"NOT"; "LOT"; "KRRiT"; "OIT"; ""; ""; ""; ""; ""; ""; ""; "";
"bliscy"; "ojcowie"; "teลciowie"; "ichmoลciowie"; "wujkowie"; "staruszkowie"; "pradziadkowie"; ""; ""; ""; ""; "";
"maลลผonkowie"; "kochankowie"; "dziadkowie"; "rozwiedzeni"; "nieliczni"; "chrzestni"; ""; ""; ""; ""; ""; "";
"starzy"; "wasi"; "nasi"; "najmลodsi"; "dzisiejsi"; ""; ""; ""; ""; ""; ""; "";
"IKEA"; "stajnia"; "kuchnia:s"; "suknia"; "minisuknia"; "gลรณwnia"; "gลownia"; "dลบwignia"; ""; ""; ""; "";
"workowiลnia"; "wiลnia"; "soลnia"; "laurowiลnia"; "studnia"; "idea"; "imienie"; ""; ""; ""; ""; "";
"makao"; "macao"; "kakao"; "Akademgorodok"; "yuppi"; "hippie"; "yuppie"; ""; ""; ""; ""; "";
"Uj"; "PIT"; "ChAT"; "podczลowiek"; "nieczลowiek"; "czeลฤ"; "ktoล"; "ktosik"; ""; ""; ""; "";
"+รณwna"; "+yna"; "+ina"; "+anka"; "+owa"; "co"; "cokolwiek"; "coล"; "coลtam"; ""; ""; "";
"zajฤ
c"; "tysiฤ
c"; "rok"; "roczek"; "oko"; "ucho"; "Wลochy"; "niebiosy"; "wici"; ""; ""; "";
"Andrรฉ"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"zลy:a"; "dobry:a"; "przymaลy"; "maลy:a"; "duลผy"; "wielki:a";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"zwaฤ"; "wiedzieฤ"; "pรณjลฤ"; "przejลฤ"; "dojลฤ"; "zsiฤ
ลฤ"; "iลฤ"; ""; ""; ""; ""; "";
"byฤ"; "zasลoniฤ"; "sลoniฤ"; "przysลoniฤ"; "przesลoniฤ"; "osลoniฤ"; "odsลoniฤ"; ""; ""; ""; ""; "";
(*
(* bลฤ
d w sลowniku *)
"รณwลผe";
(* wiele stemรณw *)
"twรณj:a"; "swรณj"; "mรณj:a"; "wszystek";
(* obocznoลci w stemie *)
"co:s"; "cociล"; "cokolwiek:s"; "cosi"; "cosik"; "cosiล"; "coล:s"; "coลciล"; "coลkolwiek"; "coลผe"; "cรณลผ"; "cรณลผkolwiek";
"kto"; "ktokolwiek"; "ktoล:s2"; "ktoลkolwiek"; "ktรณลผ"; "ktรณลผkolwiek"; "nikt"; "nic";
"Angel"; "Apollo"; "Biaลystok"; "Bober"; "Dziฤgiel"; "Engel"; "Goลฤ
b:s2"; "Gรณzd"; "Hendel"; "Herschel"; "Jastrzฤ
b";
"Kodrฤ
b:s2"; "Kozioล"; "Krasnystaw"; "Majcher"; "Ob"; "Omulew"; "Orzeล"; "Rรณลผanystok"; "Schuster"; "Stฤpieล"; "Sลonim";
"Wielkanoc"; "achtel"; "archiprezbiter"; "arcydziฤgiel"; "bedel"; "ber"; "biaลagลowa"; "biaลodrzew"; "ceter"; "deszcz";
"drama"; "dziรณb:s1"; "dziฤgiel"; "dลผemper"; "falafel"; "grubodziรณb"; "harbajtel"; "harbejtel"; "harmider"; "imฤpan";
"iลciec"; "jarzฤ
b:s2"; "kierdel"; "kimel"; "kiper:s1"; "klaster"; "kliper"; "kosodrzew"; "kureล"; "manczester";
"nadpiersieล"; "osep"; "otrzฤ
s"; "pedel"; "piksel"; "podpiersieล"; "podziem"; "prezbiter"; "protokรณล"; "przedpiersieล";
"ratel"; "rondel:s2"; "rozpiรณr:s1"; "rozpiรณr:s2"; "rzeczpospolita"; "rzep:s2"; "rzepieล"; "rzewieล"; "rฤ
b"; "sosrฤ
b";
"srebrnodrzew"; "swฤ
d"; "szmermel"; "szpiegierz"; "ulster"; "wab:s2"; "wermiszel"; "wilczeลyko"; "woleoczko"; "wลosieล:s2";
"zew"; "zลotogลรณw"; "ลwireล"; "ลบreb"; "ลผรณลtodziรณb";
"czลowiek"; "pรณลczลowiek"; "przedczลowiek"; "praczลowiek"; "nadczลowiek"; "git-czลowiek"; ""; ""; ""; ""; ""; ""; ""; "";
"przechrzest"; "chrzest"; "pรณลdziecko"; "roczek:s2"; "rok:s1"; "tydzieล"; ""; ""; ""; ""; ""; "";
(* obocznoลci w odmianie *)
"niekaลผdy"; "kaลผdy"; "niektรณry:a"; "ktรณry"; "tenลผe"; "ten"; "tamten"; "kijลผe";
"ucho:s2"; "dziecko"; "oko:s2"; "imiฤ"; "nozdrze";
"ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
"NOT"; "LOT"; "KRRiT"; ""; ""; ""; ""; ""; ""; ""; ""; "";
"byฤ"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
(* pozostawione *)
"czyjลผe"; "czyjล"; "czyjkolwiek"; "kiลผ"; "รณw"; "ow"; "on:a"; "ki";
"Pia"; "jo-jo"; "+lecie"; "";
"zagrzฤ
ลบฤ"; "zrzeฤ";
(* niepeลny paradygmat *)
"zrรณลฤ"; "zlฤ
c"; "zarรณลฤ"; "zaprzฤ
c"; "zaprzysiฤ
c"; "zanieลฤ:v2"; "zalฤ
c"; "wzrรณลฤ"; "wyrรณลฤ"; "wyprzฤ
c"; "wyprzysiฤ
c";
"rรณลฤ"; "sprzysiฤ
c"; "sprzฤ
c"; "ugrzฤ
ลบฤ"; "ulฤ
c"; "upiec:v2"; "uprzฤ
c"; "urรณลฤ"; "wieลฤ:v2"; "wprzฤ
c"; "wrรณลฤ"; "wylฤ
c";
"powieลฤ:v2"; "posiฤ
c"; "przelฤ
c"; "przeprzฤ
c"; "przerรณลฤ"; "przyprzฤ
c"; "przysiฤ
c"; "przyrรณลฤ"; "przฤ
c"; "pรณjลฤ:v2"; "rozprzฤ
c"; "rozrรณลฤ";
"krzywoprzysiฤ
c"; "lฤ
c"; "narรณลฤ"; "obrรณลฤ"; "odprzysiฤ
c"; "odprzฤ
c"; "odrรณลฤ"; "oprzysiฤ
c"; "podrรณลฤ"; "pogrzฤ
ลบฤ"; "poprzysiฤ
c"; "porรณลฤ";
"dojลฤ:v2"; "doprzฤ
c"; "dorรณลฤ"; "dosiฤ
c"; "grzฤ
ลบฤ"; "iลฤ:v2";
(* wiele stemรณw *)
"uwlec"; "wewlec"; "wlec"; "wwlec"; "wywlec"; "wyลผec"; "zawlec"; "zaลผec"; "zewlec"; "zwlec"; "zลผec"; "ลผec";
"podwlec"; "podลผec"; "powlec:v1"; "powlec:v2"; "przeoblec"; "przewlec"; "przeลผec"; "przyoblec"; "przywlec"; "przyลผec"; "rozwlec"; "rozลผec";
"dowlec"; "nawlec"; "oblec:v2"; "obwlec"; "odwlec"; "owlec"; "zeลผreฤ";
(* inne *)
"liลบฤ"; "iลฤ:v1"; "wyniลฤ"; "wynijลฤ"; "wyjลฤ"; "wniลฤ"; "wnijลฤ"; "wejลฤ"; "ujลฤ"; "rozejลฤ"; "pรณjลฤ:v1"; "przyjลฤ"; "przejลฤ:v2"; "przejลฤ:v1"; "podejลฤ"; "odejลฤ"; "obejลฤ:v2"; "obejลฤ:v1"; "najลฤ:v2"; "najลฤ:v1"; "nadejลฤ"; "dojลฤ:v1";
"roztworzyฤ:v2"; "przetworzyฤ:v2"; "otworzyฤ";
"zsiฤ
ลฤ:v2"; "zsiฤ
ลฤ:v1"; "zesiฤ
ลฤ"; "zasiฤ
ลฤ"; "wysiฤ
ลฤ"; "wspรณลposiฤ
ลฤ"; "wsiฤ
ลฤ"; "usiฤ
ลฤ"; "siฤ
ลฤ"; "rozsiฤ
ลฤ"; "przysiฤ
ลฤ"; "przesiฤ
ลฤ"; "powsiฤ
ลฤ"; "posiฤ
ลฤ"; "podsiฤ
ลฤ"; "osiฤ
ลฤ"; "obsiฤ
ลฤ"; "nasiฤ
ลฤ"; "dosiฤ
ลฤ";
"ลบreฤ:v1"; "zniลฤ"; "znijลฤ"; "znajลฤ"; "zejลฤ"; "zejลฤ"; "zajลฤ:v2"; "zajลฤ:v1"; "wzniลฤ"; "wznijลฤ"; "wzejลฤ"
(*
"moi"; "twoi";
(*"AIDS"; "BGลป"; "BWZ"; "BZ";*) (*"Bandtkie";*) (*"CRZZ"; "FPลป";*) (*"Jokai"; "Jรณkai"; "Linde";*)(* "MSZ"; "MWGzZ"; *)
(*"NSZ"; "OPZZ";*) "Radetzky"; "Tagore"; (*"UNZ"; "URz"; "WBZ"; "ZSZ"; "ZWZ"; "ZZ";*) "aids";
"arcyksiฤลผna"; "cornflakes"; "ksiฤลผna"; (*"scrabble";*) "sms"; "teลcina";
"Wielkanoc"; "biaลagลowa"; "rzeczpospolita"; "imฤpan";
"Ob"; "podziem"; "Pia"; "woleoczko"; "wilczeลyko"; "jo-jo"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"Omulew"; "drama"; (*"Kayah";*) "ratel"; "grubodziรณb"; "rozpiรณr:s1"; "ceter"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"DJ"; "FIFA"; (*"manicure"; "Greenpeace"; "Google";*) ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"wลosieล:s2"; "deszcz"; "falafel"; "Krasnystaw";
"Rรณลผanystok"; "Biaลystok"; "ZHR"; "rzep:s2"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"IKEA"; "makao"; "macao"; "kakao"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"dziecko"; "oko:s2"; "ucho:s2"; "pรณลdziecko"; "b-cia"; ""; ""; ""; ""; ""; ""; ""; ""; "";
"idea"; "rฤka"; "czeลฤ:s"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"ABBA"; "UEFA"; "FAMA"; "SABENA"; "MENA"; "APA"; "NASA"; "ANSA";
"NAFTA"; "LETTA"; "ETA"; "ELTA"; "EFTA"; "CEFTA";
"WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART";
"PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT";
"czลowiek"; "pรณลczลowiek"; "przedczลowiek"; "praczลowiek"; "nadczลowiek"; "git-czลowiek"; ""; ""; ""; ""; ""; ""; ""; "";
"szwa"; "hawanna"; "butaforia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"Skopie"; "Mathea"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"poema:s1"; "klima:s1"; "dylema"; "dilemma"; "apoftegma"; "aksjoma"; ""; ""; ""; ""; ""; ""; ""; "";
"burgrabia"; "gograbia"; "grabia"; "hrabia"; "margrabia"; "murgrabia"; "sฤdzia:s1"; "wicehrabia"; "wspรณลsฤdzia";
"cieลla"; "bibliopola"; "ลwiszczypaลka"; "ลwircaลka"; "ลwierczaลka"; "ลcierciaลka"; "tatka"; "sลuga:s1"; "stupajka:s1"; "stepka"; "starowinka:s2"; "skurczypaลka"; "mฤลผczyzna"; "klecha";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
] (*@ File.load_lines "../morphology/data/obce.tab"*) (* @
File.load_lines "../morphology/data/validated_adj.tab" @ File.load_lines "../morphology/data/validated_noun.tab" @
File.load_lines "../morphology/data/validated_verb.tab" @ File.load_lines "../morphology/data/adv_nieodprzymiotnikowe.tab" *))
let remove_exceptional_lemmata dict =
Xlist.fold dict [] (fun dict entry ->
if StringSet.mem exceptional_lemmata entry.lemma then dict
else entry :: dict)
let generate_stem dict =
Xlist.rev_map dict (fun entry ->
{entry with stem=
if entry.ndm then (List.hd entry.forms).orth else
if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then
Stem.generate_stem entry
else ""})
let generate_stem_lu dict =
Xlist.rev_map dict (fun entry ->
{entry with lu_stem=Stem.generate_stem_lu entry.lemma1 entry.lemma2})
let lowercase_lu dict =
Xlist.rev_map dict (fun entry ->
{entry with
lemma1=Xunicode.lowercase_utf8_string entry.lemma1;
lemma2=Xunicode.lowercase_utf8_string entry.lemma2})
let fonetic_translation dict =
Xlist.fold dict [] (fun dict e ->
try
let lemma,lemma_suf = Stem.simplify_lemma_full e.lemma in
{e with lemma = Fonetics.translate Fonetics.rules lemma;
forms = Xlist.map e.forms (fun f ->
{f with orth = Fonetics.translate Fonetics.rules f.orth})} :: dict
with _ -> dict)
let validate rules dict =
Xlist.rev_map dict (fun entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
let forms = Xlist.rev_map entry.forms (fun form ->
let candidates = Rules.CharTrees.find rules form.orth in
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
if stem ^ rule.set = simple_lemma then (stem,rule) :: candidates else candidates) in
if candidates = [] then {form with validated=false} else {form with validated=true}) in
{entry with forms=forms})
let validate_lu rules dict =
Xlist.rev_map dict (fun entry ->
let candidates1 = Rules.CharTrees.find rules entry.lemma1 in
let candidates2 = Rules.CharTrees.find rules entry.lemma2 in
let b = Xlist.fold candidates1 false (fun b (stem1,rule1) ->
Xlist.fold candidates2 b (fun b (stem2,rule2) ->
(* Printf.printf "%s %s %s %s\n%!" stem1 stem2 (string_of_rule rule1) (string_of_rule rule1); *)
if stem1 ^ rule1.set = stem2 ^ rule2.set then true else b)) in
(* if b then print_endline "validated"; *)
let b1 = Xlist.fold candidates1 false (fun b (stem1,rule1) -> if stem1 = entry.lu_stem then true else b) in
let b2 = Xlist.fold candidates2 false (fun b (stem2,rule2) -> if stem2 = entry.lu_stem then true else b) in
{entry with lu_validated=b; validated1=b1; validated2=b2})
let validate_interp rules dict =
Xlist.rev_map dict (fun entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
let forms = Xlist.rev_map entry.forms (fun form ->
let candidates = Rules.CharTrees.find rules form.orth in
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
if rule.star = Ndm && not entry.ndm then candidates else
if rule.star <> Ndm && entry.ndm then candidates else
if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
(stem,rule) :: candidates else candidates) in
if candidates = [] then {form with validated=false} else {form with validated=true}) in
{entry with forms=forms})
let remove_validated_forms dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then forms else form :: forms) in
if forms = [] then dict else {entry with forms=forms} :: dict)
let remove_validated_entries dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then forms else form :: forms) in
if forms = [] then dict else entry :: dict)
let remove_validated_lu dict =
Xlist.fold dict [] (fun dict entry ->
if entry.lu_validated then dict else entry :: dict)
let remove_not_validated_forms dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then form :: forms else forms) in
if forms = [] then dict else {entry with forms=forms} :: dict)
let remove_not_validated_entries dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then form :: forms else forms) in
if Xlist.size forms <> Xlist.size entry.forms then dict else entry :: dict)
let print filename dict =
File.file_out filename (fun file ->
Xlist.iter dict (fun entry ->
Xlist.iter entry.forms (fun form ->
fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp)))
let print_lemmata filename dict =
File.file_out filename (fun file ->
Xlist.iter dict (fun entry ->
fprintf file "%s\n" entry.lemma))
let remove_sup_neg_forms dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if Xstring.check_sufix ":neg" form.interp || Xstring.check_sufix ":sup" form.interp then
forms else form :: forms) in
if forms = [] then dict else {entry with forms=forms} :: dict)
let generate_rules rules path filename rules_filename =
let dict = load_tab (path ^ filename) in
let dict = merge_entries dict in
let dict = mark_ndm dict in (* FIXME: remove_ndm? *)
let dict = find_kolwiek_suffixes dict in
let dict = remove_exceptional_lemmata dict in
let dict = fonetic_translation dict in
let dict = generate_stem dict in
let dict = validate rules dict in
let dict = remove_validated_forms dict in
let dict = remove_sup_neg_forms dict in
let rules = Xlist.fold dict StringMap.empty (fun rules entry ->
Xlist.fold (RuleGenerator.generate_rules_entry entry) rules (fun rules (key,rule) ->
let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in
let rules2 = StringMap.add_inc rules2 rule (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l) in
StringMap.add rules key rules2)) in
File.file_out rules_filename (fun file ->
StringMap.iter rules (fun interp rules2 ->
fprintf file "\n@RULES %s\n" interp;
StringMap.iter rules2 (fun rule (q,l) ->
fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))
let generate_rules_lu rules id path rules_filename =
let dict = load_lu [] id path in
let dict = lowercase_lu dict in
let dict = generate_stem_lu dict in
let dict = validate_lu rules dict in
let dict = remove_validated_lu dict in
(* let dict = remove_sup_neg_forms dict in *)
let rules = Xlist.fold dict StringMap.empty (fun rules entry ->
Xlist.fold (RuleGenerator.generate_rules_lu_entry entry) rules (fun rules (key,rule,lemma) ->
let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in
let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in
StringMap.add rules key rules2)) in
File.file_out rules_filename (fun file ->
StringMap.iter rules (fun interp rules2 ->
fprintf file "\n@RULES %s\n" interp;
StringMap.iter rules2 (fun rule (q,l) ->
fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))
let rec get_first n l =
if n = 0 || l = [] then [] else
List.hd l :: (get_first (n-1) (List.tl l))
let generate_interp_rules rules interp_rules con_flag group_flag lemma_flag path filename rules_filename =
let dict = load_tab (path ^ filename) in
let dict = merge_entries dict in
let dict = mark_ndm dict in (* FIXME: remove_ndm? *)
let dict = find_kolwiek_suffixes dict in
let dict = remove_exceptional_lemmata dict in
let dict = generate_stem dict in
let dict = validate_interp interp_rules dict in
let dict = remove_validated_forms dict in
let interp_rules = Xlist.fold dict StringMap.empty (fun interp_rules entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
Xlist.fold entry.forms interp_rules (fun interp_rules form ->
let candidates = RuleGenerator.generate_interp_rules rules con_flag group_flag lemma_flag simple_lemma form in
Xlist.fold candidates interp_rules (fun interp_rules cand ->
(* StringMap.add_inc interp_rules cand (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l)))) in *)
StringMap.add_inc interp_rules cand (StringSet.singleton entry.lemma) (fun set -> StringSet.add set entry.lemma)))) in
File.file_out rules_filename (fun file ->
StringMap.iter interp_rules (fun k set(*q,l*) ->
(* fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l))) *)
(*if StringSet.size set > 1000 then*) fprintf file "\t%s\t# %d %s\n" k (StringSet.size set) (String.concat " " (get_first 20 (List.rev (StringSet.to_list set))))))
let generate_ndm_rules dict =
let freq_rules = Xlist.fold dict Rules.RuleQMap.empty (fun freq_rules entry ->
Xlist.fold entry.forms freq_rules (fun freq_rules form ->
let rule = {id=""; freq=0; star=Ndm; pref=""; find=""; set=""; tags=[]; interp=form.interp} in
Rules.RuleQMap.add freq_rules rule)) in
fst (Rules.RuleQMap.fold freq_rules (Rules.RuleQMap.empty,1) (fun (freq_rules,i) rule freq ->
Rules.RuleQMap.add_val freq_rules {rule with id = "N" ^ string_of_int i} freq, i+1))
let generate_rule_frequencies rules path filename rules_filename =
let dict = load_tab (path ^ filename) in
let dict = merge_entries dict in
let dict = remove_cat "cond" dict in
let dict = mark_ndm dict in
let freq_rules = generate_ndm_rules (remove_not_ndm dict) in
let dict = remove_ndm dict in
let dict = remove_exceptional_lemmata dict in
let dict = generate_stem dict in
let freq_rules = Xlist.fold dict freq_rules (fun freq_rules entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
Xlist.fold entry.forms freq_rules (fun freq_rules form ->
let candidates = Rules.CharTrees.find rules form.orth in
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
(stem,rule) :: candidates else candidates) in
if candidates = [] then freq_rules else Rules.RuleQMap.add freq_rules (snd (List.hd candidates)))) in
File.file_out rules_filename (fun file ->
Rules.RuleQMap.iter freq_rules (fun rule freq ->
fprintf file "%s\n" (Rules.string_of_freq_rule {rule with freq=freq})))
let generate_stem_dict rules_filename path filename out_filename =
let rules = Rules.load_freq_rules rules_filename in
let rules = Rules.CharTrees.create rules in
let dict = load_tab (path ^ filename) in
let dict = merge_entries dict in
let dict = remove_cat "cond" dict in
let dict = mark_ndm dict in
let stems = Xlist.fold dict StringMap.empty (fun stems entry ->
let simple_lemma,lemma_suf = Stem.simplify_lemma_full entry.lemma in
Xlist.fold entry.forms stems (fun stems form ->
let candidates = Rules.CharTrees.find rules form.orth in
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
if rule.star = Ndm && not entry.ndm then candidates else
if rule.star <> Ndm && entry.ndm then candidates else
if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
(stem,rule) :: candidates else candidates) in
if candidates = [] then stems else
let stem,rule = List.hd candidates in
StringMap.add_inc stems (stem ^ "\t" ^ lemma_suf) [rule.id] (fun l -> rule.id :: l))) in
File.file_out out_filename (fun file ->
StringMap.iter stems (fun stem ids ->
fprintf file "%s\t%s\n" stem (String.concat " " ids)))