dict.ml
25.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
open Xstd
open Printf
open Types
let get_form e =
match e.forms with
[form] -> form
| _ -> failwith "get_form"
let load_tab filename =
File.load_tab filename (function
orth :: lemma :: interp :: _ ->
{lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type="";
ndm=false; stem=""}
| line -> failwith ("load_tab: " ^ (String.concat "\t" line)))
let load_tab_full filename =
File.load_tab filename (function
[orth; lemma; interp] ->
{lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type="";
ndm=false; stem=""}
| [orth; lemma; interp; proper_type] ->
{lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type=proper_type;
ndm=false; stem=""}
| [orth; lemma; interp; proper_type; genre] ->
{lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=genre; validated=false}]; proper_type=proper_type;
ndm=false; stem=""}
(* | orth :: lemma :: interp :: proper_type :: genre -> (orth,lemma,interp,proper_type,String.concat ";" genre) :: l *)
| line -> failwith ("load_tab_full: " ^ (String.concat "\t" line)))
let pos = StringSet.of_list [
"subst";"adj";"adv";"interp";"num";"xxx";"prep";"fin";"praet";"qub";"inf";"interj";
"brev";"numcol";"ppas";"pact";"adja";"conj";"ger";"pcon";"pant";"comp";"depr";
"adjp";"imps";"impt";"pred";"bedzie";"burk";"aglt";"ppron12";"ppron3";"adjc";
"winien";"siebie"
]
let rec find_pos rev = function
s :: l -> if StringSet.mem pos s then List.rev rev, s :: l else find_pos (s :: rev) l
| [] -> failwith "find_pos"
let split_lemma_interp s =
let l = Xstring.split_delim ":" s in
let lemma,interp = find_pos [List.hd l] (List.tl l) in
String.concat ":" lemma, String.concat ":" interp
let rec remove_empties = function
"" :: l -> remove_empties l
| l -> l
let split_freq_orth s =
match remove_empties (Xstring.split " " s) with
freq :: l -> (*print_endline (String.concat "|" (freq :: l));*) int_of_string freq, String.concat " " l
| _ -> failwith "split_freq_orth"
let load_freq_tab filename =
File.load_tab filename (function
[freq_orth; lemma_interp] ->
(* print_endline (freq_orth ^ "\t" ^ lemma_interp); *)
let freq,orth = split_freq_orth freq_orth in
let lemma,interp = split_lemma_interp lemma_interp in
{lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=freq; genre=""; validated=false}]; proper_type="";
ndm=false; stem=""}
| line -> failwith ("load_freq_tab: " ^ (String.concat "\t" line)))
let proper_type_selector e = e.proper_type
let genre_selector e = e.genre
let interp_selector e = (get_form e).interp
let freq_selector e = (get_form e).freq
let print_quantities out_filename selector dict =
let qmap = Xlist.fold dict StringQMap.empty (fun qmap entry ->
StringQMap.add qmap (selector entry)) in
File.file_out out_filename (fun file ->
StringQMap.iter qmap (fun k v ->
fprintf file "%6d\t%s\n" v k))
(**********************************************************************************)
let load_dict_as_set filename =
let l = load_tab filename in
List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
let form = get_form entry in
StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp]))))
let load_dict_as_set_full filename =
let l = load_tab_full filename in
List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
let form = get_form entry in
StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp;entry.proper_type;form.genre]))))
let rec compare_dicts_rec file = function
[],[] -> ()
| [],b :: lb -> fprintf file "> %s\n" b; compare_dicts_rec file ([],lb)
| a :: la,[] -> fprintf file "< %s\n" a; compare_dicts_rec file (la,[])
| a :: la, b :: lb ->
if a = b then compare_dicts_rec file (la,lb) else
if a < b then (fprintf file "< %s\n" a; compare_dicts_rec file (la,b :: lb)) else
(fprintf file "> %s\n" b; compare_dicts_rec file (a :: la,lb))
let compare_dicts filename1 filename2 filename_out =
let dict1 = load_dict_as_set filename1 in
let dict2 = load_dict_as_set filename2 in
File.file_out filename_out (fun file ->
compare_dicts_rec file (dict1,dict2))
let compare_dicts_full filename1 filename2 filename_out =
let dict1 = load_dict_as_set_full filename1 in
let dict2 = load_dict_as_set_full filename2 in
File.file_out filename_out (fun file ->
compare_dicts_rec file (dict1,dict2))
(**********************************************************************************)
let find_entry_cat entry =
let form = get_form entry in
let cat,tags = match Xstring.split ":" form.interp with
cat :: tags -> cat,tags
| _ -> failwith ("find_entry_type: " ^ form.interp) in
if cat = "subst" || cat = "depr" then "noun" else
if cat = "adj" || cat = "adja"|| cat = "adjc"|| cat = "adjp" then "adj" else
if cat = "adv" then "adv" else
if cat = "inf" || cat = "praet"|| cat = "fin" || cat = "ppas" || cat = "pact" || cat = "pacta" ||
cat = "impt" || cat = "imps" || cat = "pcon" || cat = "pant" || cat = "ger" || cat = "" then "verb" else
if cat = "bedzie" || cat = "pred"|| cat = "prep" || cat = "num" || cat = "aglt" || cat = "winien" ||
cat = "qub" || cat = "brev" || cat = "comp" || cat = "interj" || cat = "burk" ||
cat = "conj" || cat = "ppron12" || cat = "ppron3" || cat = "" then "other" else
if cat = "cond" then "cond" else
failwith ("find_entry_cat: " ^ cat)
let assign_entry_cat dict =
Xlist.rev_map dict (fun entry ->
{entry with cat = find_entry_cat entry})
let split_dict path filename =
let dict = load_tab (path ^ filename) in
let dict = List.rev (assign_entry_cat dict) in
File.file_out (path ^ "noun_" ^ filename) (fun noun_file ->
File.file_out (path ^ "adj_" ^ filename) (fun adj_file ->
File.file_out (path ^ "adv_" ^ filename) (fun adv_file ->
File.file_out (path ^ "verb_" ^ filename) (fun verb_file ->
File.file_out (path ^ "other_" ^ filename) (fun other_file ->
Xlist.iter dict (fun entry ->
let form = get_form entry in
try
let file = match entry.cat with
"noun" -> noun_file
| "adj" -> adj_file
| "adv" -> adv_file
| "verb" -> verb_file
| "other" -> other_file
| "cond" -> raise Not_found
| _ -> failwith "split_dict" in
fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp
with Not_found -> ()))))))
let merge_entries dict =
let dict = assign_entry_cat dict in
let map = Xlist.fold dict StringMap.empty (fun map entry ->
let form = get_form entry in
let key =
if entry.cat = "noun" then (* FIXME: problem z depr *)
let gender = match Xstring.split ":" form.interp with
["depr";_;_;"m2"] -> "m1"
| "depr" :: _ -> failwith ("merge_entries: " ^ form.interp)
| [_;_;_;gender] -> gender
| _ -> failwith ("merge_entries: " ^ form.interp) in
entry.lemma ^ "|" ^ entry.cat ^ "|" ^ gender
else entry.lemma ^ "|" ^ entry.cat in
StringMap.add_inc map key entry (fun e ->
if entry.proper_type <> e.proper_type then
failwith ("merge_entries: " ^ key ^ " " ^ entry.proper_type ^ " " ^ e.proper_type) else
{e with forms = form :: e.forms})) in
StringMap.fold map [] (fun dict _ e -> e :: dict)
(**********************************************************************************)
let mark_ndm dict =
Xlist.fold dict [] (fun dict entry ->
if entry.cat <> "noun" && entry.cat <> "adj" then entry :: dict else
let map = Xlist.fold entry.forms StringMap.empty (fun map form ->
StringMap.add_inc map form.interp (StringSet.singleton form.orth) (fun set -> StringSet.add set form.orth)) in
let qmap = StringMap.fold map StringQMap.empty (fun qmap interp orths ->
StringSet.fold orths qmap StringQMap.add) in
let n = StringMap.size map in
let found = StringQMap.fold qmap [] (fun found orth v ->
if v = n then orth :: found else found) in
(* printf "%s\t%s\t%d\t%s\n%!" entry.lemma entry.cat n (String.concat " " found); *)
match found with
[] -> entry :: dict
| [orth] ->
let ndm,odm = Xlist.fold entry.forms ([],[]) (fun (ndm,odm) form ->
if form.orth = orth then form :: ndm, odm else ndm, form :: odm) in
let dict = {entry with forms=odm} :: dict in
{entry with forms=ndm; ndm=true} :: dict
| _ -> failwith ("mark_ndm: " ^ (String.concat " " found)))
let print_ndm filename dict =
File.file_out filename (fun file ->
Xlist.iter dict (fun entry ->
if entry.ndm then
let orth = (List.hd entry.forms).orth in
fprintf file "%s\t%s\t%s\n" orth entry.lemma entry.cat))
let kolwiek_lemmas = StringSet.of_list [
(* adj *)
"czyjkolwiek"; "czyjś"; "czyjże"; "jakiciś"; "jakikolwiek"; "jakisi"; "jakiś"; "jakiści";
"jakiściś"; "jakiśkolwiek"; "jakiż"; "jakiżkolwiek"; "jakowyś"; "kijże"; "kiż"; "którykolwiek";
"któryś"; "któryż"; "któryżkolwiek"; "niejakiś"; "takiż"; "takowyż"; "tenże"; "tyliż"; "ówże";
(* noun *)
"cokolwiek:s"; "cośkolwiek"; "cóżkolwiek"; "ktokolwiek"; "ktośkolwiek"; "któżkolwiek";
"cociś"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "coże"; "cóż";
"ktoś:s2"; "któż";
(* adv *)
"jakkolwiek"; "jakoś"; "małoż"; "niejakkolwiek"; "niejakoś"; (*"niemalże";*) ]
let kolwiek_suffixes = [
"żkolwiek"; "żekolwiek"; "śkolwiek"; "kolwiek"; "ż"; "że"; "ściś"; "ciś"; "ś"; "ści"; "sik"; "si"]
let find_kolwiek_suffixes dict =
Xlist.rev_map dict (fun entry ->
if StringSet.mem kolwiek_lemmas entry.lemma then
{entry with forms=Xlist.map entry.forms (fun form ->
{form with orth=Xlist.fold kolwiek_suffixes form.orth (fun orth kolwiek_suf ->
if Xstring.check_sufix kolwiek_suf orth then
Xstring.cut_sufix kolwiek_suf orth
else orth)})}
else entry)
let exceptional_lemmata = StringSet.of_list ([
(* błąd w słowniku *)
"ówże";
(* wiele stemów *)
"twój:a"; "swój"; "mój:a"; "wszystek";
(* oboczności w stemie *)
"co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek";
"kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic";
"Angel"; "Apollo"; "Białystok"; "Bober"; "Dzięgiel"; "Engel"; "Gołąb:s2"; "Gózd"; "Hendel"; "Herschel"; "Jastrząb";
"Kodrąb:s2"; "Kozioł"; "Krasnystaw"; "Majcher"; "Ob"; "Omulew"; "Orzeł"; "Różanystok"; "Schuster"; "Stępień"; "Słonim";
"Wielkanoc"; "achtel"; "archiprezbiter"; "arcydzięgiel"; "bedel"; "ber"; "białagłowa"; "białodrzew"; "ceter"; "deszcz";
"drama"; "dziób:s1"; "dzięgiel"; "dżemper"; "falafel"; "grubodziób"; "harbajtel"; "harbejtel"; "harmider"; "imćpan";
"iściec"; "jarząb:s2"; "kierdel"; "kimel"; "kiper:s1"; "klaster"; "kliper"; "kosodrzew"; "kureń"; "manczester";
"nadpiersień"; "osep"; "otrząs"; "pedel"; "piksel"; "podpiersień"; "podziem"; "prezbiter"; "protokół"; "przedpiersień";
"ratel"; "rondel:s2"; "rozpiór:s1"; "rozpiór:s2"; "rzeczpospolita"; "rzep:s2"; "rzepień"; "rzewień"; "rąb"; "sosrąb";
"srebrnodrzew"; "swąd"; "szmermel"; "szpiegierz"; "ulster"; "wab:s2"; "wermiszel"; "wilczełyko"; "woleoczko"; "włosień:s2";
"zew"; "złotogłów"; "świreń"; "źreb"; "żółtodziób";
"człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
"przechrzest"; "chrzest"; "półdziecko"; "roczek:s2"; "rok:s1"; "tydzień"; ""; ""; ""; ""; ""; "";
(* oboczności w odmianie *)
"niekażdy"; "każdy"; "niektóry:a"; "który"; "tenże"; "ten"; "tamten"; "kijże";
"ucho:s2"; "dziecko"; "oko:s2"; "imię"; "nozdrze";
"ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
"NOT"; "LOT"; "KRRiT"; ""; ""; ""; ""; ""; ""; ""; ""; "";
"być"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
(* pozostawione *)
"czyjże"; "czyjś"; "czyjkolwiek"; "kiż"; "ów"; "ow"; "on:a"; "ki";
"Pia"; "jo-jo"; "+lecie"; "";
"zagrząźć"; "zrzeć";
(* niepełny paradygmat *)
"zróść"; "zląc"; "zaróść"; "zaprząc"; "zaprzysiąc"; "zanieść:v2"; "zaląc"; "wzróść"; "wyróść"; "wyprząc"; "wyprzysiąc";
"róść"; "sprzysiąc"; "sprząc"; "ugrząźć"; "uląc"; "upiec:v2"; "uprząc"; "uróść"; "wieść:v2"; "wprząc"; "wróść"; "wyląc";
"powieść:v2"; "posiąc"; "przeląc"; "przeprząc"; "przeróść"; "przyprząc"; "przysiąc"; "przyróść"; "prząc"; "pójść:v2"; "rozprząc"; "rozróść";
"krzywoprzysiąc"; "ląc"; "naróść"; "obróść"; "odprzysiąc"; "odprząc"; "odróść"; "oprzysiąc"; "podróść"; "pogrząźć"; "poprzysiąc"; "poróść";
"dojść:v2"; "doprząc"; "doróść"; "dosiąc"; "grząźć"; "iść:v2";
(* wiele stemów *)
"uwlec"; "wewlec"; "wlec"; "wwlec"; "wywlec"; "wyżec"; "zawlec"; "zażec"; "zewlec"; "zwlec"; "zżec"; "żec";
"podwlec"; "podżec"; "powlec:v1"; "powlec:v2"; "przeoblec"; "przewlec"; "przeżec"; "przyoblec"; "przywlec"; "przyżec"; "rozwlec"; "rozżec";
"dowlec"; "nawlec"; "oblec:v2"; "obwlec"; "odwlec"; "owlec"; "zeżreć";
(* inne *)
"liźć"; "iść:v1"; "wyniść"; "wynijść"; "wyjść"; "wniść"; "wnijść"; "wejść"; "ujść"; "rozejść"; "pójść:v1"; "przyjść"; "przejść:v2"; "przejść:v1"; "podejść"; "odejść"; "obejść:v2"; "obejść:v1"; "najść:v2"; "najść:v1"; "nadejść"; "dojść:v1";
"roztworzyć:v2"; "przetworzyć:v2"; "otworzyć";
"zsiąść:v2"; "zsiąść:v1"; "zesiąść"; "zasiąść"; "wysiąść"; "współposiąść"; "wsiąść"; "usiąść"; "siąść"; "rozsiąść"; "przysiąść"; "przesiąść"; "powsiąść"; "posiąść"; "podsiąść"; "osiąść"; "obsiąść"; "nasiąść"; "dosiąść";
"źreć:v1"; "zniść"; "znijść"; "znajść"; "zejść"; "zejść"; "zajść:v2"; "zajść:v1"; "wzniść"; "wznijść"; "wzejść"
(*
"moi"; "twoi";
(*"AIDS"; "BGŻ"; "BWZ"; "BZ";*) (*"Bandtkie";*) (*"CRZZ"; "FPŻ";*) (*"Jokai"; "Jókai"; "Linde";*)(* "MSZ"; "MWGzZ"; *)
(*"NSZ"; "OPZZ";*) "Radetzky"; "Tagore"; (*"UNZ"; "URz"; "WBZ"; "ZSZ"; "ZWZ"; "ZZ";*) "aids";
"arcyksiężna"; "cornflakes"; "księżna"; (*"scrabble";*) "sms"; "teścina";
"Wielkanoc"; "białagłowa"; "rzeczpospolita"; "imćpan";
"Ob"; "podziem"; "Pia"; "woleoczko"; "wilczełyko"; "jo-jo"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"Omulew"; "drama"; (*"Kayah";*) "ratel"; "grubodziób"; "rozpiór:s1"; "ceter"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"DJ"; "FIFA"; (*"manicure"; "Greenpeace"; "Google";*) ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"włosień:s2"; "deszcz"; "falafel"; "Krasnystaw";
"Różanystok"; "Białystok"; "ZHR"; "rzep:s2"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"IKEA"; "makao"; "macao"; "kakao"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"dziecko"; "oko:s2"; "ucho:s2"; "półdziecko"; "b-cia"; ""; ""; ""; ""; ""; ""; ""; ""; "";
"idea"; "ręka"; "cześć:s"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"ABBA"; "UEFA"; "FAMA"; "SABENA"; "MENA"; "APA"; "NASA"; "ANSA";
"NAFTA"; "LETTA"; "ETA"; "ELTA"; "EFTA"; "CEFTA";
"WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART";
"PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT";
"człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
"szwa"; "hawanna"; "butaforia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"Skopie"; "Mathea"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"poema:s1"; "klima:s1"; "dylema"; "dilemma"; "apoftegma"; "aksjoma"; ""; ""; ""; ""; ""; ""; ""; "";
"burgrabia"; "gograbia"; "grabia"; "hrabia"; "margrabia"; "murgrabia"; "sędzia:s1"; "wicehrabia"; "współsędzia";
"cieśla"; "bibliopola"; "świszczypałka"; "śwircałka"; "świerczałka"; "ścierciałka"; "tatka"; "sługa:s1"; "stupajka:s1"; "stepka"; "starowinka:s2"; "skurczypałka"; "mężczyzna"; "klecha";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
] @ File.load_lines "data/obce.tab" (* @
File.load_lines "data/validated_adj.tab" @ File.load_lines "data/validated_noun.tab" @
File.load_lines "data/validated_verb.tab" @ File.load_lines "data/adv_nieodprzymiotnikowe.tab" *))
let remove_exceptional_lemmata dict =
Xlist.fold dict [] (fun dict entry ->
if StringSet.mem exceptional_lemmata entry.lemma then dict
else entry :: dict)
let generate_stem dict =
Xlist.rev_map dict (fun entry ->
{entry with stem=
if entry.ndm then (List.hd entry.forms).orth else
if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then
Stem.generate_stem entry
else ""})
let validate dict =
let rules = Rules.compound_rule_trees in
Xlist.rev_map dict (fun entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
let forms = Xlist.rev_map entry.forms (fun form ->
(* printf "E %s\t%s\t%s\n" orth lemma interp; *)
let candidates = Rules.CharTrees.find rules form.orth in
(* printf "S %d\n" (Xlist.size forms); *)
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
(* printf "R %s\t%s\n" stem (Rules.string_of_rule rule); *)
if stem ^ rule.Rules.set = simple_lemma then (stem,rule) :: candidates else candidates) in
if candidates = [] then {form with validated=false} else {form with validated=true}) in
{entry with forms=forms})
let validate_interp rules dict =
let rules = Rules.interp_compound_rule_trees in
Xlist.rev_map dict (fun entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
let forms = Xlist.rev_map entry.forms (fun form ->
(* printf "E %s\t%s\t%s\n" orth lemma interp; *)
let candidates = Rules.CharTrees.find rules form.orth in
(* printf "S %d\n" (Xlist.size forms); *)
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
(* printf "R %s\t%s\n" stem (Rules.string_of_rule rule); *)
if stem ^ rule.Rules.set = simple_lemma && form.interp = rule.Rules.interp then
(stem,rule) :: candidates else candidates) in
if candidates = [] then {form with validated=false} else {form with validated=true}) in
{entry with forms=forms})
let remove_validated_forms dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then forms else form :: forms) in
if forms = [] then dict else {entry with forms=forms} :: dict)
let remove_validated_entries dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then forms else form :: forms) in
if forms = [] then dict else entry :: dict)
let remove_not_validated_forms dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then form :: forms else forms) in
if forms = [] then dict else {entry with forms=forms} :: dict)
let remove_not_validated_entries dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then form :: forms else forms) in
if Xlist.size forms <> Xlist.size entry.forms then dict else entry :: dict)
let print filename dict =
File.file_out filename (fun file ->
Xlist.iter dict (fun entry ->
Xlist.iter entry.forms (fun form ->
fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp)))
let print_lemmata filename dict =
File.file_out filename (fun file ->
Xlist.iter dict (fun entry ->
fprintf file "%s\n" entry.lemma))
let remove_sup_neg_forms dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if Xstring.check_sufix ":neg" form.interp || Xstring.check_sufix ":sup" form.interp then
forms else form :: forms) in
if forms = [] then dict else {entry with forms=forms} :: dict)
let generate_rules path filename rules_filename =
let dict = load_tab (path ^ filename) in
(* printf "g1 %d\n%!" (Xlist.size dict); *)
let dict = merge_entries dict in
(* printf "g2 %d\n%!" (Xlist.size dict); *)
let dict = mark_ndm dict in (* FIXME: remove_ndm? *)
(* printf "g3 %d\n%!" (Xlist.size dict); *)
let dict = find_kolwiek_suffixes dict in
(* printf "g4 %d\n%!" (Xlist.size dict); *)
let dict = remove_exceptional_lemmata dict in
(* printf "g5 %d\n%!" (Xlist.size dict); *)
let dict = generate_stem dict in
let dict = validate dict in
let dict = remove_validated_forms dict in
(* printf "g6 %d\n%!" (Xlist.size dict); *)
let dict = remove_sup_neg_forms dict in
(* printf "g7 %d\n%!" (Xlist.size dict); *)
let rules = Xlist.fold dict StringMap.empty (fun rules entry ->
Xlist.fold (RuleGenerator.generate_rules_entry entry) rules (fun rules (key,rule) ->
let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in
let rules2 = StringMap.add_inc rules2 rule (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l) in
StringMap.add rules key rules2)) in
File.file_out rules_filename (fun file ->
StringMap.iter rules (fun interp rules2 ->
fprintf file "\n@RULES %s\n" interp;
StringMap.iter rules2 (fun rule (q,l) ->
fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))
let generate_interp_rules con_flag group_flag lemma_flag path filename rules_filename =
let dict = load_tab (path ^ filename) in
(* printf "g1 %d\n%!" (Xlist.size dict); *)
let dict = merge_entries dict in
(* printf "g2 %d\n%!" (Xlist.size dict); *)
let dict = mark_ndm dict in (* FIXME: remove_ndm? *)
(* printf "g3 %d\n%!" (Xlist.size dict); *)
let dict = find_kolwiek_suffixes dict in
(* printf "g4 %d\n%!" (Xlist.size dict); *)
let dict = remove_exceptional_lemmata dict in
(* printf "g5 %d\n%!" (Xlist.size dict); *)
let dict = generate_stem dict in
let dict = validate_interp Rules.interp_compound_rule_trees dict in
let dict = remove_validated_forms dict in
(* printf "g6 %d\n%!" (Xlist.size dict); *)
(* printf "g7 %d\n%!" (Xlist.size dict); *)
let interp_rules = Xlist.fold dict StringMap.empty (fun interp_rules entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
Xlist.fold entry.forms interp_rules (fun interp_rules form ->
let candidates = RuleGenerator.generate_interp_rules con_flag group_flag lemma_flag simple_lemma form in
Xlist.fold candidates interp_rules (fun interp_rules cand ->
StringMap.add_inc interp_rules cand (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l)))) in
File.file_out rules_filename (fun file ->
StringMap.iter interp_rules (fun k (q,l) ->
fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l)))
let generate_rule_frequencies path filename rules_filename =
let rules = Rules.interp_compound_rule_trees in
let dict = load_tab (path ^ filename) in
(* printf "g1 %d\n%!" (Xlist.size dict); *)
let dict = merge_entries dict in
(* printf "g2 %d\n%!" (Xlist.size dict); *)
let dict = mark_ndm dict in (* FIXME: remove_ndm? *)
(* printf "g3 %d\n%!" (Xlist.size dict); *)
(* printf "g4 %d\n%!" (Xlist.size dict); *)
let dict = remove_exceptional_lemmata dict in
(* printf "g5 %d\n%!" (Xlist.size dict); *)
let dict = generate_stem dict in
(* printf "g6 %d\n%!" (Xlist.size dict); *)
(* printf "g7 %d\n%!" (Xlist.size dict); *)
let freq_rules = Xlist.fold dict Rules.RuleQMap.empty (fun freq_rules entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
Xlist.fold entry.forms freq_rules (fun freq_rules form ->
(* printf "E %s\t%s\t%s\n" orth lemma interp; *)
let candidates = Rules.CharTrees.find rules form.orth in
(* printf "S %d\n" (Xlist.size forms); *)
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
(* printf "R %s\t%s\n" stem (Rules.string_of_rule rule); *)
if stem ^ rule.Rules.set = simple_lemma && form.interp = rule.Rules.interp then
(stem,rule) :: candidates else candidates) in
if candidates = [] then freq_rules else Rules.RuleQMap.add freq_rules (snd (List.hd candidates)))) in
File.file_out rules_filename (fun file ->
Rules.RuleQMap.iter freq_rules (fun rule freq ->
fprintf file "%s\n" (Rules.string_of_freq_rule {rule with Rules.freq=freq})))
let generate_stem_dict rules_filename path filename out_filename =
let rules = Rules.load_freq_rules rules_filename in
let rules = Rules.CharTrees.create rules in
let dict = load_tab (path ^ filename) in
let dict = merge_entries dict in
let dict = mark_ndm dict in (* FIXME: remove_ndm? *)
let stems = Xlist.fold dict StringMap.empty (fun stems entry ->
let simple_lemma,lemma_suf = Stem.simplify_lemma_full entry.lemma in
Xlist.fold entry.forms stems (fun stems form ->
let candidates = Rules.CharTrees.find rules form.orth in
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
if stem ^ rule.Rules.set = simple_lemma && form.interp = rule.Rules.interp then
(stem,rule) :: candidates else candidates) in
if candidates = [] then stems else
let stem,rule = List.hd candidates in
StringMap.add_inc stems (stem ^ "\t" ^ lemma_suf) [rule.Rules.id] (fun l -> rule.Rules.id :: l))) in
File.file_out out_filename (fun file ->
StringMap.iter stems (fun stem ids ->
fprintf file "%s\t%s\n" stem (String.concat " " ids)))