|
1
2
3
4
5
6
7
8
9
10
11
12
|
open Xstd
open Printf
open Types
let get_form e =
match e.forms with
[form] -> form
| _ -> failwith "get_form"
let load_tab filename =
File.load_tab filename (function
orth :: lemma :: interp :: _ ->
|
|
13
|
{empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]}
|
|
14
15
16
17
18
|
| line -> failwith ("load_tab: " ^ (String.concat "\t" line)))
let load_tab_full filename =
File.load_tab filename (function
[orth; lemma; interp] ->
|
|
19
|
{empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]}
|
|
20
|
| [orth; lemma; interp; proper_type] ->
|
|
21
|
{empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]; proper_type=proper_type}
|
|
22
|
| [orth; lemma; interp; proper_type; genre] ->
|
|
23
|
{empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp; genre=genre}]; proper_type=proper_type}
|
|
24
25
|
| line -> failwith ("load_tab_full: " ^ (String.concat "\t" line)))
|
|
26
27
28
29
30
31
32
|
let load_lu dict id path =
let filename = path ^ "morf_rel_" ^ string_of_int id ^ "_lu.tab" in
File.fold_tab filename dict (fun dict -> function
[lemma1; lemma2] ->
{lemma1=lemma1; lemma2=lemma2; rel_id=id; lu_stem=""; lu_validated=false;validated1=false;validated2=false} :: dict
| line -> failwith ("load_lu: " ^ (String.concat "\t" line)))
|
|
33
34
35
36
|
let pos = StringSet.of_list [
"subst";"adj";"adv";"interp";"num";"xxx";"prep";"fin";"praet";"qub";"inf";"interj";
"brev";"numcol";"ppas";"pact";"adja";"conj";"ger";"pcon";"pant";"comp";"depr";
"adjp";"imps";"impt";"pred";"bedzie";"burk";"aglt";"ppron12";"ppron3";"adjc";
|
|
37
|
"winien";"siebie";"numcomp"
|
|
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
|
]
let rec find_pos rev = function
s :: l -> if StringSet.mem pos s then List.rev rev, s :: l else find_pos (s :: rev) l
| [] -> failwith "find_pos"
let split_lemma_interp s =
let l = Xstring.split_delim ":" s in
let lemma,interp = find_pos [List.hd l] (List.tl l) in
String.concat ":" lemma, String.concat ":" interp
let rec remove_empties = function
"" :: l -> remove_empties l
| l -> l
let split_freq_orth s =
match remove_empties (Xstring.split " " s) with
|
|
55
|
freq :: l -> int_of_string freq, String.concat " " l
|
|
56
57
58
59
60
61
62
|
| _ -> failwith "split_freq_orth"
let load_freq_tab filename =
File.load_tab filename (function
[freq_orth; lemma_interp] ->
let freq,orth = split_freq_orth freq_orth in
let lemma,interp = split_lemma_interp lemma_interp in
|
|
63
|
{empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp; freq=freq}]}
|
|
64
65
66
|
| line -> failwith ("load_freq_tab: " ^ (String.concat "\t" line)))
let proper_type_selector e = e.proper_type
|
|
67
|
let genre_selector e = (get_form e).genre
|
|
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
let interp_selector e = (get_form e).interp
let freq_selector e = (get_form e).freq
let print_quantities out_filename selector dict =
let qmap = Xlist.fold dict StringQMap.empty (fun qmap entry ->
StringQMap.add qmap (selector entry)) in
File.file_out out_filename (fun file ->
StringQMap.iter qmap (fun k v ->
fprintf file "%6d\t%s\n" v k))
(**********************************************************************************)
let load_dict_as_set filename =
let l = load_tab filename in
List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
let form = get_form entry in
StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp]))))
let load_dict_as_set_full filename =
let l = load_tab_full filename in
List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
let form = get_form entry in
StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp;entry.proper_type;form.genre]))))
let rec compare_dicts_rec file = function
[],[] -> ()
| [],b :: lb -> fprintf file "> %s\n" b; compare_dicts_rec file ([],lb)
| a :: la,[] -> fprintf file "< %s\n" a; compare_dicts_rec file (la,[])
| a :: la, b :: lb ->
if a = b then compare_dicts_rec file (la,lb) else
if a < b then (fprintf file "< %s\n" a; compare_dicts_rec file (la,b :: lb)) else
(fprintf file "> %s\n" b; compare_dicts_rec file (a :: la,lb))
let compare_dicts filename1 filename2 filename_out =
let dict1 = load_dict_as_set filename1 in
let dict2 = load_dict_as_set filename2 in
File.file_out filename_out (fun file ->
compare_dicts_rec file (dict1,dict2))
let compare_dicts_full filename1 filename2 filename_out =
let dict1 = load_dict_as_set_full filename1 in
let dict2 = load_dict_as_set_full filename2 in
File.file_out filename_out (fun file ->
compare_dicts_rec file (dict1,dict2))
(**********************************************************************************)
let find_entry_cat entry =
let form = get_form entry in
let cat,tags = match Xstring.split ":" form.interp with
cat :: tags -> cat,tags
| _ -> failwith ("find_entry_type: " ^ form.interp) in
|
|
120
121
122
123
124
|
if cat = "praet" then
let t = match tags with
_ :: _ :: t :: _ -> t
| _ -> failwith ("find_entry_cat: " ^ form.interp) in
if t = "pri" || t = "sec" || t = "ter" then "cond" else "verb" else
|
|
125
126
127
128
129
130
131
|
if cat = "subst" || cat = "depr" then "noun" else
if cat = "adj" || cat = "adja"|| cat = "adjc"|| cat = "adjp" then "adj" else
if cat = "adv" then "adv" else
if cat = "inf" || cat = "praet"|| cat = "fin" || cat = "ppas" || cat = "pact" || cat = "pacta" ||
cat = "impt" || cat = "imps" || cat = "pcon" || cat = "pant" || cat = "ger" || cat = "" then "verb" else
if cat = "bedzie" || cat = "pred"|| cat = "prep" || cat = "num" || cat = "aglt" || cat = "winien" ||
cat = "qub" || cat = "brev" || cat = "comp" || cat = "interj" || cat = "burk" ||
|
|
132
|
cat = "conj" || cat = "ppron12" || cat = "ppron3" || cat = "numcomp" || cat = "" then "other" else
|
|
133
134
135
136
137
138
139
|
if cat = "cond" then "cond" else
failwith ("find_entry_cat: " ^ cat)
let assign_entry_cat dict =
Xlist.rev_map dict (fun entry ->
{entry with cat = find_entry_cat entry})
|
|
140
141
|
let split_dict in_path filename out_path =
let dict = load_tab (in_path ^ filename) in
|
|
142
|
let dict = List.rev (assign_entry_cat dict) in
|
|
143
144
145
146
147
148
149
|
let filename = if Xstring.check_sufix ".gz" filename then
Xstring.cut_sufix ".gz" filename else filename in
File.file_out (out_path ^ "noun_" ^ filename) (fun noun_file ->
File.file_out (out_path ^ "adj_" ^ filename) (fun adj_file ->
File.file_out (out_path ^ "adv_" ^ filename) (fun adv_file ->
File.file_out (out_path ^ "verb_" ^ filename) (fun verb_file ->
File.file_out (out_path ^ "other_" ^ filename) (fun other_file ->
|
|
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
|
Xlist.iter dict (fun entry ->
let form = get_form entry in
try
let file = match entry.cat with
"noun" -> noun_file
| "adj" -> adj_file
| "adv" -> adv_file
| "verb" -> verb_file
| "other" -> other_file
| "cond" -> raise Not_found
| _ -> failwith "split_dict" in
fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp
with Not_found -> ()))))))
let merge_entries dict =
let dict = assign_entry_cat dict in
let map = Xlist.fold dict StringMap.empty (fun map entry ->
let form = get_form entry in
let key =
|
|
169
|
if entry.cat = "noun" then
|
|
170
171
172
173
|
let gender = match Xstring.split ":" form.interp with
["depr";_;_;"m2"] -> "m1"
| "depr" :: _ -> failwith ("merge_entries: " ^ form.interp)
| [_;_;_;gender] -> gender
|
|
174
|
| [_;_;_;gender;col] -> gender ^ ":" ^ col
|
|
175
176
177
178
179
180
181
182
183
|
| _ -> failwith ("merge_entries: " ^ form.interp) in
entry.lemma ^ "|" ^ entry.cat ^ "|" ^ gender
else entry.lemma ^ "|" ^ entry.cat in
StringMap.add_inc map key entry (fun e ->
if entry.proper_type <> e.proper_type then
failwith ("merge_entries: " ^ key ^ " " ^ entry.proper_type ^ " " ^ e.proper_type) else
{e with forms = form :: e.forms})) in
StringMap.fold map [] (fun dict _ e -> e :: dict)
|
|
184
185
186
187
188
|
let remove_cat cat dict =
Xlist.fold dict [] (fun dict entry ->
if entry.cat = cat then dict
else entry :: dict)
|
|
189
190
191
192
193
194
195
196
|
let rec get_aspect lemma = function
(f : form) :: l ->
(match Xstring.split ":" f.interp with
["inf";a] -> a
| ["ger";_;_;_;a;_] -> a
| _ -> get_aspect lemma l)
| [] -> failwith ("get_aspect: " ^ lemma)
|
|
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
|
let get_lemma_suf lemma =
let lemma_suf =
if lemma = "" then "" else
List.hd (List.rev (Xunicode.utf8_chars_of_utf8_string (Stem.simplify_lemma lemma))) in
match lemma_suf with
"a" -> "a"
| "e" -> "e"
| "o" -> "o"
| "y" -> "y"
| "i" -> "y"
| "ę" -> "ę"
| _ -> "ε"
let get_orth_suf orth =
let orth_suf =
if orth = "" then "" else
List.hd (List.rev (Xunicode.utf8_chars_of_utf8_string orth)) in
match orth_suf with
"j" -> "j"
| "e" -> "e"
| _ -> "ε"
let merge_interps lemma forms =
let lemma_suf = get_lemma_suf lemma in
|
|
221
222
223
224
225
|
let map = Xlist.fold forms StringMap.empty (fun map form ->
(* printf "merge_interps 1: %s %s\n%!" form.orth form.interp; *)
StringMap.add_inc map form.orth (StringSet.singleton form.interp) (fun set -> StringSet.add set form.interp)) in
StringMap.fold map [] (fun forms orth set ->
(* printf "merge_interps 2: %s %s\n%!" orth (String.concat " " (StringSet.to_list set)); *)
|
|
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
|
let orth_suf = get_orth_suf orth in
match lemma_suf, Xlist.sort (StringSet.to_list set) compare with
_,["adv"] -> {empty_form with orth=orth; interp="adv:pos"} :: forms
| _,["adv";"adv:pos"] -> {empty_form with orth=orth; interp="adv:pos"} :: forms
| _,["adj:pl:acc:m2.m3.f.n:pos";"adj:pl:nom.voc:m2.m3.f.n:pos";"adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos|adj:pl:nom.acc.voc:m2.m3.f.n:pos"} :: forms
| _,["adj:sg:acc:m1.m2:pos";"adj:sg:gen:m1.m2.m3.n:pos"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:pos|adj:sg:acc:m1.m2:pos"} :: forms
| _,["adj:sg:dat:f:pos";"adj:sg:gen:f:pos";"adj:sg:loc:f:pos"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:pos"} :: forms
| _,["adj:sg:acc:m3:pos";"adj:sg:nom.voc:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos"} :: forms
| _,["adj:pl:acc:m1:pos";"adj:pl:gen:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:pos|adj:pl:acc:m1:pos"} :: forms
| _,["adj:pl:dat:m1.m2.m3.f.n:pos";"adj:sg:inst:m1.m2.m3.n:pos";"adj:sg:loc:m1.m2.m3.n:pos"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:pos|adj:pl:dat:m1.m2.m3.f.n:pos"} :: forms
| _,["adj:sg:acc:f:pos";"adj:sg:inst:f:pos"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:pos"} :: forms
| _,["adj:pl:nom.voc:m1:pos";"adj:sg:acc:m3:pos";"adj:sg:nom.voc:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos|adj:pl:nom.voc:m1:pos"} :: forms
| _,["adj:sg:acc:m3:pos";"adj:sg:nom:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom:m1.m2.m3:pos|adj:sg:acc:m3:pos"} :: forms
| _,["adj:pl:acc:m2.m3.f.n:pos";"adj:pl:nom.voc:m2.m3.f.n:pos"] -> {empty_form with orth=orth; interp="adj:pl:nom.acc.voc:m2.m3.f.n:pos"} :: forms
| _,["adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos"} :: forms
| _,["adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos";"adja"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos|adja"} :: forms
| _,["adj:pl:nom:m2.m3.f.n:pos";"adj:sg:nom:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom:n:pos|adj:pl:nom:m2.m3.f.n:pos"} :: forms
| _,["adj:pl:acc:m2.m3.f.n:sup";"adj:pl:nom.voc:m2.m3.f.n:sup";"adj:sg:acc:n:sup";"adj:sg:nom.voc:n:sup"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:sup|adj:pl:nom.acc.voc:m2.m3.f.n:sup"} :: forms
| _,["adj:sg:acc:m1.m2:sup";"adj:sg:gen:m1.m2.m3.n:sup"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:sup|adj:sg:acc:m1.m2:sup"} :: forms
| _,["adj:sg:dat:f:sup";"adj:sg:gen:f:sup";"adj:sg:loc:f:sup"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:sup"} :: forms
| _,["adj:sg:acc:m3:sup";"adj:sg:nom.voc:m1.m2.m3:sup"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:sup|adj:sg:acc:m3:sup"} :: forms
| _,["adj:pl:acc:m1:sup";"adj:pl:gen:m1.m2.m3.f.n:sup";"adj:pl:loc:m1.m2.m3.f.n:sup"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:sup|adj:pl:acc:m1:sup"} :: forms
| _,["adj:pl:dat:m1.m2.m3.f.n:sup";"adj:sg:inst:m1.m2.m3.n:sup";"adj:sg:loc:m1.m2.m3.n:sup"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:sup|adj:pl:dat:m1.m2.m3.f.n:sup"} :: forms
| _,["adj:sg:acc:f:sup";"adj:sg:inst:f:sup"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:sup"} :: forms
| _,["adj:pl:acc:m2.m3.f.n:com";"adj:pl:nom.voc:m2.m3.f.n:com";"adj:sg:acc:n:com";"adj:sg:nom.voc:n:com"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:com|adj:pl:nom.acc.voc:m2.m3.f.n:com"} :: forms
| _,["adj:sg:acc:m1.m2:com";"adj:sg:gen:m1.m2.m3.n:com"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:com|adj:sg:acc:m1.m2:com"} :: forms
| _,["adj:sg:dat:f:com";"adj:sg:gen:f:com";"adj:sg:loc:f:com"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:com"} :: forms
| _,["adj:sg:acc:m3:com";"adj:sg:nom.voc:m1.m2.m3:com"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:com|adj:sg:acc:m3:com"} :: forms
| _,["adj:pl:acc:m1:com";"adj:pl:gen:m1.m2.m3.f.n:com";"adj:pl:loc:m1.m2.m3.f.n:com"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:com|adj:pl:acc:m1:com"} :: forms
| _,["adj:pl:dat:m1.m2.m3.f.n:com";"adj:sg:inst:m1.m2.m3.n:com";"adj:sg:loc:m1.m2.m3.n:com"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:com|adj:pl:dat:m1.m2.m3.f.n:com"} :: forms
| _,["adj:sg:acc:f:com";"adj:sg:inst:f:com"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:com"} :: forms
| _,["adj:pl:acc:m1:pos";"adj:pl:acc:m2.m3.f.n:pos";"adj:pl:dat:m1.m2.m3.f.n:pos";"adj:pl:gen:m1.m2.m3.f.n:pos";
|
|
258
259
260
261
|
"adj:pl:inst:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos";"adj:pl:nom.voc:m1:pos";"adj:pl:nom.voc:m2.m3.f.n:pos";
"adj:sg:acc:f:pos";"adj:sg:acc:m1.m2:pos";"adj:sg:acc:m3:pos";"adj:sg:acc:n:pos";"adj:sg:dat:f:pos";
"adj:sg:dat:m1.m2.m3.n:pos";"adj:sg:gen:f:pos";"adj:sg:gen:m1.m2.m3.n:pos";"adj:sg:inst:f:pos";"adj:sg:inst:m1.m2.m3.n:pos";
"adj:sg:loc:f:pos";"adj:sg:loc:m1.m2.m3.n:pos";"adj:sg:nom.voc:f:pos";"adj:sg:nom.voc:m1.m2.m3:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos"} :: forms
|
|
262
|
| _,["adj:pl:acc:m1:pos";"adj:pl:acc:m2.m3.f.n:pos";"adj:pl:dat:m1.m2.m3.f.n:pos";"adj:pl:gen:m1.m2.m3.f.n:pos";
|
|
263
264
265
266
|
"adj:pl:inst:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos";"adj:pl:nom.voc:m1:pos";"adj:pl:nom.voc:m2.m3.f.n:pos";
"adj:sg:acc:f:pos";"adj:sg:acc:m1.m2:pos";"adj:sg:acc:m3:pos";"adj:sg:acc:n:pos";"adj:sg:dat:f:pos";
"adj:sg:dat:m1.m2.m3.n:pos";"adj:sg:gen:f:pos";"adj:sg:gen:m1.m2.m3.n:pos";"adj:sg:inst:f:pos";"adj:sg:inst:m1.m2.m3.n:pos";
"adj:sg:loc:f:pos";"adj:sg:loc:m1.m2.m3.n:pos";"adj:sg:nom.voc:f:pos";"adj:sg:nom.voc:m1.m2.m3:pos";"adj:sg:nom.voc:n:pos";"adja"] -> {empty_form with orth=orth; interp="adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos|adja"} :: forms
|
|
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
|
| _,["ger:pl:nom.acc:n:imperf.perf:aff";"ger:sg:gen:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ger:sg:gen:n:imperf.perf:aff|ger:pl:nom.acc:n:imperf.perf:aff"} :: forms
| _,["ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:aff";"ppas:sg:nom.acc.voc:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:nom.acc.voc:n:imperf.perf:aff|ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:aff"} :: forms
| _,["ppas:sg:acc:m1.m2:imperf.perf:aff";"ppas:sg:gen:m1.m2.m3.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:gen:m1.m2.m3.n:imperf.perf:aff|ppas:sg:acc:m1.m2:imperf.perf:aff"} :: forms
| _,["ppas:sg:acc:m3:imperf.perf:aff";"ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff|ppas:sg:acc:m3:imperf.perf:aff"} :: forms
| _,["ppas:pl:acc:m1:imperf.perf:aff";"ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:aff|ppas:pl:acc:m1:imperf.perf:aff"} :: forms
| _,["ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:aff";"ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:aff|ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:aff"} :: forms
| _,["pact:pl:nom.acc.voc:m2.m3.f.n:imperf:aff";"pact:sg:nom.acc.voc:n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:nom.acc.voc:n:imperf:aff|pact:pl:nom.acc.voc:m2.m3.f.n:imperf:aff"} :: forms
| _,["pact:sg:acc:m1.m2:imperf:aff";"pact:sg:gen:m1.m2.m3.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:gen:m1.m2.m3.n:imperf:aff|pact:sg:acc:m1.m2:imperf:aff"} :: forms
| _,["pact:pl:nom.voc:m1:imperf:aff";"pact:sg:acc:m3:imperf:aff";"pact:sg:nom.voc:m1.m2.m3:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:nom.voc:m1.m2.m3:imperf:aff|pact:sg:acc:m3:imperf:aff|pact:pl:nom.voc:m1:imperf:aff"} :: forms
| _,["pact:pl:acc:m1:imperf:aff";"pact:pl:gen.loc:m1.m2.m3.f.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:pl:gen.loc:m1.m2.m3.f.n:imperf:aff|pact:pl:acc:m1:imperf:aff"} :: forms
| _,["pact:pl:dat:m1.m2.m3.f.n:imperf:aff";"pact:sg:inst.loc:m1.m2.m3.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:inst.loc:m1.m2.m3.n:imperf:aff|pact:pl:dat:m1.m2.m3.f.n:imperf:aff"} :: forms
| _,["ger:pl:nom.acc:n:imperf.perf:neg";"ger:sg:gen:n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ger:sg:gen:n:imperf.perf:neg|ger:pl:nom.acc:n:imperf.perf:neg"} :: forms
| _,["ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:neg";"ppas:sg:nom.acc.voc:n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:nom.acc.voc:n:imperf.perf:neg|ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:neg"} :: forms
| _,["ppas:sg:acc:m1.m2:imperf.perf:neg";"ppas:sg:gen:m1.m2.m3.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:gen:m1.m2.m3.n:imperf.perf:neg|ppas:sg:acc:m1.m2:imperf.perf:neg"} :: forms
| _,["ppas:sg:acc:m3:imperf.perf:neg";"ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg|ppas:sg:acc:m3:imperf.perf:neg"} :: forms
| _,["ppas:pl:acc:m1:imperf.perf:neg";"ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:neg|ppas:pl:acc:m1:imperf.perf:neg"} :: forms
| _,["ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:neg";"ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:neg|ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:neg"} :: forms
| _,["pact:pl:nom.acc.voc:m2.m3.f.n:imperf:neg";"pact:sg:nom.acc.voc:n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:nom.acc.voc:n:imperf:neg|pact:pl:nom.acc.voc:m2.m3.f.n:imperf:neg"} :: forms
| _,["pact:sg:acc:m1.m2:imperf:neg";"pact:sg:gen:m1.m2.m3.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:gen:m1.m2.m3.n:imperf:neg|pact:sg:acc:m1.m2:imperf:neg"} :: forms
| _,["pact:pl:nom.voc:m1:imperf:neg";"pact:sg:acc:m3:imperf:neg";"pact:sg:nom.voc:m1.m2.m3:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:nom.voc:m1.m2.m3:imperf:neg|pact:sg:acc:m3:imperf:neg|pact:pl:nom.voc:m1:imperf:neg"} :: forms
| _,["pact:pl:acc:m1:imperf:neg";"pact:pl:gen.loc:m1.m2.m3.f.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:pl:gen.loc:m1.m2.m3.f.n:imperf:neg|pact:pl:acc:m1:imperf:neg"} :: forms
| _,["pact:pl:dat:m1.m2.m3.f.n:imperf:neg";"pact:sg:inst.loc:m1.m2.m3.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:inst.loc:m1.m2.m3.n:imperf:neg|pact:pl:dat:m1.m2.m3.f.n:imperf:neg"} :: forms
| _,["ger:pl:gen:n:imperf.perf:aff";"inf:imperf.perf"] -> {empty_form with orth=orth; interp="ger:pl:gen:n:imperf.perf:aff"} :: {empty_form with orth=orth; interp="inf:imperf.perf"} :: forms
| _,["praet:sg:m1.m2.m3:imperf.perf";"praet:sg:m1.m2.m3:imperf.perf:nagl"] -> {empty_form with orth=orth; interp="praet:sg:m1.m2.m3:imperf.perf:nagl"} :: forms
| _,["fin:sg:ter:imperf.perf";"ger:sg:nom.acc:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="fin:sg:ter:imperf.perf"} :: {empty_form with orth=orth; interp="ger:sg:nom.acc:n:imperf.perf:aff"} :: forms
| _,["ger:pl:gen:n:imperf.perf:aff";"impt:sg:sec:imperf.perf"] -> {empty_form with orth=orth; interp="ger:pl:gen:n:imperf.perf:aff"} :: {empty_form with orth=orth; interp="impt:sg:sec:imperf.perf"} :: forms
| _,["fin:pl:ter:imperf.perf";"ppas:sg:acc.inst:f:imperf.perf:aff"] -> {empty_form with orth=orth; interp="fin:pl:ter:imperf.perf"} :: {empty_form with orth=orth; interp="ppas:sg:acc.inst:f:imperf.perf:aff"} :: forms
| "a",["subst:sg:dat.loc:f";"subst:sg:gen:f"] ->
if orth_suf = "j" then {empty_form with orth=orth; interp="subst:sg:gen.dat.loc:f"} :: forms
else {empty_form with orth=orth; interp="subst:sg:gen:f"} :: {empty_form with orth=orth; interp="subst:sg:dat.loc:f"} :: forms
(* | "a",["subst:pl:gen:f";"subst:sg:dat.loc:f";"subst:sg:gen:f"] -> (*print_endline lemma;*) {empty_form with orth=orth; interp="subst:pl:gen:f"} :: {empty_form with orth=orth; interp="subst:sg:dat.loc:f"} :: {empty_form with orth=orth; interp="subst:sg:gen:f"} :: forms *)
| "a",["subst:pl:gen:f";"subst:pl:loc:f"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:f"} :: forms
| "ε",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
| "ε",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms
| "ε",["subst:sg:loc:m1";"subst:sg:voc:m1"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m1"} :: forms
| "ε",["subst:sg:loc:m2";"subst:sg:voc:m2"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m2"} :: forms
| "ε",["subst:sg:loc:m3";"subst:sg:voc:m3"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m3"} :: forms
| "ε",["subst:sg:gen:m3";"subst:sg:loc:m3";"subst:sg:voc:m3"] -> {empty_form with orth=orth; interp="subst:sg:loc.voc:m3"} :: {empty_form with orth=orth; interp="subst:sg:gen:m3"} :: forms
| "ε",["subst:sg:acc:f";"subst:sg:nom:f"] -> {empty_form with orth=orth; interp="subst:sg:nom.acc:f"} :: forms
| "ε",["subst:pl:gen:f";"subst:pl:nom.acc.voc:f";"subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:gen.dat.loc.voc:f|subst:pl:gen:f"} :: {empty_form with orth=orth; interp="subst:pl:nom.acc.voc:f"} :: forms
| "ε",["subst:pl:gen:f";"subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:gen.dat.loc.voc:f|subst:pl:gen:f"} :: forms
| "y",["subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1"} :: forms
| "y",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
| "y",["subst:pl:gen.acc:m1";"subst:pl:loc:m1"] -> {empty_form with orth=orth; interp="subst:pl:gen.acc.loc:m1"} :: forms
| "y",["subst:pl:dat:m1";"subst:sg:inst:m1";"subst:sg:loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m1|subst:pl:dat:m1"} :: forms
| "y",["subst:sg:nom:m2";"subst:sg:voc:m2"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m2"} :: forms
| "y",["subst:pl:nom.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1"} :: {empty_form with orth=orth; interp="subst:pl:nom.voc:m1"} :: forms
| "y",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms
| "y",["subst:pl:gen:m2";"subst:pl:loc:m2"] -> {empty_form with orth=orth; interp="subst:pl:gen.acc.loc:m2"} :: forms
| "y",["subst:pl:dat:m2";"subst:sg:inst:m2";"subst:sg:loc:m2"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m2|subst:pl:dat:m2"} :: forms
| "y",["subst:sg:nom.acc:m3";"subst:sg:voc:m3"] -> {empty_form with orth=orth; interp="subst:sg:nom.acc.voc:m3"} :: forms
| "y",["subst:pl:gen:m3";"subst:pl:loc:m3"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:m3"} :: forms
| "y",["subst:pl:dat:m3";"subst:sg:inst:m3";"subst:sg:loc:m3"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m3|subst:pl:dat:m3"} :: forms
| "y",["subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:nom:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.loc.voc:f"} :: forms
| "e",["depr:pl:nom.acc.voc:m2";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
|
|
322
|
| "e",["depr:pl:nom.acc.voc:m2";"subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1|depr:pl:nom.acc.voc:m2"} :: {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
|
|
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
|
| "e",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
| "e",["subst:sg:inst:m1";"subst:sg:loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m1"} :: forms
| "e",["subst:pl:dat:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:n:ncol"} :: {empty_form with orth=orth; interp="subst:pl:dat:n:ncol"} :: forms
| "e",["subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:n:ncol"} :: forms
| "e",["subst:pl:gen:n:ncol";"subst:pl:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:n:ncol"} :: forms
| "e",["subst:sg:dat:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:ncol"} :: forms
| "e",["subst:pl:nom.acc.voc:n:ncol";"subst:sg:gen:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:gen:n:ncol|subst:pl:nom.acc.voc:n:ncol"} :: forms
| "o",["subst:sg:nom.voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom:m1"} :: {empty_form with orth=orth; interp="subst:sg:voc:m1"} :: forms
| "o",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
| "o",["depr:pl:nom.acc.voc:m2";"subst:pl:nom.voc:m1";"subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: {empty_form with orth=orth; interp="subst:pl:nom.voc:m1"} :: {empty_form with orth=orth; interp="depr:pl:nom.acc.voc:m2"} :: forms
| "o",["subst:sg:dat.loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:dat:m1"} :: {empty_form with orth=orth; interp="subst:sg:loc:m1"} :: forms
| "o",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms
| "o",["subst:pl:dat:m1:pt";"subst:pl:loc:m1:pt"] -> {empty_form with orth=orth; interp="subst:pl:dat.loc:m1:pt"} :: forms
| "ε",["subst:sg:dat:n:ncol";"subst:sg:gen:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol";"subst:sg:nom.acc.voc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: forms
| "ε",["subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";"subst:sg:dat:n:ncol";"subst:sg:gen:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol";"subst:sg:nom.acc.voc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: forms
| "ę",["subst:sg:dat:n:col";"subst:sg:loc:n:col"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:col"} :: forms
| "ę",["subst:sg:dat:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:ncol"} :: forms
| "o",["subst:sg:loc:m1";"subst:sg:voc:m1"] ->
if orth_suf = "e" then {empty_form with orth=orth; interp="subst:sg:loc.voc:m1"} :: forms
else {empty_form with orth=orth; interp="subst:sg:loc:m1"} :: {empty_form with orth=orth; interp="subst:sg:voc:m1"} :: forms
|
|
343
|
| _,["depr:pl:nom.acc.voc:m2";"subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
|
|
344
345
|
| _,[interp] -> {empty_form with orth=orth; interp=interp} :: forms
| _,interps ->
|
|
346
|
(* print_endline ("merge_interps: " (*^ lemma_suf*) ^ " [\"" ^ String.concat "\";\"" interps ^ "\"]"); *)
|
|
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
|
Xlist.fold interps forms (fun forms interp ->
{empty_form with orth=orth; interp=interp} :: forms))
let process_interps dict =
Xlist.rev_map dict (fun entry ->
if entry.cat = "verb" then
let aspect = get_aspect entry.lemma entry.forms in
let forms = Xlist.rev_map entry.forms (fun f ->
let interp = match (Xstring.split ":" f.interp) with
["fin";n;p;_] -> String.concat ":" ["fin";n;p;"imperf.perf"]
| ["impt";n;p;_] -> String.concat ":" ["impt";n;p;"imperf.perf"]
| "pcon" :: _ -> f.interp
| "pacta" :: _ -> f.interp
| "pact" :: _ -> f.interp
| ["ger";n;c;g;_;a] -> String.concat ":" ["ger";n;c;g;"imperf.perf";a]
| ["praet";n;g;_] -> String.concat ":" ["praet";n;g;"imperf.perf"]
| ["praet";n;g;_;a] -> String.concat ":" ["praet";n;g;"imperf.perf";a]
| ["inf";_] -> String.concat ":" ["inf";"imperf.perf"]
| ["pant";_] -> String.concat ":" ["pant";"imperf.perf"]
| ["imps";_] -> String.concat ":" ["imps";"imperf.perf"]
| ["ppas";n;c;g;_;a] -> String.concat ":" ["ppas";n;c;g;"imperf.perf";a]
| _ -> print_endline ("merge_interps: " ^ f.interp); f.interp in
{f with interp=interp}) in
|
|
370
|
let forms = merge_interps entry.lemma forms in
|
|
371
|
{entry with aspect=aspect; forms=forms} else
|
|
372
|
{entry with forms=merge_interps entry.lemma entry.forms})
|
|
373
|
|
|
374
375
|
(**********************************************************************************)
|
|
376
|
(*let mark_ndm dict =
|
|
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
|
Xlist.fold dict [] (fun dict entry ->
if entry.cat <> "noun" && entry.cat <> "adj" then entry :: dict else
let map = Xlist.fold entry.forms StringMap.empty (fun map form ->
StringMap.add_inc map form.interp (StringSet.singleton form.orth) (fun set -> StringSet.add set form.orth)) in
let qmap = StringMap.fold map StringQMap.empty (fun qmap interp orths ->
StringSet.fold orths qmap StringQMap.add) in
let n = StringMap.size map in
let found = StringQMap.fold qmap [] (fun found orth v ->
if v = n then orth :: found else found) in
match found with
[] -> entry :: dict
| [orth] ->
let ndm,odm = Xlist.fold entry.forms ([],[]) (fun (ndm,odm) form ->
if form.orth = orth then form :: ndm, odm else ndm, form :: odm) in
let dict = {entry with forms=odm} :: dict in
{entry with forms=ndm; ndm=true} :: dict
| _ -> failwith ("mark_ndm: " ^ (String.concat " " found)))
let print_ndm filename dict =
File.file_out filename (fun file ->
Xlist.iter dict (fun entry ->
if entry.ndm then
let orth = (List.hd entry.forms).orth in
fprintf file "%s\t%s\t%s\n" orth entry.lemma entry.cat))
|
|
402
403
404
405
406
407
408
409
|
let remove_ndm dict =
Xlist.fold dict [] (fun dict entry ->
if entry.ndm then dict
else entry :: dict)
let remove_not_ndm dict =
Xlist.fold dict [] (fun dict entry ->
if not entry.ndm then dict
|
|
410
|
else entry :: dict)*)
|
|
411
412
|
|
|
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
|
let kolwiek_lemmas = StringSet.of_list [
(* adj *)
"czyjkolwiek"; "czyjś"; "czyjże"; "jakiciś"; "jakikolwiek"; "jakisi"; "jakiś"; "jakiści";
"jakiściś"; "jakiśkolwiek"; "jakiż"; "jakiżkolwiek"; "jakowyś"; "kijże"; "kiż"; "którykolwiek";
"któryś"; "któryż"; "któryżkolwiek"; "niejakiś"; "takiż"; "takowyż"; "tenże"; "tyliż"; "ówże";
(* noun *)
"cokolwiek:s"; "cośkolwiek"; "cóżkolwiek"; "ktokolwiek"; "ktośkolwiek"; "któżkolwiek";
"cociś"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "coże"; "cóż";
"ktoś:s2"; "któż";
(* adv *)
"jakkolwiek"; "jakoś"; "małoż"; "niejakkolwiek"; "niejakoś"; (*"niemalże";*) ]
let kolwiek_suffixes = [
"żkolwiek"; "żekolwiek"; "śkolwiek"; "kolwiek"; "ż"; "że"; "ściś"; "ciś"; "ś"; "ści"; "sik"; "si"]
let find_kolwiek_suffixes dict =
Xlist.rev_map dict (fun entry ->
if StringSet.mem kolwiek_lemmas entry.lemma then
{entry with forms=Xlist.map entry.forms (fun form ->
{form with orth=Xlist.fold kolwiek_suffixes form.orth (fun orth kolwiek_suf ->
if Xstring.check_sufix kolwiek_suf orth then
Xstring.cut_sufix kolwiek_suf orth
else orth)})}
else entry)
let exceptional_lemmata = StringSet.of_list ([
|
|
439
440
441
442
443
444
445
446
447
448
449
450
451
452
|
(* wiele stemów *)
"Apollo"; "Aujeszky"; "Białystok"; "Gózd"; "Krasnystaw"; "Różanystok"; "Wielkanoc"; "białagłowa";
"deszcz"; "imćpan"; "iściec"; "otrząs"; "rzeczpospolita"; "wilczełyko"; "woleoczko";
"prapraojciec"; "praojciec"; "ojciec"; "współbrat"; "spółbrat"; "półbrat"; "brat";
"półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; "człowiek";
"półdziecko"; "+lecie"; "zimoziele"; "ziele"; "trójziele"; "nasienie";
"ksiądz"; "dech"; "tydzień"; "roczek:s2"; "rok:s1"; "przechrzest"; "chrzest";
"dziecko"; "ucho:s2"; "oko:s2"; "cześć:s"; "jo-jo"; "Zabłotce"; "tysiąc:s1"; "półmiesiąc"; "miesiąc"; ""; ""; "";
"Pia"; "ręka"; "człek"; "Kozak:s1"; "bóg"; "psubrat"; "pieniądz"; ""; ""; ""; "";
"kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic";
"co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek";
"niebiosa"; "Włochy:s1"; "Niemcy"; "Węgry"; "Austro-Węgry"; ""; ""; ""; ""; ""; ""; "";
"zając:s1"; "tysiąc:s2"; "wszyscy"; ""; ""; ""; ""; ""; ""; ""; ""; "";
|
|
453
454
|
(* "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
"NOT"; "LOT"; "KRRiT"; "OIT"; ""; ""; ""; ""; ""; ""; ""; "";*)
|
|
455
456
457
458
459
460
461
462
463
464
465
466
|
"bliscy"; "ojcowie"; "teściowie"; "ichmościowie"; "wujkowie"; "staruszkowie"; "pradziadkowie"; ""; ""; ""; ""; "";
"małżonkowie"; "kochankowie"; "dziadkowie"; "rozwiedzeni"; "nieliczni"; "chrzestni"; ""; ""; ""; ""; ""; "";
"starzy"; "wasi"; "nasi"; "najmłodsi"; "dzisiejsi"; ""; ""; ""; ""; ""; ""; "";
"IKEA"; "stajnia"; "kuchnia:s"; "suknia"; "minisuknia"; "głównia"; "głownia"; "dźwignia"; ""; ""; ""; "";
"workowiśnia"; "wiśnia"; "sośnia"; "laurowiśnia"; "studnia"; "idea"; "imienie"; ""; ""; ""; ""; "";
"makao"; "macao"; "kakao"; "Akademgorodok"; "yuppi"; "hippie"; "yuppie"; ""; ""; ""; ""; "";
"Uj"; "PIT"; "ChAT"; "podczłowiek"; "nieczłowiek"; "cześć"; "ktoś"; "ktosik"; ""; ""; ""; "";
"+ówna"; "+yna"; "+ina"; "+anka"; "+owa"; "co"; "cokolwiek"; "coś"; "cośtam"; ""; ""; "";
"zając"; "tysiąc"; "rok"; "roczek"; "oko"; "ucho"; "Włochy"; "niebiosy"; "wici"; ""; ""; "";
"André"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"zły:a"; "dobry:a"; "przymały"; "mały:a"; "duży"; "wielki:a";
|
|
467
|
"ppoż."; "ppanc."; "pepanc."; "midi:a"; ""; ""; ""; ""; ""; ""; ""; "";
|
|
468
469
470
471
|
"zwać"; "wiedzieć"; "pójść"; "przejść"; "dojść"; "zsiąść"; "iść"; ""; ""; ""; ""; "";
"być"; "zasłonić"; "słonić"; "przysłonić"; "przesłonić"; "osłonić"; "odsłonić"; ""; ""; ""; ""; "";
(*
|
|
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
|
(* błąd w słowniku *)
"ówże";
(* wiele stemów *)
"twój:a"; "swój"; "mój:a"; "wszystek";
(* oboczności w stemie *)
"co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek";
"kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic";
"Angel"; "Apollo"; "Białystok"; "Bober"; "Dzięgiel"; "Engel"; "Gołąb:s2"; "Gózd"; "Hendel"; "Herschel"; "Jastrząb";
"Kodrąb:s2"; "Kozioł"; "Krasnystaw"; "Majcher"; "Ob"; "Omulew"; "Orzeł"; "Różanystok"; "Schuster"; "Stępień"; "Słonim";
"Wielkanoc"; "achtel"; "archiprezbiter"; "arcydzięgiel"; "bedel"; "ber"; "białagłowa"; "białodrzew"; "ceter"; "deszcz";
"drama"; "dziób:s1"; "dzięgiel"; "dżemper"; "falafel"; "grubodziób"; "harbajtel"; "harbejtel"; "harmider"; "imćpan";
"iściec"; "jarząb:s2"; "kierdel"; "kimel"; "kiper:s1"; "klaster"; "kliper"; "kosodrzew"; "kureń"; "manczester";
"nadpiersień"; "osep"; "otrząs"; "pedel"; "piksel"; "podpiersień"; "podziem"; "prezbiter"; "protokół"; "przedpiersień";
"ratel"; "rondel:s2"; "rozpiór:s1"; "rozpiór:s2"; "rzeczpospolita"; "rzep:s2"; "rzepień"; "rzewień"; "rąb"; "sosrąb";
"srebrnodrzew"; "swąd"; "szmermel"; "szpiegierz"; "ulster"; "wab:s2"; "wermiszel"; "wilczełyko"; "woleoczko"; "włosień:s2";
"zew"; "złotogłów"; "świreń"; "źreb"; "żółtodziób";
"człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
"przechrzest"; "chrzest"; "półdziecko"; "roczek:s2"; "rok:s1"; "tydzień"; ""; ""; ""; ""; ""; "";
(* oboczności w odmianie *)
"niekażdy"; "każdy"; "niektóry:a"; "który"; "tenże"; "ten"; "tamten"; "kijże";
"ucho:s2"; "dziecko"; "oko:s2"; "imię"; "nozdrze";
"ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
"NOT"; "LOT"; "KRRiT"; ""; ""; ""; ""; ""; ""; ""; ""; "";
"być"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
|
|
496
|
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
|
|
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
|
(* pozostawione *)
"czyjże"; "czyjś"; "czyjkolwiek"; "kiż"; "ów"; "ow"; "on:a"; "ki";
"Pia"; "jo-jo"; "+lecie"; "";
"zagrząźć"; "zrzeć";
(* niepełny paradygmat *)
"zróść"; "zląc"; "zaróść"; "zaprząc"; "zaprzysiąc"; "zanieść:v2"; "zaląc"; "wzróść"; "wyróść"; "wyprząc"; "wyprzysiąc";
"róść"; "sprzysiąc"; "sprząc"; "ugrząźć"; "uląc"; "upiec:v2"; "uprząc"; "uróść"; "wieść:v2"; "wprząc"; "wróść"; "wyląc";
"powieść:v2"; "posiąc"; "przeląc"; "przeprząc"; "przeróść"; "przyprząc"; "przysiąc"; "przyróść"; "prząc"; "pójść:v2"; "rozprząc"; "rozróść";
"krzywoprzysiąc"; "ląc"; "naróść"; "obróść"; "odprzysiąc"; "odprząc"; "odróść"; "oprzysiąc"; "podróść"; "pogrząźć"; "poprzysiąc"; "poróść";
"dojść:v2"; "doprząc"; "doróść"; "dosiąc"; "grząźć"; "iść:v2";
(* wiele stemów *)
"uwlec"; "wewlec"; "wlec"; "wwlec"; "wywlec"; "wyżec"; "zawlec"; "zażec"; "zewlec"; "zwlec"; "zżec"; "żec";
"podwlec"; "podżec"; "powlec:v1"; "powlec:v2"; "przeoblec"; "przewlec"; "przeżec"; "przyoblec"; "przywlec"; "przyżec"; "rozwlec"; "rozżec";
"dowlec"; "nawlec"; "oblec:v2"; "obwlec"; "odwlec"; "owlec"; "zeżreć";
(* inne *)
"liźć"; "iść:v1"; "wyniść"; "wynijść"; "wyjść"; "wniść"; "wnijść"; "wejść"; "ujść"; "rozejść"; "pójść:v1"; "przyjść"; "przejść:v2"; "przejść:v1"; "podejść"; "odejść"; "obejść:v2"; "obejść:v1"; "najść:v2"; "najść:v1"; "nadejść"; "dojść:v1";
"roztworzyć:v2"; "przetworzyć:v2"; "otworzyć";
"zsiąść:v2"; "zsiąść:v1"; "zesiąść"; "zasiąść"; "wysiąść"; "współposiąść"; "wsiąść"; "usiąść"; "siąść"; "rozsiąść"; "przysiąść"; "przesiąść"; "powsiąść"; "posiąść"; "podsiąść"; "osiąść"; "obsiąść"; "nasiąść"; "dosiąść";
"źreć:v1"; "zniść"; "znijść"; "znajść"; "zejść"; "zejść"; "zajść:v2"; "zajść:v1"; "wzniść"; "wznijść"; "wzejść"
(*
"moi"; "twoi";
(*"AIDS"; "BGŻ"; "BWZ"; "BZ";*) (*"Bandtkie";*) (*"CRZZ"; "FPŻ";*) (*"Jokai"; "Jókai"; "Linde";*)(* "MSZ"; "MWGzZ"; *)
(*"NSZ"; "OPZZ";*) "Radetzky"; "Tagore"; (*"UNZ"; "URz"; "WBZ"; "ZSZ"; "ZWZ"; "ZZ";*) "aids";
"arcyksiężna"; "cornflakes"; "księżna"; (*"scrabble";*) "sms"; "teścina";
"Wielkanoc"; "białagłowa"; "rzeczpospolita"; "imćpan";
"Ob"; "podziem"; "Pia"; "woleoczko"; "wilczełyko"; "jo-jo"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"Omulew"; "drama"; (*"Kayah";*) "ratel"; "grubodziób"; "rozpiór:s1"; "ceter"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"DJ"; "FIFA"; (*"manicure"; "Greenpeace"; "Google";*) ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"włosień:s2"; "deszcz"; "falafel"; "Krasnystaw";
"Różanystok"; "Białystok"; "ZHR"; "rzep:s2"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"IKEA"; "makao"; "macao"; "kakao"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"dziecko"; "oko:s2"; "ucho:s2"; "półdziecko"; "b-cia"; ""; ""; ""; ""; ""; ""; ""; ""; "";
"idea"; "ręka"; "cześć:s"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"ABBA"; "UEFA"; "FAMA"; "SABENA"; "MENA"; "APA"; "NASA"; "ANSA";
"NAFTA"; "LETTA"; "ETA"; "ELTA"; "EFTA"; "CEFTA";
"WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART";
"PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT";
"człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
"szwa"; "hawanna"; "butaforia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"Skopie"; "Mathea"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
"poema:s1"; "klima:s1"; "dylema"; "dilemma"; "apoftegma"; "aksjoma"; ""; ""; ""; ""; ""; ""; ""; "";
"burgrabia"; "gograbia"; "grabia"; "hrabia"; "margrabia"; "murgrabia"; "sędzia:s1"; "wicehrabia"; "współsędzia";
"cieśla"; "bibliopola"; "świszczypałka"; "śwircałka"; "świerczałka"; "ścierciałka"; "tatka"; "sługa:s1"; "stupajka:s1"; "stepka"; "starowinka:s2"; "skurczypałka"; "mężczyzna"; "klecha";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
|
|
544
|
] @ File.load_lines "../morphology/data/obce.tab" @ File.load_lines "../morphology/data/akronimy.tab" @
|
|
545
546
|
(* File.load_lines "results/interp_validated_verb.tab" @ *)
(*File.load_lines "results/interp_validated_noun.tab" @ File.load_lines "results/interp_validated_adj.tab" @
|
|
547
|
File.load_lines "../morphology/data/validated_adj.tab" @ File.load_lines "../morphology/data/validated_noun.tab" @
|
|
548
|
File.load_lines "../morphology/data/validated_verb.tab" @ File.load_lines "../morphology/data/adv_nieodprzymiotnikowe.tab" *) [])
|
|
549
550
551
552
553
554
|
let remove_exceptional_lemmata dict =
Xlist.fold dict [] (fun dict entry ->
if StringSet.mem exceptional_lemmata entry.lemma then dict
else entry :: dict)
|
|
555
556
557
558
559
|
let remove_exceptional_lemmata_gen ex dict =
Xlist.fold dict [] (fun dict entry ->
if StringSet.mem ex entry.lemma then dict
else entry :: dict)
|
|
560
561
562
|
let generate_stem dict =
Xlist.rev_map dict (fun entry ->
{entry with stem=
|
|
563
|
(* if entry.ndm then (List.hd entry.forms).orth else *)
|
|
564
565
566
567
|
if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then
Stem.generate_stem entry
else ""})
|
|
568
|
(*let phon_generate_stem dict =
|
|
569
570
571
572
573
|
Xlist.rev_map dict (fun entry ->
{entry with phon_stem=
if entry.ndm then (List.hd entry.forms).phon_orth else
if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then
Stem.phon_generate_stem entry
|
|
574
|
else []})*)
|
|
575
|
|
|
576
577
578
579
580
581
582
583
584
585
|
let generate_stem_lu dict =
Xlist.rev_map dict (fun entry ->
{entry with lu_stem=Stem.generate_stem_lu entry.lemma1 entry.lemma2})
let lowercase_lu dict =
Xlist.rev_map dict (fun entry ->
{entry with
lemma1=Xunicode.lowercase_utf8_string entry.lemma1;
lemma2=Xunicode.lowercase_utf8_string entry.lemma2})
|
|
586
|
|
|
587
588
589
|
let fonetic_translation dict =
Xlist.fold dict [] (fun dict e ->
try
|
|
590
591
592
593
|
let lemma = Stem.simplify_lemma e.lemma in
let phon_lemma = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules lemma in
let phon_stem = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules e.stem in
{e with phon_lemma = phon_lemma; phon_stem=phon_stem;
|
|
594
|
forms = Xlist.map e.forms (fun f ->
|
|
595
596
597
598
599
600
601
|
let phon_orth = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules f.orth in
{f with phon_orth = phon_orth})} :: dict
with
Fonetics.NotFound(x,s) -> printf "NF %s %s %s\n%!" e.lemma x s; dict
| Fonetics.NotEqual(x,s,t) -> printf "NE %s %s %s %s\n%!" e.lemma x s t; dict
| Fonetics.MulipleSolutions(x,s,l) -> printf "MS %s %s %s: %s\n%!" e.lemma x s (String.concat " " l); dict
| _ -> dict)
|
|
602
|
|
|
603
|
let validate rules dict =
|
|
604
605
606
607
608
|
Xlist.rev_map dict (fun entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
let forms = Xlist.rev_map entry.forms (fun form ->
let candidates = Rules.CharTrees.find rules form.orth in
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
|
|
609
|
if stem ^ rule.set = simple_lemma then (stem,rule) :: candidates else candidates) in
|
|
610
611
612
|
if candidates = [] then {form with validated=false} else {form with validated=true}) in
{entry with forms=forms})
|
|
613
614
615
616
617
618
619
620
621
622
623
|
let phon_validate rules dict =
Xlist.rev_map dict (fun entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
let forms = Xlist.rev_map entry.forms (fun form ->
let candidates = Xlist.fold form.phon_orth [] (fun candidates s ->
Xlist.fold (Rules.CharTrees.find rules s) candidates (fun candidates (stem,rule) ->
let candidate_lemma = Fonetics.translate_single true Fonetics.rev_rules (stem ^ rule.set) in
if candidate_lemma = simple_lemma then (stem,rule) :: candidates else candidates)) in
if candidates = [] then {form with validated=false} else {form with validated=true}) in
{entry with forms=forms})
|
|
624
625
626
627
628
629
630
631
632
633
634
635
636
|
let validate_lu rules dict =
Xlist.rev_map dict (fun entry ->
let candidates1 = Rules.CharTrees.find rules entry.lemma1 in
let candidates2 = Rules.CharTrees.find rules entry.lemma2 in
let b = Xlist.fold candidates1 false (fun b (stem1,rule1) ->
Xlist.fold candidates2 b (fun b (stem2,rule2) ->
(* Printf.printf "%s %s %s %s\n%!" stem1 stem2 (string_of_rule rule1) (string_of_rule rule1); *)
if stem1 ^ rule1.set = stem2 ^ rule2.set then true else b)) in
(* if b then print_endline "validated"; *)
let b1 = Xlist.fold candidates1 false (fun b (stem1,rule1) -> if stem1 = entry.lu_stem then true else b) in
let b2 = Xlist.fold candidates2 false (fun b (stem2,rule2) -> if stem2 = entry.lu_stem then true else b) in
{entry with lu_validated=b; validated1=b1; validated2=b2})
|
|
637
|
let validate_interp rules dict =
|
|
638
639
640
|
Xlist.rev_map dict (fun entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
let forms = Xlist.rev_map entry.forms (fun form ->
|
|
641
642
|
let candidates = Xlist.fold form.phon_orth [] (fun candidates s ->
Xlist.fold (Rules.CharTrees.find rules s) candidates (fun candidates (stem,rule) ->
|
|
643
644
|
(* if rule.star = Ndm && not entry.ndm then candidates else
if rule.star <> Ndm && entry.ndm then candidates else *)
|
|
645
646
647
|
let candidate_lemma = Fonetics.translate_single true Fonetics.rev_rules (stem ^ rule.set) in
if candidate_lemma = simple_lemma && form.interp = rule.interp then
(stem,rule) :: candidates else candidates)) in
|
|
648
|
if candidates = [] then {form with validated=false} else {form with validated=true}) in
|
|
649
650
651
652
653
654
655
656
657
658
659
660
661
662
|
{entry with forms=forms})
let remove_validated_forms dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then forms else form :: forms) in
if forms = [] then dict else {entry with forms=forms} :: dict)
let remove_validated_entries dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then forms else form :: forms) in
if forms = [] then dict else entry :: dict)
|
|
663
664
665
666
|
let remove_validated_lu dict =
Xlist.fold dict [] (fun dict entry ->
if entry.lu_validated then dict else entry :: dict)
|
|
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
|
let remove_not_validated_forms dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then form :: forms else forms) in
if forms = [] then dict else {entry with forms=forms} :: dict)
let remove_not_validated_entries dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if form.validated then form :: forms else forms) in
if Xlist.size forms <> Xlist.size entry.forms then dict else entry :: dict)
let print filename dict =
File.file_out filename (fun file ->
Xlist.iter dict (fun entry ->
Xlist.iter entry.forms (fun form ->
fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp)))
let print_lemmata filename dict =
File.file_out filename (fun file ->
Xlist.iter dict (fun entry ->
fprintf file "%s\n" entry.lemma))
|
|
689
690
691
692
693
694
695
696
|
let remove_sup_neg_forms dict =
Xlist.fold dict [] (fun dict entry ->
let forms = Xlist.fold entry.forms [] (fun forms form ->
if Xstring.check_sufix ":neg" form.interp || Xstring.check_sufix ":sup" form.interp then
forms else form :: forms) in
if forms = [] then dict else {entry with forms=forms} :: dict)
|
|
697
|
let generate_rules rules path filename rules_filename =
|
|
698
699
|
let dict = load_tab (path ^ filename) in
let dict = merge_entries dict in
|
|
700
|
let dict = process_interps dict in
|
|
701
|
(* let dict = mark_ndm dict in (* FIXME: remove_ndm? *) *)
|
|
702
|
let dict = remove_exceptional_lemmata dict in
|
|
703
704
|
let dict = find_kolwiek_suffixes dict in (* FIXME: lematy z kolwiek_suffixes nie są walidowane *)
let dict = generate_stem dict in
|
|
705
|
let dict = fonetic_translation dict in
|
|
706
|
let dict = phon_validate rules dict in
|
|
707
|
let dict = remove_validated_forms dict in
|
|
708
|
let dict = remove_sup_neg_forms dict in (* FIXME *)
|
|
709
|
let rules = Xlist.fold dict StringMap.empty (fun rules entry ->
|
|
710
|
Xlist.fold (RuleGenerator.phon_generate_rules_entry entry) rules (fun rules (key,rule) ->
|
|
711
712
713
714
715
716
717
718
719
|
let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in
let rules2 = StringMap.add_inc rules2 rule (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l) in
StringMap.add rules key rules2)) in
File.file_out rules_filename (fun file ->
StringMap.iter rules (fun interp rules2 ->
fprintf file "\n@RULES %s\n" interp;
StringMap.iter rules2 (fun rule (q,l) ->
fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))
|
|
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
|
let generate_rules_lu rules id path rules_filename =
let dict = load_lu [] id path in
let dict = lowercase_lu dict in
let dict = generate_stem_lu dict in
let dict = validate_lu rules dict in
let dict = remove_validated_lu dict in
(* let dict = remove_sup_neg_forms dict in *)
let rules = Xlist.fold dict StringMap.empty (fun rules entry ->
Xlist.fold (RuleGenerator.generate_rules_lu_entry entry) rules (fun rules (key,rule,lemma) ->
let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in
let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in
StringMap.add rules key rules2)) in
File.file_out rules_filename (fun file ->
StringMap.iter rules (fun interp rules2 ->
fprintf file "\n@RULES %s\n" interp;
StringMap.iter rules2 (fun rule (q,l) ->
fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))
let rec get_first n l =
if n = 0 || l = [] then [] else
List.hd l :: (get_first (n-1) (List.tl l))
|
|
742
743
|
let generate_interp_rules rules interp_rules selected_tags path filename rules_filename =
let selected_tags = StringSet.of_list selected_tags in
|
|
744
745
|
let dict = load_tab (path ^ filename) in
let dict = merge_entries dict in
|
|
746
|
let dict = process_interps dict in
|
|
747
|
(* let dict = mark_ndm dict in (* FIXME: remove_ndm? *) *)
|
|
748
|
let dict = remove_exceptional_lemmata dict in
|
|
749
750
751
|
(* let dict = find_kolwiek_suffixes dict in *)
(* let dict = generate_stem dict in *)
let dict = fonetic_translation dict in
|
|
752
|
let dict = validate_interp interp_rules dict in
|
|
753
754
755
756
|
let dict = remove_validated_forms dict in
let interp_rules = Xlist.fold dict StringMap.empty (fun interp_rules entry ->
let simple_lemma = Stem.simplify_lemma entry.lemma in
Xlist.fold entry.forms interp_rules (fun interp_rules form ->
|
|
757
758
|
let candidates = RuleGenerator.phon_generate_interp_rules rules selected_tags simple_lemma form in
Xlist.fold candidates interp_rules (fun interp_rules (v,cand) ->
|
|
759
|
(* StringMap.add_inc interp_rules cand (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l)))) in *)
|
|
760
761
762
|
StringMap.add_inc interp_rules cand (v,StringSet.singleton entry.lemma) (fun (v,set) -> v,StringSet.add set entry.lemma)))) in
let interp_rules = List.rev (List.sort compare (StringMap.fold interp_rules [] (fun l k (v,set) ->
(v,k,set) :: l))) in
|
|
763
|
File.file_out rules_filename (fun file ->
|
|
764
|
Xlist.iter interp_rules (fun (v,k,set)(*q,l*) ->
|
|
765
|
(* fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l))) *)
|
|
766
|
(*if StringSet.size set > 1000 then*) fprintf file "\t%s\t# %d %s\n" k (StringSet.size set) (String.concat " " (get_first 20 (List.rev (StringSet.to_list set))))))
|
|
767
|
|
|
768
|
(* let generate_ndm_rules dict =
|
|
769
770
771
772
773
|
let freq_rules = Xlist.fold dict Rules.RuleQMap.empty (fun freq_rules entry ->
Xlist.fold entry.forms freq_rules (fun freq_rules form ->
let rule = {id=""; freq=0; star=Ndm; pref=""; find=""; set=""; tags=[]; interp=form.interp} in
Rules.RuleQMap.add freq_rules rule)) in
fst (Rules.RuleQMap.fold freq_rules (Rules.RuleQMap.empty,1) (fun (freq_rules,i) rule freq ->
|
|
774
|
Rules.RuleQMap.add_val freq_rules {rule with id = "N" ^ string_of_int i} freq, i+1)) *)
|
|
775
776
|
let generate_rule_frequencies rules path filename rules_filename =
|
|
777
778
|
let dict = load_tab (path ^ filename) in
let dict = merge_entries dict in
|
|
779
|
let dict = process_interps dict in
|
|
780
|
let dict = remove_cat "cond" dict in
|
|
781
|
(* let dict = mark_ndm dict in
|
|
782
|
let freq_rules = generate_ndm_rules (remove_not_ndm dict) in
|
|
783
|
let dict = remove_ndm dict in *)
|
|
784
785
|
let dict = remove_exceptional_lemmata dict in
let dict = generate_stem dict in
|
|
786
|
let freq_rules = Xlist.fold dict Rules.RuleQMap.empty(*freq_rules*) (fun freq_rules entry ->
|
|
787
788
789
790
|
let simple_lemma = Stem.simplify_lemma entry.lemma in
Xlist.fold entry.forms freq_rules (fun freq_rules form ->
let candidates = Rules.CharTrees.find rules form.orth in
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
|
|
791
|
if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
|
|
792
793
794
795
|
(stem,rule) :: candidates else candidates) in
if candidates = [] then freq_rules else Rules.RuleQMap.add freq_rules (snd (List.hd candidates)))) in
File.file_out rules_filename (fun file ->
Rules.RuleQMap.iter freq_rules (fun rule freq ->
|
|
796
|
fprintf file "%s\n" (Rules.string_of_freq_rule {rule with freq=freq})))
|
|
797
798
799
800
801
802
|
let generate_stem_dict rules_filename path filename out_filename =
let rules = Rules.load_freq_rules rules_filename in
let rules = Rules.CharTrees.create rules in
let dict = load_tab (path ^ filename) in
let dict = merge_entries dict in
|
|
803
|
let dict = process_interps dict in
|
|
804
|
let dict = remove_cat "cond" dict in
|
|
805
|
(* let dict = mark_ndm dict in *)
|
|
806
807
808
809
810
|
let stems = Xlist.fold dict StringMap.empty (fun stems entry ->
let simple_lemma,lemma_suf = Stem.simplify_lemma_full entry.lemma in
Xlist.fold entry.forms stems (fun stems form ->
let candidates = Rules.CharTrees.find rules form.orth in
let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
|
|
811
812
|
(* if rule.star = Ndm && not entry.ndm then candidates else
if rule.star <> Ndm && entry.ndm then candidates else *)
|
|
813
|
if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
|
|
814
815
816
|
(stem,rule) :: candidates else candidates) in
if candidates = [] then stems else
let stem,rule = List.hd candidates in
|
|
817
|
StringMap.add_inc stems (stem ^ "\t" ^ lemma_suf) [rule.id] (fun l -> rule.id :: l))) in
|
|
818
819
820
|
File.file_out out_filename (fun file ->
StringMap.iter stems (fun stem ids ->
fprintf file "%s\t%s\n" stem (String.concat " " ids)))
|