Blame view

morphology/dict.ml 55.4 KB
Wojciech Jaworski authored
1
2
3
4
5
6
7
8
9
10
11
12
open Xstd
open Printf
open Types

let get_form e =
  match e.forms with
    [form] -> form
  | _ -> failwith "get_form"

let load_tab filename =
  File.load_tab filename (function
      orth :: lemma :: interp :: _ ->
Wojciech Jaworski authored
13
        {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]}
Wojciech Jaworski authored
14
15
16
17
18
    | line -> failwith ("load_tab: " ^ (String.concat "\t" line)))

let load_tab_full filename =
  File.load_tab filename (function
      [orth; lemma; interp] ->
Wojciech Jaworski authored
19
        {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]}
Wojciech Jaworski authored
20
    | [orth; lemma; interp; proper_type] ->
Wojciech Jaworski authored
21
        {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]; proper_type=proper_type}
Wojciech Jaworski authored
22
    | [orth; lemma; interp; proper_type; genre] ->
Wojciech Jaworski authored
23
        {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp; genre=genre}]; proper_type=proper_type}
Wojciech Jaworski authored
24
25
    | line -> failwith ("load_tab_full: " ^ (String.concat "\t" line)))
Wojciech Jaworski authored
26
27
28
29
30
31
32
let load_lu dict id path =
  let filename = path ^ "morf_rel_" ^ string_of_int id ^ "_lu.tab" in
  File.fold_tab filename dict (fun dict -> function
      [lemma1; lemma2] ->
        {lemma1=lemma1; lemma2=lemma2; rel_id=id; lu_stem=""; lu_validated=false;validated1=false;validated2=false} :: dict
    | line -> failwith ("load_lu: " ^ (String.concat "\t" line)))
Wojciech Jaworski authored
33
34
35
36
let pos = StringSet.of_list [
  "subst";"adj";"adv";"interp";"num";"xxx";"prep";"fin";"praet";"qub";"inf";"interj";
  "brev";"numcol";"ppas";"pact";"adja";"conj";"ger";"pcon";"pant";"comp";"depr";
  "adjp";"imps";"impt";"pred";"bedzie";"burk";"aglt";"ppron12";"ppron3";"adjc";
Wojciech Jaworski authored
37
  "winien";"siebie";"numcomp"
Wojciech Jaworski authored
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
  ]

let rec find_pos rev = function
    s :: l -> if StringSet.mem pos s then List.rev rev, s :: l else find_pos (s :: rev) l
  | [] -> failwith "find_pos"

let split_lemma_interp s =
  let l = Xstring.split_delim ":" s in
  let lemma,interp = find_pos [List.hd l] (List.tl l) in
  String.concat ":" lemma, String.concat ":" interp

let rec remove_empties = function
    "" :: l -> remove_empties l
  | l -> l

let split_freq_orth s =
  match remove_empties (Xstring.split " " s) with
Wojciech Jaworski authored
55
    freq :: l -> int_of_string freq, String.concat " " l
Wojciech Jaworski authored
56
57
58
59
60
61
62
  | _ -> failwith "split_freq_orth"

let load_freq_tab filename =
  File.load_tab filename (function
      [freq_orth; lemma_interp] ->
        let freq,orth = split_freq_orth freq_orth in
        let lemma,interp = split_lemma_interp lemma_interp  in
Wojciech Jaworski authored
63
        {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp; freq=freq}]}
Wojciech Jaworski authored
64
65
66
    | line -> failwith ("load_freq_tab: " ^ (String.concat "\t" line)))

let proper_type_selector e = e.proper_type
Wojciech Jaworski authored
67
let genre_selector e = (get_form e).genre
Wojciech Jaworski authored
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
let interp_selector e = (get_form e).interp
let freq_selector e = (get_form e).freq

let print_quantities out_filename selector dict =
  let qmap = Xlist.fold dict StringQMap.empty (fun qmap entry ->
    StringQMap.add qmap (selector entry)) in
  File.file_out out_filename (fun file ->
    StringQMap.iter qmap (fun k v ->
      fprintf file "%6d\t%s\n" v k))

(**********************************************************************************)

let load_dict_as_set filename =
  let l = load_tab filename in
  List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
    let form = get_form entry in
    StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp]))))

let load_dict_as_set_full filename =
  let l = load_tab_full filename in
  List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
    let form = get_form entry in
    StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp;entry.proper_type;form.genre]))))

let rec compare_dicts_rec file = function
    [],[] -> ()
  | [],b :: lb -> fprintf file "> %s\n" b; compare_dicts_rec file ([],lb)
  | a :: la,[] -> fprintf file "< %s\n" a; compare_dicts_rec file (la,[])
  | a :: la, b :: lb ->
       if a = b then compare_dicts_rec file (la,lb) else
       if a < b then (fprintf file "< %s\n" a; compare_dicts_rec file (la,b :: lb)) else
       (fprintf file "> %s\n" b; compare_dicts_rec file (a :: la,lb))

let compare_dicts filename1 filename2 filename_out =
  let dict1 = load_dict_as_set filename1 in
  let dict2 = load_dict_as_set filename2 in
  File.file_out filename_out (fun file ->
    compare_dicts_rec file (dict1,dict2))

let compare_dicts_full filename1 filename2 filename_out =
  let dict1 = load_dict_as_set_full filename1 in
  let dict2 = load_dict_as_set_full filename2 in
  File.file_out filename_out (fun file ->
    compare_dicts_rec file (dict1,dict2))

(**********************************************************************************)

let find_entry_cat entry =
  let form = get_form entry in
  let cat,tags = match Xstring.split ":" form.interp with
      cat :: tags -> cat,tags
    | _ -> failwith ("find_entry_type: " ^ form.interp) in
Wojciech Jaworski authored
120
121
122
123
124
  if cat = "praet" then
    let t = match tags with
      _ :: _ :: t :: _ -> t
    | _ -> failwith ("find_entry_cat: " ^ form.interp) in
    if t = "pri" || t = "sec" || t = "ter" then "cond" else "verb" else
Wojciech Jaworski authored
125
126
127
128
129
130
131
  if cat = "subst" || cat = "depr" then "noun" else
  if cat = "adj" || cat = "adja"|| cat = "adjc"|| cat = "adjp" then "adj" else
  if cat = "adv" then "adv" else
  if cat = "inf" || cat = "praet"|| cat = "fin" || cat = "ppas" || cat = "pact" || cat = "pacta" ||
     cat = "impt" || cat = "imps" || cat = "pcon" || cat = "pant" || cat = "ger" || cat = "" then "verb" else
  if cat = "bedzie" || cat = "pred"|| cat = "prep" || cat = "num" || cat = "aglt" || cat = "winien" ||
     cat = "qub" || cat = "brev" || cat = "comp" || cat = "interj" || cat = "burk" ||
Wojciech Jaworski authored
132
     cat = "conj" || cat = "ppron12" || cat = "ppron3" || cat = "numcomp" || cat = "" then "other" else
Wojciech Jaworski authored
133
134
135
136
137
138
139
  if cat = "cond" then "cond" else
  failwith ("find_entry_cat: " ^ cat)

let assign_entry_cat dict =
  Xlist.rev_map dict (fun entry ->
    {entry with cat = find_entry_cat entry})
Wojciech Jaworski authored
140
141
let split_dict in_path filename out_path =
  let dict = load_tab (in_path ^ filename) in
Wojciech Jaworski authored
142
  let dict = List.rev (assign_entry_cat dict) in
Wojciech Jaworski authored
143
144
145
146
147
148
149
  let filename = if Xstring.check_sufix ".gz" filename then
    Xstring.cut_sufix ".gz" filename else filename in
  File.file_out (out_path ^ "noun_" ^ filename) (fun noun_file ->
  File.file_out (out_path ^ "adj_" ^ filename) (fun adj_file ->
  File.file_out (out_path ^ "adv_" ^ filename) (fun adv_file ->
  File.file_out (out_path ^ "verb_" ^ filename) (fun verb_file ->
  File.file_out (out_path ^ "other_" ^ filename) (fun other_file ->
Wojciech Jaworski authored
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
  Xlist.iter dict (fun entry ->
    let form = get_form entry in
    try
      let file = match entry.cat with
          "noun" -> noun_file
        | "adj" -> adj_file
        | "adv" -> adv_file
        | "verb" -> verb_file
        | "other" -> other_file
        | "cond" -> raise Not_found
        | _ -> failwith "split_dict" in
      fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp
    with Not_found -> ()))))))

let merge_entries dict =
  let dict = assign_entry_cat dict in
  let map = Xlist.fold dict StringMap.empty (fun map entry ->
    let form = get_form entry in
    let key =
Wojciech Jaworski authored
169
      if entry.cat = "noun" then
Wojciech Jaworski authored
170
171
172
173
        let gender = match Xstring.split ":" form.interp with
            ["depr";_;_;"m2"] -> "m1"
          | "depr" :: _ -> failwith ("merge_entries: " ^ form.interp)
          | [_;_;_;gender] -> gender
Wojciech Jaworski authored
174
          | [_;_;_;gender;col] -> gender ^ ":" ^ col
Wojciech Jaworski authored
175
176
177
178
179
180
181
182
183
          | _ -> failwith ("merge_entries: " ^ form.interp) in
        entry.lemma ^ "|" ^ entry.cat ^ "|" ^ gender
      else entry.lemma ^ "|" ^ entry.cat in
    StringMap.add_inc map key entry (fun e ->
      if entry.proper_type <> e.proper_type then
        failwith ("merge_entries: " ^ key ^ " " ^ entry.proper_type ^ " " ^ e.proper_type) else
      {e with forms = form :: e.forms})) in
  StringMap.fold map [] (fun dict _ e -> e :: dict)
Wojciech Jaworski authored
184
185
186
187
188
let remove_cat cat dict =
  Xlist.fold dict [] (fun dict entry ->
    if entry.cat = cat then dict
    else entry :: dict)
Wojciech Jaworski authored
189
190
191
192
193
194
195
196
let rec get_aspect lemma = function
    (f : form) :: l ->
      (match Xstring.split ":" f.interp with
        ["inf";a] -> a
      | ["ger";_;_;_;a;_] -> a
      | _ -> get_aspect lemma l)
  | [] -> failwith ("get_aspect: " ^ lemma)
Wojciech Jaworski authored
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
let get_lemma_suf lemma =
  let lemma_suf =
    if lemma = "" then "" else
    List.hd (List.rev (Xunicode.utf8_chars_of_utf8_string (Stem.simplify_lemma lemma))) in
  match lemma_suf with
    "a" -> "a"
  | "e" -> "e"
  | "o" -> "o"
  | "y" -> "y"
  | "i" -> "y"
  | "ę" -> "ę"
  | _ -> "ε"

let get_orth_suf orth =
  let orth_suf =
    if orth = "" then "" else
    List.hd (List.rev (Xunicode.utf8_chars_of_utf8_string orth)) in
  match orth_suf with
    "j" -> "j"
  | "e" -> "e"
  | _ -> "ε"

let merge_interps lemma forms =
  let lemma_suf = get_lemma_suf lemma in
Wojciech Jaworski authored
221
222
223
224
225
  let map = Xlist.fold forms StringMap.empty (fun map form ->
    (* printf "merge_interps 1: %s %s\n%!" form.orth form.interp; *)
    StringMap.add_inc map form.orth (StringSet.singleton form.interp) (fun set -> StringSet.add set form.interp)) in
  StringMap.fold map [] (fun forms orth set ->
    (* printf "merge_interps 2: %s %s\n%!" orth (String.concat " " (StringSet.to_list set)); *)
Wojciech Jaworski authored
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
    let orth_suf = get_orth_suf orth in
    match lemma_suf, Xlist.sort (StringSet.to_list set) compare with
      _,["adv"] -> {empty_form with orth=orth; interp="adv:pos"} :: forms
    | _,["adv";"adv:pos"] -> {empty_form with orth=orth; interp="adv:pos"} :: forms
    | _,["adj:pl:acc:m2.m3.f.n:pos";"adj:pl:nom.voc:m2.m3.f.n:pos";"adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos|adj:pl:nom.acc.voc:m2.m3.f.n:pos"} :: forms
    | _,["adj:sg:acc:m1.m2:pos";"adj:sg:gen:m1.m2.m3.n:pos"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:pos|adj:sg:acc:m1.m2:pos"} :: forms
    | _,["adj:sg:dat:f:pos";"adj:sg:gen:f:pos";"adj:sg:loc:f:pos"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:pos"} :: forms
    | _,["adj:sg:acc:m3:pos";"adj:sg:nom.voc:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos"} :: forms
    | _,["adj:pl:acc:m1:pos";"adj:pl:gen:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:pos|adj:pl:acc:m1:pos"} :: forms
    | _,["adj:pl:dat:m1.m2.m3.f.n:pos";"adj:sg:inst:m1.m2.m3.n:pos";"adj:sg:loc:m1.m2.m3.n:pos"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:pos|adj:pl:dat:m1.m2.m3.f.n:pos"} :: forms
    | _,["adj:sg:acc:f:pos";"adj:sg:inst:f:pos"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:pos"} :: forms
    | _,["adj:pl:nom.voc:m1:pos";"adj:sg:acc:m3:pos";"adj:sg:nom.voc:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos|adj:pl:nom.voc:m1:pos"} :: forms
    | _,["adj:sg:acc:m3:pos";"adj:sg:nom:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom:m1.m2.m3:pos|adj:sg:acc:m3:pos"} :: forms
    | _,["adj:pl:acc:m2.m3.f.n:pos";"adj:pl:nom.voc:m2.m3.f.n:pos"] -> {empty_form with orth=orth; interp="adj:pl:nom.acc.voc:m2.m3.f.n:pos"} :: forms
    | _,["adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos"} :: forms
    | _,["adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos";"adja"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos|adja"} :: forms
    | _,["adj:pl:nom:m2.m3.f.n:pos";"adj:sg:nom:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom:n:pos|adj:pl:nom:m2.m3.f.n:pos"} :: forms
    | _,["adj:pl:acc:m2.m3.f.n:sup";"adj:pl:nom.voc:m2.m3.f.n:sup";"adj:sg:acc:n:sup";"adj:sg:nom.voc:n:sup"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:sup|adj:pl:nom.acc.voc:m2.m3.f.n:sup"} :: forms
    | _,["adj:sg:acc:m1.m2:sup";"adj:sg:gen:m1.m2.m3.n:sup"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:sup|adj:sg:acc:m1.m2:sup"} :: forms
    | _,["adj:sg:dat:f:sup";"adj:sg:gen:f:sup";"adj:sg:loc:f:sup"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:sup"} :: forms
    | _,["adj:sg:acc:m3:sup";"adj:sg:nom.voc:m1.m2.m3:sup"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:sup|adj:sg:acc:m3:sup"} :: forms
    | _,["adj:pl:acc:m1:sup";"adj:pl:gen:m1.m2.m3.f.n:sup";"adj:pl:loc:m1.m2.m3.f.n:sup"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:sup|adj:pl:acc:m1:sup"} :: forms
    | _,["adj:pl:dat:m1.m2.m3.f.n:sup";"adj:sg:inst:m1.m2.m3.n:sup";"adj:sg:loc:m1.m2.m3.n:sup"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:sup|adj:pl:dat:m1.m2.m3.f.n:sup"} :: forms
    | _,["adj:sg:acc:f:sup";"adj:sg:inst:f:sup"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:sup"} :: forms
    | _,["adj:pl:acc:m2.m3.f.n:com";"adj:pl:nom.voc:m2.m3.f.n:com";"adj:sg:acc:n:com";"adj:sg:nom.voc:n:com"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:com|adj:pl:nom.acc.voc:m2.m3.f.n:com"} :: forms
    | _,["adj:sg:acc:m1.m2:com";"adj:sg:gen:m1.m2.m3.n:com"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:com|adj:sg:acc:m1.m2:com"} :: forms
    | _,["adj:sg:dat:f:com";"adj:sg:gen:f:com";"adj:sg:loc:f:com"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:com"} :: forms
    | _,["adj:sg:acc:m3:com";"adj:sg:nom.voc:m1.m2.m3:com"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:com|adj:sg:acc:m3:com"} :: forms
    | _,["adj:pl:acc:m1:com";"adj:pl:gen:m1.m2.m3.f.n:com";"adj:pl:loc:m1.m2.m3.f.n:com"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:com|adj:pl:acc:m1:com"} :: forms
    | _,["adj:pl:dat:m1.m2.m3.f.n:com";"adj:sg:inst:m1.m2.m3.n:com";"adj:sg:loc:m1.m2.m3.n:com"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:com|adj:pl:dat:m1.m2.m3.f.n:com"} :: forms
    | _,["adj:sg:acc:f:com";"adj:sg:inst:f:com"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:com"} :: forms
    | _,["adj:pl:acc:m1:pos";"adj:pl:acc:m2.m3.f.n:pos";"adj:pl:dat:m1.m2.m3.f.n:pos";"adj:pl:gen:m1.m2.m3.f.n:pos";
Wojciech Jaworski authored
258
259
260
261
       "adj:pl:inst:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos";"adj:pl:nom.voc:m1:pos";"adj:pl:nom.voc:m2.m3.f.n:pos";
       "adj:sg:acc:f:pos";"adj:sg:acc:m1.m2:pos";"adj:sg:acc:m3:pos";"adj:sg:acc:n:pos";"adj:sg:dat:f:pos";
       "adj:sg:dat:m1.m2.m3.n:pos";"adj:sg:gen:f:pos";"adj:sg:gen:m1.m2.m3.n:pos";"adj:sg:inst:f:pos";"adj:sg:inst:m1.m2.m3.n:pos";
       "adj:sg:loc:f:pos";"adj:sg:loc:m1.m2.m3.n:pos";"adj:sg:nom.voc:f:pos";"adj:sg:nom.voc:m1.m2.m3:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos"} :: forms
Wojciech Jaworski authored
262
    | _,["adj:pl:acc:m1:pos";"adj:pl:acc:m2.m3.f.n:pos";"adj:pl:dat:m1.m2.m3.f.n:pos";"adj:pl:gen:m1.m2.m3.f.n:pos";
Wojciech Jaworski authored
263
264
265
266
       "adj:pl:inst:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos";"adj:pl:nom.voc:m1:pos";"adj:pl:nom.voc:m2.m3.f.n:pos";
       "adj:sg:acc:f:pos";"adj:sg:acc:m1.m2:pos";"adj:sg:acc:m3:pos";"adj:sg:acc:n:pos";"adj:sg:dat:f:pos";
       "adj:sg:dat:m1.m2.m3.n:pos";"adj:sg:gen:f:pos";"adj:sg:gen:m1.m2.m3.n:pos";"adj:sg:inst:f:pos";"adj:sg:inst:m1.m2.m3.n:pos";
       "adj:sg:loc:f:pos";"adj:sg:loc:m1.m2.m3.n:pos";"adj:sg:nom.voc:f:pos";"adj:sg:nom.voc:m1.m2.m3:pos";"adj:sg:nom.voc:n:pos";"adja"] -> {empty_form with orth=orth; interp="adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos|adja"} :: forms
Wojciech Jaworski authored
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
    | _,["ger:pl:nom.acc:n:imperf.perf:aff";"ger:sg:gen:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ger:sg:gen:n:imperf.perf:aff|ger:pl:nom.acc:n:imperf.perf:aff"} :: forms
    | _,["ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:aff";"ppas:sg:nom.acc.voc:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:nom.acc.voc:n:imperf.perf:aff|ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:aff"} :: forms
    | _,["ppas:sg:acc:m1.m2:imperf.perf:aff";"ppas:sg:gen:m1.m2.m3.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:gen:m1.m2.m3.n:imperf.perf:aff|ppas:sg:acc:m1.m2:imperf.perf:aff"} :: forms
    | _,["ppas:sg:acc:m3:imperf.perf:aff";"ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff|ppas:sg:acc:m3:imperf.perf:aff"} :: forms
    | _,["ppas:pl:acc:m1:imperf.perf:aff";"ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:aff|ppas:pl:acc:m1:imperf.perf:aff"} :: forms
    | _,["ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:aff";"ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:aff|ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:aff"} :: forms
    | _,["pact:pl:nom.acc.voc:m2.m3.f.n:imperf:aff";"pact:sg:nom.acc.voc:n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:nom.acc.voc:n:imperf:aff|pact:pl:nom.acc.voc:m2.m3.f.n:imperf:aff"} :: forms
    | _,["pact:sg:acc:m1.m2:imperf:aff";"pact:sg:gen:m1.m2.m3.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:gen:m1.m2.m3.n:imperf:aff|pact:sg:acc:m1.m2:imperf:aff"} :: forms
    | _,["pact:pl:nom.voc:m1:imperf:aff";"pact:sg:acc:m3:imperf:aff";"pact:sg:nom.voc:m1.m2.m3:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:nom.voc:m1.m2.m3:imperf:aff|pact:sg:acc:m3:imperf:aff|pact:pl:nom.voc:m1:imperf:aff"} :: forms
    | _,["pact:pl:acc:m1:imperf:aff";"pact:pl:gen.loc:m1.m2.m3.f.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:pl:gen.loc:m1.m2.m3.f.n:imperf:aff|pact:pl:acc:m1:imperf:aff"} :: forms
    | _,["pact:pl:dat:m1.m2.m3.f.n:imperf:aff";"pact:sg:inst.loc:m1.m2.m3.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:inst.loc:m1.m2.m3.n:imperf:aff|pact:pl:dat:m1.m2.m3.f.n:imperf:aff"} :: forms
    | _,["ger:pl:nom.acc:n:imperf.perf:neg";"ger:sg:gen:n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ger:sg:gen:n:imperf.perf:neg|ger:pl:nom.acc:n:imperf.perf:neg"} :: forms
    | _,["ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:neg";"ppas:sg:nom.acc.voc:n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:nom.acc.voc:n:imperf.perf:neg|ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:neg"} :: forms
    | _,["ppas:sg:acc:m1.m2:imperf.perf:neg";"ppas:sg:gen:m1.m2.m3.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:gen:m1.m2.m3.n:imperf.perf:neg|ppas:sg:acc:m1.m2:imperf.perf:neg"} :: forms
    | _,["ppas:sg:acc:m3:imperf.perf:neg";"ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg|ppas:sg:acc:m3:imperf.perf:neg"} :: forms
    | _,["ppas:pl:acc:m1:imperf.perf:neg";"ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:neg|ppas:pl:acc:m1:imperf.perf:neg"} :: forms
    | _,["ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:neg";"ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:neg|ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:neg"} :: forms
    | _,["pact:pl:nom.acc.voc:m2.m3.f.n:imperf:neg";"pact:sg:nom.acc.voc:n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:nom.acc.voc:n:imperf:neg|pact:pl:nom.acc.voc:m2.m3.f.n:imperf:neg"} :: forms
    | _,["pact:sg:acc:m1.m2:imperf:neg";"pact:sg:gen:m1.m2.m3.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:gen:m1.m2.m3.n:imperf:neg|pact:sg:acc:m1.m2:imperf:neg"} :: forms
    | _,["pact:pl:nom.voc:m1:imperf:neg";"pact:sg:acc:m3:imperf:neg";"pact:sg:nom.voc:m1.m2.m3:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:nom.voc:m1.m2.m3:imperf:neg|pact:sg:acc:m3:imperf:neg|pact:pl:nom.voc:m1:imperf:neg"} :: forms
    | _,["pact:pl:acc:m1:imperf:neg";"pact:pl:gen.loc:m1.m2.m3.f.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:pl:gen.loc:m1.m2.m3.f.n:imperf:neg|pact:pl:acc:m1:imperf:neg"} :: forms
    | _,["pact:pl:dat:m1.m2.m3.f.n:imperf:neg";"pact:sg:inst.loc:m1.m2.m3.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:inst.loc:m1.m2.m3.n:imperf:neg|pact:pl:dat:m1.m2.m3.f.n:imperf:neg"} :: forms
    | _,["ger:pl:gen:n:imperf.perf:aff";"inf:imperf.perf"] -> {empty_form with orth=orth; interp="ger:pl:gen:n:imperf.perf:aff"} :: {empty_form with orth=orth; interp="inf:imperf.perf"} :: forms
    | _,["praet:sg:m1.m2.m3:imperf.perf";"praet:sg:m1.m2.m3:imperf.perf:nagl"] -> {empty_form with orth=orth; interp="praet:sg:m1.m2.m3:imperf.perf:nagl"} :: forms
    | _,["fin:sg:ter:imperf.perf";"ger:sg:nom.acc:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="fin:sg:ter:imperf.perf"} :: {empty_form with orth=orth; interp="ger:sg:nom.acc:n:imperf.perf:aff"} :: forms
    | _,["ger:pl:gen:n:imperf.perf:aff";"impt:sg:sec:imperf.perf"] -> {empty_form with orth=orth; interp="ger:pl:gen:n:imperf.perf:aff"} :: {empty_form with orth=orth; interp="impt:sg:sec:imperf.perf"} :: forms
    | _,["fin:pl:ter:imperf.perf";"ppas:sg:acc.inst:f:imperf.perf:aff"] -> {empty_form with orth=orth; interp="fin:pl:ter:imperf.perf"} :: {empty_form with orth=orth; interp="ppas:sg:acc.inst:f:imperf.perf:aff"} :: forms
    | "a",["subst:sg:dat.loc:f";"subst:sg:gen:f"] ->
          if orth_suf = "j" then {empty_form with orth=orth; interp="subst:sg:gen.dat.loc:f"} :: forms
          else {empty_form with orth=orth; interp="subst:sg:gen:f"} :: {empty_form with orth=orth; interp="subst:sg:dat.loc:f"} :: forms
    (* | "a",["subst:pl:gen:f";"subst:sg:dat.loc:f";"subst:sg:gen:f"] -> (*print_endline lemma;*) {empty_form with orth=orth; interp="subst:pl:gen:f"} :: {empty_form with orth=orth; interp="subst:sg:dat.loc:f"} :: {empty_form with orth=orth; interp="subst:sg:gen:f"} :: forms *)
    | "a",["subst:pl:gen:f";"subst:pl:loc:f"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:f"} :: forms
    | "ε",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
    | "ε",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms
    | "ε",["subst:sg:loc:m1";"subst:sg:voc:m1"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m1"} :: forms
    | "ε",["subst:sg:loc:m2";"subst:sg:voc:m2"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m2"} :: forms
    | "ε",["subst:sg:loc:m3";"subst:sg:voc:m3"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m3"} :: forms
    | "ε",["subst:sg:gen:m3";"subst:sg:loc:m3";"subst:sg:voc:m3"] -> {empty_form with orth=orth; interp="subst:sg:loc.voc:m3"} :: {empty_form with orth=orth; interp="subst:sg:gen:m3"} :: forms
    | "ε",["subst:sg:acc:f";"subst:sg:nom:f"] -> {empty_form with orth=orth; interp="subst:sg:nom.acc:f"} :: forms
    | "ε",["subst:pl:gen:f";"subst:pl:nom.acc.voc:f";"subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:gen.dat.loc.voc:f|subst:pl:gen:f"} :: {empty_form with orth=orth; interp="subst:pl:nom.acc.voc:f"} :: forms
    | "ε",["subst:pl:gen:f";"subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:gen.dat.loc.voc:f|subst:pl:gen:f"} :: forms
    | "y",["subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1"} :: forms
    | "y",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
    | "y",["subst:pl:gen.acc:m1";"subst:pl:loc:m1"] -> {empty_form with orth=orth; interp="subst:pl:gen.acc.loc:m1"} :: forms
    | "y",["subst:pl:dat:m1";"subst:sg:inst:m1";"subst:sg:loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m1|subst:pl:dat:m1"} :: forms
    | "y",["subst:sg:nom:m2";"subst:sg:voc:m2"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m2"} :: forms
    | "y",["subst:pl:nom.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1"} :: {empty_form with orth=orth; interp="subst:pl:nom.voc:m1"} :: forms
    | "y",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms
    | "y",["subst:pl:gen:m2";"subst:pl:loc:m2"] -> {empty_form with orth=orth; interp="subst:pl:gen.acc.loc:m2"} :: forms
    | "y",["subst:pl:dat:m2";"subst:sg:inst:m2";"subst:sg:loc:m2"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m2|subst:pl:dat:m2"} :: forms
    | "y",["subst:sg:nom.acc:m3";"subst:sg:voc:m3"] -> {empty_form with orth=orth; interp="subst:sg:nom.acc.voc:m3"} :: forms
    | "y",["subst:pl:gen:m3";"subst:pl:loc:m3"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:m3"} :: forms
    | "y",["subst:pl:dat:m3";"subst:sg:inst:m3";"subst:sg:loc:m3"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m3|subst:pl:dat:m3"} :: forms
    | "y",["subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:nom:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.loc.voc:f"} :: forms
    | "e",["depr:pl:nom.acc.voc:m2";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
Wojciech Jaworski authored
322
    | "e",["depr:pl:nom.acc.voc:m2";"subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1|depr:pl:nom.acc.voc:m2"} :: {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
Wojciech Jaworski authored
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
    | "e",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
    | "e",["subst:sg:inst:m1";"subst:sg:loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m1"} :: forms
    | "e",["subst:pl:dat:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:n:ncol"} :: {empty_form with orth=orth; interp="subst:pl:dat:n:ncol"} :: forms
    | "e",["subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:n:ncol"} :: forms
    | "e",["subst:pl:gen:n:ncol";"subst:pl:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:n:ncol"} :: forms
    | "e",["subst:sg:dat:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:ncol"} :: forms
    | "e",["subst:pl:nom.acc.voc:n:ncol";"subst:sg:gen:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:gen:n:ncol|subst:pl:nom.acc.voc:n:ncol"} :: forms
    | "o",["subst:sg:nom.voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom:m1"} :: {empty_form with orth=orth; interp="subst:sg:voc:m1"} :: forms
    | "o",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
    | "o",["depr:pl:nom.acc.voc:m2";"subst:pl:nom.voc:m1";"subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: {empty_form with orth=orth; interp="subst:pl:nom.voc:m1"} :: {empty_form with orth=orth; interp="depr:pl:nom.acc.voc:m2"} :: forms
    | "o",["subst:sg:dat.loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:dat:m1"} :: {empty_form with orth=orth; interp="subst:sg:loc:m1"} :: forms
    | "o",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms
    | "o",["subst:pl:dat:m1:pt";"subst:pl:loc:m1:pt"] -> {empty_form with orth=orth; interp="subst:pl:dat.loc:m1:pt"} :: forms
    | "ε",["subst:sg:dat:n:ncol";"subst:sg:gen:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol";"subst:sg:nom.acc.voc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: forms
    | "ε",["subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";"subst:sg:dat:n:ncol";"subst:sg:gen:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol";"subst:sg:nom.acc.voc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: forms
    | "ę",["subst:sg:dat:n:col";"subst:sg:loc:n:col"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:col"} :: forms
    | "ę",["subst:sg:dat:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:ncol"} :: forms
    | "o",["subst:sg:loc:m1";"subst:sg:voc:m1"] ->
          if orth_suf = "e" then {empty_form with orth=orth; interp="subst:sg:loc.voc:m1"} :: forms
          else {empty_form with orth=orth; interp="subst:sg:loc:m1"} :: {empty_form with orth=orth; interp="subst:sg:voc:m1"} :: forms
Wojciech Jaworski authored
343
    | _,["depr:pl:nom.acc.voc:m2";"subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
Wojciech Jaworski authored
344
345
    | _,[interp] -> {empty_form with orth=orth; interp=interp} :: forms
    | _,interps ->
Wojciech Jaworski authored
346
        (* print_endline ("merge_interps: " (*^ lemma_suf*) ^ " [\"" ^ String.concat "\";\"" interps ^ "\"]"); *)
Wojciech Jaworski authored
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
        Xlist.fold interps forms (fun forms interp ->
          {empty_form with orth=orth; interp=interp} :: forms))

let process_interps dict =
  Xlist.rev_map dict (fun entry ->
    if entry.cat = "verb" then
      let aspect = get_aspect entry.lemma entry.forms in
      let forms = Xlist.rev_map entry.forms (fun f ->
        let interp = match (Xstring.split ":" f.interp) with
            ["fin";n;p;_] -> String.concat ":" ["fin";n;p;"imperf.perf"]
          | ["impt";n;p;_] -> String.concat ":" ["impt";n;p;"imperf.perf"]
          | "pcon" :: _ -> f.interp
          | "pacta" :: _ -> f.interp
          | "pact" :: _ -> f.interp
          | ["ger";n;c;g;_;a] -> String.concat ":" ["ger";n;c;g;"imperf.perf";a]
          | ["praet";n;g;_] -> String.concat ":" ["praet";n;g;"imperf.perf"]
          | ["praet";n;g;_;a] -> String.concat ":" ["praet";n;g;"imperf.perf";a]
          | ["inf";_] -> String.concat ":" ["inf";"imperf.perf"]
          | ["pant";_] -> String.concat ":" ["pant";"imperf.perf"]
          | ["imps";_] -> String.concat ":" ["imps";"imperf.perf"]
          | ["ppas";n;c;g;_;a] -> String.concat ":" ["ppas";n;c;g;"imperf.perf";a]
          | _ -> print_endline ("merge_interps: " ^ f.interp); f.interp in
        {f with interp=interp}) in
Wojciech Jaworski authored
370
      let forms = merge_interps entry.lemma forms in
Wojciech Jaworski authored
371
      {entry with aspect=aspect; forms=forms} else
Wojciech Jaworski authored
372
    {entry with forms=merge_interps entry.lemma entry.forms})
Wojciech Jaworski authored
373
Wojciech Jaworski authored
374
375
(**********************************************************************************)
Wojciech Jaworski authored
376
(*let mark_ndm dict =
Wojciech Jaworski authored
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
  Xlist.fold dict [] (fun dict entry ->
    if entry.cat <> "noun" &&  entry.cat <> "adj" then entry :: dict else
    let map = Xlist.fold entry.forms StringMap.empty (fun map form ->
      StringMap.add_inc map form.interp (StringSet.singleton form.orth) (fun set -> StringSet.add set form.orth)) in
    let qmap = StringMap.fold map StringQMap.empty (fun qmap interp orths ->
      StringSet.fold orths qmap StringQMap.add) in
    let n = StringMap.size map in
    let found = StringQMap.fold qmap [] (fun found orth v ->
      if v = n then orth :: found else found) in
    match found with
      [] -> entry :: dict
    | [orth] ->
        let ndm,odm = Xlist.fold entry.forms ([],[]) (fun (ndm,odm) form ->
          if form.orth = orth then form :: ndm, odm else ndm, form :: odm) in
        let dict = {entry with forms=odm} :: dict in
        {entry with forms=ndm; ndm=true} :: dict
    | _ -> failwith ("mark_ndm: " ^ (String.concat " " found)))

let print_ndm filename dict =
  File.file_out filename (fun file ->
    Xlist.iter dict (fun entry ->
      if entry.ndm then
        let orth = (List.hd entry.forms).orth in
        fprintf file "%s\t%s\t%s\n" orth entry.lemma entry.cat))
Wojciech Jaworski authored
402
403
404
405
406
407
408
409
let remove_ndm dict =
  Xlist.fold dict [] (fun dict entry ->
    if entry.ndm then dict
    else entry :: dict)

let remove_not_ndm dict =
  Xlist.fold dict [] (fun dict entry ->
    if not entry.ndm then dict
Wojciech Jaworski authored
410
    else entry :: dict)*)
Wojciech Jaworski authored
411
412
Wojciech Jaworski authored
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
let kolwiek_lemmas = StringSet.of_list [
  (* adj *)
  "czyjkolwiek"; "czyjś"; "czyjże"; "jakiciś"; "jakikolwiek"; "jakisi"; "jakiś"; "jakiści";
  "jakiściś"; "jakiśkolwiek"; "jakiż"; "jakiżkolwiek"; "jakowyś"; "kijże"; "kiż"; "którykolwiek";
  "któryś"; "któryż"; "któryżkolwiek"; "niejakiś"; "takiż"; "takowyż"; "tenże"; "tyliż"; "ówże";
  (* noun *)
  "cokolwiek:s"; "cośkolwiek"; "cóżkolwiek"; "ktokolwiek"; "ktośkolwiek"; "któżkolwiek";
  "cociś"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "coże"; "cóż";
  "ktoś:s2"; "któż";
  (* adv *)
  "jakkolwiek"; "jakoś"; "małoż"; "niejakkolwiek"; "niejakoś"; (*"niemalże";*) ]

let kolwiek_suffixes = [
  "żkolwiek"; "żekolwiek"; "śkolwiek"; "kolwiek"; "ż"; "że"; "ściś"; "ciś"; "ś"; "ści"; "sik"; "si"]

let find_kolwiek_suffixes dict =
  Xlist.rev_map dict (fun entry ->
    if StringSet.mem kolwiek_lemmas entry.lemma then
      {entry with forms=Xlist.map entry.forms (fun form ->
          {form with orth=Xlist.fold kolwiek_suffixes form.orth (fun orth kolwiek_suf ->
            if Xstring.check_sufix kolwiek_suf orth then
              Xstring.cut_sufix kolwiek_suf orth
            else orth)})}
    else entry)

let exceptional_lemmata = StringSet.of_list ([
Wojciech Jaworski authored
439
440
441
442
443
444
445
446
447
448
449
450
451
452
  (* wiele stemów *)
  "Apollo"; "Aujeszky"; "Białystok"; "Gózd"; "Krasnystaw"; "Różanystok"; "Wielkanoc"; "białagłowa";
  "deszcz"; "imćpan"; "iściec"; "otrząs"; "rzeczpospolita"; "wilczełyko"; "woleoczko";

  "prapraojciec"; "praojciec"; "ojciec"; "współbrat"; "spółbrat"; "półbrat"; "brat";
  "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; "człowiek";
  "półdziecko"; "+lecie"; "zimoziele"; "ziele"; "trójziele"; "nasienie";
  "ksiądz"; "dech"; "tydzień"; "roczek:s2"; "rok:s1"; "przechrzest"; "chrzest";
  "dziecko"; "ucho:s2"; "oko:s2"; "cześć:s"; "jo-jo"; "Zabłotce"; "tysiąc:s1"; "półmiesiąc"; "miesiąc"; ""; ""; "";
  "Pia"; "ręka"; "człek"; "Kozak:s1"; "bóg"; "psubrat"; "pieniądz"; ""; ""; ""; "";
  "kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic";
  "co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek";
  "niebiosa"; "Włochy:s1"; "Niemcy"; "Węgry"; "Austro-Węgry"; ""; ""; ""; ""; ""; ""; "";
  "zając:s1"; "tysiąc:s2"; "wszyscy"; ""; ""; ""; ""; ""; ""; ""; ""; "";
Wojciech Jaworski authored
453
454
(*  "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
  "NOT"; "LOT"; "KRRiT"; "OIT"; ""; ""; ""; ""; ""; ""; ""; "";*)
Wojciech Jaworski authored
455
456
457
458
459
460
461
462
463
464
465
466
  "bliscy"; "ojcowie"; "teściowie"; "ichmościowie"; "wujkowie"; "staruszkowie"; "pradziadkowie"; ""; ""; ""; ""; "";
  "małżonkowie"; "kochankowie"; "dziadkowie"; "rozwiedzeni"; "nieliczni"; "chrzestni"; ""; ""; ""; ""; ""; "";
  "starzy"; "wasi"; "nasi"; "najmłodsi"; "dzisiejsi"; ""; ""; ""; ""; ""; ""; "";
  "IKEA"; "stajnia"; "kuchnia:s"; "suknia"; "minisuknia"; "głównia"; "głownia"; "dźwignia"; ""; ""; ""; "";
  "workowiśnia"; "wiśnia"; "sośnia"; "laurowiśnia"; "studnia"; "idea"; "imienie"; ""; ""; ""; ""; "";
  "makao"; "macao"; "kakao"; "Akademgorodok"; "yuppi"; "hippie"; "yuppie"; ""; ""; ""; ""; "";
  "Uj"; "PIT"; "ChAT"; "podczłowiek"; "nieczłowiek"; "cześć"; "ktoś"; "ktosik"; ""; ""; ""; "";
  "+ówna"; "+yna"; "+ina"; "+anka"; "+owa"; "co"; "cokolwiek"; "coś"; "cośtam"; ""; ""; "";
  "zając"; "tysiąc"; "rok"; "roczek"; "oko"; "ucho"; "Włochy"; "niebiosy"; "wici"; ""; ""; "";
  "André"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";

  "zły:a"; "dobry:a"; "przymały"; "mały:a"; "duży"; "wielki:a";
Wojciech Jaworski authored
467
  "ppoż."; "ppanc."; "pepanc."; "midi:a"; ""; ""; ""; ""; ""; ""; ""; "";
Wojciech Jaworski authored
468
469
470
471
  "zwać"; "wiedzieć"; "pójść"; "przejść"; "dojść"; "zsiąść"; "iść"; ""; ""; ""; ""; "";
  "być"; "zasłonić"; "słonić"; "przysłonić"; "przesłonić"; "osłonić"; "odsłonić"; ""; ""; ""; ""; "";

  (*
Wojciech Jaworski authored
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
  (* błąd w słowniku *)
  "ówże";
  (* wiele stemów *)
  "twój:a"; "swój"; "mój:a"; "wszystek";
  (* oboczności w stemie *)
  "co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek";
  "kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic";
  "Angel"; "Apollo"; "Białystok"; "Bober"; "Dzięgiel"; "Engel"; "Gołąb:s2"; "Gózd"; "Hendel"; "Herschel"; "Jastrząb";
  "Kodrąb:s2"; "Kozioł"; "Krasnystaw"; "Majcher"; "Ob"; "Omulew"; "Orzeł"; "Różanystok"; "Schuster"; "Stępień"; "Słonim";
  "Wielkanoc"; "achtel"; "archiprezbiter"; "arcydzięgiel"; "bedel"; "ber"; "białagłowa"; "białodrzew"; "ceter"; "deszcz";
  "drama"; "dziób:s1"; "dzięgiel"; "dżemper"; "falafel"; "grubodziób"; "harbajtel"; "harbejtel"; "harmider"; "imćpan";
  "iściec"; "jarząb:s2"; "kierdel"; "kimel"; "kiper:s1"; "klaster"; "kliper"; "kosodrzew"; "kureń"; "manczester";
  "nadpiersień"; "osep"; "otrząs"; "pedel"; "piksel"; "podpiersień"; "podziem"; "prezbiter"; "protokół"; "przedpiersień";
  "ratel"; "rondel:s2"; "rozpiór:s1"; "rozpiór:s2"; "rzeczpospolita"; "rzep:s2"; "rzepień"; "rzewień"; "rąb"; "sosrąb";
  "srebrnodrzew"; "swąd"; "szmermel"; "szpiegierz"; "ulster"; "wab:s2"; "wermiszel"; "wilczełyko"; "woleoczko"; "włosień:s2";
  "zew"; "złotogłów"; "świreń"; "źreb"; "żółtodziób";
  "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
  "przechrzest"; "chrzest"; "półdziecko"; "roczek:s2"; "rok:s1"; "tydzień"; ""; ""; ""; ""; ""; "";
  (* oboczności w odmianie *)
  "niekażdy"; "każdy"; "niektóry:a"; "który"; "tenże"; "ten"; "tamten"; "kijże";
  "ucho:s2"; "dziecko"; "oko:s2"; "imię"; "nozdrze";
  "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
  "NOT"; "LOT"; "KRRiT"; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "być"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
Wojciech Jaworski authored
496
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
Wojciech Jaworski authored
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
  (* pozostawione *)
  "czyjże"; "czyjś"; "czyjkolwiek"; "kiż"; "ów"; "ow"; "on:a"; "ki";
  "Pia"; "jo-jo"; "+lecie"; "";
  "zagrząźć"; "zrzeć";
  (* niepełny paradygmat *)
  "zróść"; "zląc"; "zaróść"; "zaprząc"; "zaprzysiąc"; "zanieść:v2"; "zaląc"; "wzróść"; "wyróść"; "wyprząc"; "wyprzysiąc";
  "róść"; "sprzysiąc"; "sprząc"; "ugrząźć"; "uląc"; "upiec:v2"; "uprząc"; "uróść"; "wieść:v2"; "wprząc"; "wróść"; "wyląc";
  "powieść:v2"; "posiąc"; "przeląc"; "przeprząc"; "przeróść"; "przyprząc"; "przysiąc"; "przyróść"; "prząc"; "pójść:v2"; "rozprząc"; "rozróść";
  "krzywoprzysiąc"; "ląc"; "naróść"; "obróść"; "odprzysiąc"; "odprząc"; "odróść"; "oprzysiąc"; "podróść"; "pogrząźć"; "poprzysiąc"; "poróść";
  "dojść:v2"; "doprząc"; "doróść"; "dosiąc"; "grząźć"; "iść:v2";
  (* wiele stemów *)
  "uwlec"; "wewlec"; "wlec"; "wwlec"; "wywlec"; "wyżec"; "zawlec"; "zażec"; "zewlec"; "zwlec"; "zżec"; "żec";
  "podwlec"; "podżec"; "powlec:v1"; "powlec:v2"; "przeoblec"; "przewlec"; "przeżec"; "przyoblec"; "przywlec"; "przyżec"; "rozwlec"; "rozżec";
  "dowlec"; "nawlec"; "oblec:v2"; "obwlec"; "odwlec"; "owlec"; "zeżreć";
  (* inne *)
  "liźć"; "iść:v1"; "wyniść"; "wynijść"; "wyjść"; "wniść"; "wnijść"; "wejść"; "ujść"; "rozejść"; "pójść:v1"; "przyjść"; "przejść:v2"; "przejść:v1"; "podejść"; "odejść"; "obejść:v2"; "obejść:v1"; "najść:v2"; "najść:v1"; "nadejść"; "dojść:v1";
  "roztworzyć:v2"; "przetworzyć:v2"; "otworzyć";
  "zsiąść:v2"; "zsiąść:v1"; "zesiąść"; "zasiąść"; "wysiąść"; "współposiąść"; "wsiąść"; "usiąść"; "siąść"; "rozsiąść"; "przysiąść"; "przesiąść"; "powsiąść"; "posiąść"; "podsiąść"; "osiąść"; "obsiąść"; "nasiąść"; "dosiąść";
  "źreć:v1"; "zniść"; "znijść"; "znajść"; "zejść"; "zejść"; "zajść:v2"; "zajść:v1"; "wzniść"; "wznijść"; "wzejść"
(*
   "moi"; "twoi";
  (*"AIDS"; "BGŻ"; "BWZ"; "BZ";*) (*"Bandtkie";*) (*"CRZZ"; "FPŻ";*) (*"Jokai"; "Jókai"; "Linde";*)(* "MSZ"; "MWGzZ"; *)
  (*"NSZ"; "OPZZ";*) "Radetzky"; "Tagore"; (*"UNZ"; "URz"; "WBZ"; "ZSZ"; "ZWZ"; "ZZ";*) "aids";
  "arcyksiężna"; "cornflakes"; "księżna"; (*"scrabble";*) "sms"; "teścina";
  "Wielkanoc"; "białagłowa"; "rzeczpospolita"; "imćpan";
  "Ob"; "podziem"; "Pia"; "woleoczko"; "wilczełyko"; "jo-jo"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "Omulew"; "drama"; (*"Kayah";*) "ratel"; "grubodziób"; "rozpiór:s1"; "ceter"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "DJ"; "FIFA"; (*"manicure"; "Greenpeace"; "Google";*) ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "włosień:s2"; "deszcz"; "falafel"; "Krasnystaw";
  "Różanystok"; "Białystok"; "ZHR"; "rzep:s2"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "IKEA"; "makao"; "macao"; "kakao"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "dziecko"; "oko:s2"; "ucho:s2"; "półdziecko"; "b-cia"; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "idea"; "ręka"; "cześć:s"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "ABBA"; "UEFA"; "FAMA"; "SABENA"; "MENA"; "APA"; "NASA"; "ANSA";
  "NAFTA"; "LETTA"; "ETA"; "ELTA"; "EFTA"; "CEFTA";
  "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART";
  "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT";
  "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
  "szwa"; "hawanna"; "butaforia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "Skopie"; "Mathea"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "poema:s1"; "klima:s1"; "dylema"; "dilemma"; "apoftegma"; "aksjoma"; ""; ""; ""; ""; ""; ""; ""; "";
  "burgrabia"; "gograbia"; "grabia"; "hrabia"; "margrabia"; "murgrabia"; "sędzia:s1"; "wicehrabia"; "współsędzia";
  "cieśla"; "bibliopola"; "świszczypałka"; "śwircałka"; "świerczałka"; "ścierciałka"; "tatka"; "sługa:s1"; "stupajka:s1"; "stepka"; "starowinka:s2"; "skurczypałka"; "mężczyzna"; "klecha";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
Wojciech Jaworski authored
544
  ] @ File.load_lines "../morphology/data/obce.tab" @ File.load_lines "../morphology/data/akronimy.tab" @
Wojciech Jaworski authored
545
546
    (* File.load_lines "results/interp_validated_verb.tab" @  *)
    (*File.load_lines "results/interp_validated_noun.tab" @ File.load_lines "results/interp_validated_adj.tab" @
Wojciech Jaworski authored
547
    File.load_lines "../morphology/data/validated_adj.tab" @ File.load_lines "../morphology/data/validated_noun.tab" @
Wojciech Jaworski authored
548
    File.load_lines "../morphology/data/validated_verb.tab" @ File.load_lines "../morphology/data/adv_nieodprzymiotnikowe.tab" *) [])
Wojciech Jaworski authored
549
550
551
552
553
554

let remove_exceptional_lemmata dict =
  Xlist.fold dict [] (fun dict entry ->
    if StringSet.mem exceptional_lemmata entry.lemma then dict
    else entry :: dict)
Wojciech Jaworski authored
555
556
557
558
559
let remove_exceptional_lemmata_gen ex dict =
  Xlist.fold dict [] (fun dict entry ->
    if StringSet.mem ex entry.lemma then dict
    else entry :: dict)
Wojciech Jaworski authored
560
561
562
let generate_stem dict =
  Xlist.rev_map dict (fun entry ->
    {entry with stem=
Wojciech Jaworski authored
563
      (* if entry.ndm then (List.hd entry.forms).orth else *)
Wojciech Jaworski authored
564
565
566
567
      if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then
        Stem.generate_stem entry
      else ""})
Wojciech Jaworski authored
568
(*let phon_generate_stem dict =
Wojciech Jaworski authored
569
570
571
572
573
  Xlist.rev_map dict (fun entry ->
    {entry with phon_stem=
      if entry.ndm then (List.hd entry.forms).phon_orth else
      if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then
        Stem.phon_generate_stem entry
Wojciech Jaworski authored
574
      else []})*)
Wojciech Jaworski authored
575
Wojciech Jaworski authored
576
577
578
579
580
581
582
583
584
585
let generate_stem_lu dict =
  Xlist.rev_map dict (fun entry ->
    {entry with lu_stem=Stem.generate_stem_lu entry.lemma1 entry.lemma2})

let lowercase_lu dict =
  Xlist.rev_map dict (fun entry ->
    {entry with
      lemma1=Xunicode.lowercase_utf8_string entry.lemma1;
      lemma2=Xunicode.lowercase_utf8_string entry.lemma2})
Wojciech Jaworski authored
586
Wojciech Jaworski authored
587
588
589
let fonetic_translation dict =
  Xlist.fold dict [] (fun dict e ->
    try
Wojciech Jaworski authored
590
591
592
593
      let lemma = Stem.simplify_lemma e.lemma in
      let phon_lemma = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules lemma in
      let phon_stem = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules e.stem in
      {e with phon_lemma = phon_lemma; phon_stem=phon_stem;
Wojciech Jaworski authored
594
        forms = Xlist.map e.forms (fun f ->
Wojciech Jaworski authored
595
596
597
598
599
600
601
          let phon_orth = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules f.orth in
          {f with phon_orth = phon_orth})} :: dict
    with
      Fonetics.NotFound(x,s) -> printf "NF %s %s %s\n%!" e.lemma x s; dict
    | Fonetics.NotEqual(x,s,t) -> printf "NE %s %s %s %s\n%!" e.lemma x s t; dict
    | Fonetics.MulipleSolutions(x,s,l) -> printf "MS %s %s %s: %s\n%!" e.lemma x s (String.concat " " l); dict
    | _ -> dict)
Wojciech Jaworski authored
602
Wojciech Jaworski authored
603
let validate rules dict =
Wojciech Jaworski authored
604
605
606
607
608
  Xlist.rev_map dict (fun entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    let forms = Xlist.rev_map entry.forms (fun form ->
      let candidates = Rules.CharTrees.find rules form.orth in
      let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
Wojciech Jaworski authored
609
        if stem ^ rule.set = simple_lemma then (stem,rule) :: candidates else candidates) in
Wojciech Jaworski authored
610
611
612
      if candidates = [] then {form with validated=false} else {form with validated=true}) in
    {entry with forms=forms})
Wojciech Jaworski authored
613
614
615
616
617
618
619
620
621
622
623
let phon_validate rules dict =
  Xlist.rev_map dict (fun entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    let forms = Xlist.rev_map entry.forms (fun form ->
      let candidates = Xlist.fold form.phon_orth [] (fun candidates s ->
        Xlist.fold (Rules.CharTrees.find rules s) candidates (fun candidates (stem,rule) ->
          let candidate_lemma = Fonetics.translate_single true Fonetics.rev_rules (stem ^ rule.set) in
          if candidate_lemma = simple_lemma then (stem,rule) :: candidates else candidates)) in
      if candidates = [] then {form with validated=false} else {form with validated=true}) in
    {entry with forms=forms})
Wojciech Jaworski authored
624
625
626
627
628
629
630
631
632
633
634
635
636
let validate_lu rules dict =
  Xlist.rev_map dict (fun entry ->
    let candidates1 = Rules.CharTrees.find rules entry.lemma1 in
    let candidates2 = Rules.CharTrees.find rules entry.lemma2 in
    let b = Xlist.fold candidates1 false (fun b (stem1,rule1) ->
      Xlist.fold candidates2 b (fun b (stem2,rule2) ->
        (* Printf.printf "%s %s %s %s\n%!" stem1 stem2 (string_of_rule rule1) (string_of_rule rule1); *)
        if stem1 ^ rule1.set = stem2 ^ rule2.set then true else b)) in
    (* if b then print_endline "validated"; *)
    let b1 = Xlist.fold candidates1 false (fun b (stem1,rule1) -> if stem1 = entry.lu_stem then true else b) in
    let b2 = Xlist.fold candidates2 false (fun b (stem2,rule2) -> if stem2 = entry.lu_stem then true else b) in
    {entry with lu_validated=b; validated1=b1; validated2=b2})
Wojciech Jaworski authored
637
let validate_interp rules dict =
Wojciech Jaworski authored
638
639
640
  Xlist.rev_map dict (fun entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    let forms = Xlist.rev_map entry.forms (fun form ->
Wojciech Jaworski authored
641
642
      let candidates = Xlist.fold form.phon_orth [] (fun candidates s ->
        Xlist.fold (Rules.CharTrees.find rules s) candidates (fun candidates (stem,rule) ->
Wojciech Jaworski authored
643
644
          (* if rule.star = Ndm && not entry.ndm then candidates else
          if rule.star <> Ndm && entry.ndm then candidates else *)
Wojciech Jaworski authored
645
646
647
          let candidate_lemma = Fonetics.translate_single true Fonetics.rev_rules (stem ^ rule.set) in
          if candidate_lemma = simple_lemma && form.interp = rule.interp then
            (stem,rule) :: candidates else candidates)) in
Wojciech Jaworski authored
648
      if candidates = [] then {form with validated=false} else {form with validated=true}) in
Wojciech Jaworski authored
649
650
651
652
653
654
655
656
657
658
659
660
661
662
    {entry with forms=forms})

let remove_validated_forms dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if form.validated then forms else form :: forms) in
    if forms = [] then dict else {entry with forms=forms} :: dict)

let remove_validated_entries dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if form.validated then forms else form :: forms) in
    if forms = [] then dict else entry :: dict)
Wojciech Jaworski authored
663
664
665
666
let remove_validated_lu dict =
  Xlist.fold dict [] (fun dict entry ->
    if entry.lu_validated then dict else entry :: dict)
Wojciech Jaworski authored
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
let remove_not_validated_forms dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if form.validated then form :: forms else forms) in
    if forms = [] then dict else {entry with forms=forms} :: dict)

let remove_not_validated_entries dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if form.validated then form :: forms else forms) in
    if Xlist.size forms <> Xlist.size entry.forms then dict else entry :: dict)

let print filename dict =
  File.file_out filename (fun file ->
    Xlist.iter dict (fun entry ->
      Xlist.iter entry.forms (fun form ->
        fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp)))

let print_lemmata filename dict =
  File.file_out filename (fun file ->
    Xlist.iter dict (fun entry ->
      fprintf file "%s\n" entry.lemma))
Wojciech Jaworski authored
689
690
691
692
693
694
695
696

let remove_sup_neg_forms dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if Xstring.check_sufix ":neg" form.interp || Xstring.check_sufix ":sup" form.interp then
        forms else form :: forms) in
    if forms = [] then dict else {entry with forms=forms} :: dict)
Wojciech Jaworski authored
697
let generate_rules rules path filename rules_filename =
Wojciech Jaworski authored
698
699
  let dict = load_tab (path ^ filename) in
  let dict = merge_entries dict in
Wojciech Jaworski authored
700
  let dict = process_interps dict in
Wojciech Jaworski authored
701
  (* let dict = mark_ndm dict in (* FIXME: remove_ndm? *) *)
Wojciech Jaworski authored
702
  let dict = remove_exceptional_lemmata dict in
Wojciech Jaworski authored
703
704
  let dict = find_kolwiek_suffixes dict in (* FIXME: lematy z kolwiek_suffixes nie są walidowane *)
  let dict = generate_stem dict in
Wojciech Jaworski authored
705
  let dict = fonetic_translation dict in
Wojciech Jaworski authored
706
  let dict = phon_validate rules dict in
Wojciech Jaworski authored
707
  let dict = remove_validated_forms dict in
Wojciech Jaworski authored
708
  let dict = remove_sup_neg_forms dict in (* FIXME *)
Wojciech Jaworski authored
709
  let rules = Xlist.fold dict StringMap.empty (fun rules entry ->
Wojciech Jaworski authored
710
    Xlist.fold (RuleGenerator.phon_generate_rules_entry entry) rules (fun rules (key,rule) ->
Wojciech Jaworski authored
711
712
713
714
715
716
717
718
719
      let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in
      let rules2 = StringMap.add_inc rules2 rule (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l) in
      StringMap.add rules key rules2)) in
  File.file_out rules_filename (fun file ->
    StringMap.iter rules (fun interp rules2 ->
      fprintf file "\n@RULES %s\n" interp;
      StringMap.iter rules2 (fun rule (q,l) ->
        fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))
Wojciech Jaworski authored
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
let generate_rules_lu rules id path rules_filename =
  let dict = load_lu [] id path in
  let dict = lowercase_lu dict in
  let dict = generate_stem_lu dict in
  let dict = validate_lu rules dict in
  let dict = remove_validated_lu dict in
  (* let dict = remove_sup_neg_forms dict in *)
  let rules = Xlist.fold dict StringMap.empty (fun rules entry ->
    Xlist.fold (RuleGenerator.generate_rules_lu_entry entry) rules (fun rules (key,rule,lemma) ->
      let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in
      let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in
      StringMap.add rules key rules2)) in
  File.file_out rules_filename (fun file ->
    StringMap.iter rules (fun interp rules2 ->
      fprintf file "\n@RULES %s\n" interp;
      StringMap.iter rules2 (fun rule (q,l) ->
        fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))

let rec get_first n l =
  if n = 0 || l = [] then [] else
  List.hd l :: (get_first (n-1) (List.tl l))
Wojciech Jaworski authored
742
743
let generate_interp_rules rules interp_rules selected_tags path filename rules_filename =
  let selected_tags = StringSet.of_list selected_tags in
Wojciech Jaworski authored
744
745
  let dict = load_tab (path ^ filename) in
  let dict = merge_entries dict in
Wojciech Jaworski authored
746
  let dict = process_interps dict in
Wojciech Jaworski authored
747
  (* let dict = mark_ndm dict in (* FIXME: remove_ndm? *) *)
Wojciech Jaworski authored
748
  let dict = remove_exceptional_lemmata dict in
Wojciech Jaworski authored
749
750
751
  (* let dict = find_kolwiek_suffixes dict in *)
  (* let dict = generate_stem dict in *)
  let dict = fonetic_translation dict in
Wojciech Jaworski authored
752
  let dict = validate_interp interp_rules dict in
Wojciech Jaworski authored
753
754
755
756
  let dict = remove_validated_forms dict in
  let interp_rules = Xlist.fold dict StringMap.empty (fun interp_rules entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    Xlist.fold entry.forms interp_rules (fun interp_rules form ->
Wojciech Jaworski authored
757
758
      let candidates = RuleGenerator.phon_generate_interp_rules rules selected_tags simple_lemma form in
      Xlist.fold candidates interp_rules (fun interp_rules (v,cand) ->
Wojciech Jaworski authored
759
        (* StringMap.add_inc interp_rules cand (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l)))) in *)
Wojciech Jaworski authored
760
761
762
        StringMap.add_inc interp_rules cand (v,StringSet.singleton entry.lemma) (fun (v,set) -> v,StringSet.add set entry.lemma)))) in
  let interp_rules = List.rev (List.sort compare (StringMap.fold interp_rules [] (fun l k (v,set) ->
    (v,k,set) :: l))) in
Wojciech Jaworski authored
763
  File.file_out rules_filename (fun file ->
Wojciech Jaworski authored
764
    Xlist.iter interp_rules (fun (v,k,set)(*q,l*) ->
Wojciech Jaworski authored
765
      (* fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l))) *)
Wojciech Jaworski authored
766
      (*if StringSet.size set > 1000 then*) fprintf file "\t%s\t# %d %s\n" k (StringSet.size set) (String.concat " " (get_first 20 (List.rev (StringSet.to_list set))))))
Wojciech Jaworski authored
767
Wojciech Jaworski authored
768
(* let generate_ndm_rules dict =
Wojciech Jaworski authored
769
770
771
772
773
  let freq_rules = Xlist.fold dict Rules.RuleQMap.empty (fun freq_rules entry ->
    Xlist.fold entry.forms freq_rules (fun freq_rules form ->
      let rule = {id=""; freq=0; star=Ndm; pref=""; find=""; set=""; tags=[]; interp=form.interp} in
      Rules.RuleQMap.add freq_rules rule)) in
   fst (Rules.RuleQMap.fold freq_rules (Rules.RuleQMap.empty,1) (fun (freq_rules,i) rule freq ->
Wojciech Jaworski authored
774
     Rules.RuleQMap.add_val freq_rules {rule with id = "N" ^ string_of_int i} freq, i+1)) *)
Wojciech Jaworski authored
775
776

let generate_rule_frequencies rules path filename rules_filename =
Wojciech Jaworski authored
777
778
  let dict = load_tab (path ^ filename) in
  let dict = merge_entries dict in
Wojciech Jaworski authored
779
  let dict = process_interps dict in
Wojciech Jaworski authored
780
  let dict = remove_cat "cond" dict in
Wojciech Jaworski authored
781
  (* let dict = mark_ndm dict in
Wojciech Jaworski authored
782
  let freq_rules = generate_ndm_rules (remove_not_ndm dict) in
Wojciech Jaworski authored
783
  let dict = remove_ndm dict in *)
Wojciech Jaworski authored
784
785
  let dict = remove_exceptional_lemmata dict in
  let dict = generate_stem dict in
Wojciech Jaworski authored
786
  let freq_rules = Xlist.fold dict Rules.RuleQMap.empty(*freq_rules*) (fun freq_rules entry ->
Wojciech Jaworski authored
787
788
789
790
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    Xlist.fold entry.forms freq_rules (fun freq_rules form ->
      let candidates = Rules.CharTrees.find rules form.orth in
      let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
Wojciech Jaworski authored
791
        if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
Wojciech Jaworski authored
792
793
794
795
          (stem,rule) :: candidates else candidates) in
      if candidates = [] then freq_rules else Rules.RuleQMap.add freq_rules (snd (List.hd candidates)))) in
  File.file_out rules_filename (fun file ->
    Rules.RuleQMap.iter freq_rules (fun rule freq ->
Wojciech Jaworski authored
796
      fprintf file "%s\n" (Rules.string_of_freq_rule {rule with freq=freq})))
Wojciech Jaworski authored
797
798
799
800
801
802

let generate_stem_dict rules_filename path filename out_filename =
  let rules = Rules.load_freq_rules rules_filename in
  let rules = Rules.CharTrees.create rules in
  let dict = load_tab (path ^ filename) in
  let dict = merge_entries dict in
Wojciech Jaworski authored
803
  let dict = process_interps dict in
Wojciech Jaworski authored
804
  let dict = remove_cat "cond" dict in
Wojciech Jaworski authored
805
  (* let dict = mark_ndm dict in *)
Wojciech Jaworski authored
806
807
808
809
810
  let stems = Xlist.fold dict StringMap.empty (fun stems entry ->
    let simple_lemma,lemma_suf = Stem.simplify_lemma_full entry.lemma in
    Xlist.fold entry.forms stems (fun stems form ->
      let candidates = Rules.CharTrees.find rules form.orth in
      let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
Wojciech Jaworski authored
811
812
        (* if rule.star = Ndm && not entry.ndm then candidates else
        if rule.star <> Ndm && entry.ndm then candidates else *)
Wojciech Jaworski authored
813
        if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
Wojciech Jaworski authored
814
815
816
          (stem,rule) :: candidates else candidates) in
      if candidates = [] then stems else
      let stem,rule = List.hd candidates in
Wojciech Jaworski authored
817
      StringMap.add_inc stems (stem ^ "\t" ^ lemma_suf) [rule.id] (fun l -> rule.id :: l))) in
Wojciech Jaworski authored
818
819
820
  File.file_out out_filename (fun file ->
    StringMap.iter stems (fun stem ids ->
      fprintf file "%s\t%s\n" stem (String.concat " " ids)))