Commit aadb59c0639f7698b8020ad96ab4ed7d8ae02100
1 parent
54ddc22b
dodanie guessera
Showing
6 changed files
with
2891 additions
and
0 deletions
guesser/generate.ml
0 → 100644
1 | +open Xstd | |
2 | +open Printf | |
3 | + | |
4 | +let nexus_path = "/home/yacheu/Dokumenty/Badania/Jezyk i Umysl/Przetwarzanie Języka Naturalnego/zasoby/" | |
5 | +let toshiba_ub_path = "/home/wjaworski/Dokumenty/zasoby/" | |
6 | + | |
7 | +let get_host_name () = | |
8 | + let chan = Unix.open_process_in "uname -n" in | |
9 | + input_line chan | |
10 | + | |
11 | +let zasoby_path = | |
12 | + match get_host_name () with | |
13 | + "nexus" -> nexus_path | |
14 | + | "toshiba-UB" -> toshiba_ub_path | |
15 | +(* | "mozart" -> "." *) | |
16 | + | s -> failwith ("unknown host: " ^ s) | |
17 | + | |
18 | +let morfeusz_path = zasoby_path ^ "morfeusz/" | |
19 | +let sgjp_filename2015 = "sgjp-20151020.tab" | |
20 | +let polimorf_filename2015 = "polimorf-20151020.tab" | |
21 | +let sgjp_filename = "sgjp-20160508.tab" | |
22 | +let polimorf_filename = "polimorf-20160508.tab" | |
23 | + | |
24 | +(**********************************************************************************) | |
25 | + | |
26 | +module OrderedStringList = struct | |
27 | + | |
28 | + type t = string list | |
29 | + | |
30 | + let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare) | |
31 | + | |
32 | +end | |
33 | + | |
34 | +module OrderedStringListList = struct | |
35 | + | |
36 | + type t = string list list | |
37 | + | |
38 | + let compare x y = compare (Xlist.sort x compare) (Xlist.sort y compare) | |
39 | + | |
40 | +end | |
41 | + | |
42 | +module StringListMap = Xmap.Make(OrderedStringList) | |
43 | +module StringListListMap = Xmap.Make(OrderedStringListList) | |
44 | +module StringListListSet = Xset.Make(OrderedStringListList) | |
45 | + | |
46 | +type tree = T of tree StringListMap.t | S of StringSet.t | |
47 | + | |
48 | +let single_tags = function | |
49 | + [_] :: _ -> true | |
50 | + | _ -> false | |
51 | + | |
52 | +let rec make_tree interp = | |
53 | + if single_tags interp then S (StringSet.of_list (List.flatten (List.flatten interp))) else | |
54 | + let map = Xlist.fold interp StringListMap.empty (fun map tags -> | |
55 | + StringListMap.add_inc map (List.hd tags) [List.tl tags] (fun l -> (List.tl tags) :: l)) in | |
56 | + T(StringListMap.map map make_tree) | |
57 | + | |
58 | +let is_s_tree map = | |
59 | + StringListListMap.fold map false (fun b _ -> function | |
60 | + S _ -> true | |
61 | + | T _ -> b) | |
62 | + | |
63 | +let rec fold_tree_rec rev s f = function | |
64 | + S set -> f s (List.rev rev) set | |
65 | + | T map -> StringListMap.fold map s (fun s tag tree -> | |
66 | + fold_tree_rec (tag :: rev) s f tree) | |
67 | + | |
68 | +let fold_tree tree s f = fold_tree_rec [] s f tree | |
69 | + | |
70 | +let rec combine_interps_rec map = | |
71 | + if is_s_tree map then | |
72 | + StringListListMap.fold map [] (fun interp tail_tags -> function | |
73 | + S tag -> ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) :: interp | |
74 | + | _ -> failwith "combine_interps_rec") | |
75 | + else | |
76 | + let map = StringListListMap.fold map StringListListMap.empty (fun map tail_tags tree -> | |
77 | + fold_tree tree map (fun map head_tags tag -> | |
78 | + StringListListMap.add_inc map ((Xlist.sort (StringSet.to_list tag) compare) :: tail_tags) [head_tags] (fun l -> head_tags :: l))) in | |
79 | + combine_interps_rec (StringListListMap.map map make_tree) | |
80 | + | |
81 | +let combine_interp interp = | |
82 | + let map = StringListListMap.add StringListListMap.empty [] (make_tree interp) in | |
83 | + combine_interps_rec map | |
84 | + | |
85 | +let combine_pos = StringSet.of_list ["subst"; "depr"; "ppron12"; "ppron3"; "siebie"; "adj"; "num"; "ger"; "praet"; "fin"; "impt"; "imps"; "pcon"; "ppas"; "pact"; | |
86 | + "inf"; "bedzie"; "aglt"; "winien"; "pant"; "prep"] | |
87 | + | |
88 | +let tag_map = Xlist.fold [ | |
89 | + "sg",1; | |
90 | + "pl",2; | |
91 | + "nom",1; | |
92 | + "gen",2; | |
93 | + "dat",3; | |
94 | + "acc",4; | |
95 | + "inst",5; | |
96 | + "loc",6; | |
97 | + "voc",7; | |
98 | + "m1",1; | |
99 | + "m2",2; | |
100 | + "m3",3; | |
101 | + "n1",4; | |
102 | + "n2",5; | |
103 | + "f",6; | |
104 | + "p1",7; | |
105 | + "p2",8; | |
106 | + "p3",9; | |
107 | + "pos",1; | |
108 | + "com",2; | |
109 | + "sup",3 | |
110 | + ] StringMap.empty (fun map (k,v) -> StringMap.add map k v) | |
111 | + | |
112 | +let tag_compare x y = | |
113 | + try compare (StringMap.find tag_map x) (StringMap.find tag_map y) | |
114 | + with Not_found -> failwith ("tag_compare: " ^ x ^ " " ^ y) | |
115 | + | |
116 | +let combine_interps interps = | |
117 | + let map = Xlist.fold interps StringMap.empty (fun map interp -> | |
118 | + match Xlist.map (Str.split (Str.regexp ":") interp) (Str.split (Str.regexp "\\.")) with | |
119 | + [cat] :: tags -> StringMap.add_inc map cat [tags] (fun l -> tags :: l) | |
120 | + | _ -> failwith "combine_interps") in | |
121 | + let map = StringMap.mapi map (fun cat interp -> | |
122 | + Xlist.map interp (fun tags -> | |
123 | + match cat,Xlist.size tags with | |
124 | + "subst",3 -> tags | |
125 | + | "depr",3 -> tags | |
126 | + | "adj",4 -> tags | |
127 | + | "adja",0 -> tags | |
128 | + | _ -> failwith ("combine_interps: " ^ cat))) in | |
129 | + let l = StringMap.fold map [] (fun l cat interp -> | |
130 | + let interp = if StringSet.mem combine_pos cat then combine_interp interp else | |
131 | + StringListListSet.to_list (StringListListSet.of_list interp) in | |
132 | + (Xlist.map interp (fun tags -> [cat] :: tags)) @ l) in | |
133 | + String.concat "|" (Xlist.map l (fun tags -> | |
134 | + String.concat ":" (Xlist.map tags (fun l -> | |
135 | + String.concat "." (Xlist.sort l tag_compare))))) | |
136 | + | |
137 | +(**********************************************************************************) | |
138 | + | |
139 | +let load_tab filename = | |
140 | + File.load_tab filename (function | |
141 | +(* let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in | |
142 | + List.rev (Xlist.fold l [] (fun l line -> | |
143 | + if String.length line = 0 then l else | |
144 | + if String.get line 0 = '#' then l else | |
145 | + let line = if String.get line (String.length line - 1) = '\r' then | |
146 | + String.sub line 0 (String.length line - 1) else line in | |
147 | + match Str.split (Str.regexp "\t") line with*) | |
148 | + orth :: lemma :: interp :: _ -> orth,lemma,interp | |
149 | + | line -> failwith ("load_tab: " ^ (String.concat "\t" line))) | |
150 | + | |
151 | +let load_tab_full filename = | |
152 | + File.load_tab filename (function | |
153 | +(* let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in | |
154 | + List.rev (Xlist.fold l [] (fun l line -> | |
155 | + if String.length line = 0 then l else | |
156 | + if String.get line 0 = '#' then l else | |
157 | + let line = if String.get line (String.length line - 1) = '\r' then | |
158 | + String.sub line 0 (String.length line - 1) else line in | |
159 | + match Str.split (Str.regexp "\t") line with*) | |
160 | + [orth; lemma; interp] -> orth,lemma,interp,"","" | |
161 | + | [orth; lemma; interp; cl] -> orth,lemma,interp,cl,"" | |
162 | + | [orth; lemma; interp; cl; cl2] -> orth,lemma,interp,cl,cl2 | |
163 | +(* | orth :: lemma :: interp :: cl :: cl2 -> (orth,lemma,interp,cl,String.concat ";" cl2) :: l *) | |
164 | + | line -> failwith ("load_tab_full: " ^ (String.concat "\t" line))) | |
165 | +(* | _ -> failwith ("load_tab_full: " ^ line)) *) | |
166 | + | |
167 | +let load_dict_as_set filename = | |
168 | + let l = load_tab filename in | |
169 | + List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set (orth,lemma,interp) -> | |
170 | + StringSet.add set (String.concat "\t" [orth;lemma;interp])))) | |
171 | + | |
172 | +let load_dict_as_set_full filename = | |
173 | + let l = load_tab_full filename in | |
174 | + List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set (orth,lemma,interp,cl,cl2) -> | |
175 | + StringSet.add set (String.concat "\t" [orth;lemma;interp;cl;cl2])))) | |
176 | + | |
177 | +let rec compare_dicts_rec file = function | |
178 | + [],[] -> () | |
179 | + | [],b :: lb -> fprintf file "> %s\n" b; compare_dicts_rec file ([],lb) | |
180 | + | a :: la,[] -> fprintf file "< %s\n" a; compare_dicts_rec file (la,[]) | |
181 | + | a :: la, b :: lb -> | |
182 | + if a = b then compare_dicts_rec file (la,lb) else | |
183 | + if a < b then (fprintf file "< %s\n" a; compare_dicts_rec file (la,b :: lb)) else | |
184 | + (fprintf file "> %s\n" b; compare_dicts_rec file (a :: la,lb)) | |
185 | + | |
186 | +let compare_dicts path filename1 filename2 filename_out = | |
187 | + let dict1 = load_dict_as_set (path ^ filename1) in | |
188 | + let dict2 = load_dict_as_set (path ^ filename2) in | |
189 | + File.file_out filename_out (fun file -> | |
190 | + compare_dicts_rec file (dict1,dict2)) | |
191 | + | |
192 | +let compare_dicts_full path filename1 filename2 filename_out = | |
193 | + let dict1 = load_dict_as_set_full (path ^ filename1) in | |
194 | + let dict2 = load_dict_as_set_full (path ^ filename2) in | |
195 | + File.file_out filename_out (fun file -> | |
196 | + compare_dicts_rec file (dict1,dict2)) | |
197 | + | |
198 | +(* Porównanie wersji słowników *) | |
199 | +let _ = | |
200 | +(* compare_dicts_full morfeusz_path sgjp_filename2015 sgjp_filename "results/comparition_sgjp_full.out"; *) | |
201 | +(* compare_dicts_full morfeusz_path polimorf_filename2015 polimorf_filename "results/comparition_polimorf_full.out"; *) | |
202 | +(* compare_dicts morfeusz_path sgjp_filename2015 sgjp_filename "results/comparition_sgjp.out"; *) | |
203 | + () | |
204 | + | |
205 | +let split_dict path filename = | |
206 | + let dict = load_tab (path ^ filename) in | |
207 | + File.file_out (path ^ "noun_" ^ filename) (fun noun_file -> | |
208 | + File.file_out (path ^ "adj_" ^ filename) (fun adj_file -> | |
209 | + File.file_out (path ^ "adv_" ^ filename) (fun adv_file -> | |
210 | + File.file_out (path ^ "verb_" ^ filename) (fun verb_file -> | |
211 | + File.file_out (path ^ "other_" ^ filename) (fun other_file -> | |
212 | + Xlist.iter dict (fun (orth,lemma,interp) -> | |
213 | + let cat,tags = match Str.split (Str.regexp ":") interp with | |
214 | + cat :: tags -> cat,tags | |
215 | + | _ -> failwith ("split_dict: " ^ interp) in | |
216 | + if cat = "subst" || cat = "depr" then | |
217 | + fprintf noun_file "%s\t%s\t%s\n" orth lemma interp else | |
218 | + if cat = "adj" || cat = "adja"|| cat = "adjc"|| cat = "adjp" then | |
219 | + fprintf adj_file "%s\t%s\t%s\n" orth lemma interp else | |
220 | + if cat = "adv" then | |
221 | + fprintf adv_file "%s\t%s\t%s\n" orth lemma interp else | |
222 | + if cat = "inf" || cat = "praet"|| cat = "fin" || cat = "ppas" || cat = "pact" || cat = "pacta" || | |
223 | + cat = "impt" || cat = "imps" || cat = "pcon" || cat = "pant" || cat = "ger" || cat = "" then | |
224 | + fprintf verb_file "%s\t%s\t%s\n" orth lemma interp else | |
225 | + if cat = "bedzie" || cat = "pred"|| cat = "prep" || cat = "num" || cat = "aglt" || cat = "winien" || | |
226 | + cat = "qub" || cat = "brev" || cat = "comp" || cat = "interj" || cat = "burk" || cat = "conj" || cat = "ppron12" || cat = "ppron3" || cat = "" then | |
227 | + fprintf other_file "%s\t%s\t%s\n" orth lemma interp else | |
228 | + if cat = "cond" then () else | |
229 | + print_endline cat)))))) | |
230 | + | |
231 | + | |
232 | +(* Podział słownika *) | |
233 | +let _ = | |
234 | +(* split_dict morfeusz_path sgjp_filename; *) | |
235 | + () | |
236 | + | |
237 | +let map_of_tab data = | |
238 | + Xlist.fold data StringMap.empty (fun map (orth,lemma,interp) -> | |
239 | + StringMap.add_inc map lemma [orth,interp] (fun data -> (orth,interp) :: data)) | |
240 | + | |
241 | +let feminine = StringSet.of_list ["subst:pl:acc:f"; "subst:pl:dat:f"; "subst:pl:gen:f"; "subst:pl:inst:f"; "subst:pl:loc:f"; | |
242 | + "subst:pl:nom:f"; "subst:pl:voc:f"; "subst:sg:acc:f"; "subst:sg:dat:f"; "subst:sg:gen:f"; | |
243 | + "subst:sg:inst:f"; "subst:sg:loc:f"; "subst:sg:nom:f"; "subst:sg:voc:f"] | |
244 | + | |
245 | + | |
246 | +let extract_ndm path filename = | |
247 | + let dict = load_tab (path ^ filename) in | |
248 | + let dict = map_of_tab dict in | |
249 | + File.file_out (path ^ "ndm_" ^ filename) (fun ndm_file -> | |
250 | + File.file_out (path ^ "odm_" ^ filename) (fun odm_file -> | |
251 | + StringMap.iter dict (fun lemma l -> | |
252 | + let map = Xlist.fold l StringMap.empty (fun map (orth,interp) -> | |
253 | + StringMap.add_inc map interp (StringSet.singleton orth) (fun set -> StringSet.add set orth)) in | |
254 | + let qmap = StringMap.fold map StringQMap.empty (fun qmap interp orths -> | |
255 | + StringSet.fold orths qmap StringQMap.add) in | |
256 | + let n = StringMap.size map in | |
257 | + let found = StringQMap.fold qmap [] (fun found orth v -> | |
258 | + if v = n then orth :: found else found) in | |
259 | + match found with | |
260 | + [] -> | |
261 | + let fmap = StringMap.fold map StringMap.empty (fun fmap interp orths -> | |
262 | + if StringSet.mem feminine interp then StringMap.add fmap interp orths else fmap) in | |
263 | + let fqmap = StringMap.fold fmap StringQMap.empty (fun fqmap interp orths -> | |
264 | + StringSet.fold orths fqmap StringQMap.add) in | |
265 | + let fn = StringMap.size fmap in | |
266 | + let ffound = StringQMap.fold fqmap [] (fun ffound orth v -> | |
267 | + if v = fn then orth :: ffound else ffound) in | |
268 | + (match ffound with | |
269 | + [] -> Xlist.iter l (fun (orth,interp) -> fprintf odm_file "%s\t%s\t%s\n" orth lemma interp) | |
270 | + | [orth] -> | |
271 | + let interps = combine_interps(*String.concat "|" (List.sort compare*) (StringMap.fold fmap [] (fun l interp _ -> interp :: l)) in | |
272 | + fprintf ndm_file "%s\t%s\t%s\n" orth lemma interps; | |
273 | + if StringQMap.size fqmap > 1 then failwith ("extract_ndm ni: " ^ orth); | |
274 | + let map = StringMap.fold map StringMap.empty (fun map interp orths -> | |
275 | + if StringSet.mem feminine interp then map else StringMap.add map interp orths) in | |
276 | + StringMap.iter map (fun interp orths -> | |
277 | + StringSet.iter orths (fun orth -> | |
278 | + fprintf odm_file "%s\t%s\t%s\n" orth lemma interp)) | |
279 | + | _ -> failwith ("extract_ndm: " ^ (String.concat " " ffound))) | |
280 | + | [orth] -> | |
281 | + let interps = combine_interps(*String.concat "|" (List.sort compare*) (StringMap.fold map [] (fun l interp _ -> interp :: l)) in | |
282 | + fprintf ndm_file "%s\t%s\t%s\n" orth lemma interps; | |
283 | + if StringQMap.size qmap > 1 then | |
284 | + StringMap.iter map (fun interp orths -> | |
285 | + let orths = if StringSet.size orths = 1 then orths else StringSet.remove orths orth in | |
286 | + StringSet.iter orths (fun orth -> | |
287 | + fprintf odm_file "%s\t%s\t%s\n" orth lemma interp)) | |
288 | + | _ -> failwith ("extract_ndm: " ^ (String.concat " " found))))) | |
289 | + | |
290 | +(* Wydobycie nieodmiennych *) | |
291 | +let _ = | |
292 | +(* extract_ndm morfeusz_path ("adj_" ^ sgjp_filename); *) | |
293 | +(* extract_ndm morfeusz_path ("noun_" ^ sgjp_filename); *) | |
294 | + () | |
295 | + | |
296 | +(**********************************************************************************) | |
297 | + | |
298 | +let kolwiek_lemmas = StringSet.of_list [ | |
299 | + (* adj *) | |
300 | + "czyjkolwiek"; "czyjś"; "czyjże"; "jakiciś"; "jakikolwiek"; "jakisi"; "jakiś"; "jakiści"; | |
301 | + "jakiściś"; "jakiśkolwiek"; "jakiż"; "jakiżkolwiek"; "jakowyś"; "kijże"; "kiż"; "którykolwiek"; | |
302 | + "któryś"; "któryż"; "któryżkolwiek"; "niejakiś"; "takiż"; "takowyż"; "tenże"; "tyliż"; "ówże"; | |
303 | + (* noun *) | |
304 | + "cokolwiek:s"; "cośkolwiek"; "cóżkolwiek"; "ktokolwiek"; "ktośkolwiek"; "któżkolwiek"; | |
305 | + "cociś"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "coże"; "cóż"; | |
306 | + "ktoś:s2"; "któż"; | |
307 | + (* adv *) | |
308 | + "jakkolwiek"; "jakoś"; "małoż"; "niejakkolwiek"; "niejakoś"; (*"niemalże";*) ] | |
309 | + | |
310 | +let kolwiek_suffixes = [ | |
311 | + "żkolwiek"; "żekolwiek"; "śkolwiek"; "kolwiek"; "ż"; "że"; "ściś"; "ciś"; "ś"; "ści"; "sik"; "si"] | |
312 | + | |
313 | +let find_kolwiek_suffixes morfs = | |
314 | + StringMap.mapi morfs (fun lemma interps -> | |
315 | + if StringSet.mem kolwiek_lemmas lemma then | |
316 | + Xlist.map interps (fun (orth,interp) -> | |
317 | + (Xlist.fold kolwiek_suffixes orth (fun orth kolwiek_suf -> | |
318 | + if Rules.check_sufix kolwiek_suf orth then | |
319 | + Rules.cut_sufix kolwiek_suf orth | |
320 | + else orth)), interp) | |
321 | + else interps) | |
322 | + | |
323 | + | |
324 | +let exceptional_lemmata = StringSet.of_list ([ | |
325 | + (* błąd w słowniku *) | |
326 | + "ówże"; | |
327 | + (* wiele stemów *) | |
328 | + "twój:a"; "swój"; "mój:a"; "wszystek"; | |
329 | + (* oboczności w stemie *) | |
330 | + "co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek"; | |
331 | + "kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic"; | |
332 | + "Angel"; "Apollo"; "Białystok"; "Bober"; "Dzięgiel"; "Engel"; "Gołąb:s2"; "Gózd"; "Hendel"; "Herschel"; "Jastrząb"; | |
333 | + "Kodrąb:s2"; "Kozioł"; "Krasnystaw"; "Majcher"; "Ob"; "Omulew"; "Orzeł"; "Różanystok"; "Schuster"; "Stępień"; "Słonim"; | |
334 | + "Wielkanoc"; "achtel"; "archiprezbiter"; "arcydzięgiel"; "bedel"; "ber"; "białagłowa"; "białodrzew"; "ceter"; "deszcz"; | |
335 | + "drama"; "dziób:s1"; "dzięgiel"; "dżemper"; "falafel"; "grubodziób"; "harbajtel"; "harbejtel"; "harmider"; "imćpan"; | |
336 | + "iściec"; "jarząb:s2"; "kierdel"; "kimel"; "kiper:s1"; "klaster"; "kliper"; "kosodrzew"; "kureń"; "manczester"; | |
337 | + "nadpiersień"; "osep"; "otrząs"; "pedel"; "piksel"; "podpiersień"; "podziem"; "prezbiter"; "protokół"; "przedpiersień"; | |
338 | + "ratel"; "rondel:s2"; "rozpiór:s1"; "rozpiór:s2"; "rzeczpospolita"; "rzep:s2"; "rzepień"; "rzewień"; "rąb"; "sosrąb"; | |
339 | + "srebrnodrzew"; "swąd"; "szmermel"; "szpiegierz"; "ulster"; "wab:s2"; "wermiszel"; "wilczełyko"; "woleoczko"; "włosień:s2"; | |
340 | + "zew"; "złotogłów"; "świreń"; "źreb"; "żółtodziób"; | |
341 | + "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; ""; | |
342 | + "przechrzest"; "chrzest"; "półdziecko"; "roczek:s2"; "rok:s1"; "tydzień"; ""; ""; ""; ""; ""; ""; | |
343 | + (* oboczności w odmianie *) | |
344 | + "niekażdy"; "każdy"; "niektóry:a"; "który"; "tenże"; "ten"; "tamten"; "kijże"; | |
345 | + "ucho:s2"; "dziecko"; "oko:s2"; "imię"; "nozdrze"; | |
346 | + "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; | |
347 | + "NOT"; "LOT"; "KRRiT"; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
348 | + "być"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
349 | + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
350 | + (* pozostawione *) | |
351 | + "czyjże"; "czyjś"; "czyjkolwiek"; "kiż"; "ów"; "ow"; "on:a"; "ki"; | |
352 | + "Pia"; "jo-jo"; "+lecie"; ""; | |
353 | + "zagrząźć"; "zrzeć"; | |
354 | + (* niepełny paradygmat *) | |
355 | + "zróść"; "zląc"; "zaróść"; "zaprząc"; "zaprzysiąc"; "zanieść:v2"; "zaląc"; "wzróść"; "wyróść"; "wyprząc"; "wyprzysiąc"; | |
356 | + "róść"; "sprzysiąc"; "sprząc"; "ugrząźć"; "uląc"; "upiec:v2"; "uprząc"; "uróść"; "wieść:v2"; "wprząc"; "wróść"; "wyląc"; | |
357 | + "powieść:v2"; "posiąc"; "przeląc"; "przeprząc"; "przeróść"; "przyprząc"; "przysiąc"; "przyróść"; "prząc"; "pójść:v2"; "rozprząc"; "rozróść"; | |
358 | + "krzywoprzysiąc"; "ląc"; "naróść"; "obróść"; "odprzysiąc"; "odprząc"; "odróść"; "oprzysiąc"; "podróść"; "pogrząźć"; "poprzysiąc"; "poróść"; | |
359 | + "dojść:v2"; "doprząc"; "doróść"; "dosiąc"; "grząźć"; "iść:v2"; | |
360 | + (* wiele stemów *) | |
361 | + "uwlec"; "wewlec"; "wlec"; "wwlec"; "wywlec"; "wyżec"; "zawlec"; "zażec"; "zewlec"; "zwlec"; "zżec"; "żec"; | |
362 | + "podwlec"; "podżec"; "powlec:v1"; "powlec:v2"; "przeoblec"; "przewlec"; "przeżec"; "przyoblec"; "przywlec"; "przyżec"; "rozwlec"; "rozżec"; | |
363 | + "dowlec"; "nawlec"; "oblec:v2"; "obwlec"; "odwlec"; "owlec"; "zeżreć"; | |
364 | + (* inne *) | |
365 | + "liźć"; "iść:v1"; "wyniść"; "wynijść"; "wyjść"; "wniść"; "wnijść"; "wejść"; "ujść"; "rozejść"; "pójść:v1"; "przyjść"; "przejść:v2"; "przejść:v1"; "podejść"; "odejść"; "obejść:v2"; "obejść:v1"; "najść:v2"; "najść:v1"; "nadejść"; "dojść:v1"; | |
366 | + "roztworzyć:v2"; "przetworzyć:v2"; "otworzyć"; | |
367 | + "zsiąść:v2"; "zsiąść:v1"; "zesiąść"; "zasiąść"; "wysiąść"; "współposiąść"; "wsiąść"; "usiąść"; "siąść"; "rozsiąść"; "przysiąść"; "przesiąść"; "powsiąść"; "posiąść"; "podsiąść"; "osiąść"; "obsiąść"; "nasiąść"; "dosiąść"; | |
368 | + "źreć:v1"; "zniść"; "znijść"; "znajść"; "zejść"; "zejść"; "zajść:v2"; "zajść:v1"; "wzniść"; "wznijść"; "wzejść" | |
369 | +(* | |
370 | + "moi"; "twoi"; | |
371 | + (*"AIDS"; "BGŻ"; "BWZ"; "BZ";*) (*"Bandtkie";*) (*"CRZZ"; "FPŻ";*) (*"Jokai"; "Jókai"; "Linde";*)(* "MSZ"; "MWGzZ"; *) | |
372 | + (*"NSZ"; "OPZZ";*) "Radetzky"; "Tagore"; (*"UNZ"; "URz"; "WBZ"; "ZSZ"; "ZWZ"; "ZZ";*) "aids"; | |
373 | + "arcyksiężna"; "cornflakes"; "księżna"; (*"scrabble";*) "sms"; "teścina"; | |
374 | + "Wielkanoc"; "białagłowa"; "rzeczpospolita"; "imćpan"; | |
375 | + "Ob"; "podziem"; "Pia"; "woleoczko"; "wilczełyko"; "jo-jo"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
376 | + "Omulew"; "drama"; (*"Kayah";*) "ratel"; "grubodziób"; "rozpiór:s1"; "ceter"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
377 | + "DJ"; "FIFA"; (*"manicure"; "Greenpeace"; "Google";*) ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
378 | + "włosień:s2"; "deszcz"; "falafel"; "Krasnystaw"; | |
379 | + "Różanystok"; "Białystok"; "ZHR"; "rzep:s2"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
380 | + "IKEA"; "makao"; "macao"; "kakao"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
381 | + "dziecko"; "oko:s2"; "ucho:s2"; "półdziecko"; "b-cia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
382 | + "idea"; "ręka"; "cześć:s"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
383 | + "ABBA"; "UEFA"; "FAMA"; "SABENA"; "MENA"; "APA"; "NASA"; "ANSA"; | |
384 | + "NAFTA"; "LETTA"; "ETA"; "ELTA"; "EFTA"; "CEFTA"; | |
385 | + "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; | |
386 | + "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT"; | |
387 | + "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; ""; | |
388 | + "szwa"; "hawanna"; "butaforia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
389 | + "Skopie"; "Mathea"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
390 | + "poema:s1"; "klima:s1"; "dylema"; "dilemma"; "apoftegma"; "aksjoma"; ""; ""; ""; ""; ""; ""; ""; ""; | |
391 | + "burgrabia"; "gograbia"; "grabia"; "hrabia"; "margrabia"; "murgrabia"; "sędzia:s1"; "wicehrabia"; "współsędzia"; | |
392 | + "cieśla"; "bibliopola"; "świszczypałka"; "śwircałka"; "świerczałka"; "ścierciałka"; "tatka"; "sługa:s1"; "stupajka:s1"; "stepka"; "starowinka:s2"; "skurczypałka"; "mężczyzna"; "klecha"; | |
393 | + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
394 | + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
395 | + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
396 | + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*) | |
397 | + ] @ File.load_lines "data/obce.tab" @ File.load_lines "data/validated_adj.tab" @ File.load_lines "data/validated_noun.tab" @ File.load_lines "data/validated_verb.tab" @ File.load_lines "data/adv_nieodprzymiotnikowe.tab") | |
398 | + | |
399 | +let remove_exceptional_lemmata dict = | |
400 | + StringMap.fold dict StringMap.empty (fun dict lemma interps -> | |
401 | + if StringSet.mem exceptional_lemmata lemma then dict | |
402 | + else StringMap.add dict lemma interps) | |
403 | + | |
404 | +let check_stem_generation stem_sel path filename = | |
405 | + let dict = load_tab (path ^ filename) in | |
406 | + let dict = map_of_tab dict in | |
407 | + let dict = remove_exceptional_lemmata dict in | |
408 | + let dict = find_kolwiek_suffixes dict in | |
409 | + StringMap.iter dict (fun lemma forms -> | |
410 | + let _ = Stem.generate_stem stem_sel lemma forms in | |
411 | + ()) | |
412 | + | |
413 | +(* Sprawdzenie działania stemowania *) | |
414 | +let _ = | |
415 | +(* check_stem_generation Stem.adj_stem_sel morfeusz_path ("odm_adj_" ^ sgjp_filename); *) | |
416 | +(* check_stem_generation Stem.noun_stem_sel morfeusz_path ("odm_noun_" ^ sgjp_filename); *) | |
417 | + () | |
418 | + | |
419 | +let remove_com_sup dict = | |
420 | + List.rev (Xlist.fold dict [] (fun l (orth,lemma,interp) -> | |
421 | + if Rules.check_sufix ":com" interp || Rules.check_sufix ":sup" interp then l else (orth,lemma,interp) :: l)) | |
422 | + | |
423 | +let generate_adj_pos_rules rules_filename dict = | |
424 | + let dict = map_of_tab dict in | |
425 | + let dict = find_kolwiek_suffixes dict in | |
426 | + let dict = remove_exceptional_lemmata dict in | |
427 | + let rules = StringMap.fold dict StringMap.empty (RuleGenerator.generate_rules_entry Stem.adj_stem_sel) in | |
428 | + File.file_out rules_filename (fun file -> | |
429 | + StringMap.iter rules (fun interp rules2 -> | |
430 | + fprintf file "\n@RULES %s\n" interp; | |
431 | + StringMap.iter rules2 (fun rule (q,l) -> | |
432 | + fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l)))) | |
433 | + | |
434 | +let split_into_groups interp_sel dict = | |
435 | + Xlist.fold dict StringMap.empty (fun dict (lemma,orth,interp) -> | |
436 | + let group = | |
437 | + try StringMap.find interp_sel interp | |
438 | + with Not_found -> failwith ("split_into_groups: " ^ interp) in | |
439 | + StringMap.add_inc dict group [lemma,orth,interp] (fun l -> (lemma,orth,interp) :: l)) | |
440 | + | |
441 | +let load_interp_sel filename = | |
442 | + File.fold_tab filename StringMap.empty (fun interp_sel -> function | |
443 | +(* let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in | |
444 | + Xlist.fold l StringMap.empty (fun interp_sel line -> | |
445 | + if String.length line = 0 then interp_sel else | |
446 | + if String.get line 0 = '#' then interp_sel else | |
447 | + match Str.split (Str.regexp "\t") line with*) | |
448 | +(* [group;interp;label] -> Strcut_prefixingMap.add_inc interp_sel group [interp,label] (fun l -> (interp,label) :: l) *) | |
449 | + [group;interp;label] -> StringMap.add interp_sel interp group | |
450 | + | line -> failwith ("load_interp_sel: " ^ (String.concat "\t" line))) | |
451 | + | |
452 | +let generate_adj_rules path filename adj_pos_rules_filename = | |
453 | + let interp_sel = load_interp_sel "data/interps.tab" in | |
454 | + let dict = load_tab (path ^ filename) in | |
455 | + let dict = split_into_groups interp_sel dict in | |
456 | + if StringMap.size dict <> 3 then failwith ("generate_adj_rules: " ^ | |
457 | + String.concat " " (StringMap.fold dict [] (fun l s _ -> s :: l))) else | |
458 | + generate_adj_pos_rules adj_pos_rules_filename (StringMap.find dict "adj"); | |
459 | + () | |
460 | + | |
461 | +let generate_adj_com_rules path filename adj_com_rules_filename = | |
462 | + let dict = load_tab (path ^ filename) in | |
463 | + let dict = map_of_tab dict in | |
464 | + let dict = find_kolwiek_suffixes dict in | |
465 | +(* let dict = remove_exceptional_lemmata dict in *) | |
466 | + let rules = StringMap.fold dict StringMap.empty (RuleGenerator.generate_rules_com_entry Stem.adj_stem_sel) in | |
467 | + File.file_out adj_com_rules_filename (fun file -> | |
468 | + StringMap.iter rules (fun interp rules2 -> | |
469 | + fprintf file "\n@RULES %s\n" interp; | |
470 | + StringMap.iter rules2 (fun rule (q,l) -> | |
471 | + fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l)))) | |
472 | + | |
473 | +let find_validated_lemata_adj_pos dict = | |
474 | + let dict = map_of_tab dict in | |
475 | + StringMap.fold dict [] (fun lemmata lemma forms -> | |
476 | + if Rules.validate_entry lemma forms then lemma :: lemmata else lemmata) | |
477 | + | |
478 | +let find_validated_lemata_adj path filename = | |
479 | + let interp_sel = load_interp_sel "data/interps.tab" in | |
480 | + let dict = load_tab (path ^ filename) in | |
481 | + let dict = split_into_groups interp_sel dict in | |
482 | + if StringMap.size dict <> 3 then failwith ("generate_adj_rules: " ^ | |
483 | + String.concat " " (StringMap.fold dict [] (fun l s _ -> s :: l))) else | |
484 | + find_validated_lemata_adj_pos (StringMap.find dict "adj") | |
485 | + | |
486 | +let generate_adv_pos_rules rules_filename dict = | |
487 | + let dict = map_of_tab dict in | |
488 | + let dict = find_kolwiek_suffixes dict in | |
489 | + let dict = remove_exceptional_lemmata dict in | |
490 | + let rules = StringMap.fold dict StringMap.empty (RuleGenerator.generate_rules_entry_lemma_as_stem Stem.adv_stem_sel StringMap.empty) in | |
491 | + File.file_out rules_filename (fun file -> | |
492 | + StringMap.iter rules (fun interp rules2 -> | |
493 | + fprintf file "\n@RULES %s\n" interp; | |
494 | + StringMap.iter rules2 (fun rule (q,l) -> | |
495 | + fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l)))) | |
496 | + | |
497 | +let generate_adv_rules path filename adv_pos_rules_filename = | |
498 | + let interp_sel = load_interp_sel "data/interps.tab" in | |
499 | + let dict = load_tab (path ^ filename) in | |
500 | + let dict = split_into_groups interp_sel dict in | |
501 | + if StringMap.size dict <> 3 then failwith ("generate_adv_rules: " ^ | |
502 | + String.concat " " (StringMap.fold dict [] (fun l s _ -> s :: l))) else | |
503 | + generate_adv_pos_rules adv_pos_rules_filename (StringMap.find dict "adv"); | |
504 | + () | |
505 | + | |
506 | +let generate_adv_com_rules path filename adv_com_rules_filename = | |
507 | + let dict = load_tab (path ^ filename) in | |
508 | + let dict = map_of_tab dict in | |
509 | + let dict = find_kolwiek_suffixes dict in | |
510 | + let dict = remove_exceptional_lemmata dict in | |
511 | + let rules = StringMap.fold dict StringMap.empty (RuleGenerator.generate_rules_com_entry_lemma_as_stem Stem.adv_stem_sel) in | |
512 | + File.file_out adv_com_rules_filename (fun file -> | |
513 | + StringMap.iter rules (fun interp rules2 -> | |
514 | + fprintf file "\n@RULES %s\n" interp; | |
515 | + StringMap.iter rules2 (fun rule (q,l) -> | |
516 | + fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l)))) | |
517 | + | |
518 | + | |
519 | +let generate_noun_rules2 rules_filename dict = | |
520 | + let dict = map_of_tab dict in | |
521 | + let dict = find_kolwiek_suffixes dict in | |
522 | + let dict = remove_exceptional_lemmata dict in | |
523 | + let rules = StringMap.fold dict StringMap.empty (RuleGenerator.generate_rules_entry Stem.noun_stem_sel) in | |
524 | +(* let rules = StringMap.fold dict StringMap.empty (fun rules lemma l -> | |
525 | + let interps = Xlist.fold l StringMap.empty (fun map (orth,interp) -> | |
526 | + StringMap.add_inc map interp [orth] (fun l -> orth :: l)) in | |
527 | + let stem(*,_*) = generate_stem (*0*) interps lemma noun_stem_sel in | |
528 | + let cl = classify_noun lemma stem interps noun_classes in | |
529 | + if cl <> "A" && cl <> "II" (*&& cl <> "Ę" && cl <> "Ą"*) then rules else | |
530 | + if has_known_inflexion_noun stem interps then rules else | |
531 | + let interps = select_inflexion rules_a stem interps in | |
532 | + let stem2 = cut_stem_sufix stem in | |
533 | + StringMap.fold interps rules (fun rules interp orths -> | |
534 | + Xlist.fold orths rules (fun rules orth -> | |
535 | + let n = find_common_prefix_length [stem2;orth] in | |
536 | + let rules2 = try StringMap.find rules interp with Not_found -> StringMap.empty in | |
537 | + let a = cut_prefixn n orth in | |
538 | + let b = cut_prefixn n stem in | |
539 | + let c,f = rule_code (a,b) in | |
540 | +(* let rule = sprintf "%s\t%s\t%s\t%s" cl c a b in *) | |
541 | + let rule = cl ^ "\t" ^ if f then "\t" ^ c else sprintf "%s\t%s\t%s" c a b in | |
542 | + let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in | |
543 | + StringMap.add rules interp rules2))) in*) | |
544 | + File.file_out rules_filename (fun file -> | |
545 | + StringMap.iter rules (fun interp rules2 -> | |
546 | + fprintf file "\n@RULES %s\n" interp; | |
547 | + StringMap.iter rules2 (fun rule (q,l) -> | |
548 | + fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l)))) | |
549 | + | |
550 | +let generate_noun_rules path filename noun_rules_filename = | |
551 | + let interp_sel = load_interp_sel "data/interps.tab" in | |
552 | + let dict = load_tab (path ^ filename) in | |
553 | + let dict = split_into_groups interp_sel dict in | |
554 | + if StringMap.size dict <> 1 then failwith ("generate_noun_rules: " ^ | |
555 | + String.concat " " (StringMap.fold dict [] (fun l s _ -> s :: l))) else | |
556 | + generate_noun_rules2 noun_rules_filename (StringMap.find dict "noun"); | |
557 | + () | |
558 | + | |
559 | +let generate_verb_rules2 rules_filename dict = | |
560 | + let dict = map_of_tab dict in | |
561 | + let dict = remove_exceptional_lemmata dict in | |
562 | + let rules = StringMap.fold dict StringMap.empty (RuleGenerator.generate_rules_entry_lemma_as_stem Stem.verb_stem_sel Stem.verb_stem_sel2) in | |
563 | + File.file_out rules_filename (fun file -> | |
564 | + StringMap.iter rules (fun interp rules2 -> | |
565 | + fprintf file "\n@RULES %s\n" interp; | |
566 | + StringMap.iter rules2 (fun rule (q,l) -> | |
567 | + fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l)))) | |
568 | + | |
569 | +let generate_verb_rules path filename rules_filename = | |
570 | + let interp_sel = load_interp_sel "data/interps.tab" in | |
571 | + let dict = load_tab (path ^ filename) in | |
572 | + let dict = split_into_groups interp_sel dict in | |
573 | + if StringMap.size dict <> 2 then failwith ("generate_verb_rules: " ^ | |
574 | + String.concat " " (StringMap.fold dict [] (fun l s _ -> s :: l))) else | |
575 | + generate_verb_rules2 rules_filename (StringMap.find dict "verb"); | |
576 | + () | |
577 | + | |
578 | +let find_validated_lemata_noun path filename = | |
579 | + let dict = load_tab (path ^ filename) in | |
580 | + let dict = map_of_tab dict in | |
581 | + StringMap.fold dict [] (fun lemmata lemma forms -> | |
582 | + if Rules.validate_entry lemma forms then lemma :: lemmata else lemmata) | |
583 | + | |
584 | +let find_tags_lemata_noun path filename = | |
585 | + let dict = load_tab (path ^ filename) in | |
586 | + let dict = map_of_tab dict in | |
587 | + StringMap.iter dict (fun lemma forms -> | |
588 | + Xlist.iter (Rules.find_tags_entry lemma forms) (fun (orth,found,interp) -> | |
589 | + if Xlist.size found <> 1 then | |
590 | + printf "%s\t%s\t%d\n %s\n" orth lemma (Xlist.size found) (String.concat "\n " (Xlist.map found (fun (_,l) -> | |
591 | + String.concat " " (Xlist.map l (fun (k,v) -> k ^ "=" ^ v)) | |
592 | + ))))) | |
593 | + | |
594 | +let find_validated_lemata_verb2 dict = | |
595 | + let dict = map_of_tab dict in | |
596 | + StringMap.fold dict [] (fun lemmata lemma forms -> | |
597 | + if Rules.validate_entry lemma forms then lemma :: lemmata else lemmata) | |
598 | + | |
599 | +let find_validated_lemata_verb path filename = | |
600 | + let interp_sel = load_interp_sel "data/interps.tab" in | |
601 | + let dict = load_tab (path ^ filename) in | |
602 | + let dict = split_into_groups interp_sel dict in | |
603 | + if StringMap.size dict <> 2 then failwith ("generate_verb_rules: " ^ | |
604 | + String.concat " " (StringMap.fold dict [] (fun l s _ -> s :: l))) else | |
605 | + find_validated_lemata_verb2 (StringMap.find dict "verb") | |
606 | + | |
607 | +let find_tags_lemata_verb2 dict = | |
608 | + let dict = map_of_tab dict in | |
609 | + StringMap.iter dict (fun lemma forms -> | |
610 | + Xlist.iter (Rules.find_tags_entry lemma forms) (fun (orth,found,interp) -> | |
611 | + if Xlist.size found <> 1 then | |
612 | + printf "%s\t%s\t%d\n %s\n" orth lemma (Xlist.size found) (String.concat "\n " (Xlist.map found (fun (_,l) -> | |
613 | + String.concat " " (Xlist.map l (fun (k,v) -> k ^ "=" ^ v)) | |
614 | + ))))) | |
615 | + | |
616 | +let find_tags_lemata_verb path filename = | |
617 | + let interp_sel = load_interp_sel "data/interps.tab" in | |
618 | + let dict = load_tab (path ^ filename) in | |
619 | + let dict = split_into_groups interp_sel dict in | |
620 | + if StringMap.size dict <> 2 then failwith ("generate_verb_rules: " ^ | |
621 | + String.concat " " (StringMap.fold dict [] (fun l s _ -> s :: l))) else | |
622 | + find_tags_lemata_verb2 (StringMap.find dict "verb") | |
623 | + | |
624 | +let _ = | |
625 | +(* generate_adj_rules morfeusz_path ("odm_adj_" ^ sgjp_filename) "rules/ADJ-FLEX6.dic"; *) | |
626 | +(* generate_noun_rules morfeusz_path ("odm_noun_" ^ sgjp_filename) "rules/NOUN-FLEX6.dic"; *) | |
627 | +(* generate_adj_com_rules morfeusz_path ("odm_adj_" ^ sgjp_filename) "rules/ADJ-FLEX-COM6.dic"; *) | |
628 | +(* generate_adv_rules morfeusz_path ("adv_" ^ sgjp_filename) "rules/ADV-FLEX6.dic"; *) | |
629 | +(* generate_adv_com_rules morfeusz_path ("adv_" ^ sgjp_filename) "rules/ADV-FLEX-COM6.dic"; *) | |
630 | +(* generate_verb_rules morfeusz_path ("verb_" ^ sgjp_filename) "rules/VERB-FLEX6.dic"; *) | |
631 | +(* generate_verb_rules "data/" "verbs_ex.tab" "rules/VERB-FLEX6.dic"; *) | |
632 | + () | |
633 | + | |
634 | +let _ = | |
635 | +(* let l = find_validated_lemata_adj morfeusz_path ("odm_adj_" ^ sgjp_filename) in *) | |
636 | +(* let l = find_validated_lemata_noun morfeusz_path ("odm_noun_" ^ sgjp_filename) in *) | |
637 | +(* let l = find_validated_lemata_verb morfeusz_path ("verb_" ^ sgjp_filename) in *) | |
638 | +(* let l = find_validated_lemata_noun "data/" "nouns_ex.tab" in *) | |
639 | +(* let l = find_validated_lemata_verb "data/" "verbs_ex.tab" in *) | |
640 | +(* Xlist.iter l print_endline; *) | |
641 | + () | |
642 | + | |
643 | +let _ = | |
644 | +(* find_tags_lemata_verb "data/" "verbs_ex.tab"; *) | |
645 | + find_tags_lemata_noun "data/" "nouns_ex.tab"; | |
646 | + () | |
647 | + | |
648 | +let _ = | |
649 | +(* Rules.print "results/rules/"; *) | |
650 | + () | |
651 | + | |
652 | + | |
653 | + | |
654 | +(*** | |
655 | +let expand_tags tags = | |
656 | + if tags = "" then [] else | |
657 | + List.flatten (Xlist.map (Str.split (Str.regexp "|") tags) (fun tags -> | |
658 | + let tags = Xlist.map (Str.split (Str.regexp ":") tags) (Str.split (Str.regexp "\\.")) in | |
659 | + Xlist.map (Xlist.multiply_list tags) (String.concat ":"))) | |
660 | + | |
661 | +let prepare_rules l = | |
662 | + Xlist.fold l [] (fun rules rule_set_name -> | |
663 | + let rule_set = StringMap.find rule_map rule_set_name in | |
664 | + Xlist.fold rule_set rules (fun rules (alternation_name, sufix, tags) -> | |
665 | + let alternation = StringMap.find alternation_map alternation_name in | |
666 | + Xlist.fold alternation rules (fun rules (a,b) -> | |
667 | + (a ^ sufix, b, expand_tags tags) :: rules))) | |
668 | + | |
669 | +let prepare_rules_simple l = | |
670 | + Xlist.fold l [] (fun rules rule_set_name -> | |
671 | + let rule_set = StringMap.find rule_map rule_set_name in | |
672 | + Xlist.fold rule_set rules (fun rules (alternation_name, sufix, tags) -> | |
673 | + let alternation = StringMap.find alternation_map alternation_name in | |
674 | + Xlist.fold alternation rules (fun rules (a,b) -> | |
675 | + (a ^ sufix, b, [tags]) :: rules))) | |
676 | + | |
677 | +let rules_adj_flex = prepare_rules_simple ["ADJ-FLEX"] | |
678 | +let rules_adj_lemma = prepare_rules ["ADJ-LEMMA"] | |
679 | + | |
680 | +let rules_a = prepare_rules ["NOUN-FLEX-GENERAL";"NOUN-FLEX-A"] | |
681 | +let rules_noun_as_adj = prepare_rules ["NOUN-FLEX-GENERAL";"NOUN-ADJ-FLEX"] | |
682 | +let rules_noun_lemma = prepare_rules ["NOUN-LEMMA"] | |
683 | + | |
684 | +let is_applicable_rule (a,_,_) s = check_sufix a s | |
685 | + | |
686 | +let apply_rule (a,b,_) s = | |
687 | + (cut_sufix a s) ^ b | |
688 | + | |
689 | +let match_interp (_,_,l) s = | |
690 | + Xlist.mem l s | |
691 | + | |
692 | +let get_interps (_,_,l) = l | |
693 | + | |
694 | +let apply_rules rules s = | |
695 | + Xlist.fold rules [] (fun l rule -> | |
696 | + if is_applicable_rule rule s then | |
697 | + (apply_rule rule s, get_interps rule) :: l | |
698 | + else l) | |
699 | + | |
700 | +let check_inflexion rules stem interps = | |
701 | + StringMap.fold interps true (fun b interp orths -> | |
702 | + Xlist.fold orths b (fun b orth -> | |
703 | + let c = Xlist.fold rules false (fun c rule -> | |
704 | + if is_applicable_rule rule orth && match_interp rule interp then | |
705 | + if apply_rule rule orth = stem then true else c | |
706 | + else c) in | |
707 | + if c then b else false)) | |
708 | + | |
709 | +let has_known_inflexion_noun stem interps = | |
710 | + let b1 = check_inflexion rules_a stem interps in | |
711 | + let b2 = check_inflexion rules_noun_as_adj stem interps in | |
712 | + b1 || b2 | |
713 | + | |
714 | +let has_known_inflexion_adj stem interps = | |
715 | + let b = check_inflexion rules_adj_flex stem interps in | |
716 | + b | |
717 | + | |
718 | +let select_inflexion rules stem interps = | |
719 | + StringMap.fold interps StringMap.empty (fun interps interp orths -> | |
720 | + let orths = Xlist.fold orths [] (fun orths orth -> | |
721 | + let c = Xlist.fold rules false (fun c rule -> | |
722 | + if is_applicable_rule rule orth && match_interp rule interp then | |
723 | + if apply_rule rule orth = stem then true else c | |
724 | + else c) in | |
725 | + if c then orths else orth :: orths) in | |
726 | + if orths = [] then interps else StringMap.add interps interp orths) | |
727 | +***) | |
... | ... |
guesser/makefile
0 → 100755
1 | +OCAMLC=ocamlc | |
2 | +OCAMLOPT=ocamlopt | |
3 | +OCAMLDEP=ocamldep | |
4 | +INCLUDES=-I +xml-light -I +xlib -I ../../9WalLCGslo | |
5 | +OCAMLFLAGS=$(INCLUDES) -g | |
6 | +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa xlib.cmxa | |
7 | +#OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa xlib.cmxa tokenizer.cmxa | |
8 | + | |
9 | +MODS= morf.ml morfeusz.ml | |
10 | +MODS2= morf.ml inflexionConv.ml | |
11 | +MODS3= morf.ml inflexion.ml | |
12 | +MODS4= morf.ml morphemes.ml | |
13 | + | |
14 | +all: | |
15 | + $(OCAMLOPT) -o generate $(OCAMLOPTFLAGS) schemata.ml rules.ml stem.ml ruleGenerator.ml generate.ml | |
16 | +# $(OCAMLOPT) -o morfeusz $(OCAMLOPTFLAGS) $(MODS) | |
17 | +# $(OCAMLOPT) -o inflexion $(OCAMLOPTFLAGS) $(MODS2) | |
18 | +# $(OCAMLOPT) -o inflexion_test $(OCAMLOPTFLAGS) $(MODS3) | |
19 | +# $(OCAMLOPT) -o morphemes $(OCAMLOPTFLAGS) $(MODS4) | |
20 | + | |
21 | +lib: | |
22 | + $(OCAMLOPT) -linkall -a -o inflexion.cmxa $(INCLUDES) $(MODS3) | |
23 | + | |
24 | + | |
25 | +.SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx | |
26 | + | |
27 | +.mll.ml: | |
28 | + ocamllex $< | |
29 | + | |
30 | +.mly.mli: | |
31 | + ocamlyacc $< | |
32 | + | |
33 | +.mly.ml: | |
34 | + ocamlyacc $< | |
35 | + | |
36 | +.ml.cmo: | |
37 | + $(OCAMLC) $(OCAMLFLAGS) -c $< | |
38 | + | |
39 | +.mli.cmi: | |
40 | + $(OCAMLC) $(OCAMLFALGS) -c $< | |
41 | + | |
42 | +.ml.cmx: | |
43 | + $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< | |
44 | + | |
45 | +clean: | |
46 | + rm -f *~ *.cm[oix] *.o morfeusz inflexion inflexion_test morphemes | |
... | ... |
guesser/ruleGenerator.ml
0 → 100644
1 | +open Xstd | |
2 | +open Printf | |
3 | + | |
4 | +let alternation_map = Rules.alternation_map | |
5 | + | |
6 | +let rule_types = Xlist.fold [ | |
7 | +(* Xlist.map (StringMap.find alternation_map "obce_ch") (fun (_,s,t) -> sprintf "%sch\t%s" s t), "{x}ych\t{x}"; | |
8 | + Xlist.map (StringMap.find alternation_map "obce_ch") (fun (_,s,t) -> sprintf "%smi\t%s" s t), "{x}ymi\t{x}"; | |
9 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_iy") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{'}y\t{'}"; | |
10 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_iy") (fun (_,s,t) -> sprintf "%sch\t%s" s t), "{'}ych\t{'}"; | |
11 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_iy") (fun (_,s,t) -> sprintf "%sm\t%s" s t), "{'}ym\t{'}"; | |
12 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_iy") (fun (_,s,t) -> sprintf "%smi\t%s" s t), "{'}ymi\t{'}"; | |
13 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%se\t%s" s t), "{'}e\t{'}"; | |
14 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%sego\t%s" s t), "{'}ego\t{'}"; | |
15 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%sej\t%s" s t), "{'}ej\t{'}"; | |
16 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%semu\t%s" s t), "{'}emu\t{'}"; | |
17 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%sa\t%s" s t), "{'}a\t{'}"; | |
18 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%są\t%s" s t), "{'}ą\t{'}"; | |
19 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%so\t%s" s t), "{'}o\t{'}"; | |
20 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%sę\t%s" s t), "{'}ę\t{'}"; | |
21 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%su\t%s" s t), "{'}u\t{'}"; | |
22 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%sów\t%s" s t), "{'}ów\t{'}"; | |
23 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%som\t%s" s t), "{'}om\t{'}"; | |
24 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%sami\t%s" s t), "{'}ami\t{'}"; | |
25 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%sach\t%s" s t), "{'}ach\t{'}"; | |
26 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%sowi\t%s" s t), "{'}owi\t{'}"; | |
27 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%sowie\t%s" s t), "{'}owie\t{'}"; | |
28 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%sum\t%s" s t), "{'}um\t{'}"; | |
29 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ae") (fun (_,s,t) -> sprintf "%sem\t%s" s t), "{'}em\t{'}"; | |
30 | +(* Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_ii") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{'}ii\t{'}"; | |
31 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_yj") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{'}yj\t{'}";*) | |
32 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_wyglos") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{'}ε\t{'}"; | |
33 | +(* Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{v'}y\t{v'}"; *) | |
34 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%sch\t%s" s t), "{v'}ych\t{v'}"; | |
35 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%sm\t%s" s t), "{v'}ym\t{v'}"; | |
36 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%smi\t%s" s t), "{v'}ymi\t{v'}"; | |
37 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%se\t%s" s t), "{v'}e\t{v'}"; | |
38 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%sego\t%s" s t), "{v'}ego\t{v'}"; | |
39 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%sej\t%s" s t), "{v'}ej\t{v'}"; | |
40 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%semu\t%s" s t), "{v'}emu\t{v'}"; | |
41 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%sa\t%s" s t), "{v'}a\t{v'}"; | |
42 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%są\t%s" s t), "{v'}ą\t{v'}"; | |
43 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%so\t%s" s t), "{v'}o\t{v'}"; | |
44 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%sę\t%s" s t), "{v'}ę\t{v'}"; | |
45 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%su\t%s" s t), "{v'}u\t{v'}"; | |
46 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%sów\t%s" s t), "{v'}ów\t{v'}"; | |
47 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%som\t%s" s t), "{v'}om\t{v'}"; | |
48 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%sami\t%s" s t), "{v'}ami\t{v'}"; | |
49 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%sach\t%s" s t), "{v'}ach\t{v'}"; | |
50 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%sowi\t%s" s t), "{v'}owi\t{v'}"; | |
51 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%sowie\t%s" s t), "{v'}owie\t{v'}"; | |
52 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%sum\t%s" s t), "{v'}um\t{v'}"; | |
53 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe") (fun (_,s,t) -> sprintf "%sem\t%s" s t), "{v'}em\t{v'}"; | |
54 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_miekkie_nowe_wyglos") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{v'}ε\t{v'}"; | |
55 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_y") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{}y\t{}"; | |
56 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_y") (fun (_,s,t) -> sprintf "%sch\t%s" s t), "{}ych\t{}"; | |
57 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_y") (fun (_,s,t) -> sprintf "%sm\t%s" s t), "{}ym\t{}"; | |
58 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_y") (fun (_,s,t) -> sprintf "%smi\t%s" s t), "{}ymi\t{}"; | |
59 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_e") (fun (_,s,t) -> sprintf "%se\t%s" s t), "{}e\t{}"; | |
60 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_e") (fun (_,s,t) -> sprintf "%sego\t%s" s t), "{}ego\t{}"; | |
61 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_e") (fun (_,s,t) -> sprintf "%sej\t%s" s t), "{}ej\t{}"; | |
62 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_e") (fun (_,s,t) -> sprintf "%semu\t%s" s t), "{}emu\t{}"; | |
63 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_a") (fun (_,s,t) -> sprintf "%sa\t%s" s t), "{}a\t{}"; | |
64 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_a") (fun (_,s,t) -> sprintf "%są\t%s" s t), "{}ą\t{}"; | |
65 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_a") (fun (_,s,t) -> sprintf "%so\t%s" s t), "{}o\t{}"; | |
66 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_a") (fun (_,s,t) -> sprintf "%sę\t%s" s t), "{}ę\t{}"; | |
67 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_a") (fun (_,s,t) -> sprintf "%su\t%s" s t), "{}u\t{}"; | |
68 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_a") (fun (_,s,t) -> sprintf "%sów\t%s" s t), "{}ów\t{}"; | |
69 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_a") (fun (_,s,t) -> sprintf "%som\t%s" s t), "{}om\t{}"; | |
70 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_a") (fun (_,s,t) -> sprintf "%sami\t%s" s t), "{}ami\t{}"; | |
71 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_a") (fun (_,s,t) -> sprintf "%sach\t%s" s t), "{}ach\t{}"; | |
72 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_a") (fun (_,s,t) -> sprintf "%sowi\t%s" s t), "{}owi\t{}"; | |
73 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_a") (fun (_,s,t) -> sprintf "%sowie\t%s" s t), "{}owie\t{}"; | |
74 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_a") (fun (_,s,t) -> sprintf "%sum\t%s" s t), "{}um\t{}"; | |
75 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_e") (fun (_,s,t) -> sprintf "%sem\t%s" s t), "{}em\t{}"; | |
76 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_i") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{}'i\t{}"; | |
77 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_ie") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{}'ie\t{}"; | |
78 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_wyglos") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{}ε\t{}"; | |
79 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe_y") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{v}y\t{v}"; | |
80 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe_y") (fun (_,s,t) -> sprintf "%sch\t%s" s t), "{v}ych\t{v}"; | |
81 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe_y") (fun (_,s,t) -> sprintf "%sm\t%s" s t), "{v}ym\t{v}"; | |
82 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe_y") (fun (_,s,t) -> sprintf "%smi\t%s" s t), "{v}ymi\t{v}"; | |
83 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%se\t%s" s t), "{v}e\t{v}"; | |
84 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%sego\t%s" s t), "{v}ego\t{v}"; | |
85 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%sej\t%s" s t), "{v}ej\t{v}"; | |
86 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%semu\t%s" s t), "{v}emu\t{v}"; | |
87 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%sa\t%s" s t), "{v}a\t{v}"; | |
88 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%są\t%s" s t), "{v}ą\t{v}"; | |
89 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%so\t%s" s t), "{v}o\t{v}"; | |
90 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%sę\t%s" s t), "{v}ę\t{v}"; | |
91 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%su\t%s" s t), "{v}u\t{v}"; | |
92 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%sów\t%s" s t), "{v}ów\t{v}"; | |
93 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%som\t%s" s t), "{v}om\t{v}"; | |
94 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%sami\t%s" s t), "{v}ami\t{v}"; | |
95 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%sach\t%s" s t), "{v}ach\t{v}"; | |
96 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%sowi\t%s" s t), "{v}owi\t{v}"; | |
97 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%sowie\t%s" s t), "{v}owie\t{v}"; | |
98 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe") (fun (_,s,t) -> sprintf "%sum\t%s" s t), "{v}um\t{v}"; | |
99 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe_ie") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{v}'ie\t{v}"; | |
100 | + Xlist.map (StringMap.find alternation_map "funkcjonalnie_twarde_nowe_wyglos") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{v}ε\t{v}"; | |
101 | + Xlist.map (StringMap.find alternation_map "kapitaliki_y") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{-}y\t{-}"; | |
102 | + Xlist.map (StringMap.find alternation_map "kapitaliki_y") (fun (_,s,t) -> sprintf "%sch\t%s" s t), "{-}ych\t{-}"; | |
103 | + Xlist.map (StringMap.find alternation_map "kapitaliki_y") (fun (_,s,t) -> sprintf "%sm\t%s" s t), "{-}ym\t{-}"; | |
104 | + Xlist.map (StringMap.find alternation_map "kapitaliki_y") (fun (_,s,t) -> sprintf "%smi\t%s" s t), "{-}ymi\t{-}"; | |
105 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%se\t%s" s t), "{-}e\t{-}"; | |
106 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%sego\t%s" s t), "{-}ego\t{-}"; | |
107 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%sej\t%s" s t), "{-}ej\t{-}"; | |
108 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%semu\t%s" s t), "{-}emu\t{-}"; | |
109 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%sa\t%s" s t), "{-}a\t{-}"; | |
110 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%są\t%s" s t), "{-}ą\t{-}"; | |
111 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%so\t%s" s t), "{-}o\t{-}"; | |
112 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%sę\t%s" s t), "{-}ę\t{-}"; | |
113 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%su\t%s" s t), "{-}u\t{-}"; | |
114 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%sów\t%s" s t), "{-}ów\t{-}"; | |
115 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%som\t%s" s t), "{-}om\t{-}"; | |
116 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%sem\t%s" s t), "{-}em\t{-}"; | |
117 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%sami\t%s" s t), "{-}ami\t{-}"; | |
118 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%sach\t%s" s t), "{-}ach\t{-}"; | |
119 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%sowi\t%s" s t), "{-}owi\t{-}"; | |
120 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%sowie\t%s" s t), "{-}owie\t{-}"; | |
121 | + Xlist.map (StringMap.find alternation_map "kapitaliki_a") (fun (_,s,t) -> sprintf "%sum\t%s" s t), "{-}um\t{-}"; | |
122 | + Xlist.map (StringMap.find alternation_map "kapitaliki_ie") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{-}'ie\t{-}"; | |
123 | + Xlist.map (StringMap.find alternation_map "kapitaliki_wyglos") (fun (_,s,t) -> sprintf "%s\t%s" s t), "{-}ε\t{-}";*) | |
124 | + ] StringMap.empty (fun map (l,code) -> | |
125 | + Xlist.fold l map (fun map rule -> StringMap.add_inc map rule code (fun code2 -> | |
126 | + print_endline ("rule_types: " ^ rule ^ " " ^ code ^ " " ^ code2); code2))) | |
127 | + | |
128 | +let rec cut_prefix_list c ll = | |
129 | + Xlist.map ll (function | |
130 | + [] -> raise Not_found | |
131 | + | x :: l -> if x = c then l else raise Not_found) | |
132 | + | |
133 | +let rec find_common_prefix_length_rec n = function | |
134 | + [] :: _ -> n | |
135 | + | (c :: l) :: ll -> | |
136 | + (try | |
137 | + let ll = cut_prefix_list c ll in | |
138 | + find_common_prefix_length_rec (n + String.length c) (l :: ll) | |
139 | + with Not_found -> n) | |
140 | + | [] -> failwith "find_common_prefix_length_rec" | |
141 | + | |
142 | +let find_common_prefix_length l = | |
143 | + let ll = Xlist.map l Xunicode.utf8_chars_of_utf8_string(*Stem.text_to_chars*) in | |
144 | + find_common_prefix_length_rec 0 ll | |
145 | + | |
146 | +let cut_prefixn i s = | |
147 | + let n = String.length s in | |
148 | + if i >= n then "" else | |
149 | + try String.sub s i (n-i) with _ -> failwith ("cut_prefixn: " ^ s ^ " " ^ string_of_int i) | |
150 | + | |
151 | +let rule_code (a,b) = | |
152 | + let s = sprintf "%s\t%s" a b in | |
153 | + try StringMap.find rule_types s, true with Not_found -> | |
154 | + if Rules.check_prefix b a then | |
155 | + let suf = Rules.cut_prefix b a in | |
156 | + suf ^ "_" ^ (String.concat "_" (List.rev (Xunicode.utf8_chars_of_utf8_string(*Stem.text_to_chars*) b))), false | |
157 | + else "???", false | |
158 | + | |
159 | +let generate_rule stem stem_pref orth = | |
160 | + let n = find_common_prefix_length [stem_pref;orth] in | |
161 | + let a = cut_prefixn n orth in | |
162 | + let b = cut_prefixn n stem in | |
163 | + let c,f = rule_code (a,b) in | |
164 | + if f then "\t" ^ c else sprintf "%s\t%s\t%s" c a b | |
165 | + | |
166 | +let rec classify_entry lemma stem forms = function | |
167 | + (class_interp,suf,cl) :: class_sel -> | |
168 | + let l = Xlist.fold forms [] (fun l (orth,interp) -> | |
169 | + if interp = class_interp then orth :: l else l) in | |
170 | + let b = Xlist.fold l false (fun b orth -> | |
171 | + if Rules.check_sufix suf orth then true else b) in | |
172 | + if b then cl else classify_entry lemma stem forms class_sel | |
173 | +(* let l = StringSet.to_list (Xlist.fold l StringSet.empty (fun set orth -> | |
174 | + if check_prefix stem orth then | |
175 | + StringSet.add set (cut_prefix stem orth) | |
176 | + else set)) in | |
177 | + if Xlist.mem l suf then cl else classify_noun lemma stem interps class_sel | |
178 | + let l = StringSet.to_list (Xlist.fold l StringSet.empty (fun set orth -> | |
179 | + if check_prefix stem orth then | |
180 | + StringSet.add set (cut_prefix stem orth) | |
181 | + else set)) in | |
182 | + if Xlist.mem l suf then cl else classify_noun lemma stem interps class_sel*) | |
183 | +(* (match l with | |
184 | + [] -> classify_noun lemma stem interps class_sel | |
185 | + | [s] -> if s = suf then cl else classify_noun lemma stem interps class_sel | |
186 | + | _ -> print_endline ("classify_noun multiple class: " ^ lemma ^ " " ^ String.concat " " l); | |
187 | + classify_noun lemma stem interps class_sel)*) | |
188 | + | [] -> (*print_endline ("classify_noun unknown class: " ^ lemma);*) "X" | |
189 | + | |
190 | +let entry_classes = | |
191 | + List.flatten (Xlist.map ["m1";"m2";"m3";"n1";"n2";"f";"p1";"p2";"p3"] (fun gender -> | |
192 | + Xlist.map ["ii";"ji";"yj"] (fun sufix -> | |
193 | + "subst:pl:gen:" ^ gender, sufix,"II"))) @ | |
194 | + List.flatten (Xlist.map ["m1";"m2";"m3";"n1";"n2";"f"] (fun gender -> | |
195 | + Xlist.map ["a"] (fun sufix -> | |
196 | + "subst:sg:nom:" ^ gender, sufix,"A"))) @ | |
197 | + List.flatten (Xlist.map ["m1";"m2";"m3";"n1";"n2";"f"] (fun gender -> | |
198 | + Xlist.map ["ę"] (fun sufix -> | |
199 | + "subst:sg:acc:" ^ gender, sufix,"Ę"))) @ | |
200 | + List.flatten (Xlist.map ["m1";"m2";"m3";"n1";"n2";"f"] (fun gender -> | |
201 | + Xlist.map ["ą"] (fun sufix -> | |
202 | + "subst:sg:inst:" ^ gender, sufix,"Ą"))) @ | |
203 | +(* List.flatten (Xlist.map ["m1";"m2";"m3";"n1";"n2";"f";"p1";"p2";"p3"] (fun gender -> | |
204 | + Xlist.map ["ym";"im";"m"] (fun sufix -> | |
205 | + "subst:pl:dat:" ^ gender, sufix,"ADJ"))) @ | |
206 | + List.flatten (Xlist.map ["m1";"m2";"m3";"n1";"n2";"f"] (fun gender -> | |
207 | + Xlist.map ["a","A";"o","O";"e","E"] (fun (sufix,s) -> | |
208 | + "subst:sg:nom:" ^ gender, sufix,s))) @*) | |
209 | +[ | |
210 | + "subst:sg:nom:n2","um","UM"; | |
211 | + ] | |
212 | + | |
213 | +let generate_rules_entry stem_sel rules lemma forms = | |
214 | + let stem = Stem.generate_stem stem_sel lemma forms in | |
215 | + let stem_pref = Stem.cut_stem_sufix stem in | |
216 | + let cl = classify_entry lemma stem forms entry_classes in | |
217 | + let forms = Rules.select_not_validated lemma forms in | |
218 | + Xlist.fold forms rules (fun rules (orth,interp) -> | |
219 | + let rule = cl ^ "\t" ^ generate_rule stem stem_pref orth in | |
220 | + let rules2 = try StringMap.find rules interp with Not_found -> StringMap.empty in | |
221 | + let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in | |
222 | + StringMap.add rules interp rules2) | |
223 | + | |
224 | +let generate_rules_com_entry stem_sel rules lemma forms = | |
225 | + let stem = Stem.generate_stem stem_sel lemma forms in | |
226 | + let stem_pref = Stem.cut_stem_sufix stem in | |
227 | +(* let cl = classify_entry lemma stem forms entry_classes in *) | |
228 | + let forms = Rules.select_not_validated lemma forms in | |
229 | + Xlist.fold forms rules (fun rules (orth,interp) -> | |
230 | + if not (Rules.check_sufix ":com" interp) then rules else | |
231 | + let rule = "\t" ^ generate_rule stem stem_pref orth in | |
232 | + let rules2 = try StringMap.find rules interp with Not_found -> StringMap.empty in | |
233 | + let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in | |
234 | + StringMap.add rules interp rules2) | |
235 | + | |
236 | +let generate_rules_entry_lemma_as_stem stem_sel stem_sel2 rules lemma forms = | |
237 | + let stem = Stem.generate_stem_lemma_as_stem stem_sel lemma in | |
238 | + let stem = if stem = "" then Stem.generate_stem stem_sel2 lemma forms else stem in | |
239 | + if stem = "" then rules else | |
240 | + let stem_pref = Stem.cut_stem_sufix stem in | |
241 | + let cl = classify_entry lemma stem forms entry_classes in | |
242 | + let forms = Rules.select_not_validated lemma forms in | |
243 | + Xlist.fold forms rules (fun rules (orth,interp) -> | |
244 | + let rule = cl ^ "\t" ^ generate_rule stem stem_pref orth in | |
245 | + let rules2 = try StringMap.find rules interp with Not_found -> StringMap.empty in | |
246 | + let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in | |
247 | + StringMap.add rules interp rules2) | |
248 | + | |
249 | +let generate_rules_com_entry_lemma_as_stem stem_sel rules lemma forms = | |
250 | + let stem = Stem.generate_stem_lemma_as_stem stem_sel lemma in | |
251 | + let stem_pref = Stem.cut_stem_sufix stem in | |
252 | +(* let cl = classify_entry lemma stem forms entry_classes in *) | |
253 | + let forms = Rules.select_not_validated lemma forms in | |
254 | + Xlist.fold forms rules (fun rules (orth,interp) -> | |
255 | + if not (Rules.check_sufix ":com" interp) then rules else | |
256 | + let rule = "\t" ^ generate_rule stem stem_pref orth in | |
257 | + let rules2 = try StringMap.find rules interp with Not_found -> StringMap.empty in | |
258 | + let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in | |
259 | + StringMap.add rules interp rules2) | |
260 | + | |
261 | + | |
262 | + | |
263 | + | |
264 | + | |
265 | + | |
266 | + | |
267 | + | |
268 | + | |
0 | 269 | \ No newline at end of file |
... | ... |
guesser/rules.ml
0 → 100644
1 | +open Xstd | |
2 | +open Printf | |
3 | + | |
4 | +let check_prefix pat s = | |
5 | + let n = String.length pat in | |
6 | + if n > String.length s then false else | |
7 | + String.sub s 0 n = pat | |
8 | + | |
9 | +let cut_prefix pat s = | |
10 | + let i = String.length pat in | |
11 | + let n = String.length s in | |
12 | + if i >= n then "" else | |
13 | + try String.sub s i (n-i) with _ -> failwith ("cut_prefix: " ^ s ^ " " ^ string_of_int i) | |
14 | + | |
15 | +let check_sufix pat s = | |
16 | + let n = String.length pat in | |
17 | + let m = String.length s in | |
18 | + if n > m then false else | |
19 | + String.sub s (m-n) n = pat | |
20 | + | |
21 | +let cut_sufix pat s = | |
22 | + let i = String.length pat in | |
23 | + let n = String.length s in | |
24 | + try String.sub s 0 (n-i) with _ -> failwith ("cut_sufix: " ^ s) | |
25 | + | |
26 | + | |
27 | + | |
28 | +let revert_alternations l = | |
29 | + Xlist.map l (fun (a,b,c) -> a,c,b) | |
30 | + | |
31 | +let alternation_map = Xlist.fold Schemata.alternations StringMap.empty (fun map (k,v) -> | |
32 | + StringMap.add map k v) | |
33 | + | |
34 | +let rev_alternation_map = Xlist.fold Schemata.alternations StringMap.empty (fun map (k,v) -> | |
35 | + StringMap.add map k (revert_alternations v)) | |
36 | + | |
37 | +let translate_tags = function | |
38 | + ["adj";n;c;g;d] -> ["cat","adj";"number",n;"case",c;"gender",g;"grad",d] | |
39 | + | ["adja"] -> ["cat","adja"] | |
40 | + | ["adjc"] -> ["cat","adjc"] | |
41 | + | ["adjp"] -> ["cat","adjp"] | |
42 | + | ["subst";n;c;g] -> ["cat","subst";"number",n;"case",c;"gender",g] | |
43 | + | ["depr";n;c;g] -> ["depr","subst";"number",n;"case",c;"gender",g] | |
44 | + | l -> failwith ("translate_tags: " ^ String.concat ":" l) | |
45 | + | |
46 | +let retranslate_tags = function | |
47 | + ["cat","adj";"number",n;"case",c;"gender",g;"grad",d] -> ["adj";n;c;g;d] | |
48 | + | ["cat","adja"] -> ["adja"] | |
49 | + | ["cat","adjc"] -> ["adjc"] | |
50 | + | ["cat","adjp"] -> ["adjp"] | |
51 | + | ["cat","subst";"number",n;"case",c;"gender",g] -> ["subst";n;c;g] | |
52 | + | ["depr","subst";"number",n;"case",c;"gender",g] -> ["depr";n;c;g] | |
53 | + | l -> failwith ("retranslate_tags: " ^ String.concat " " (Xlist.map l (fun (k,v) -> k ^ "=" ^ v))) | |
54 | + | |
55 | +let expand_tags x = function | |
56 | + Schemata.M tags -> | |
57 | + List.flatten (Xlist.map (Str.split (Str.regexp "|") tags) (fun tags -> | |
58 | + let tags = Xlist.map (Str.split (Str.regexp ":") tags) (Str.split (Str.regexp "\\.")) in | |
59 | + Xlist.map (Xlist.multiply_list tags) translate_tags)) | |
60 | + | Schemata.T(k,v) -> [[k,v]] | |
61 | + | Schemata.A k -> [[k,x]] | |
62 | +(* | Schemata.N -> [[]] *) | |
63 | + | |
64 | +let expand_tag_list x l = | |
65 | + Xlist.map (Xlist.multiply_list (Xlist.map l (expand_tags x))) List.flatten | |
66 | + | |
67 | +let prepare_rules l = | |
68 | + Xlist.fold l [] (fun rules (alternation_name, sufix, tags) -> | |
69 | + let alternation = try StringMap.find alternation_map alternation_name with Not_found -> failwith ("prepare_rules " ^ alternation_name) in | |
70 | + Xlist.fold alternation rules (fun rules (c,a,b) -> | |
71 | + (a ^ sufix, b, expand_tag_list c tags) :: rules)) | |
72 | + | |
73 | +let prepare_rev_rules l = | |
74 | + Xlist.fold l [] (fun rules (alternation_name, sufix, tags) -> | |
75 | + let alternation = try StringMap.find rev_alternation_map alternation_name with Not_found -> failwith ("prepare_rules " ^ alternation_name) in | |
76 | + Xlist.fold alternation rules (fun rules (c,a,b) -> | |
77 | + (a, b ^ sufix, expand_tag_list c tags) :: rules)) | |
78 | + | |
79 | +let rule_map = | |
80 | + let map = Xlist.fold Schemata.rules StringMap.empty (fun map (k,v) -> StringMap.add map k (prepare_rules v)) in | |
81 | + Xlist.fold Schemata.rev_rules map (fun map (k,v) -> StringMap.add map k (prepare_rev_rules v)) | |
82 | + | |
83 | +let is_applicable_rule (a,_,_) s = check_sufix a s | |
84 | + | |
85 | +let apply_rule (a,b,_) s = | |
86 | + (cut_sufix a s) ^ b | |
87 | + | |
88 | +let get_tags (_,_,l) = l | |
89 | + | |
90 | +let rec extract_tag s rev = function | |
91 | + [] -> "", List.rev rev | |
92 | + | (k,v) :: l -> if s = k then v, List.rev rev @ l else extract_tag s ((k,v) :: rev) l | |
93 | + | |
94 | +let apply_rules orth = | |
95 | + let found = Xlist.fold Schemata.schemata [] (fun found schema -> | |
96 | + let interps = Xlist.fold schema [orth,[]] (fun interps rule_set_name -> | |
97 | + let rules = try StringMap.find rule_map rule_set_name with Not_found -> failwith ("apply_rules: " ^ rule_set_name) in | |
98 | + Xlist.fold interps [] (fun interps (orth,tags) -> | |
99 | + Xlist.fold rules interps (fun interps rule -> | |
100 | + if is_applicable_rule rule orth then | |
101 | + let orth = apply_rule rule orth in | |
102 | + Xlist.fold (get_tags rule) interps (fun interps new_tags -> | |
103 | + (orth, new_tags @ tags) :: interps) | |
104 | + else interps))) in | |
105 | + interps @ found) in | |
106 | + Xlist.rev_map found (fun (orth,tags) -> | |
107 | + let suf,tags = extract_tag "suf" [] tags in | |
108 | + orth ^ suf, tags) | |
109 | + | |
110 | +let expand_tags tags = | |
111 | + if tags = "" then [] else | |
112 | + List.flatten (Xlist.map (Str.split (Str.regexp "|") tags) (fun tags -> | |
113 | + let tags = Xlist.map (Str.split (Str.regexp ":") tags) (Str.split (Str.regexp "\\.")) in | |
114 | + Xlist.map (Xlist.multiply_list tags) (String.concat ":"))) | |
115 | + | |
116 | +let validate_form orth lemma interp = | |
117 | +(* printf "form %s %s %s%!" orth lemma interp; *) | |
118 | + let found = apply_rules orth in | |
119 | + let b = Xlist.fold found false (fun b (orth,_) -> | |
120 | + if orth = lemma then true else b) in | |
121 | +(* let found = StringSet.of_list (Xlist.fold found [] (fun found (orth,tags) -> | |
122 | + if lemma = orth then (String.concat ":" (retranslate_tags tags)) :: found else found)) in | |
123 | + let b = Xlist.fold (expand_tags interp) true (fun b interp -> | |
124 | + if StringSet.mem found interp then b else false) in*) | |
125 | +(* if b then printf " validated\n%!" else printf " not validated\n%!"; *) | |
126 | + b | |
127 | + | |
128 | +let simplify_lemma s = | |
129 | + match Str.split (Str.regexp ":") s with | |
130 | + [s] -> s | |
131 | + | [s;_] -> s | |
132 | + | _ -> failwith "simplify_lemma" | |
133 | + | |
134 | +let validate_entry lemma forms = | |
135 | + let lemma = simplify_lemma lemma in | |
136 | + let b = Xlist.fold forms true (fun b (orth,interp) -> | |
137 | + if validate_form orth lemma interp then b else false) in | |
138 | +(* if b then printf "entry %s validated\n%!" lemma else printf "entry %s not validated\n%!" lemma; *) | |
139 | + b | |
140 | + | |
141 | +let select_not_validated lemma forms = | |
142 | + let lemma = simplify_lemma lemma in | |
143 | + Xlist.fold forms [] (fun l (orth,interp) -> | |
144 | + if validate_form orth lemma interp then l else (orth,interp) :: l) | |
145 | + | |
146 | +let find_tags_form orth lemma interp = | |
147 | + let found = apply_rules orth in | |
148 | + let found = Xlist.fold found [] (fun found (orth,tags) -> | |
149 | + if orth = lemma then (orth,tags) :: found else found) in | |
150 | + found | |
151 | + | |
152 | +let find_tags_entry lemma forms = | |
153 | + let lemma = simplify_lemma lemma in | |
154 | + let l = Xlist.fold forms [] (fun l (orth,interp) -> | |
155 | + let found = find_tags_form orth lemma interp in | |
156 | + (orth,found,interp) :: l) in | |
157 | +(* if b then printf "entry %s validated\n%!" lemma else printf "entry %s not validated\n%!" lemma; *) | |
158 | + l | |
159 | + | |
160 | +let string_of_tags tags = | |
161 | + String.concat " " (Xlist.map (List.sort compare tags) (fun (k,v) -> k ^ "=" ^ v)) | |
162 | + | |
163 | +let rec select_tag tag rev = function | |
164 | + [] -> "", rev | |
165 | + | (k,v) :: l -> if k = tag then v, rev @ l else select_tag tag ((k,v) :: rev) l | |
166 | + | |
167 | +let print path = | |
168 | + StringMap.iter rule_map (fun name rules -> | |
169 | + File.file_out (path ^ name ^ ".dic") (fun file -> | |
170 | + let map = Xlist.fold rules StringMap.empty (fun map (a,b,tagsl) -> | |
171 | + Xlist.fold tagsl map (fun map tags -> | |
172 | + let con,tags = select_tag "con" [] tags in | |
173 | + StringMap.add_inc map (string_of_tags tags) [a,b,con] (fun l -> (a,b,con) :: l))) in | |
174 | + StringMap.iter map (fun tags l -> | |
175 | + fprintf file "@RULES\t%s\n" tags; | |
176 | + Xlist.iter l (fun (a,b,con) -> | |
177 | + if con = "" then fprintf file "\t%s\t%s\t#\n" a b | |
178 | + else fprintf file "\t%s\t%s\tcon=%s\t#\n" a b con); | |
179 | + fprintf file "\n"))); | |
180 | + File.file_out (path ^ "SCHEMATA.dic") (fun file -> | |
181 | + Xlist.iter Schemata.schemata (fun schema -> | |
182 | + fprintf file "@SCHEMA\t%s\n\n" (String.concat " " schema))) | |
183 | + | |
0 | 184 | \ No newline at end of file |
... | ... |
guesser/schemata.ml
0 → 100644
1 | +let alternations = [ | |
2 | + "dowolne", ["", "", ""]; | |
3 | + | |
4 | + "funkcjonalnie_miekkie_iy", [ | |
5 | + "b'", "bi", "b'"; "ć", "ci", "ć"; "dź", "dzi", "dź"; "f'", "fi", "f'"; "ḿ", "mi", "ḿ"; "ń", "ni", "ń"; "ṕ", "pi", "ṕ"; "ś", "si", "ś"; "ẃ", "wi", "ẃ"; "ź", "zi", "ź"; | |
6 | + "l", "li", "l"; | |
7 | + "c", "cy", "c"; "cz", "czy", "cz"; "dz", "dzy", "dz"; "dż", "dży", "dż"; "rz", "rzy", "rz"; "sz", "szy", "sz"; "ż", "ży", "ż"; | |
8 | + "aj", "ai", "aj"; "ej", "ei", "ej"; "ij", "ii", "ij"; "oj", "oi", "oj"; "ój", "ói", "ój"; "uj", "ui", "uj"; "yj", "yi", "yj"; | |
9 | + ]; | |
10 | + "funkcjonalnie_miekkie_ae", [ | |
11 | + "b'", "bi", "b'"; "ć", "ci", "ć"; "dź", "dzi", "dź"; "f'", "fi", "f'"; "ḿ", "mi", "ḿ"; "ń", "ni", "ń"; "ṕ", "pi", "ṕ"; "ś", "si", "ś"; "ẃ", "wi", "ẃ"; "ź", "zi", "ź"; | |
12 | + "l", "l", "l"; | |
13 | + "c", "c", "c"; "cz", "cz", "cz"; "dz", "dz", "dz"; "dż", "dż", "dż"; "rz", "rz", "rz"; "sz", "sz", "sz"; "ż", "ż", "ż"; | |
14 | + "aj", "aj", "aj"; "ej", "ej", "ej"; "ij", "ij", "ij"; "oj", "oj", "oj"; "ój", "ój", "ój"; "uj", "uj", "uj"; "yj", "yj", "yj"; | |
15 | + "c->cz", "cz", "c"; | |
16 | + ]; | |
17 | + "funkcjonalnie_miekkie_wyglos", [ | |
18 | + "b'", "b", "b'"; "b'", "ąb", "ęb'"; "b'", "ób", "ob'"; | |
19 | + "ć", "ć", "ć"; "ć", "óć", "oć"; "ć", "eć", "ć"; "ć", "ieć", "ć"; | |
20 | + "dź", "dź", "dź"; "dź", "ódź", "odź"; "dź", "ądź", "ędź"; "dź", "óźdź", "oźdź"; | |
21 | + "f'", "f", "f'"; | |
22 | + "ḿ", "m", "ḿ"; | |
23 | + "ń", "ń", "ń"; "ń", "eń", "ń"; "ń", "ień", "ń"; "ń", "cień", "tń"; "ń", "dzień", "dń"; "ń", "sień", "śń"; "ń", "zień", "źń"; "ń", "dzień", "edń"; | |
24 | + "ṕ", "p", "ṕ"; | |
25 | + "ś", "ś", "ś"; "ś", "ieś", "ś"; | |
26 | + "ẃ", "w", "ẃ"; "ẃ", "ew", "ẃ"; "ẃ", "iew", "ẃ"; "ẃ", "ów", "oẃ"; | |
27 | + "ź", "ź", "ź"; "ź", "óź", "oź"; "ź", "ąź", "ęź"; | |
28 | + "l", "l", "l"; "l", "el", "l"; "l", "iel", "l"; "l", "ól", "ol"; "l", "ódl", "odl"; | |
29 | + "c", "c", "c"; "c", "ec", "c"; "c", "iec", "c"; "c", "niec", "ńc"; "c", "rzec", "rc"; "c", "siec", "śc"; "c", "ciec", "ćc"; "c", "dziec", "dc"; "c", "dziec", "dźc"; "c", "niec", "ieńc"; "c", "ziec", "źc"; | |
30 | + "cz", "cz", "cz"; "cz", "ecz", "cz"; "cz", "ócz", "ocz"; | |
31 | + "dz", "dz", "dz"; "dz", "ódz", "odz"; | |
32 | + "dż", "dż", "dż"; | |
33 | + "rz", "rz", "rz"; "rz", "erz", "rz"; "rz", "ierz", "rz"; "rz", "órz", "orz"; "rz", "ójrz", "ojrz"; | |
34 | + "sz", "sz", "sz"; | |
35 | + "ż", "ż", "ż"; "ż", "eż", "ż"; "ż", "óż", "oż"; "ż", "ąż", "ęż"; | |
36 | + "aj", "aj", "aj"; | |
37 | + "ej", "ej", "ej"; | |
38 | + "ij", "ij", "ij"; | |
39 | + "oj", "oj", "oj"; "oj", "ój", "oj"; | |
40 | + "ój", "ój", "ój"; | |
41 | + "uj", "uj", "uj"; | |
42 | + "yj", "yj", "yj"; | |
43 | + ]; | |
44 | + | |
45 | + "funkcjonalnie_twarde_y", [ | |
46 | + "b", "by", "b"; "ch", "chy", "ch"; "d", "dy", "d"; "f", "fy", "f"; "h", "hy", "h"; "ł", "ły", "ł"; "m", "my", "m"; "n", "ny", "n"; | |
47 | + "p", "py", "p"; "r", "ry", "r"; "s", "sy", "s"; "sz", "szy", "sz"; "t", "ty", "t"; "v", "vy", "v"; "w", "wy", "w"; "z", "zy", "z"; | |
48 | + "g", "gi", "g"; "k", "ki", "k"; | |
49 | + "a", "ay", "a"; "e", "ey", "e"; "o", "oy", "o"; "u", "uy", "u"; | |
50 | + ]; | |
51 | + "funkcjonalnie_twarde_e", [ | |
52 | + "b", "b", "b"; "ch", "ch", "ch"; "d", "d", "d"; "f", "f", "f"; "h", "h", "h"; "ł", "ł", "ł"; "m", "m", "m"; "n", "n", "n"; | |
53 | + "p", "p", "p"; "r", "r", "r"; "s", "s", "s"; "sz", "sz", "sz"; "t", "t", "t"; "v", "v", "v"; "w", "w", "w"; "z", "z", "z"; | |
54 | + "g", "gi", "g"; "k", "ki", "k"; | |
55 | + "a", "a", "a"; "e", "e", "e"; "o", "o", "o"; "u", "u", "u"; | |
56 | + ]; | |
57 | + "funkcjonalnie_twarde_a", [ | |
58 | + "b", "b", "b"; "ch", "ch", "ch"; "d", "d", "d"; "f", "f", "f"; "h", "h", "h"; "ł", "ł", "ł"; "m", "m", "m"; "n", "n", "n"; | |
59 | + "p", "p", "p"; "r", "r", "r"; "s", "s", "s"; "sz", "sz", "sz"; "t", "t", "t"; "v", "v", "v"; "w", "w", "w"; "z", "z", "z"; | |
60 | + "g", "g", "g"; "k", "k", "k"; | |
61 | + "a", "a", "a"; "e", "e", "e"; "o", "o", "o"; "u", "u", "u"; | |
62 | + ]; | |
63 | + "funkcjonalnie_twarde_i", [ | |
64 | + "b", "bi", "b"; "ch", "si", "ch"; "d", "dzi", "d"; "d", "edzi", "ad"; "f", "fi", "f"; "h", "zi", "h"; | |
65 | + "ł", "li", "ł"; "ł", "eli", "oł"; "ł", "śli", "sł"; "ł", "źli", "zł"; "ł", "rźli", "rzł"; | |
66 | + "m", "mi", "m"; "m", "śmi", "sm"; | |
67 | + "n", "ni", "n"; "n", "eni", "on"; "n", "eni", "ion"; "n", "śni", "sn"; "n", "źni", "zn"; | |
68 | + "p", "pi", "p"; "r", "rzy", "r"; "s", "si", "s"; "sz", "si", "sz"; | |
69 | + "t", "ci", "t"; "t", "ści", "st"; "t", "eci", "ot"; | |
70 | + "v", "vi", "v"; "w", "wi", "w"; "z", "zi", "z"; "ż", "zi", "ż"; | |
71 | + "g", "dzy", "g"; "k", "cy", "k"; | |
72 | + "a", "ai", "a"; "e", "ei", "e"; "o", "oi", "o"; "u", "ui", "u"; | |
73 | +(* "", "rzy", "er";*) | |
74 | + ]; | |
75 | + | |
76 | + "funkcjonalnie_twarde_ie", [ | |
77 | + "b", "bie", "b"; "ch", "sze", "ch"; | |
78 | + "d", "dzie", "d"; "d", "ździe", "zd"; "d", "edzie", "ad"; "d", "edzie", "od"; "d", "eździe", "azd"; | |
79 | + "f", "fie", "f"; "h", "sze", "h"; "h", "że", "h"; | |
80 | + "ł", "le", "ł"; "ł", "śle", "sł"; "ł", "źle", "zł"; "ł", "ele", "ał"; "ł", "ele", "oł"; "ł", "etle", "atł"; "ł", "lle", "łł"; | |
81 | + "m", "mie", "m"; "m", "śmie", "sm"; | |
82 | + "n", "nie", "n"; "n", "enie", "on"; "n", "śnie", "sn"; "n", "źnie", "zn"; | |
83 | + "p", "pie", "p"; | |
84 | + "r", "rze", "r"; "r", "erze", "ar"; "r", "etrze", "atr"; "r", "rze", "rr"; | |
85 | + "s", "sie", "s"; "s", "esie", "as"; | |
86 | + "t", "cie", "t"; "t", "ecie", "at"; "t", "ecie", "ot"; "t", "ście", "st"; "t", "eście", "ast"; | |
87 | + "v", "vie", "v"; "w", "wie", "w"; "z", "zie", "z"; | |
88 | + "g", "dze", "g"; "k", "ce", "k"; | |
89 | + ]; | |
90 | + | |
91 | + "funkcjonalnie_twarde_wyglos", [ | |
92 | + "b", "b", "b"; "b", "eb", "b"; "b", "ób", "ob"; "b", "ąb", "ęb"; "b", "óśb", "ośb"; "b", "óźb", "oźb"; | |
93 | + "ch","ch","ch";"ch","ech","ch"; | |
94 | + "d", "d", "d"; "d", "ed", "d"; "d", "ód", "od"; "d", "ąd", "ęd"; | |
95 | + "f", "f", "f"; | |
96 | + "h", "h", "h"; | |
97 | + "ł", "ł", "ł"; "ł", "eł", "ł"; "ł", "ieł", "ł"; "ł", "el", "oł"; "ł", "ół", "oł"; "ł", "ioł", "ł"; "ł", "rzeł", "rł"; "ł", "cieł", "tł"; "ł", "cioł", "tł"; | |
98 | + "m", "m", "m"; "m", "em", "m"; "m", "ciem", "ćm"; | |
99 | + "n", "n", "n"; "n", "en", "n"; "n", "ien", "n"; "n", "dzien", "dn"; "n", "zien", "źn"; "n", "cien", "tn"; "n", "sien", "śn"; | |
100 | + "p", "p", "p"; "p", "ep", "p"; "p", "iep", "p"; "p", "óp", "op"; | |
101 | + "r", "r", "r"; "r", "er", "r"; "r", "ier", "r"; "r", "ór", "or"; "r", "cer", "kr"; "r", "óbr", "obr"; "r", "óstr", "ostr"; | |
102 | + "s", "s", "s"; "s", "ies", "s"; | |
103 | + "sz","sz","sz";"sz","esz","sz"; | |
104 | + "t", "t", "t"; "t", "et", "t"; "t", "ót", "ot"; "t", "ąt", "ęt"; | |
105 | + "v", "v", "v"; | |
106 | + "w", "w", "w"; "w", "ew", "w"; "w", "iew", "w"; "w", "ów", "ow"; | |
107 | + "x", "x", "ks"; | |
108 | + "z", "z", "z"; "z", "ez", "z"; "z", "iez", "z"; "z", "óz", "oz"; "z", "ąz", "ęz"; | |
109 | + "g", "g", "g"; "g", "eg", "g"; "g", "óg", "og"; "g", "órg", "org"; "g", "ąg", "ęg"; | |
110 | + "k", "k", "k"; "k", "ek", "k"; "k", "ciek", "ćk"; "k", "dziek", "dźk"; "k", "niek", "ńk"; "k", "siek", "śk"; "k", "ziek", "źk"; "k", "ąk", "ęk"; | |
111 | + "a", "a", "a"; "e", "e", "e"; "o", "o", "o"; "u", "u", "u"; | |
112 | + ]; | |
113 | + | |
114 | + "funkcjonalnie_miekkie_ii", [ | |
115 | + "ai", "ai", "ai"; | |
116 | + "bi", "bi", "bi"; | |
117 | +(* "ci", "ci", "ci"; *) | |
118 | + "chi", "chi", "chi"; | |
119 | +(* "czi", "czi", "czi"; *) | |
120 | + "di", "di", "di"; | |
121 | + "dżi", "dżi", "dżi"; | |
122 | + "fi", "fi", "fi"; | |
123 | + "gi", "gi", "gi"; | |
124 | + "ki", "ki", "ki"; | |
125 | + "li", "li", "li"; | |
126 | + "mi", "mi", "mi"; | |
127 | + "ni", "ni", "ni"; | |
128 | + "pi", "pi", "pi"; | |
129 | + "qui", "qui", "qui"; | |
130 | + "ri", "ri", "ri"; | |
131 | + "ti", "ti", "ti"; | |
132 | + "vi", "vi", "vi"; | |
133 | + "wi", "wi", "wi"; | |
134 | + "xi", "xi", "xi"; | |
135 | + "cj", "cj", "cj"; | |
136 | + "czj", "czj", "czj"; | |
137 | + "sj", "sj", "sj"; | |
138 | + "szj", "szj", "szj"; | |
139 | + "zj", "zj", "zj"; | |
140 | + "żi", "żi", "żi"; | |
141 | + ]; | |
142 | + | |
143 | + "funkcjonalnie_miekkie_ii_wyglos", [ | |
144 | + "bi", "bij", "bi"; | |
145 | + "ci", "cyj", "ci"; | |
146 | + "chi", "chij", "chi"; | |
147 | +(* "czi", "czi", "czi"; *) | |
148 | + "di", "dyj", "di"; | |
149 | + "dżi", "dżij", "dżi"; | |
150 | + "fi", "fij", "fi"; | |
151 | + "gi", "gij", "gi"; | |
152 | + "ki", "kij", "ki"; | |
153 | + "li", "lij", "li"; | |
154 | + "mi", "mij", "mi"; | |
155 | + "ni", "nij", "ni"; | |
156 | + "pi", "pij", "pi"; | |
157 | + "qui", "quij", "qui"; | |
158 | + "ri", "ryj", "ri"; | |
159 | + "ti", "tyj", "ti"; | |
160 | + "vi", "vij", "vi"; | |
161 | + "wi", "wij", "wi"; | |
162 | + "cj", "cyj", "cj"; | |
163 | + "czj", "czyj", "czj"; | |
164 | + "sj", "syj", "sj"; | |
165 | + "szj", "szyj", "szj"; | |
166 | + "zj", "zyj", "zj"; | |
167 | + ]; | |
168 | + | |
169 | + "kontrakcje", [ | |
170 | + "t", "ę", "ęt"; | |
171 | + "t", "ęci", "ęt"; | |
172 | + "t", "ęci", "ążęt"; | |
173 | + "n", "ę", "on"; | |
174 | + "n", "eni", "on"; | |
175 | + "di", "dion", "di"; | |
176 | + "n", "anin", "an"; | |
177 | + "n", "o", "on"; | |
178 | + "t", "a", "at"; | |
179 | + ]; | |
180 | + | |
181 | + "kapitaliki_y", [ | |
182 | + "B", "B-y", "B"; "C", "C-i", "C"; "D", "D-y", "D"; "F", "F-y", "F"; "G", "G-i", "G"; "H", "H-y", "H"; | |
183 | + "I", "I-i", "I"; "J", "J-i", "J"; "J", "J-oty", "J"; "K", "K-i", "K"; "L", "L-i", "L"; "M", "M-y", "M"; "N", "N-y", "N"; "P", "P-y", "P"; "R", "R-y", "R"; "S", "S-y", "S"; | |
184 | + "T", "T-y", "T"; "V", "V-y", "V"; "W", "W-y", "W"; "X", "X-y", "X"; "Z", "Z-y", "Z"; "Z", "Z-ety", "Z"; | |
185 | + "z", "z-ety", "z"; "Ż", "Ż-ety", "Ż"; "f", "f-y", "f"; "m", "m-y", "m"; "r", "r-y", "r"; | |
186 | + "s", "s-y", "s"; "z", "z-y", "z"; "Ł", "Ł-y", "Ł"; "Ś", "Ś-y", "Ś"; "Ż", "Ż-y", "Ż"; "l", "l-i", "l"; | |
187 | + ]; | |
188 | + | |
189 | + "kapitaliki_a", [ | |
190 | + "B", "B-", "B"; "C", "C-", "C"; "D", "D-", "D"; "E", "E-", "E"; "F", "F-", "F"; "G", "G-", "G"; "H", "H-", "H"; "I", "I-", "I"; | |
191 | + "J", "J-", "J"; "J", "J-ot", "J"; "K", "K-", "K"; "L", "L-", "L"; "M", "M-", "M"; "N", "N-", "N"; "P", "P-", "P"; "R", "R-", "R"; "S", "S-", "S"; | |
192 | + "T", "T-", "T"; "V", "V-", "V"; "W", "W-", "W"; "X", "X-", "X"; "Z", "Z-", "Z"; "Z", "Z-et", "Z"; | |
193 | + "z", "z-et", "z"; "Ż", "Ż-et", "Ż"; "f", "f-", "f"; "l", "l-", "l"; "m", "m-", "m"; "r", "r-", "r"; | |
194 | + "s", "s-", "s"; "z", "z-", "z"; "Ł", "Ł-", "Ł"; "Ś", "Ś-", "Ś"; "Ż", "Ż-", "Ż"; | |
195 | + ]; | |
196 | + | |
197 | + "kapitaliki_e", [ | |
198 | + "B", "B-", "B"; "C", "C-", "C"; "C", "C-i", "C"; "D", "D-", "D"; "E", "E-", "E"; "F", "F-", "F"; "G", "G-i", "G"; "H", "H-", "H"; "I", "I-", "I"; | |
199 | + "J", "J-", "J"; "J", "J-ot", "J"; "K", "K-i", "K"; "L", "L-", "L"; "M", "M-", "M"; "N", "N-", "N"; "P", "P-", "P"; "R", "R-", "R"; "S", "S-", "S"; | |
200 | + "T", "T-", "T"; "V", "V-", "V"; "W", "W-", "W"; "X", "X-", "X"; "Z", "Z-", "Z"; "Z", "Z-et", "Z"; | |
201 | + "z", "z-et", "z"; "Ż", "Ż-et", "Ż"; "f", "f-", "f"; "l", "l-", "l"; "m", "m-", "m"; "r", "r-", "r"; | |
202 | + "s", "s-", "s"; "z", "z-", "z"; "Ł", "Ł-", "Ł"; "Ś", "Ś-", "Ś"; "Ż", "Ż-", "Ż"; | |
203 | + ]; | |
204 | + | |
205 | + "kapitaliki_ie", [ | |
206 | + "B", "B-ie", "B"; "D", "D-zie", "D"; "F", "F-ie", "F"; "J", "J-ocie", "J"; "M", "M-ie", "M"; "N", "N-ie", "N"; "P", "P-ie", "P"; "R", "R-ze", "R"; "S", "S-ie", "S"; | |
207 | + "T", "-cie", "T"; "V", "V-ie", "V"; "W", "W-ie", "W"; "X", "X-ie", "X"; "Z", "Z-ie", "Z"; "Z", "Z-ecie", "Z"; | |
208 | + "z", "z-ecie", "z"; "Ż", "Ż-ecie", "Ż"; "f", "f-ie", "f"; "m", "m-ie", "m"; "s", "s-ie", "s"; "r", "r-ze", "r"; | |
209 | + ]; | |
210 | + | |
211 | + "kapitaliki_wyglos", [ | |
212 | + "B", "B", "B"; "C", "C", "C"; "D", "D", "D"; "E", "E", "E"; "F", "F", "F"; "G", "G", "G"; "H", "H", "H"; "I", "I", "I"; | |
213 | + "J", "J", "J"; "K", "K", "K"; "L", "L", "L"; "M", "M", "M"; "N", "N", "N"; "P", "P", "P"; "R", "R", "R"; "S", "S", "S"; | |
214 | + "T", "T", "T"; "V", "V", "V"; "W", "W", "W"; "X", "X", "X"; "Z", "Z", "Z"; | |
215 | + "Ł", "Ł", "Ł"; "Ś", "Ś", "Ś"; "Ż", "Ż", "Ż"; | |
216 | + (*"z", "z", "z"; "f", "f", "f"; "l", "l", "l"; "m", "m", "m"; "r", "r", "r"; | |
217 | + "s", "s", "s"; "z", "z", "z";*) | |
218 | + ]; | |
219 | + | |
220 | +(* | |
221 | + "funkcjonalnie_miekkie_nowe", [ | |
222 | + "chi", "chi"; "czi", "czi"; "di", "di"; "dżi", "dżi"; "gi", "gi"; | |
223 | + "ki", "ki"; "li", "li"; "ri", "ri"; "ti", "ti"; "vi", "vi"; | |
224 | + "cj", "cj"; "czj", "czj"; "sj", "sj"; | |
225 | + "szj", "szj"; "zj", "zj"; "ui", "ui"; "ai", "ai"; | |
226 | + "żi", "żi"; (*"nj", "nj"; "lj", "lj"; "pj", "pj";*) | |
227 | + (*"dhi", "dhi";*) "xi", "xi"; (*"yi", "yi";*) (*"ři", "ři"; "şi", "şi"; *) | |
228 | + (*"í", "í";*) (*"", ""; | |
229 | + "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; *) | |
230 | + ]; | |
231 | + | |
232 | + | |
233 | + "funkcjonalnie_miekkie_nowe_wyglos", [ | |
234 | + "chi", "chi"; | |
235 | + "czi", "czi"; | |
236 | + "di", "di"; | |
237 | +(* "dhi", "dhi"; *) | |
238 | + "dżi", "dżi"; | |
239 | + "li", "li"; | |
240 | + "ri", "ri"; | |
241 | + "ti", "ti"; | |
242 | + "vi", "vi"; | |
243 | + "xi", "xi"; | |
244 | + "yi", "yi"; | |
245 | +(* "ři", "ři"; | |
246 | + "şi", "şi"; | |
247 | + "ij", "ij"; | |
248 | + "t", "ti"; *) | |
249 | + ]; | |
250 | + | |
251 | + "samogloski", [ | |
252 | + "a", "a"; "e", "e"; "o", "o"; "u", "u"; | |
253 | + ]; | |
254 | + | |
255 | + "funkcjonalnie_twarde_nowe", [ | |
256 | + (*"dh", "dh";*) "dź", "dź"; | |
257 | + (*"rh", "rh";*) "v", "v"; (*"gh", "gh"; "nh", "nh"; *) | |
258 | + (*"q", "q";*) "x", "x"; (*"", ""; "", ""; "", ""; "", ""; "", ""; "", ""; | |
259 | + "", ""; "", ""; "", ""; "", ""; | |
260 | + "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; | |
261 | + "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; *) | |
262 | + ]; | |
263 | + | |
264 | + "funkcjonalnie_twarde_nowe_y", [ | |
265 | + "ay", "a"; (*"dhy", "dh"; "dźy", "dź";*) "ey", "e"; | |
266 | + "oy", "o"; (*"rhy", "rh";*) "uy", "u"; "vy", "v"; (*"ghi", "gh"; "nhy", "nh"; *) | |
267 | + (*"qy", "q";*) "xy", "x"; (*"ki", "c";*) "ai", "a"; (*"dźi", "dź";*) "ei", "e"; | |
268 | + "oi", "o"; "ui", "u"; | |
269 | + (*"", ""; "", ""; "", ""; "", ""; | |
270 | + "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; | |
271 | + "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; *) | |
272 | + ]; | |
273 | + | |
274 | + "funkcjonalnie_twarde_nowe_ie", [ (* FIXME: przejrzeć czy faktycznie nowe *) | |
275 | + "ksie", "x"; "dzie", "dh"; "dze", "h"; "rsze", "rh"; "rże", "rh"; "dze", "gh"; | |
276 | + "cie", "te"; "nie", "nh"; "rze", "rh"; "cie", "th"; | |
277 | +(* "cie", "tes"; | |
278 | + "cie", "the"; | |
279 | + "cie", "thes"; *) | |
280 | + "cie", "tt"; | |
281 | + "dzie", "de"; | |
282 | + "edzie", "ad"; | |
283 | +(* "fie", "ph"; *) | |
284 | +(* "fie", "phe"; *) | |
285 | + "obie", "ób"; | |
286 | + "rze", "er"; | |
287 | + "rze", "re"; | |
288 | +(* "rze", "res"; *) | |
289 | + "rze", "rre"; | |
290 | + "sie", "ce"; | |
291 | +(* "sie", "th"; *) | |
292 | + "ąbie", "ębi"; | |
293 | + "ście", "ste"; | |
294 | +(* "ście", "stes"; *) | |
295 | + "śnie", "sne"; | |
296 | + "ecie", "at"; | |
297 | + "edzie", "od"; | |
298 | + "esie", "as"; | |
299 | + "etrze", "atr"; | |
300 | + "kcie", "ct"; | |
301 | + "ole", "ół"; (* wątpliwe *) | |
302 | + "orze", "ór"; (* wątpliwe *) | |
303 | + "oździe", "ózd"; (* wątpliwe *) | |
304 | + "ębie", "ąb"; | |
305 | + "ędzie", "ąd"; | |
306 | + "ęsie", "ąs"; | |
307 | + "rze", "rs"; | |
308 | + "ele", "ał"; | |
309 | + "etle", "atł"; | |
310 | + "śmie", "sm"; | |
311 | + "vie", "v"; | |
312 | + "rzie", "rz"; | |
313 | + ]; | |
314 | + | |
315 | + "funkcjonalnie_twarde_nowe_wyglos", [ | |
316 | + "x", "ks"; | |
317 | + "x", "s"; | |
318 | +(* "dh", "dh"; | |
319 | + "gh", "gh"; | |
320 | + "nh", "nh"; | |
321 | + "q", "q"; *) | |
322 | + "v", "v"; | |
323 | +(* "rh", "rh"; *) | |
324 | + "dź", "dź"; | |
325 | + ]; | |
326 | + | |
327 | + | |
328 | + "obce_ch", [ | |
329 | +(* "u", "u"; "y", "y"; "ee", "e"; | |
330 | + "die", "di"; "pie", "pi"; "rie", "ri"; "tie", "ti"; "ne", "n"; | |
331 | + "nii", "ni"; "rii", "ri"; "oji", "oj"; "zi", "z"; | |
332 | + "gie", "g"; "kie", "k"; | |
333 | + "sze", "ch"; "cze", "c"; "rze", "r"; "esie", "os"; (*"", ""; "", ""; | |
334 | + "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; | |
335 | + "", ""; "", ""; "", ""; "", ""; "", ""; "", ""; *)*) | |
336 | + ];*) | |
337 | + | |
338 | + "sz", [ | |
339 | + "sz", "sz", "sz"; | |
340 | + ]; | |
341 | + | |
342 | + "sz_i", [ | |
343 | + "sz", "si", "sz"; | |
344 | + ]; | |
345 | + | |
346 | + "adj_grad_sz", [ | |
347 | + "b", "b", "b"; | |
348 | + "k", "b", "bk"; | |
349 | + "k", "b", "bok"; | |
350 | + "c", "ęt", "ąc"; | |
351 | + "d", "d", "d"; | |
352 | + "d", "ed", "ad"; | |
353 | + "k", "d", "dk"; | |
354 | + "h", "h", "h"; | |
355 | + "g", "ż", "g"; | |
356 | + "k", "k", "k"; | |
357 | + "k", "k", "kk"; | |
358 | + "ł", "l", "ł"; | |
359 | + "ł", "el", "oł"; | |
360 | + "ł", "el", "ał"; | |
361 | + "k", "el", "ałk"; | |
362 | + "k", "l", "lek"; | |
363 | + "m", "m", "m"; | |
364 | + "n", "ń", "n"; | |
365 | + "n", "eń", "an"; | |
366 | + "ń", "ń", "ń"; | |
367 | + "k", "ń", "nk"; | |
368 | + "n", "eń", "on"; | |
369 | + "n", "ień", "on"; | |
370 | + "p", "p", "p"; | |
371 | + "ṕ", "p", "ṕ"; | |
372 | + "k", "p", "pk"; | |
373 | + "r", "r", "r"; | |
374 | + "r", "ędr", "ądr"; | |
375 | + "k", "r", "rok"; | |
376 | + "k", "ż", "sk"; | |
377 | + "k", "ż", "sok"; | |
378 | + "k", "ęż", "ąsk"; | |
379 | + "t", "t", "t"; | |
380 | + "k", "t", "tk"; | |
381 | + "k", "t", "ck"; | |
382 | + "w", "w", "w"; | |
383 | + "ż", "ż", "ż"; | |
384 | + "k", "ż", "żk"; | |
385 | + ]; | |
386 | + | |
387 | + "adj_grad_iejsz", [ | |
388 | + "d", "dzi", "d"; | |
389 | + "k", "ci", "k"; | |
390 | + "k", "ż", "ekk"; | |
391 | + "ł", "l", "ł"; | |
392 | + "ł", "śl", "sł"; | |
393 | + "ł", "źl", "zł"; | |
394 | + "m", "mi", "m"; | |
395 | + "n", "ni", "n"; | |
396 | + "n", "śni", "sn"; | |
397 | + "n", "źni", "zn"; | |
398 | + "ń", "ni", "ń"; | |
399 | + "r", "rz", "r"; | |
400 | + "t", "ci", "t"; | |
401 | + "t", "ści", "st"; | |
402 | + "w", "wi", "w"; | |
403 | + ]; | |
404 | + | |
405 | + "adv_grad", [ | |
406 | + "k", "c", "kk"; | |
407 | + "k", "c", "tk"; | |
408 | + "k", "ci", "k"; | |
409 | + "k", "ci", "tk"; | |
410 | + "k", "cz", "k"; | |
411 | + "k", "dz", "dk"; | |
412 | + "k", "dzi", "dk"; | |
413 | + "k", "ż", "żk"; | |
414 | + "k", "ni", "nk"; | |
415 | + "k", "bi", "bok"; | |
416 | + "n", "ieni", "on"; | |
417 | + "k", "l", "lek"; | |
418 | + "k", "rz", "rok"; | |
419 | + "k", "ęzi", "ąsk"; | |
420 | + "c", "ęc", "ąc"; | |
421 | + "k", "ż", "ekk"; | |
422 | + "g", "ż", "g"; | |
423 | + "k", "ż", "sk"; | |
424 | + "k", "ż", "sok"; | |
425 | + "cz", "cz", "cz"; | |
426 | + "ḿ", "mi", "ḿ"; | |
427 | + "ń", "ni", "ń"; | |
428 | + "ṕ", "pi", "ṕ"; | |
429 | + "ż", "ż", "ż"; | |
430 | + ]; | |
431 | + | |
432 | + "ger", [ | |
433 | + "ć", "ci", "ć"; | |
434 | + "ń", "ni", "ń"; | |
435 | + ]; | |
436 | + "ger_wyglos", [ | |
437 | + "ć", "ć", "ć"; | |
438 | + "ń", "ń", "ń"; | |
439 | + ]; | |
440 | + "pact", [ | |
441 | + "c", "c", "c"; | |
442 | + ]; | |
443 | + "ppas", [ | |
444 | + "t", "t", "t"; | |
445 | + "n", "n", "n"; | |
446 | + ]; | |
447 | + "ppas_i", [ | |
448 | + "t", "ci", "t"; | |
449 | + "n", "ni", "n"; | |
450 | +(* "y", "", "y"; *) | |
451 | + "n", "eni", "on"; | |
452 | + ]; | |
453 | + "praet", [ | |
454 | + "ł", "ł", "ł"; | |
455 | + ]; | |
456 | + "praet_i", [ | |
457 | + "ł", "li", "ł"; | |
458 | + "ł", "edli", "adł"; | |
459 | + "ł", "etli", "otł"; | |
460 | + "ł", "eśli", "osł"; | |
461 | + "ł", "eźli", "azł"; | |
462 | + "ł", "eźli", "ozł"; | |
463 | + "ł", "śli", "sł"; | |
464 | + "ł", "źli", "zł"; | |
465 | + "ł", "eli", "ał"; | |
466 | + ]; | |
467 | + "ae", [ | |
468 | + "", "a", "a"; | |
469 | + "", "e", "e"; | |
470 | + ]; | |
471 | + | |
472 | + "verb_inf_ć", [ | |
473 | + "b", "ś", "b"; | |
474 | + "d", "ś", "d"; | |
475 | + "d", "eś", "ad"; | |
476 | + "d", "eś", "od"; | |
477 | + "d", "óś", "od"; | |
478 | + "d", "ąś", "ęd"; | |
479 | + "s", "ś", "s"; | |
480 | + "s", "eś", "os"; | |
481 | + "s", "ąś", "ęs"; | |
482 | + "s", "óś", "os"; | |
483 | + "t", "eś", "ot"; | |
484 | + "z", "ź", "z"; | |
485 | + "z", "eź", "az"; | |
486 | + "z", "eź", "oz"; | |
487 | + "z", "ąź", "ęz"; | |
488 | +(* "n", "nię", "n"; *) | |
489 | +(* "ą", "óś", "osną"; | |
490 | + "ą", "ąź", "ęzną"; | |
491 | + "d", "dną", "d"; | |
492 | + "g", "gną", "g"; | |
493 | + "k", "kną", "k";*) | |
494 | + ]; | |
495 | + "verb_inf_c", [ | |
496 | + "g", "", "g"; | |
497 | + "g", "ó", "og"; | |
498 | + "g", "ą", "ęg"; | |
499 | + "k", "", "k"; | |
500 | + "k", "ą", "ęk"; | |
501 | +(* "ą", "ąc", "ęgną"; | |
502 | + "ą", "ąc", "ękną";*) | |
503 | + ]; | |
504 | + | |
505 | + "verb_łszy", [ | |
506 | + "", "ad", "ed"; | |
507 | + "", "ód", "od"; | |
508 | + "", "óg", "og"; | |
509 | + "", "ók", "ok"; | |
510 | + "", "ós", "os"; | |
511 | + "", "ót", "ot"; | |
512 | + "", "óz", "oz"; | |
513 | + "", "ąb", "ęb"; | |
514 | + "", "ąd", "ęd"; | |
515 | + "", "ąg", "ęg"; | |
516 | + "", "ąk", "ęk"; | |
517 | + "", "ąs", "ęs"; | |
518 | + "", "ązg", "ęzg"; | |
519 | + "", "ąz", "ęz"; | |
520 | + "", "b", "b"; | |
521 | + "", "d", "d"; | |
522 | + "", "g", "g"; | |
523 | + "", "h", "h"; | |
524 | + "", "k", "k"; | |
525 | + "", "p", "p"; | |
526 | + "", "s", "s"; | |
527 | + "", "t", "t"; | |
528 | + "", "z", "z"; | |
529 | + "", "ż", "ż"; | |
530 | + "", "r", "r"; | |
531 | + "", "ł", "ł"; | |
532 | + (* ruchome e *) | |
533 | + "", "ech", "ch"; | |
534 | + "", "sech", "esch"; | |
535 | + "", "szed", "esz"; | |
536 | + ]; | |
537 | + | |
538 | + "patal_j", [ | |
539 | + "", "szcz", "ść"; | |
540 | + "", "żdż", "źdź"; | |
541 | + "", "ż", "ź"; | |
542 | + "", "cz", "ć"; | |
543 | + "", "sz", "ś"; | |
544 | + "", "c", "ć"; | |
545 | + "", "n", "ń"; | |
546 | + "", "dz", "dź"; | |
547 | + ]; | |
548 | + | |
549 | + "verb_nie", [ (* wyjątki *) | |
550 | + "", "gn", "gi"; | |
551 | + "", "mn", "mi"; | |
552 | + "", "pn", "pi"; | |
553 | + "", "tn", "ci"; | |
554 | + "", "śn", "s"; | |
555 | + "", "źn", "z"; | |
556 | + "", "dm", "d"; | |
557 | + "", "jm", "j"; | |
558 | + "", "żm", "ż"; | |
559 | + "", "źm", "zi"; | |
560 | + (* ruchome e *) | |
561 | + "", "bejm", "bj"; | |
562 | + "", "dejm", "dj"; | |
563 | + "", "eczn", "cz"; | |
564 | + "", "edm", "d"; | |
565 | + "", "egn", "gi"; | |
566 | + "", "ekln", "kl"; | |
567 | + "", "emn", "mi"; | |
568 | + "", "epn", "pi"; | |
569 | + "", "espn", "spi"; | |
570 | + "", "eszczn", "szcz"; | |
571 | + "", "etn", "ci"; | |
572 | + "", "ezdm", "zd"; | |
573 | + "", "eźm", "zi"; | |
574 | + "", "eżm", "ż"; | |
575 | + "", "eżn", "ż"; | |
576 | + "", "zejm", "zj"; | |
577 | + "", "zekln", "skl"; | |
578 | + "", "zepn", "spi"; | |
579 | + "", "zetn", "ści"; | |
580 | + ]; | |
581 | + | |
582 | + "verb_nię", [ | |
583 | + "", "ś", "s"; | |
584 | + "", "ź", "z"; | |
585 | + "", "ęd", "ad"; | |
586 | + ]; | |
587 | + | |
588 | + "verb_ie", [ | |
589 | + "", "bierze", "ebr"; | |
590 | + "", "ce", "t"; | |
591 | + "", "cze", "k"; | |
592 | + "", "cze", "t"; | |
593 | + "", "dzie", "ch"; | |
594 | + "", "esze", "os"; | |
595 | + "", "jdzie", "laz"; | |
596 | + "", "szcze", "sk"; | |
597 | + "", "szcze", "st"; | |
598 | + "", "ędzie", "ad"; | |
599 | + "", "ęże", "eg"; | |
600 | + "", "ście", "s"; | |
601 | + "", "żdże", "zd"; | |
602 | + "", "żdże", "zg"; | |
603 | + "", "że", "g"; | |
604 | + "", "że", "z"; | |
605 | + "", "tanie", "t"; | |
606 | + "", "aje", "aj"; | |
607 | + "", "le", "l"; | |
608 | + "", "sze", "s"; | |
609 | + "", "owie", "w"; | |
610 | + "", "źre", "ziar"; | |
611 | + "", "orzy", "ar"; | |
612 | + "", "re", "ar"; | |
613 | + "", "rze", "ar"; | |
614 | + (* ruchome e *) | |
615 | + "", "ierze", "r"; | |
616 | + "", "pierze", "epr"; | |
617 | + "", "spierze", "zepr"; | |
618 | + "", "ściele", "esł"; | |
619 | + "", "ściele", "sł"; | |
620 | + "", "edrze", "dar"; | |
621 | + "", "emrze", "mar"; | |
622 | + "", "eprze", "par"; | |
623 | + "", "esprze", "spar"; | |
624 | + "", "etrze", "tar"; | |
625 | + "", "ewrze", "war"; | |
626 | + "", "eźre", "ziar"; | |
627 | + "", "eźrze", "ziar"; | |
628 | + "", "eżre", "żar"; | |
629 | + "", "iele", "eł"; | |
630 | + "", "zeprze", "spar"; | |
631 | + "", "zetrze", "star"; | |
632 | + ]; | |
633 | + | |
634 | + "r", [ | |
635 | + "", "r", "r"; | |
636 | + ]; | |
637 | + | |
638 | + "nieregularne", [ | |
639 | + "e", "je", "jad"; | |
640 | + "e", "re", "rz"; | |
641 | + "e", "iele", "eł"; | |
642 | + "ie", "wie", "wiedzi"; | |
643 | + "i", "śpi", "sp"; | |
644 | + "a", "ma", "mi"; | |
645 | + "eje", "reje", "r"; | |
646 | + "e", "ce", "ci"; | |
647 | + "i", "oi", ""; | |
648 | + "ie", "będzie", "b"; | |
649 | + ]; | |
650 | + | |
651 | + "verb_impt", [ | |
652 | + "", "cz", "k"; | |
653 | + "", "cz", "t"; | |
654 | + "", "dź", "ch"; | |
655 | + "", "edz", "ad"; | |
656 | + "", "edź", "od"; | |
657 | + "", "esz", "os"; | |
658 | + "", "eć", "ot"; | |
659 | + "", "eś", "os"; | |
660 | + "", "eź", "az"; | |
661 | + "", "eź", "oz"; | |
662 | + "", "gnij", "gi"; | |
663 | + "", "iel", "eł"; | |
664 | + "", "jdź", "laz"; | |
665 | + "", "mnij", "mi"; | |
666 | + "", "pnij", "pi"; | |
667 | + "", "ryj", "ar"; | |
668 | + "", "sz", "ch"; | |
669 | + "", "tnij", "ci"; | |
670 | + "", "zcz", "k"; | |
671 | + "", "zcz", "t"; | |
672 | + "", "ódź", "od"; | |
673 | + "", "órz", "or"; | |
674 | + "", "óż", "og"; | |
675 | + "", "ądź", "ęd"; | |
676 | + "", "ąś", "ęs"; | |
677 | + "", "ędź", "ad"; | |
678 | + "", "ęż", "eg"; | |
679 | + "", "ś", "s"; | |
680 | + "", "ścij", "s"; | |
681 | + "", "ślij", "sł"; | |
682 | + "", "śnij", "s"; | |
683 | + "", "śpij", "sp"; | |
684 | + "", "ź", "z"; | |
685 | + "", "źnij", "z"; | |
686 | + "", "ż", "g"; | |
687 | + "", "ż", "z"; | |
688 | + "", "żdż", "zd"; | |
689 | + "", "żdż", "zg"; | |
690 | + "", "żyj", "g"; | |
691 | + "", "z", "z"; | |
692 | + "", "laj", "l"; | |
693 | + "", "rej", "r"; | |
694 | + "", "sij", "s"; | |
695 | + "", "wij", "w"; | |
696 | + "", "rz", "r"; | |
697 | + "", "sz", "s"; | |
698 | + "", "bój", "b"; | |
699 | + "", "tój", "t"; | |
700 | + "", "bądź", "b"; | |
701 | + "", "dź", "d"; | |
702 | + (* ruchome e *) | |
703 | + "", "bejmij", "bj"; | |
704 | + "", "bierz", "ebr"; | |
705 | + "", "dejmij", "dj"; | |
706 | + "", "ecznij", "cz"; | |
707 | + "", "edmij", "d"; | |
708 | + "", "egnij", "gi"; | |
709 | + "", "eklnij", "kl"; | |
710 | + "", "emnij", "mi"; | |
711 | + "", "epnij", "pi"; | |
712 | + "", "espnij", "spi"; | |
713 | + "", "eszcznij", "szcz"; | |
714 | + "", "etnij", "ci"; | |
715 | + "", "ezdmij", "zd"; | |
716 | + "", "eź", "zi"; | |
717 | + "", "eźmij", "zi"; | |
718 | + "", "eźryj", "ziar"; | |
719 | + "", "eżmij", "ż"; | |
720 | + "", "eżnij", "ż"; | |
721 | + "", "eżryj", "żar"; | |
722 | + "", "ierz", "r"; | |
723 | + "", "pierz", "epr"; | |
724 | + "", "spierz", "zepr"; | |
725 | + "", "zejmij", "zj"; | |
726 | + "", "zeklnij", "skl"; | |
727 | + "", "zepnij", "spi"; | |
728 | + "", "zetnij", "ści"; | |
729 | + "", "ściel", "esł"; | |
730 | + "", "ściel", "sł"; | |
731 | + ]; | |
732 | + | |
733 | + "verb_ną", [ (* wyjątki *) | |
734 | + "", "gn", "gi"; | |
735 | + "", "mn", "mi"; | |
736 | + "", "pn", "pi"; | |
737 | + "", "tn", "ci"; | |
738 | + "", "dm", "d"; | |
739 | + "", "jm", "j"; | |
740 | + "", "ajm", "aj"; | |
741 | + "", "ejm", "ej"; | |
742 | + "", "ojm", "oj"; | |
743 | + "", "ujm", "uj"; | |
744 | + "", "yjm", "yj"; | |
745 | + "", "żm", "ż"; | |
746 | + (* ruchome e *) | |
747 | + "", "bejm", "bj"; | |
748 | + "", "dejm", "dj"; | |
749 | + "", "eczn", "cz"; | |
750 | + "", "edm", "d"; | |
751 | + "", "egn", "gi"; | |
752 | + "", "ekln", "kl"; | |
753 | + "", "emn", "mi"; | |
754 | + "", "epn", "pi"; | |
755 | + "", "espn", "spi"; | |
756 | + "", "eszczn", "szcz"; | |
757 | + "", "etn", "ci"; | |
758 | + "", "ezdm", "zd"; | |
759 | + "", "ezm", "zi"; | |
760 | + "", "eżm", "ż"; | |
761 | + "", "eżn", "ż"; | |
762 | + "", "zejm", "zj"; | |
763 | + "", "zekln", "skl"; | |
764 | + "", "zepn", "spi"; | |
765 | + "", "zetn", "ści"; | |
766 | + ]; | |
767 | + | |
768 | + "verb_j", [ | |
769 | + "", "ad", "ech"; | |
770 | + "", "cz", "k"; | |
771 | + "", "cz", "t"; | |
772 | + "", "c", "t"; | |
773 | + "", "ec", "ot"; | |
774 | + "", "d", "ch"; | |
775 | + "", "edz", "ad"; | |
776 | + "", "ez", "az"; | |
777 | + "", "ior", "r"; | |
778 | + "", "jd", "laz"; | |
779 | + "", "jd", "sz"; | |
780 | + "", "nid", "esz"; | |
781 | + "", "nid", "sz"; | |
782 | + "", "nijd", "esz"; | |
783 | + "", "nijd", "sz"; | |
784 | + "", "sz", "ch"; | |
785 | + "", "szcz", "sk"; | |
786 | + "", "szcz", "st"; | |
787 | + "", "ójd", "osz"; | |
788 | + "", "ąd", "ad"; | |
789 | + "", "ęd", "ad"; | |
790 | + "", "śl", "sł"; | |
791 | + "", "śpi", "sp"; | |
792 | + "", "żdż", "zd"; | |
793 | + "", "żdż", "zg"; | |
794 | + "", "ż", "g"; | |
795 | + "", "ęż", "eg"; | |
796 | + "", "ż", "z"; | |
797 | + "", "dadz", "d"; | |
798 | + "", "bi", "b"; | |
799 | + "", "mi", "m"; | |
800 | + "", "pi", "p"; | |
801 | + "", "st", "s"; | |
802 | + "", "rz", "r"; | |
803 | + "", "sz", "s"; | |
804 | + "", "b", "b"; | |
805 | + "", "d", "d"; | |
806 | + "", "g", "g"; | |
807 | + "", "aj", "aj"; | |
808 | + "", "k", "k"; | |
809 | + "", "l", "l"; | |
810 | + "", "s", "s"; | |
811 | + "", "t", "t"; | |
812 | + "", "w", "w"; | |
813 | + "", "owi", "w"; | |
814 | + "", "ow", "w"; | |
815 | + "", "z", "z"; | |
816 | + "", "cz", "cz"; | |
817 | + "", "źr", "ziar"; | |
818 | + "", "iel", "eł"; | |
819 | + "", "orz", "ar"; | |
820 | + "", "r", "ar"; | |
821 | + "", "dz", "d"; | |
822 | + (* ruchome e *) | |
823 | + "", "bior", "ebr"; | |
824 | + "", "pior", "epr"; | |
825 | + "", "spior", "zepr"; | |
826 | + "", "ściel", "esł"; | |
827 | + "", "ściel", "sł"; | |
828 | + "", "edr", "dar"; | |
829 | + "", "emr", "mar"; | |
830 | + "", "epr", "par"; | |
831 | + "", "espr", "spar"; | |
832 | + "", "etr", "tar"; | |
833 | + "", "ewr", "war"; | |
834 | + "", "eźr", "ziar"; | |
835 | + "", "eżr", "żar"; | |
836 | + "", "zepr", "spar"; | |
837 | + "", "zetr", "star"; | |
838 | + ]; | |
839 | + | |
840 | + "nieregularne_j", [ | |
841 | + "a", "ma", "mi"; | |
842 | + "aj", "maj", "mi"; | |
843 | + "ej", "rej", "r"; | |
844 | + "oj", "oj", ""; | |
845 | + "ε", "będ", "b"; | |
846 | + "e", "je", "jad"; | |
847 | + "ie", "wie", "wiedzi"; | |
848 | +(* "", "", ""; | |
849 | + "", "", "";*) | |
850 | + ]; | |
851 | + | |
852 | + "inf_e", [ | |
853 | + (* ruchome e *) | |
854 | + "", "edrz", "dar"; | |
855 | + "", "eml", "meł"; | |
856 | + "", "emrz", "mar"; | |
857 | + "", "epl", "peł"; | |
858 | + "", "eprz", "par"; | |
859 | + "", "esprz", "spar"; | |
860 | + "", "etrz", "tar"; | |
861 | + "", "ewrz", "war"; | |
862 | + "", "eźr", "ziar"; | |
863 | + "", "eźrz", "ziar"; | |
864 | + "", "eżr", "żar"; | |
865 | + "", "l", "eł"; | |
866 | + "", "r", "ar"; | |
867 | + "", "rz", "ar"; | |
868 | + "", "zeprz", "spar"; | |
869 | + "", "zetrz", "star"; | |
870 | + "", "źr", "ziar"; | |
871 | + ]; | |
872 | +(* | |
873 | + | |
874 | + "verb_ń", [ | |
875 | + "", "ń", ""; | |
876 | + ]; | |
877 | + | |
878 | + "verb_fin", [ | |
879 | + "i", "i", "i"; | |
880 | + "y", "y", "y"; | |
881 | + "y", "ędzie", "y"; | |
882 | + "e", "e", "e"; | |
883 | + "e", "i", "ie"; | |
884 | + "e", "y", "e"; | |
885 | + "e", "ma", "mie"; | |
886 | + "e", "ce", "cie"; | |
887 | + "e", "iele", "le"; | |
888 | + "e", "li", "le"; | |
889 | + "e", "miele", "emle"; | |
890 | + "e", "e", "edzie"; | |
891 | + "e", "piele", "eple"; | |
892 | + "e", "re", "rze"; | |
893 | + ]; | |
894 | + "verb_fin_ter", [ | |
895 | + "i", "i", "i"; | |
896 | + "i", "aj","ai"; | |
897 | + "i", "ej","ei"; | |
898 | + "i", "oj","oi"; | |
899 | + "i", "uj","ui"; | |
900 | + "i", "c","ci"; | |
901 | + "i", "cz","ci"; | |
902 | + "i", "dz","dzi"; | |
903 | + "i", "l","li"; | |
904 | + "i", "rż","rzi"; | |
905 | + "i", "szcz","ści"; | |
906 | + "i", "sz","si"; | |
907 | + "i", "żdż","ździ"; | |
908 | + "i", "ż","zi"; | |
909 | + "y", "cz", "czy"; | |
910 | + "y", "ż", "ży"; | |
911 | + "y", "sz", "szy"; | |
912 | + "y", "rz", "rzy"; | |
913 | + "y", "ęd", "y"; | |
914 | + "e", "sz", "sze"; | |
915 | + "e", "cz", "cze"; | |
916 | + "e", "rz", "rze"; | |
917 | + "e", "r", "re"; | |
918 | + "e", "ż", "że"; | |
919 | + "e", "r", "rze"; | |
920 | + "e", "szcz", "ście"; | |
921 | + "e", "sz", "sie"; | |
922 | + "e", "żr", "zrz"; | |
923 | + "e", "l", "le"; | |
924 | + "e", "dz", "dzie"; | |
925 | + "e", "maj", "mie"; | |
926 | + "e", "iel", "le"; | |
927 | + "e", "miel", "emle"; | |
928 | + "e", "piel", "eple"; | |
929 | + "e", "bi", "bie"; | |
930 | + "e", "mi", "mie"; | |
931 | + "e", "pi", "pie"; | |
932 | + "e", "ni", "nie"; | |
933 | + "e", "wi", "wie"; | |
934 | + "e", "si", "sie"; | |
935 | + "e", "fi", "fie"; | |
936 | + "e", "ci", "cie"; | |
937 | + "e", "zi", "zie"; | |
938 | + "e", "dzi", "dzie"; | |
939 | + "e", "n", "nie"; | |
940 | + "e", "c", "cie"; | |
941 | + "e", "j", "je"; | |
942 | + ]; | |
943 | + "verb_fin_pri", [ | |
944 | + "e", "a", "ie"; | |
945 | + "d", "e", "ad"; | |
946 | + "e", "e", "edzie"; | |
947 | + "a", "a", "a"; | |
948 | + "e", "e", "e"; | |
949 | + ]; | |
950 | + "verb_ger_n", [ | |
951 | + "a", "a", "a"; | |
952 | + "a", "ściele", "sła"; | |
953 | + "e", "e", "e"; | |
954 | + "e", "e", "ie"; | |
955 | + "e", "szcze", "ście"; | |
956 | + "e", "sze", "sie"; | |
957 | + "i", "ce", "ci"; | |
958 | + "i", "cze", "ci"; | |
959 | + "i", "dze", "dzi"; | |
960 | + "i", "rże", "rzi"; | |
961 | + "i", "szcze", "ści"; | |
962 | + "i", "sze", "si"; | |
963 | + "i", "żdże", "ździ"; | |
964 | + "i", "że", "zi"; | |
965 | + "i", "aje", "ai"; | |
966 | + "i", "eje", "ei"; | |
967 | + "i", "oje", "oi"; | |
968 | + "i", "bie", "bi"; | |
969 | + "i", "cie", "ci"; | |
970 | + "i", "fie", "fi"; | |
971 | + "i", "le", "li"; | |
972 | + "i", "mie", "mi"; | |
973 | + "i", "nie", "ni"; | |
974 | + "i", "pie", "pi"; | |
975 | + "i", "sie", "si"; | |
976 | + "i", "wie", "wi"; | |
977 | + "i", "zie", "zi"; | |
978 | + "i", "le", "li"; | |
979 | + "y", "e", "y"; | |
980 | + "k", "cze", "k"; | |
981 | + "t", "ece", "ot"; | |
982 | + "d", "edze", "ad"; | |
983 | + "d", "edze", "od"; | |
984 | + "g", "że", "g"; | |
985 | + "d", "dze", "d"; | |
986 | + "s", "esie", "os"; | |
987 | + "z", "ezie", "az"; | |
988 | + "z", "ezie", "oz"; | |
989 | + "b", "bie", "b"; | |
990 | + "s", "sie", "s"; | |
991 | + "z", "zie", "z"; | |
992 | + ]; | |
993 | + "verb_ger_c", [ | |
994 | + "ą", "ię", "ą"; | |
995 | + "ą", "ę", "ą"; | |
996 | + "ą", "śnię", "sną"; | |
997 | + "ą", "źnię", "zną"; | |
998 | + "ą", "rźnię", "rzną"; | |
999 | + "e", "ar", "re"; | |
1000 | + "e", "ar", "rze"; | |
1001 | + "e", "eł", "le"; | |
1002 | + "e", "ziar", "źre"; | |
1003 | + "i", "bi", "bi"; | |
1004 | + "i", "ni", "ni"; | |
1005 | + "i", "pi", "pi"; | |
1006 | + "i", "wi", "wi"; | |
1007 | +(* "u", "u", "u"; *) | |
1008 | + "y", "y", "y"; | |
1009 | + "g", "gnię", "g"; | |
1010 | + "k", "knię", "k"; | |
1011 | + ]; | |
1012 | + "verb_u", [ | |
1013 | + "e", "e", "e"; | |
1014 | + "i", "i", "i"; | |
1015 | + "u", "u", "u"; | |
1016 | + "y", "y", "y"; | |
1017 | + ]; | |
1018 | + | |
1019 | + "verb_'ε", [ | |
1020 | + "", "eź", "zi"; | |
1021 | + ]; | |
1022 | + | |
1023 | +*) | |
1024 | +] | |
1025 | + | |
1026 | +type tags = | |
1027 | + M of string | |
1028 | + | T of string * string | |
1029 | + | A of string | |
1030 | + | |
1031 | +let rules = [ | |
1032 | + "KOLWIEK-SUFFIXES", [ | |
1033 | + "dowolne","żkolwiek",[T("suf","żkolwiek")]; | |
1034 | + "dowolne","żekolwiek",[T("suf","żkolwiek")]; | |
1035 | + "dowolne","śkolwiek",[T("suf","śkolwiek")]; | |
1036 | + "dowolne","kolwiek",[T("suf","kolwiek")]; | |
1037 | + "dowolne","ż",[T("suf","ż")]; | |
1038 | + "dowolne","że",[T("suf","ż")]; | |
1039 | + "dowolne","ściś",[T("suf","ściś")]; | |
1040 | + "dowolne","ciś",[T("suf","ciś")]; | |
1041 | + "dowolne","ś",[T("suf","ś")]; | |
1042 | + "dowolne","ści",[T("suf","ści")]; | |
1043 | + "dowolne","sik",[T("suf","sik")]; | |
1044 | + "dowolne","si",[T("suf","si")]; | |
1045 | + "dowolne","",[]; | |
1046 | + ]; | |
1047 | + | |
1048 | + "ADJ-FLEX", [ | |
1049 | + "funkcjonalnie_miekkie_iy", "", [T("flex","y"); A "con"; T("cat","adj")]; | |
1050 | + "funkcjonalnie_twarde_y", "", [T("flex","y"); A "con"; T("cat","adj")]; | |
1051 | + "funkcjonalnie_miekkie_iy", "ch", [T("flex","ych"); A "con"; T("cat","adj")]; | |
1052 | + "funkcjonalnie_twarde_y", "ch", [T("flex","ych"); A "con"; T("cat","adj")]; | |
1053 | + "funkcjonalnie_miekkie_iy", "m", [T("flex","ym"); A "con"; T("cat","adj")]; | |
1054 | + "funkcjonalnie_twarde_y", "m", [T("flex","ym"); A "con"; T("cat","adj")]; | |
1055 | + "funkcjonalnie_miekkie_iy", "mi", [T("flex","ymi"); A "con"; T("cat","adj")]; | |
1056 | + "funkcjonalnie_twarde_y", "mi", [T("flex","ymi"); A "con"; T("cat","adj")]; | |
1057 | + "funkcjonalnie_miekkie_ae", "e", [T("flex","e"); A "con"; T("cat","adj")]; | |
1058 | + "funkcjonalnie_twarde_e", "e", [T("flex","e"); A "con"; T("cat","adj")]; | |
1059 | + "funkcjonalnie_miekkie_ae", "ego",[T("flex","ego"); A "con"; T("cat","adj")]; | |
1060 | + "funkcjonalnie_twarde_e", "ego",[T("flex","ego"); A "con"; T("cat","adj")]; | |
1061 | + "funkcjonalnie_miekkie_ae", "ej", [T("flex","ej"); A "con"; T("cat","adj")]; | |
1062 | + "funkcjonalnie_twarde_e", "ej", [T("flex","ej"); A "con"; T("cat","adj")]; | |
1063 | + "funkcjonalnie_miekkie_ae", "emu",[T("flex","emu"); A "con"; T("cat","adj")]; | |
1064 | + "funkcjonalnie_twarde_e", "emu",[T("flex","emu"); A "con"; T("cat","adj")]; | |
1065 | + "funkcjonalnie_miekkie_ae", "a", [T("flex","a"); A "con"; T("cat","adj")]; | |
1066 | + "funkcjonalnie_twarde_a", "a", [T("flex","a"); A "con"; T("cat","adj")]; | |
1067 | + "funkcjonalnie_miekkie_ae", "ą", [T("flex","ą"); A "con"; T("cat","adj")]; | |
1068 | + "funkcjonalnie_twarde_a", "ą", [T("flex","ą"); A "con"; T("cat","adj")]; | |
1069 | + "funkcjonalnie_miekkie_ae", "o", [T("flex","o"); A "con"; T("cat","adj")]; | |
1070 | + "funkcjonalnie_twarde_a", "o", [T("flex","o"); A "con"; T("cat","adj")]; | |
1071 | + "funkcjonalnie_twarde_a", "u", [T("flex","u"); A "con"; T("cat","adj")]; | |
1072 | + "funkcjonalnie_twarde_i", "", [T("flex","i"); A "con"; T("cat","adj")]; | |
1073 | + "funkcjonalnie_miekkie_wyglos","", [T("flex","ε"); A "con"; T("cat","adj")]; | |
1074 | + "funkcjonalnie_twarde_wyglos", "", [T("flex","ε"); A "con"; T("cat","adj")]; | |
1075 | + ]; | |
1076 | + "ADJ-FLEX-GRAD", [ | |
1077 | + "sz", "y", [T("flex","y"); T("cat","adj:grad")]; | |
1078 | + "sz", "ych", [T("flex","ych"); T("cat","adj:grad")]; | |
1079 | + "sz", "ym", [T("flex","ym"); T("cat","adj:grad")]; | |
1080 | + "sz", "ymi", [T("flex","ymi"); T("cat","adj:grad")]; | |
1081 | + "sz", "e", [T("flex","e"); T("cat","adj:grad")]; | |
1082 | + "sz", "ego",[T("flex","ego"); T("cat","adj:grad")]; | |
1083 | + "sz", "ej", [T("flex","ej"); T("cat","adj:grad")]; | |
1084 | + "sz", "emu",[T("flex","emu"); T("cat","adj:grad")]; | |
1085 | + "sz", "a", [T("flex","a"); T("cat","adj:grad")]; | |
1086 | + "sz", "ą", [T("flex","ą"); T("cat","adj:grad")]; | |
1087 | + "sz_i", "", [T("flex","i"); T("cat","adj:grad")]; | |
1088 | + ]; | |
1089 | + | |
1090 | + "ADJ-GRAD", [ | |
1091 | + "adj_grad_sz", "sz", [T("grad","sz"); A "con"]; | |
1092 | + "adj_grad_iejsz", "ejsz", [T("grad","iejsz"); A "con"]; | |
1093 | + ]; | |
1094 | + | |
1095 | + "ADV-FLEX", [ | |
1096 | + "funkcjonalnie_miekkie_ae", "o", [T("flex","o"); A "con"; T("cat","adv")]; | |
1097 | + "funkcjonalnie_twarde_a", "o", [T("flex","o"); A "con"; T("cat","adv")]; | |
1098 | + "funkcjonalnie_twarde_ie", "", [T("flex","ie"); A "con"; T("cat","adv")]; | |
1099 | + "adv_grad", "ej", [T("flex","iej"); A "con"; T("cat","adv")]; | |
1100 | + "funkcjonalnie_twarde_ie", "j", [T("flex","iej"); A "con"; T("cat","adv")]; | |
1101 | + ]; | |
1102 | + | |
1103 | + "NOUN-FLEX", [ | |
1104 | + "funkcjonalnie_miekkie_iy", "", [T("flex","y1"); A "con"; T("cat","noun")]; | |
1105 | + "funkcjonalnie_twarde_y", "", [T("flex","y2"); A "con"; T("cat","noun")]; | |
1106 | + "funkcjonalnie_miekkie_ii", "", [T("flex","y3"); A "con"; T("cat","noun")]; | |
1107 | + "funkcjonalnie_miekkie_iy", "ch", [T("flex","ych1"); A "con"; T("cat","noun")]; | |
1108 | + "funkcjonalnie_twarde_y", "ch", [T("flex","ych2"); A "con"; T("cat","noun")]; | |
1109 | + "funkcjonalnie_miekkie_ii", "ch", [T("flex","ych3"); A "con"; T("cat","noun")]; | |
1110 | + "funkcjonalnie_miekkie_iy", "m", [T("flex","ym1"); A "con"; T("cat","noun")]; | |
1111 | + "funkcjonalnie_twarde_y", "m", [T("flex","ym2"); A "con"; T("cat","noun")]; | |
1112 | + "funkcjonalnie_miekkie_ii", "m", [T("flex","ym3"); A "con"; T("cat","noun")]; | |
1113 | + "funkcjonalnie_miekkie_iy", "mi", [T("flex","ymi1"); A "con"; T("cat","noun")]; | |
1114 | + "funkcjonalnie_twarde_y", "mi", [T("flex","ymi2"); A "con"; T("cat","noun")]; | |
1115 | + "funkcjonalnie_miekkie_ii", "mi", [T("flex","ymi3"); A "con"; T("cat","noun")]; | |
1116 | + "funkcjonalnie_miekkie_ae", "e", [T("flex","e1"); A "con"; T("cat","noun")]; | |
1117 | + "funkcjonalnie_twarde_e", "e", [T("flex","e2"); A "con"; T("cat","noun")]; | |
1118 | + "funkcjonalnie_miekkie_ii", "e", [T("flex","e3"); A "con"; T("cat","noun")]; | |
1119 | + "funkcjonalnie_miekkie_ae", "ego", [T("flex","ego1"); A "con"; T("cat","noun")]; | |
1120 | + "funkcjonalnie_twarde_e", "ego", [T("flex","ego2"); A "con"; T("cat","noun")]; | |
1121 | + "funkcjonalnie_miekkie_ii", "ego", [T("flex","ego3"); A "con"; T("cat","noun")]; | |
1122 | + "funkcjonalnie_miekkie_ae", "ej", [T("flex","ej1"); A "con"; T("cat","noun")]; | |
1123 | + "funkcjonalnie_twarde_e", "ej", [T("flex","ej2"); A "con"; T("cat","noun")]; | |
1124 | + "funkcjonalnie_miekkie_ae", "em", [T("flex","em1"); A "con"; T("cat","noun")]; | |
1125 | + "funkcjonalnie_twarde_e", "em", [T("flex","em2"); A "con"; T("cat","noun")]; | |
1126 | + "funkcjonalnie_miekkie_ii", "em", [T("flex","em3"); A "con"; T("cat","noun")]; | |
1127 | + "kontrakcje", "em", [T("flex","em4"); A "con"; T("cat","noun")]; | |
1128 | + "funkcjonalnie_miekkie_ae", "emu", [T("flex","emu1"); A "con"; T("cat","noun")]; | |
1129 | + "funkcjonalnie_twarde_e", "emu", [T("flex","emu2"); A "con"; T("cat","noun")]; | |
1130 | + "funkcjonalnie_miekkie_ii", "emu", [T("flex","emu3"); A "con"; T("cat","noun")]; | |
1131 | + "funkcjonalnie_miekkie_ae", "a", [T("flex","a1"); A "con"; T("cat","noun")]; | |
1132 | + "funkcjonalnie_twarde_a", "a", [T("flex","a2"); A "con"; T("cat","noun")]; | |
1133 | + "funkcjonalnie_miekkie_ii", "a", [T("flex","a3"); A "con"; T("cat","noun")]; | |
1134 | + "kontrakcje", "a", [T("flex","a4"); A "con"; T("cat","noun")]; | |
1135 | + "funkcjonalnie_miekkie_ae", "ach", [T("flex","ach1"); A "con"; T("cat","noun")]; | |
1136 | + "funkcjonalnie_twarde_a", "ach", [T("flex","ach2"); A "con"; T("cat","noun")]; | |
1137 | + "funkcjonalnie_miekkie_ii", "ach", [T("flex","ach3"); A "con"; T("cat","noun")]; | |
1138 | + "funkcjonalnie_miekkie_ae", "ami", [T("flex","ami1"); A "con"; T("cat","noun")]; | |
1139 | + "funkcjonalnie_twarde_a", "ami", [T("flex","ami2"); A "con"; T("cat","noun")]; | |
1140 | + "funkcjonalnie_miekkie_ii", "ami", [T("flex","ami3"); A "con"; T("cat","noun")]; | |
1141 | + "funkcjonalnie_miekkie_wyglos","mi", [T("flex","ami4"); A "con"; T("cat","noun")]; | |
1142 | + "funkcjonalnie_twarde_wyglos", "mi", [T("flex","ami5"); A "con"; T("cat","noun")]; | |
1143 | + "funkcjonalnie_miekkie_ae", "ą", [T("flex","ą1"); A "con"; T("cat","noun")]; | |
1144 | + "funkcjonalnie_twarde_a", "ą", [T("flex","ą2"); A "con"; T("cat","noun")]; | |
1145 | + "funkcjonalnie_miekkie_ii", "ą", [T("flex","ą3"); A "con"; T("cat","noun")]; | |
1146 | + "funkcjonalnie_miekkie_ae", "ę", [T("flex","ę1"); A "con"; T("cat","noun")]; | |
1147 | + "funkcjonalnie_twarde_a", "ę", [T("flex","ę2"); A "con"; T("cat","noun")]; | |
1148 | + "funkcjonalnie_miekkie_ii", "ę", [T("flex","ę3"); A "con"; T("cat","noun")]; | |
1149 | + "funkcjonalnie_miekkie_ae", "o", [T("flex","o1"); A "con"; T("cat","noun")]; | |
1150 | + "funkcjonalnie_twarde_a", "o", [T("flex","o2"); A "con"; T("cat","noun")]; | |
1151 | + "funkcjonalnie_miekkie_ii", "o", [T("flex","o3"); A "con"; T("cat","noun")]; | |
1152 | + "funkcjonalnie_miekkie_ae", "om", [T("flex","om1"); A "con"; T("cat","noun")]; | |
1153 | + "funkcjonalnie_twarde_a", "om", [T("flex","om2"); A "con"; T("cat","noun")]; | |
1154 | + "funkcjonalnie_miekkie_ii", "om", [T("flex","om3"); A "con"; T("cat","noun")]; | |
1155 | + "funkcjonalnie_miekkie_ae", "owi", [T("flex","owi1"); A "con"; T("cat","noun")]; | |
1156 | + "funkcjonalnie_twarde_a", "owi", [T("flex","owi2"); A "con"; T("cat","noun")]; | |
1157 | + "funkcjonalnie_miekkie_ii", "owi", [T("flex","owi3"); A "con"; T("cat","noun")]; | |
1158 | + "kontrakcje", "owi", [T("flex","owi4"); A "con"; T("cat","noun")]; | |
1159 | + "funkcjonalnie_miekkie_ae", "owie",[T("flex","owie1"); A "con"; T("cat","noun")]; | |
1160 | + "funkcjonalnie_twarde_a", "owie",[T("flex","owie2"); A "con"; T("cat","noun")]; | |
1161 | + "funkcjonalnie_miekkie_ii", "owie",[T("flex","owie3"); A "con"; T("cat","noun")]; | |
1162 | + "funkcjonalnie_miekkie_ae", "ów", [T("flex","ów1"); A "con"; T("cat","noun")]; | |
1163 | + "funkcjonalnie_twarde_a", "ów", [T("flex","ów2"); A "con"; T("cat","noun")]; | |
1164 | + "funkcjonalnie_miekkie_ii", "ów", [T("flex","ów3"); A "con"; T("cat","noun")]; | |
1165 | + "funkcjonalnie_miekkie_ae", "u", [T("flex","u1"); A "con"; T("cat","noun")]; | |
1166 | + "funkcjonalnie_twarde_a", "u", [T("flex","u2"); A "con"; T("cat","noun")]; | |
1167 | + "funkcjonalnie_miekkie_ii", "u", [T("flex","u3"); A "con"; T("cat","noun")]; | |
1168 | + "kontrakcje", "u", [T("flex","u4"); A "con"; T("cat","noun")]; | |
1169 | + "funkcjonalnie_twarde_a", "um", [T("flex","um1"); A "con"; T("cat","noun")]; | |
1170 | + "funkcjonalnie_miekkie_ii", "um", [T("flex","um2"); A "con"; T("cat","noun")]; | |
1171 | + "funkcjonalnie_twarde_a", "us", [T("flex","us"); A "con"; T("cat","noun")]; | |
1172 | + "funkcjonalnie_twarde_i", "", [T("flex","i1"); A "con"; T("cat","noun")]; | |
1173 | + "funkcjonalnie_miekkie_ii", "i", [T("flex","i2"); A "con"; T("cat","noun")]; | |
1174 | + "funkcjonalnie_twarde_ie", "", [T("flex","ie1"); A "con"; T("cat","noun")]; | |
1175 | + "kontrakcje", "ie", [T("flex","ie2"); A "con"; T("cat","noun")]; | |
1176 | + "funkcjonalnie_miekkie_wyglos","", [T("flex","ε1"); A "con"; T("cat","noun")]; | |
1177 | + "funkcjonalnie_twarde_wyglos", "", [T("flex","ε2"); A "con"; T("cat","noun")]; | |
1178 | + "funkcjonalnie_miekkie_ii_wyglos","", [T("flex","ε3"); A "con"; T("cat","noun")]; | |
1179 | + "kontrakcje", "", [T("flex","ε4"); A "con"; T("cat","noun")]; | |
1180 | + ]; | |
1181 | +(* "ε->t","",M "subst:sg:nom.acc.voc:n1"; | |
1182 | + "ci->t","a",M "subst:sg:gen:n1"; | |
1183 | + "ci->t","u",M "subst:sg:dat.loc:n1"; | |
1184 | + "ci->t","em",M "subst:sg:inst:n1";*) | |
1185 | + "NOUN-FLEX-CAP", [ | |
1186 | + "kapitaliki_y","", [T("flex","yC"); A "con"; T("cat","noun")]; | |
1187 | + "kapitaliki_e","e", [T("flex","eC"); A "con"; T("cat","noun")]; | |
1188 | + "kapitaliki_e","em", [T("flex","emC"); A "con"; T("cat","noun")]; | |
1189 | + "kapitaliki_a","ach", [T("flex","achC"); A "con"; T("cat","noun")]; | |
1190 | + "kapitaliki_a","ami", [T("flex","amiC"); A "con"; T("cat","noun")]; | |
1191 | + "kapitaliki_a","a", [T("flex","aC"); A "con"; T("cat","noun")]; | |
1192 | + "kapitaliki_a","ą", [T("flex","ąC"); A "con"; T("cat","noun")]; | |
1193 | + "kapitaliki_a","ę", [T("flex","ęC"); A "con"; T("cat","noun")]; | |
1194 | + "kapitaliki_a","o", [T("flex","oC"); A "con"; T("cat","noun")]; | |
1195 | + "kapitaliki_a","om", [T("flex","omC"); A "con"; T("cat","noun")]; | |
1196 | + "kapitaliki_a","owi", [T("flex","owiC"); A "con"; T("cat","noun")]; | |
1197 | + "kapitaliki_a","owie", [T("flex","owieC"); A "con"; T("cat","noun")]; | |
1198 | + "kapitaliki_a","ów", [T("flex","ówC"); A "con"; T("cat","noun")]; | |
1199 | + "kapitaliki_a","u", [T("flex","uC"); A "con"; T("cat","noun")]; | |
1200 | + "kapitaliki_ie","", [T("flex","ieC"); A "con"; T("cat","noun")]; | |
1201 | + "kapitaliki_wyglos","", [T("flex","εC"); A "con"; T("cat","noun")]; | |
1202 | + "kapitaliki_wyglos","A", [T("flex","AC"); A "con"; T("cat","noun")]; | |
1203 | + ]; | |
1204 | + | |
1205 | + "GER-FLEX", [ | |
1206 | + "ger", "om", [T("flex","om")]; | |
1207 | + "ger", "ami", [T("flex","ami")]; | |
1208 | + "ger", "ach", [T("flex","ach")]; | |
1209 | + "ger", "e", [T("flex","e")]; | |
1210 | + "ger", "a", [T("flex","a")]; | |
1211 | + "ger", "u", [T("flex","u")]; | |
1212 | + "ger", "em", [T("flex","em")]; | |
1213 | + "ger_wyglos", "", [T("flex","ε")]; | |
1214 | + ]; | |
1215 | + "PACT-FLEX", [ | |
1216 | + "pact", "ych", [T("flex","ych")]; | |
1217 | + "pact", "ym", [T("flex","ym")]; | |
1218 | + "pact", "ymi", [T("flex","ymi")]; | |
1219 | + "pact", "e", [T("flex","e")]; | |
1220 | + "pact", "o", [T("flex","o")]; | |
1221 | + "pact", "y", [T("flex","y")]; | |
1222 | + "pact", "ą", [T("flex","ą")]; | |
1223 | + "pact", "a", [T("flex","a")]; | |
1224 | + "pact", "ego", [T("flex","ego")]; | |
1225 | + "pact", "emu", [T("flex","emu")]; | |
1226 | + "pact", "ej", [T("flex","ej")]; | |
1227 | + "pact", "", [T("flex","ε")]; | |
1228 | + ]; | |
1229 | + "PPAS-FLEX", [ | |
1230 | + "ppas", "ych", [T("flex","ych")]; | |
1231 | + "ppas", "ym", [T("flex","ym")]; | |
1232 | + "ppas", "ymi", [T("flex","ymi")]; | |
1233 | + "ppas", "e", [T("flex","e")]; | |
1234 | + "ppas", "y", [T("flex","y")]; | |
1235 | + "ppas", "ą", [T("flex","ą")]; | |
1236 | + "ppas", "a", [T("flex","a")]; | |
1237 | + "ppas", "o", [T("flex","o")]; | |
1238 | + "ppas", "ego", [T("flex","ego")]; | |
1239 | + "ppas", "emu", [T("flex","emu")]; | |
1240 | + "ppas", "ej", [T("flex","ej")]; | |
1241 | + "ppas_i", "", [T("flex","i")]; | |
1242 | + ]; | |
1243 | + "PRAET-FLEX", [ | |
1244 | + "praet", "", [T("flex","ε")]; | |
1245 | + "praet", "em", [T("flex","em")]; | |
1246 | + "praet", "eś", [T("flex","eś")]; | |
1247 | + "praet", "a", [T("flex","a")]; | |
1248 | + "praet", "am", [T("flex","am")]; | |
1249 | + "praet", "aś", [T("flex","aś")]; | |
1250 | + "praet", "o", [T("flex","o")]; | |
1251 | + "praet", "om", [T("flex","om")]; | |
1252 | + "praet", "oś", [T("flex","oś")]; | |
1253 | + "praet", "y", [T("flex","y")]; | |
1254 | + "praet", "yśmy", [T("flex","yśmy")]; | |
1255 | + "praet", "yście", [T("flex","yście")]; | |
1256 | + "praet", "ego", [T("flex","ego")]; | |
1257 | + "praet", "emu", [T("flex","emu")]; | |
1258 | + "praet_i", "", [T("flex","i")]; | |
1259 | + "praet_i", "śmy", [T("flex","iśmy")]; | |
1260 | + "praet_i", "ście", [T("flex","iście")]; | |
1261 | + ]; | |
1262 | + "FIN-FLEX", [ | |
1263 | + "dowolne", "", [T("flex","ε")]; | |
1264 | + "dowolne", "my", [T("flex","my")]; | |
1265 | + "dowolne", "cie", [T("flex","cie")]; | |
1266 | + "dowolne", "sz", [T("flex","sz")]; | |
1267 | + ]; | |
1268 | + "FIN-FLEX-J", [ | |
1269 | + "ae", "m", [T("flex","m")]; | |
1270 | + "dowolne", "ą", [T("flex","ą")]; | |
1271 | + "dowolne", "ę", [T("flex","ę")]; | |
1272 | + ]; | |
1273 | + | |
1274 | + "VERB-FLEX2", [ | |
1275 | + "verb_inf_ć", "ć", [T("flex2","ć")]; | |
1276 | + "verb_inf_c", "c", [T("flex2","c")]; | |
1277 | + "dowolne", "ć", [T("flex2","ć")]; | |
1278 | +(* "eai", "", [T("flex2","ε?")]; *) | |
1279 | + "dowolne", "", [T("flex2","ε")]; | |
1280 | + "dowolne", "t", [T("flex2","t")]; | |
1281 | + "dowolne", "wszy",[T("flex2","wszy")]; | |
1282 | + "dowolne", "ł", [T("flex2","ł")]; | |
1283 | + "verb_łszy", "ł", [T("flex2","ł")]; | |
1284 | + "verb_łszy", "łszy",[T("flex2","łszy")]; | |
1285 | + ]; | |
1286 | + "VERB-FLEX2-J", [ | |
1287 | + "dowolne", "", [T("flex2","ε")]; | |
1288 | + "dowolne", "ń", [T("flex2","ń")]; | |
1289 | + "dowolne", "n", [T("flex2","n")]; | |
1290 | + "dowolne", "ąc", [T("flex2","ąc")]; | |
1291 | + ]; | |
1292 | + | |
1293 | + "VERB-GROUP-SUFIX", [ | |
1294 | + "dowolne","a",[T("group","a"); T("cat","verb")]; | |
1295 | +(* "dowolne","owa",[T("group","owa"); T("cat","verb")]; | |
1296 | + "dowolne","iwa",[T("group","iwa"); T("cat","verb")]; | |
1297 | + "dowolne","ywa",[T("group","ywa"); T("cat","verb")]; | |
1298 | + "dowolne","awa",[T("group","awa"); T("cat","verb")]; | |
1299 | + "dowolne","owywa",[T("group","owywa"); T("cat","verb")];*) | |
1300 | + "dowolne","u",[T("group","u"); T("cat","verb")]; | |
1301 | + "funkcjonalnie_twarde_y","",[T("group","y"); T("cat","verb")]; | |
1302 | + | |
1303 | + "dowolne","uje",[T("group","uje"); T("cat","verb")]; | |
1304 | + "dowolne","eje",[T("group","eje"); T("cat","verb")]; | |
1305 | + "dowolne","aje",[T("group","aje"); T("cat","verb")]; | |
1306 | +(* "dowolne","owuje",[T("group","owuje"); T("cat","verb")]; *) | |
1307 | + "funkcjonalnie_twarde_y","je",[T("group","yje"); T("cat","verb")]; | |
1308 | + | |
1309 | + "dowolne","uj",[T("group","uj"); T("cat","verb")]; | |
1310 | + "dowolne","ej",[T("group","ej"); T("cat","verb")]; | |
1311 | + "dowolne","aj",[T("group","aj"); T("cat","verb")]; | |
1312 | +(* "dowolne","owuj",[T("group","owuj"); T("cat","verb")]; *) | |
1313 | + "funkcjonalnie_twarde_y","j",[T("group","yj"); T("cat","verb")]; | |
1314 | + ]; | |
1315 | + | |
1316 | + "VERB-GROUP-PATAL", [ | |
1317 | + "funkcjonalnie_miekkie_iy", "", [T("group","y"); T("cat","verb")]; | |
1318 | + "funkcjonalnie_miekkie_ae", "e", [T("group","e"); T("cat","verb")]; | |
1319 | + "funkcjonalnie_miekkie_ae", "eje",[T("group","eje"); T("cat","verb")]; | |
1320 | + "funkcjonalnie_miekkie_iy", "je", [T("group","yje"); T("cat","verb")]; | |
1321 | + "funkcjonalnie_miekkie_ae", "ej", [T("group","ej"); T("cat","verb")]; | |
1322 | + "funkcjonalnie_miekkie_iy", "j", [T("group","yj"); T("cat","verb")]; | |
1323 | + "funkcjonalnie_miekkie_wyglos","", [T("group","ε"); T("cat","verb")]; | |
1324 | + "funkcjonalnie_miekkie_ae", "a", [T("group","a"); T("cat","verb")]; | |
1325 | + ]; | |
1326 | + | |
1327 | + "VERB-GROUP-J-PATAL", [ | |
1328 | + "funkcjonalnie_miekkie_ae", "", [T("group","J"); T("cat","verb")]; | |
1329 | + "funkcjonalnie_miekkie_iy", "j", [T("group","j"); T("cat","verb")]; | |
1330 | + "funkcjonalnie_miekkie_ae", "ej",[T("group","ej"); T("cat","verb")]; | |
1331 | + "patal_j", "", [T("group","J"); T("cat","verb")]; | |
1332 | + "funkcjonalnie_miekkie_ae", "e", [T("group","Je"); T("cat","verb")]; | |
1333 | + "patal_j", "e", [T("group","Je"); T("cat","verb")]; | |
1334 | + "funkcjonalnie_miekkie_ae", "a", [T("group","Ja"); T("cat","verb")]; | |
1335 | + "funkcjonalnie_miekkie_ae", "o", [T("group","Jo"); T("cat","verb")]; | |
1336 | + "patal_j", "o", [T("group","Jo"); T("cat","verb")]; | |
1337 | + ]; | |
1338 | + | |
1339 | + "VERB-GROUP-NĄ", [ | |
1340 | + "dowolne","ną",[T("group","ną"); T("cat","verb")]; | |
1341 | + "dowolne","ą",[T("group","ą"); T("cat","verb")]; | |
1342 | + "dowolne","nie",[T("group","nie"); T("cat","verb")]; | |
1343 | + "verb_nie","ie",[T("group","nie"); T("cat","verb")]; | |
1344 | + "dowolne","nię",[T("group","nię"); T("cat","verb")]; | |
1345 | + "verb_nię","nię",[T("group","nię"); T("cat","verb")]; | |
1346 | + "dowolne","ę",[T("group","ę"); T("cat","verb")]; | |
1347 | + "dowolne","nę",[T("group","nę"); T("cat","verb")]; | |
1348 | + "dowolne","",[T("group","ε"); T("cat","verb")]; | |
1349 | + "funkcjonalnie_twarde_ie","",[T("group","ie"); T("cat","verb")]; | |
1350 | + "verb_ie","",[T("group","ie"); T("cat","verb")]; | |
1351 | + "r","e",[T("group","e"); T("cat","verb")]; | |
1352 | + "nieregularne","",[A "group"; T("cat","verb")]; | |
1353 | + "dowolne","ń",[T("group","ń"); T("cat","verb")]; | |
1354 | + "dowolne","nij",[T("group","nij"); T("cat","verb")]; | |
1355 | + "dowolne","mij",[T("group","mij"); T("cat","verb")]; | |
1356 | + "verb_impt","",[T("group","ε"); T("cat","verb")]; | |
1357 | +(* "dowolne","nąć",[T("group","ną"); T("flex2","ć"); T("cat","verb")]; | |
1358 | + "dowolne","ąć",[T("group","ą"); T("flex2","ć"); T("cat","verb")]; | |
1359 | + "dowolne","nie",[T("group","nie"); T("flex2","ε"); T("cat","verb")]; | |
1360 | + "verb_nie","ie",[T("group","nie"); T("flex2","ε"); T("cat","verb")]; | |
1361 | + "dowolne","nięć",[T("group","nię"); T("flex2","ć"); T("cat","verb")]; | |
1362 | + "verb_nię","nięć",[T("group","nię"); T("flex2","ć"); T("cat","verb")]; | |
1363 | + "dowolne","ęć",[T("group","ę"); T("flex2","ć"); T("cat","verb")]; | |
1364 | + "dowolne","nięt",[T("group","nię"); T("flex2","t"); T("cat","verb")]; | |
1365 | + "verb_nię","nięt",[T("group","nię"); T("flex2","t"); T("cat","verb")]; | |
1366 | + "dowolne","ęt",[T("group","ę"); T("flex2","t"); T("cat","verb")]; | |
1367 | + "dowolne","nij",[T("group","ni"); T("flex2","j"); T("cat","verb")]; | |
1368 | + "verb_nie","ij",[T("group","ni"); T("flex2","j"); T("cat","verb")]; | |
1369 | + "dowolne","nąwszy",[T("group","ną"); T("flex2","wszy"); T("cat","verb")]; | |
1370 | + "dowolne","ąwszy",[T("group","ą"); T("flex2","wszy"); T("cat","verb")]; | |
1371 | + "dowolne","nęł",[T("group","nę"); T("flex2","ł"); T("cat","verb")]; | |
1372 | + "dowolne","ęł",[T("group","ę"); T("flex2","ł"); T("cat","verb")]; *) | |
1373 | + ]; | |
1374 | + | |
1375 | + "VERB-GROUP-J-NĄ", [ | |
1376 | + "dowolne","n",[T("group","n"); T("cat","verb")]; | |
1377 | + "verb_ną","",[T("group","n"); T("cat","verb")]; | |
1378 | + "verb_j","",[T("group","ε"); T("cat","verb")]; | |
1379 | + "nieregularne_j","",[A "group"; T("cat","verb")]; | |
1380 | + "verb_j","o",[T("group","o"); T("cat","verb")]; | |
1381 | + "funkcjonalnie_twarde_i","o",[T("group","io"); T("cat","verb")]; | |
1382 | + "verb_j","e",[T("group","e"); T("cat","verb")]; | |
1383 | + "funkcjonalnie_twarde_i","e",[T("group","ie"); T("cat","verb")]; | |
1384 | +(* "dowolne","ną",[T("group","n"); T("flex2","ε"); T("flex","ą"); T("cat","verb")]; | |
1385 | + "verb_ną","ą",[T("group","n"); T("flex2","ε"); T("flex","ą"); T("cat","verb")]; | |
1386 | + "dowolne","nę",[T("group","n"); T("flex2","ε"); T("flex","ę"); T("cat","verb")]; | |
1387 | + "verb_ną","ę",[T("group","n"); T("flex2","ε"); T("flex","ę"); T("cat","verb")]; | |
1388 | + "dowolne","nąc",[T("group","n"); T("flex2","ąc"); T("cat","verb")]; | |
1389 | + "verb_ną","ąc",[T("group","n"); T("flex2","ąc"); T("cat","verb")]; | |
1390 | + "verb_j","ą",[T("group","ε"); T("flex2","ą"); T("cat","verb")]; | |
1391 | + "verb_j","ę",[T("group","ε"); T("flex2","ę"); T("cat","verb")]; | |
1392 | + "verb_j","ąc",[T("group","ε"); T("flex2","ąc"); T("cat","verb")]; *) | |
1393 | + ]; | |
1394 | + | |
1395 | + | |
1396 | + ] | |
1397 | + | |
1398 | +let rev_rules = [ | |
1399 | + "ADJ-LEMMA", [ | |
1400 | + "funkcjonalnie_miekkie_iy","",[T("lemma","y")]; | |
1401 | + "funkcjonalnie_twarde_y","",[T("lemma","y")]; | |
1402 | + "funkcjonalnie_miekkie_wyglos","",[T("lemma","ε")]; | |
1403 | + "funkcjonalnie_twarde_wyglos","",[T("lemma","ε")]; | |
1404 | + ]; | |
1405 | + "ADV-LEMMA", [ | |
1406 | + "funkcjonalnie_miekkie_ae", "o", [T("lemma","o")]; | |
1407 | + "funkcjonalnie_twarde_a", "o", [T("lemma","o")]; | |
1408 | + "funkcjonalnie_twarde_ie", "", [T("lemma","ie")]; | |
1409 | + ]; | |
1410 | + "NOUN-LEMMA", [ | |
1411 | + "funkcjonalnie_miekkie_iy", "", [T("lemma","y")]; | |
1412 | + "funkcjonalnie_twarde_y", "", [T("lemma","y")]; | |
1413 | + "funkcjonalnie_miekkie_ii", "", [T("lemma","y")]; | |
1414 | + "funkcjonalnie_miekkie_ae", "e", [T("lemma","e")]; | |
1415 | + "funkcjonalnie_twarde_e", "e", [T("lemma","e")]; | |
1416 | + "funkcjonalnie_miekkie_ii", "e", [T("lemma","e")]; | |
1417 | + "funkcjonalnie_miekkie_ae", "a", [T("lemma","a")]; | |
1418 | + "funkcjonalnie_twarde_a", "a", [T("lemma","a")]; | |
1419 | + "funkcjonalnie_miekkie_ii", "a", [T("lemma","a")]; | |
1420 | + "funkcjonalnie_miekkie_ae", "o", [T("lemma","o")]; | |
1421 | + "funkcjonalnie_twarde_a", "o", [T("lemma","o")]; | |
1422 | + "funkcjonalnie_miekkie_ii", "o", [T("lemma","o")]; | |
1423 | + "funkcjonalnie_twarde_a", "um", [T("lemma","um")]; | |
1424 | + "funkcjonalnie_miekkie_ii", "um", [T("lemma","um")]; | |
1425 | + "funkcjonalnie_twarde_a", "us", [T("lemma","us")]; | |
1426 | + "funkcjonalnie_miekkie_wyglos","", [T("lemma","ε")]; | |
1427 | + "funkcjonalnie_twarde_wyglos", "", [T("lemma","ε")]; | |
1428 | + "kontrakcje", "", [T("lemma","ε")]; | |
1429 | + ]; | |
1430 | + "NOUN-LEMMA-CAP", [ | |
1431 | + "kapitaliki_wyglos", "", [T("lemma","ε")]; | |
1432 | + "kapitaliki_wyglos", "A", [T("lemma","A")]; | |
1433 | + ]; | |
1434 | + "VERB-LEMMA-PATAL", [ | |
1435 | + "funkcjonalnie_miekkie_iy","ć",[T("lemma","ć")]; | |
1436 | + "funkcjonalnie_miekkie_ae","eć",[T("lemma","eć")]; | |
1437 | + ]; | |
1438 | + "VERB-LEMMA-SUFIX", [ | |
1439 | + "dowolne","ować",[T("lemma","ować")]; | |
1440 | + "dowolne","ywać",[T("lemma","ywać")]; | |
1441 | + "dowolne","iwać",[T("lemma","iwać")]; | |
1442 | + "dowolne","awać",[T("lemma","awać")]; | |
1443 | + "dowolne","owywać",[T("lemma","owywać")]; | |
1444 | + "dowolne","ać",[T("lemma","ać")]; | |
1445 | + "dowolne","uć",[T("lemma","uć")]; | |
1446 | + "funkcjonalnie_twarde_y","ć",[T("lemma","yć")]; | |
1447 | + ]; | |
1448 | + "VERB-LEMMA-NĄ", [ | |
1449 | + "dowolne","nąć",[T("lemma","nąć")]; | |
1450 | + "dowolne","ąć",[T("lemma","ąć")]; | |
1451 | + "verb_inf_ć","ć",[T("lemma","ć")]; | |
1452 | + "verb_inf_c","c",[T("lemma","c")]; | |
1453 | + "dowolne","ać",[T("lemma","ać")]; | |
1454 | + "dowolne","eć",[T("lemma","eć")]; | |
1455 | + "dowolne","yć",[T("lemma","yć")]; | |
1456 | + "inf_e","eć",[T("lemma","eć")]; | |
1457 | +(* "pleć_mleć_lemma","eć",[T("lemma","eć")]; *) | |
1458 | + ]; | |
1459 | + ] | |
1460 | + | |
1461 | +let schemata = [ | |
1462 | +(* ["KOLWIEK-SUFFIXES";"ADJ-FLEX";"ADJ-LEMMA"]; | |
1463 | + ["ADJ-FLEX-GRAD";"ADJ-GRAD";"ADJ-LEMMA"]; | |
1464 | + ["ADV-FLEX";"ADV-LEMMA"];*) | |
1465 | + ["NOUN-FLEX";"NOUN-LEMMA"]; | |
1466 | + ["NOUN-FLEX-CAP";"NOUN-LEMMA-CAP"]; | |
1467 | +(* ["FIN-FLEX"; "VERB-FLEX2";"VERB-GROUP-SUFIX";"VERB-LEMMA-SUFIX"]; | |
1468 | + ["GER-FLEX"; "VERB-FLEX2";"VERB-GROUP-SUFIX";"VERB-LEMMA-SUFIX"]; | |
1469 | + ["PPAS-FLEX"; "VERB-FLEX2";"VERB-GROUP-SUFIX";"VERB-LEMMA-SUFIX"]; | |
1470 | + ["PRAET-FLEX";"VERB-FLEX2";"VERB-GROUP-SUFIX";"VERB-LEMMA-SUFIX"]; | |
1471 | + ["FIN-FLEX-J";"VERB-FLEX2-J";"VERB-GROUP-SUFIX";"VERB-LEMMA-SUFIX"]; | |
1472 | + ["GER-FLEX"; "VERB-FLEX2-J";"VERB-GROUP-SUFIX";"VERB-LEMMA-SUFIX"]; | |
1473 | + ["PACT-FLEX"; "VERB-FLEX2-J";"VERB-GROUP-SUFIX";"VERB-LEMMA-SUFIX"]; | |
1474 | + ["PPAS-FLEX"; "VERB-FLEX2-J";"VERB-GROUP-SUFIX";"VERB-LEMMA-SUFIX"]; | |
1475 | + ["FIN-FLEX"; "VERB-FLEX2";"VERB-GROUP-PATAL";"VERB-LEMMA-PATAL"]; | |
1476 | + ["GER-FLEX"; "VERB-FLEX2";"VERB-GROUP-PATAL";"VERB-LEMMA-PATAL"]; | |
1477 | + ["PPAS-FLEX"; "VERB-FLEX2";"VERB-GROUP-PATAL";"VERB-LEMMA-PATAL"]; | |
1478 | + ["PRAET-FLEX";"VERB-FLEX2";"VERB-GROUP-PATAL";"VERB-LEMMA-PATAL"]; | |
1479 | + ["FIN-FLEX-J";"VERB-FLEX2-J";"VERB-GROUP-J-PATAL";"VERB-LEMMA-PATAL"]; | |
1480 | + ["GER-FLEX"; "VERB-FLEX2-J";"VERB-GROUP-J-PATAL";"VERB-LEMMA-PATAL"]; | |
1481 | + ["PACT-FLEX"; "VERB-FLEX2-J";"VERB-GROUP-J-PATAL";"VERB-LEMMA-PATAL"]; | |
1482 | + ["PPAS-FLEX"; "VERB-FLEX2-J";"VERB-GROUP-J-PATAL";"VERB-LEMMA-PATAL"]; | |
1483 | + ["FIN-FLEX"; "VERB-FLEX2";"VERB-GROUP-NĄ";"VERB-LEMMA-NĄ"]; | |
1484 | + ["GER-FLEX"; "VERB-FLEX2";"VERB-GROUP-NĄ";"VERB-LEMMA-NĄ"]; | |
1485 | + ["PPAS-FLEX"; "VERB-FLEX2";"VERB-GROUP-NĄ";"VERB-LEMMA-NĄ"]; | |
1486 | + ["PRAET-FLEX";"VERB-FLEX2";"VERB-GROUP-NĄ";"VERB-LEMMA-NĄ"]; | |
1487 | + ["FIN-FLEX-J";"VERB-FLEX2-J";"VERB-GROUP-J-NĄ";"VERB-LEMMA-NĄ"]; | |
1488 | + ["GER-FLEX"; "VERB-FLEX2-J";"VERB-GROUP-J-NĄ";"VERB-LEMMA-NĄ"]; | |
1489 | + ["PACT-FLEX"; "VERB-FLEX2-J";"VERB-GROUP-J-NĄ";"VERB-LEMMA-NĄ"]; | |
1490 | + ["PPAS-FLEX"; "VERB-FLEX2-J";"VERB-GROUP-J-NĄ";"VERB-LEMMA-NĄ"]; *) | |
1491 | + | |
1492 | +(* ["NOUN-FLEX-GENERAL";"NOUN-LEMMA-GENERAL"]; | |
1493 | + ["NOUN-FLEX-ADJ";"NOUN-LEMMA-ADJ"]; | |
1494 | + ["NOUN-FLEX-A";"NOUN-LEMMA-A"]; | |
1495 | + ["NOUN-FLEX-F-WYGŁOS";"NOUN-LEMMA-F-WYGŁOS"]; | |
1496 | + ["NOUN-FLEX-N1";"NOUN-LEMMA-N1"]; | |
1497 | + ["NOUN-FLEX-N2";"NOUN-LEMMA-N2"];*) | |
1498 | + ] | |
... | ... |
guesser/stem.ml
0 → 100644
1 | +open Xstd | |
2 | +open Printf | |
3 | + | |
4 | +let adj_stem_sel = [ | |
5 | + 1,"adj:sg:nom.voc:f:pos", ("a",""); | |
6 | + 1,"adj:sg:nom:f:pos", ("a",""); | |
7 | + ] | |
8 | + | |
9 | +let noun_stem_sel = | |
10 | + List.flatten (Xlist.map ["m1";"m2";"m3";"n1";"n2";"f";"p1";"p2";"p3"] (fun gender -> [ | |
11 | + 1,"subst:pl:loc:" ^ gender, ("’ach",""); | |
12 | + 1,"subst:pl:loc:" ^ gender, ("-ach",""); | |
13 | + 1,"subst:pl:loc:" ^ gender, ("-etach",""); | |
14 | + 1,"subst:pl:loc:" ^ gender, ("-otach",""); | |
15 | + 2,"subst:pl:dat:" ^ gender, ("om",""); | |
16 | + 2,"subst:pl:loc:" ^ gender, ("ach",""); | |
17 | + 2,"subst:pl:loc:" ^ gender, ("ych",""); | |
18 | + 2,"subst:pl:loc:" ^ gender, ("bich","bi"); | |
19 | + 2,"subst:pl:loc:" ^ gender, ("cich","ci"); | |
20 | + 2,"subst:pl:loc:" ^ gender, ("dzich","dzi"); | |
21 | + 2,"subst:pl:loc:" ^ gender, ("fich","fi"); | |
22 | + 2,"subst:pl:loc:" ^ gender, ("mich","mi"); | |
23 | + 2,"subst:pl:loc:" ^ gender, ("nich","ni"); | |
24 | + 2,"subst:pl:loc:" ^ gender, ("pich","pi"); | |
25 | + 2,"subst:pl:loc:" ^ gender, ("sich","si"); | |
26 | + 2,"subst:pl:loc:" ^ gender, ("wich","wi"); | |
27 | + 2,"subst:pl:loc:" ^ gender, ("zich","zi"); | |
28 | + 2,"subst:pl:loc:" ^ gender, ("kich","k"); | |
29 | + 2,"subst:pl:loc:" ^ gender, ("gich","g"); | |
30 | + 2,"subst:pl:loc:" ^ gender, ("lich","l"); | |
31 | + 2,"subst:pl:loc:" ^ gender, ("żich","żi"); | |
32 | + 3,"subst:sg:gen:" ^ gender, ("kiego","k"); | |
33 | + 3,"subst:sg:gen:" ^ gender, ("ojego","oj"); | |
34 | + 3,"subst:sg:gen:" ^ gender, ("nego","n"); | |
35 | + 3,"subst:sg:gen:" ^ gender, ("tego","t"); | |
36 | + 3,"subst:sg:gen:" ^ gender, ("wego","w"); | |
37 | + 3,"subst:sg:gen:" ^ gender, ("siego","si"); | |
38 | + 3,"subst:sg:gen:" ^ gender, ("ojej","oj"); | |
39 | + ])) @ | |
40 | + [3,"subst:pl:inst:p1", ("wem","w"); | |
41 | + 3,"subst:pl:nom:m1", ("owie",""); | |
42 | + 4,"subst:pl:gen:p1", ("oich","oj"); | |
43 | + ] | |
44 | + | |
45 | +let verb_stem_sel2 = | |
46 | + List.flatten (Xlist.map ["imperf";"perf";"imperf.perf"] (fun aspect -> [ | |
47 | + 4,"praet:sg:f:" ^ aspect, ("kła","k"); | |
48 | + 4,"praet:sg:f:" ^ aspect, ("gła","g"); | |
49 | + 4,"praet:sg:f:" ^ aspect, ("zła","z"); | |
50 | + 4,"praet:sg:f:" ^ aspect, ("sła","s"); | |
51 | + 4,"praet:sg:f:" ^ aspect, ("zła","z"); | |
52 | + 4,"praet:sg:f:" ^ aspect, ("dła","d"); | |
53 | + 4,"praet:sg:f:" ^ aspect, ("tła","t"); | |
54 | + 4,"praet:sg:f:" ^ aspect, ("bła","b"); | |
55 | + 4,"praet:sg:f:" ^ aspect, ("łła","ł"); | |
56 | + 4,"praet:sg:f:" ^ aspect, ("rła","r"); | |
57 | + 5,"inf:" ^ aspect, ("ieć",""); | |
58 | + 6,"inf:" ^ aspect, ("eć",""); | |
59 | +(* 3,"ppas:sg:nom.voc:m1.m2.m3:" ^ aspect ^ ":aff", ("ty",""); *) | |
60 | +(* 3,"praaaet:sg:f:" ^ aspect, ("zła","z"); *) | |
61 | + ])) | |
62 | + | |
63 | +let prepare_stem_sel stem_sel = | |
64 | + Xlist.fold stem_sel StringMap.empty (fun map (priority,tags,(a,b)) -> | |
65 | + StringMap.add_inc map tags [a,b,priority] (fun l -> (a,b,priority) :: l)) | |
66 | + | |
67 | +let adj_stem_sel = prepare_stem_sel adj_stem_sel | |
68 | +let noun_stem_sel = prepare_stem_sel noun_stem_sel | |
69 | +let verb_stem_sel2 = prepare_stem_sel verb_stem_sel2 | |
70 | + | |
71 | +let adv_stem_sel = [ | |
72 | + "o","",1; | |
73 | + "wie","w",1; | |
74 | + "nie","n",1; | |
75 | + "dze","g",1; | |
76 | + "le","ł",1; | |
77 | + "cie","t",1; | |
78 | + "dzie","d",1; | |
79 | + "mie","m",1; | |
80 | + "rze","r",1; | |
81 | + "ce","k",1; | |
82 | + ] | |
83 | + | |
84 | +let verb_stem_sel = [ | |
85 | + "ować","",1; | |
86 | + "owywać","",1; | |
87 | + "iwać","",1; | |
88 | + "ywać","",2; | |
89 | + "awać","",1; | |
90 | + "ać","",3; | |
91 | + "nąć","",1; | |
92 | + "ąć","",2; | |
93 | +(* "eć","e",1; *) | |
94 | + "ić","",1; | |
95 | + "yć","",1; | |
96 | + "uć","u",1; | |
97 | +(* "ć","",2; *) | |
98 | + ] | |
99 | + | |
100 | +let generate_stem stem_sel lemma forms = | |
101 | + let stems = Xlist.fold forms StringMap.empty (fun stems (orth,interp) -> | |
102 | + let rules = try StringMap.find stem_sel interp with Not_found -> [] in | |
103 | + Xlist.fold rules stems (fun stems rule -> | |
104 | + if Rules.is_applicable_rule rule orth then | |
105 | + StringMap.add_inc stems (Rules.apply_rule rule orth) (Rules.get_tags rule) (fun priority -> min priority (Rules.get_tags rule)) | |
106 | + else stems)) in | |
107 | + let stems,_ = StringMap.fold stems ([],max_int) (fun (stems,priority) stem p -> | |
108 | + if p < priority then [stem],p else | |
109 | + if p > priority then stems,priority else | |
110 | + stem :: stems, priority) in | |
111 | + match stems with | |
112 | + [] -> (*print_endline ("stem not found for " ^ lemma); | |
113 | + Xlist.iter forms (fun (orth,interp) -> printf " %s\t%s\n" orth interp);*) | |
114 | + "" | |
115 | + | [s] -> s | |
116 | + | l -> print_endline ("many stems found for " ^ lemma ^ ": " ^ String.concat " " l); "" | |
117 | + (*printf "\"%s\"; " lemma; ""*) | |
118 | + | |
119 | +let generate_stem_lemma_as_stem stem_sel lemma = | |
120 | + let orth = Rules.simplify_lemma lemma in | |
121 | + let stems = Xlist.fold stem_sel StringMap.empty (fun stems rule -> | |
122 | + if Rules.is_applicable_rule rule orth then | |
123 | + StringMap.add_inc stems (Rules.apply_rule rule orth) (Rules.get_tags rule) (fun priority -> min priority (Rules.get_tags rule)) | |
124 | + else stems) in | |
125 | + let stems,_ = StringMap.fold stems ([],max_int) (fun (stems,priority) stem p -> | |
126 | + if p < priority then [stem],p else | |
127 | + if p > priority then stems,priority else | |
128 | + stem :: stems, priority) in | |
129 | + match stems with | |
130 | + [] -> (*print_endline ("stem not found for " ^ lemma); *) | |
131 | + "" | |
132 | + | [s] -> s | |
133 | + | l -> print_endline ("many stems found for " ^ lemma ^ ": " ^ String.concat " " l); "" | |
134 | + (*printf "\"%s\"; " lemma; ""*) | |
135 | + | |
136 | +let rec merge_digraph = function | |
137 | + [] -> [] | |
138 | + | "b" :: "'" :: l -> "b'" :: (merge_digraph l) | |
139 | + | "f" :: "'" :: l -> "f'" :: (merge_digraph l) | |
140 | + | "c" :: "h" :: l -> "ch" :: (merge_digraph l) | |
141 | + | "c" :: "z" :: l -> "cz" :: (merge_digraph l) | |
142 | + | "d" :: "h" :: l -> "dh" :: (merge_digraph l) | |
143 | + | "d" :: "z" :: l -> "dz" :: (merge_digraph l) | |
144 | + | "d" :: "ź" :: l -> "dź" :: (merge_digraph l) | |
145 | + | "d" :: "ż" :: l -> "dż" :: (merge_digraph l) | |
146 | + | "g" :: "h" :: l -> "gh" :: (merge_digraph l) | |
147 | + | "n" :: "h" :: l -> "nh" :: (merge_digraph l) | |
148 | + | "r" :: "h" :: l -> "rh" :: (merge_digraph l) | |
149 | + | "r" :: "z" :: l -> "rz" :: (merge_digraph l) | |
150 | + | "s" :: "z" :: l -> "sz" :: (merge_digraph l) | |
151 | + | "q" :: "u" :: l -> "qu" :: (merge_digraph l) | |
152 | + | s :: l -> s :: (merge_digraph l) | |
153 | + | |
154 | +(*let text_to_chars s = | |
155 | + (try UTF8.validate s with UTF8.Malformed_code -> failwith ("Invalid UTF8 string: " ^ s)); | |
156 | + let r = ref [] in | |
157 | + UTF8.iter (fun c -> | |
158 | + r := (UTF8.init 1 (fun _ -> c)) :: (!r)) s; | |
159 | + merge_digraph (List.rev (!r))*) | |
160 | + | |
161 | +let cut_stem_sufix s = | |
162 | + let l = Xunicode.utf8_chars_of_utf8_string (*text_to_chars*) s in | |
163 | + let l = match List.rev l with | |
164 | + "i" :: _ :: l -> l | |
165 | + | "j" :: _ :: l -> l | |
166 | + | _ :: l -> l | |
167 | + | _ -> [] in | |
168 | + String.concat "" (List.rev l) | |
169 | + | |
0 | 170 | \ No newline at end of file |
... | ... |