Commit 173c381a9761ad523dd96392859442abf3c8fd7b
1 parent
8e42d298
obsługa słowników
Showing
7 changed files
with
570 additions
and
378 deletions
guesser/dict.ml
0 → 100644
1 | +open Xstd | |
2 | +open Printf | |
3 | +open Types | |
4 | + | |
5 | +let get_form e = | |
6 | + match e.forms with | |
7 | + [form] -> form | |
8 | + | _ -> failwith "get_form" | |
9 | + | |
10 | +let load_tab filename = | |
11 | + File.load_tab filename (function | |
12 | + orth :: lemma :: interp :: _ -> | |
13 | + {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type=""; | |
14 | + ndm=false; stem=""} | |
15 | + | line -> failwith ("load_tab: " ^ (String.concat "\t" line))) | |
16 | + | |
17 | +let load_tab_full filename = | |
18 | + File.load_tab filename (function | |
19 | + [orth; lemma; interp] -> | |
20 | + {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type=""; | |
21 | + ndm=false; stem=""} | |
22 | + | [orth; lemma; interp; proper_type] -> | |
23 | + {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type=proper_type; | |
24 | + ndm=false; stem=""} | |
25 | + | [orth; lemma; interp; proper_type; genre] -> | |
26 | + {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=genre; validated=false}]; proper_type=proper_type; | |
27 | + ndm=false; stem=""} | |
28 | +(* | orth :: lemma :: interp :: proper_type :: genre -> (orth,lemma,interp,proper_type,String.concat ";" genre) :: l *) | |
29 | + | line -> failwith ("load_tab_full: " ^ (String.concat "\t" line))) | |
30 | + | |
31 | +let pos = StringSet.of_list [ | |
32 | + "subst";"adj";"adv";"interp";"num";"xxx";"prep";"fin";"praet";"qub";"inf";"interj"; | |
33 | + "brev";"numcol";"ppas";"pact";"adja";"conj";"ger";"pcon";"pant";"comp";"depr"; | |
34 | + "adjp";"imps";"impt";"pred";"bedzie";"burk";"aglt";"ppron12";"ppron3";"adjc"; | |
35 | + "winien";"siebie" | |
36 | + ] | |
37 | + | |
38 | +let rec find_pos rev = function | |
39 | + s :: l -> if StringSet.mem pos s then List.rev rev, s :: l else find_pos (s :: rev) l | |
40 | + | [] -> failwith "find_pos" | |
41 | + | |
42 | +let split_lemma_interp s = | |
43 | + let l = Xstring.split_delim ":" s in | |
44 | + let lemma,interp = find_pos [List.hd l] (List.tl l) in | |
45 | + String.concat ":" lemma, String.concat ":" interp | |
46 | + | |
47 | +let rec remove_empties = function | |
48 | + "" :: l -> remove_empties l | |
49 | + | l -> l | |
50 | + | |
51 | +let split_freq_orth s = | |
52 | + match remove_empties (Xstring.split " " s) with | |
53 | + freq :: l -> (*print_endline (String.concat "|" (freq :: l));*) int_of_string freq, String.concat " " l | |
54 | + | _ -> failwith "split_freq_orth" | |
55 | + | |
56 | +let load_freq_tab filename = | |
57 | + File.load_tab filename (function | |
58 | + [freq_orth; lemma_interp] -> | |
59 | + (* print_endline (freq_orth ^ "\t" ^ lemma_interp); *) | |
60 | + let freq,orth = split_freq_orth freq_orth in | |
61 | + let lemma,interp = split_lemma_interp lemma_interp in | |
62 | + {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=freq; genre=""; validated=false}]; proper_type=""; | |
63 | + ndm=false; stem=""} | |
64 | + | line -> failwith ("load_freq_tab: " ^ (String.concat "\t" line))) | |
65 | + | |
66 | +let proper_type_selector e = e.proper_type | |
67 | +let genre_selector e = e.genre | |
68 | +let interp_selector e = (get_form e).interp | |
69 | +let freq_selector e = (get_form e).freq | |
70 | + | |
71 | +let print_quantities out_filename selector dict = | |
72 | + let qmap = Xlist.fold dict StringQMap.empty (fun qmap entry -> | |
73 | + StringQMap.add qmap (selector entry)) in | |
74 | + File.file_out out_filename (fun file -> | |
75 | + StringQMap.iter qmap (fun k v -> | |
76 | + fprintf file "%6d\t%s\n" v k)) | |
77 | + | |
78 | +(**********************************************************************************) | |
79 | + | |
80 | +let load_dict_as_set filename = | |
81 | + let l = load_tab filename in | |
82 | + List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry -> | |
83 | + let form = get_form entry in | |
84 | + StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp])))) | |
85 | + | |
86 | +let load_dict_as_set_full filename = | |
87 | + let l = load_tab_full filename in | |
88 | + List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry -> | |
89 | + let form = get_form entry in | |
90 | + StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp;entry.proper_type;form.genre])))) | |
91 | + | |
92 | +let rec compare_dicts_rec file = function | |
93 | + [],[] -> () | |
94 | + | [],b :: lb -> fprintf file "> %s\n" b; compare_dicts_rec file ([],lb) | |
95 | + | a :: la,[] -> fprintf file "< %s\n" a; compare_dicts_rec file (la,[]) | |
96 | + | a :: la, b :: lb -> | |
97 | + if a = b then compare_dicts_rec file (la,lb) else | |
98 | + if a < b then (fprintf file "< %s\n" a; compare_dicts_rec file (la,b :: lb)) else | |
99 | + (fprintf file "> %s\n" b; compare_dicts_rec file (a :: la,lb)) | |
100 | + | |
101 | +let compare_dicts filename1 filename2 filename_out = | |
102 | + let dict1 = load_dict_as_set filename1 in | |
103 | + let dict2 = load_dict_as_set filename2 in | |
104 | + File.file_out filename_out (fun file -> | |
105 | + compare_dicts_rec file (dict1,dict2)) | |
106 | + | |
107 | +let compare_dicts_full filename1 filename2 filename_out = | |
108 | + let dict1 = load_dict_as_set_full filename1 in | |
109 | + let dict2 = load_dict_as_set_full filename2 in | |
110 | + File.file_out filename_out (fun file -> | |
111 | + compare_dicts_rec file (dict1,dict2)) | |
112 | + | |
113 | +(**********************************************************************************) | |
114 | + | |
115 | +let find_entry_cat entry = | |
116 | + let form = get_form entry in | |
117 | + let cat,tags = match Xstring.split ":" form.interp with | |
118 | + cat :: tags -> cat,tags | |
119 | + | _ -> failwith ("find_entry_type: " ^ form.interp) in | |
120 | + if cat = "subst" || cat = "depr" then "noun" else | |
121 | + if cat = "adj" || cat = "adja"|| cat = "adjc"|| cat = "adjp" then "adj" else | |
122 | + if cat = "adv" then "adv" else | |
123 | + if cat = "inf" || cat = "praet"|| cat = "fin" || cat = "ppas" || cat = "pact" || cat = "pacta" || | |
124 | + cat = "impt" || cat = "imps" || cat = "pcon" || cat = "pant" || cat = "ger" || cat = "" then "verb" else | |
125 | + if cat = "bedzie" || cat = "pred"|| cat = "prep" || cat = "num" || cat = "aglt" || cat = "winien" || | |
126 | + cat = "qub" || cat = "brev" || cat = "comp" || cat = "interj" || cat = "burk" || | |
127 | + cat = "conj" || cat = "ppron12" || cat = "ppron3" || cat = "" then "other" else | |
128 | + if cat = "cond" then "cond" else | |
129 | + failwith ("find_entry_cat: " ^ cat) | |
130 | + | |
131 | +let assign_entry_cat dict = | |
132 | + Xlist.rev_map dict (fun entry -> | |
133 | + {entry with cat = find_entry_cat entry}) | |
134 | + | |
135 | +let split_dict path filename = | |
136 | + let dict = load_tab (path ^ filename) in | |
137 | + let dict = List.rev (assign_entry_cat dict) in | |
138 | + File.file_out (path ^ "noun_" ^ filename) (fun noun_file -> | |
139 | + File.file_out (path ^ "adj_" ^ filename) (fun adj_file -> | |
140 | + File.file_out (path ^ "adv_" ^ filename) (fun adv_file -> | |
141 | + File.file_out (path ^ "verb_" ^ filename) (fun verb_file -> | |
142 | + File.file_out (path ^ "other_" ^ filename) (fun other_file -> | |
143 | + Xlist.iter dict (fun entry -> | |
144 | + let form = get_form entry in | |
145 | + try | |
146 | + let file = match entry.cat with | |
147 | + "noun" -> noun_file | |
148 | + | "adj" -> adj_file | |
149 | + | "adv" -> adv_file | |
150 | + | "verb" -> verb_file | |
151 | + | "other" -> other_file | |
152 | + | "cond" -> raise Not_found | |
153 | + | _ -> failwith "split_dict" in | |
154 | + fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp | |
155 | + with Not_found -> ())))))) | |
156 | + | |
157 | +let merge_entries dict = | |
158 | + let dict = assign_entry_cat dict in | |
159 | + let map = Xlist.fold dict StringMap.empty (fun map entry -> | |
160 | + let form = get_form entry in | |
161 | + let key = | |
162 | + if entry.cat = "noun" then (* FIXME: problem z depr *) | |
163 | + let gender = match Xstring.split ":" form.interp with | |
164 | + ["depr";_;_;"m2"] -> "m1" | |
165 | + | "depr" :: _ -> failwith ("merge_entries: " ^ form.interp) | |
166 | + | [_;_;_;gender] -> gender | |
167 | + | _ -> failwith ("merge_entries: " ^ form.interp) in | |
168 | + entry.lemma ^ "|" ^ entry.cat ^ "|" ^ gender | |
169 | + else entry.lemma ^ "|" ^ entry.cat in | |
170 | + StringMap.add_inc map key entry (fun e -> | |
171 | + if entry.proper_type <> e.proper_type then | |
172 | + failwith ("merge_entries: " ^ key ^ " " ^ entry.proper_type ^ " " ^ e.proper_type) else | |
173 | + {e with forms = form :: e.forms})) in | |
174 | + StringMap.fold map [] (fun dict _ e -> e :: dict) | |
175 | + | |
176 | +(**********************************************************************************) | |
177 | + | |
178 | +let mark_ndm dict = | |
179 | + Xlist.fold dict [] (fun dict entry -> | |
180 | + if entry.cat <> "noun" && entry.cat <> "adj" then entry :: dict else | |
181 | + let map = Xlist.fold entry.forms StringMap.empty (fun map form -> | |
182 | + StringMap.add_inc map form.interp (StringSet.singleton form.orth) (fun set -> StringSet.add set form.orth)) in | |
183 | + let qmap = StringMap.fold map StringQMap.empty (fun qmap interp orths -> | |
184 | + StringSet.fold orths qmap StringQMap.add) in | |
185 | + let n = StringMap.size map in | |
186 | + let found = StringQMap.fold qmap [] (fun found orth v -> | |
187 | + if v = n then orth :: found else found) in | |
188 | + (* printf "%s\t%s\t%d\t%s\n%!" entry.lemma entry.cat n (String.concat " " found); *) | |
189 | + match found with | |
190 | + [] -> entry :: dict | |
191 | + | [orth] -> | |
192 | + let ndm,odm = Xlist.fold entry.forms ([],[]) (fun (ndm,odm) form -> | |
193 | + if form.orth = orth then form :: ndm, odm else ndm, form :: odm) in | |
194 | + let dict = {entry with forms=odm} :: dict in | |
195 | + {entry with forms=ndm; ndm=true} :: dict | |
196 | + | _ -> failwith ("mark_ndm: " ^ (String.concat " " found))) | |
197 | + | |
198 | +let print_ndm filename dict = | |
199 | + File.file_out filename (fun file -> | |
200 | + Xlist.iter dict (fun entry -> | |
201 | + if entry.ndm then | |
202 | + let orth = (List.hd entry.forms).orth in | |
203 | + fprintf file "%s\t%s\t%s\n" orth entry.lemma entry.cat)) | |
204 | + | |
205 | +let kolwiek_lemmas = StringSet.of_list [ | |
206 | + (* adj *) | |
207 | + "czyjkolwiek"; "czyjś"; "czyjże"; "jakiciś"; "jakikolwiek"; "jakisi"; "jakiś"; "jakiści"; | |
208 | + "jakiściś"; "jakiśkolwiek"; "jakiż"; "jakiżkolwiek"; "jakowyś"; "kijże"; "kiż"; "którykolwiek"; | |
209 | + "któryś"; "któryż"; "któryżkolwiek"; "niejakiś"; "takiż"; "takowyż"; "tenże"; "tyliż"; "ówże"; | |
210 | + (* noun *) | |
211 | + "cokolwiek:s"; "cośkolwiek"; "cóżkolwiek"; "ktokolwiek"; "ktośkolwiek"; "któżkolwiek"; | |
212 | + "cociś"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "coże"; "cóż"; | |
213 | + "ktoś:s2"; "któż"; | |
214 | + (* adv *) | |
215 | + "jakkolwiek"; "jakoś"; "małoż"; "niejakkolwiek"; "niejakoś"; (*"niemalże";*) ] | |
216 | + | |
217 | +let kolwiek_suffixes = [ | |
218 | + "żkolwiek"; "żekolwiek"; "śkolwiek"; "kolwiek"; "ż"; "że"; "ściś"; "ciś"; "ś"; "ści"; "sik"; "si"] | |
219 | + | |
220 | +let find_kolwiek_suffixes dict = | |
221 | + Xlist.rev_map dict (fun entry -> | |
222 | + if StringSet.mem kolwiek_lemmas entry.lemma then | |
223 | + {entry with forms=Xlist.map entry.forms (fun form -> | |
224 | + {form with orth=Xlist.fold kolwiek_suffixes form.orth (fun orth kolwiek_suf -> | |
225 | + if Xstring.check_sufix kolwiek_suf orth then | |
226 | + Xstring.cut_sufix kolwiek_suf orth | |
227 | + else orth)})} | |
228 | + else entry) | |
229 | + | |
230 | +let exceptional_lemmata = StringSet.of_list ([ | |
231 | + (* błąd w słowniku *) | |
232 | + "ówże"; | |
233 | + (* wiele stemów *) | |
234 | + "twój:a"; "swój"; "mój:a"; "wszystek"; | |
235 | + (* oboczności w stemie *) | |
236 | + "co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek"; | |
237 | + "kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic"; | |
238 | + "Angel"; "Apollo"; "Białystok"; "Bober"; "Dzięgiel"; "Engel"; "Gołąb:s2"; "Gózd"; "Hendel"; "Herschel"; "Jastrząb"; | |
239 | + "Kodrąb:s2"; "Kozioł"; "Krasnystaw"; "Majcher"; "Ob"; "Omulew"; "Orzeł"; "Różanystok"; "Schuster"; "Stępień"; "Słonim"; | |
240 | + "Wielkanoc"; "achtel"; "archiprezbiter"; "arcydzięgiel"; "bedel"; "ber"; "białagłowa"; "białodrzew"; "ceter"; "deszcz"; | |
241 | + "drama"; "dziób:s1"; "dzięgiel"; "dżemper"; "falafel"; "grubodziób"; "harbajtel"; "harbejtel"; "harmider"; "imćpan"; | |
242 | + "iściec"; "jarząb:s2"; "kierdel"; "kimel"; "kiper:s1"; "klaster"; "kliper"; "kosodrzew"; "kureń"; "manczester"; | |
243 | + "nadpiersień"; "osep"; "otrząs"; "pedel"; "piksel"; "podpiersień"; "podziem"; "prezbiter"; "protokół"; "przedpiersień"; | |
244 | + "ratel"; "rondel:s2"; "rozpiór:s1"; "rozpiór:s2"; "rzeczpospolita"; "rzep:s2"; "rzepień"; "rzewień"; "rąb"; "sosrąb"; | |
245 | + "srebrnodrzew"; "swąd"; "szmermel"; "szpiegierz"; "ulster"; "wab:s2"; "wermiszel"; "wilczełyko"; "woleoczko"; "włosień:s2"; | |
246 | + "zew"; "złotogłów"; "świreń"; "źreb"; "żółtodziób"; | |
247 | + "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; ""; | |
248 | + "przechrzest"; "chrzest"; "półdziecko"; "roczek:s2"; "rok:s1"; "tydzień"; ""; ""; ""; ""; ""; ""; | |
249 | + (* oboczności w odmianie *) | |
250 | + "niekażdy"; "każdy"; "niektóry:a"; "który"; "tenże"; "ten"; "tamten"; "kijże"; | |
251 | + "ucho:s2"; "dziecko"; "oko:s2"; "imię"; "nozdrze"; | |
252 | + "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; | |
253 | + "NOT"; "LOT"; "KRRiT"; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
254 | + "być"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
255 | + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
256 | + (* pozostawione *) | |
257 | + "czyjże"; "czyjś"; "czyjkolwiek"; "kiż"; "ów"; "ow"; "on:a"; "ki"; | |
258 | + "Pia"; "jo-jo"; "+lecie"; ""; | |
259 | + "zagrząźć"; "zrzeć"; | |
260 | + (* niepełny paradygmat *) | |
261 | + "zróść"; "zląc"; "zaróść"; "zaprząc"; "zaprzysiąc"; "zanieść:v2"; "zaląc"; "wzróść"; "wyróść"; "wyprząc"; "wyprzysiąc"; | |
262 | + "róść"; "sprzysiąc"; "sprząc"; "ugrząźć"; "uląc"; "upiec:v2"; "uprząc"; "uróść"; "wieść:v2"; "wprząc"; "wróść"; "wyląc"; | |
263 | + "powieść:v2"; "posiąc"; "przeląc"; "przeprząc"; "przeróść"; "przyprząc"; "przysiąc"; "przyróść"; "prząc"; "pójść:v2"; "rozprząc"; "rozróść"; | |
264 | + "krzywoprzysiąc"; "ląc"; "naróść"; "obróść"; "odprzysiąc"; "odprząc"; "odróść"; "oprzysiąc"; "podróść"; "pogrząźć"; "poprzysiąc"; "poróść"; | |
265 | + "dojść:v2"; "doprząc"; "doróść"; "dosiąc"; "grząźć"; "iść:v2"; | |
266 | + (* wiele stemów *) | |
267 | + "uwlec"; "wewlec"; "wlec"; "wwlec"; "wywlec"; "wyżec"; "zawlec"; "zażec"; "zewlec"; "zwlec"; "zżec"; "żec"; | |
268 | + "podwlec"; "podżec"; "powlec:v1"; "powlec:v2"; "przeoblec"; "przewlec"; "przeżec"; "przyoblec"; "przywlec"; "przyżec"; "rozwlec"; "rozżec"; | |
269 | + "dowlec"; "nawlec"; "oblec:v2"; "obwlec"; "odwlec"; "owlec"; "zeżreć"; | |
270 | + (* inne *) | |
271 | + "liźć"; "iść:v1"; "wyniść"; "wynijść"; "wyjść"; "wniść"; "wnijść"; "wejść"; "ujść"; "rozejść"; "pójść:v1"; "przyjść"; "przejść:v2"; "przejść:v1"; "podejść"; "odejść"; "obejść:v2"; "obejść:v1"; "najść:v2"; "najść:v1"; "nadejść"; "dojść:v1"; | |
272 | + "roztworzyć:v2"; "przetworzyć:v2"; "otworzyć"; | |
273 | + "zsiąść:v2"; "zsiąść:v1"; "zesiąść"; "zasiąść"; "wysiąść"; "współposiąść"; "wsiąść"; "usiąść"; "siąść"; "rozsiąść"; "przysiąść"; "przesiąść"; "powsiąść"; "posiąść"; "podsiąść"; "osiąść"; "obsiąść"; "nasiąść"; "dosiąść"; | |
274 | + "źreć:v1"; "zniść"; "znijść"; "znajść"; "zejść"; "zejść"; "zajść:v2"; "zajść:v1"; "wzniść"; "wznijść"; "wzejść" | |
275 | +(* | |
276 | + "moi"; "twoi"; | |
277 | + (*"AIDS"; "BGŻ"; "BWZ"; "BZ";*) (*"Bandtkie";*) (*"CRZZ"; "FPŻ";*) (*"Jokai"; "Jókai"; "Linde";*)(* "MSZ"; "MWGzZ"; *) | |
278 | + (*"NSZ"; "OPZZ";*) "Radetzky"; "Tagore"; (*"UNZ"; "URz"; "WBZ"; "ZSZ"; "ZWZ"; "ZZ";*) "aids"; | |
279 | + "arcyksiężna"; "cornflakes"; "księżna"; (*"scrabble";*) "sms"; "teścina"; | |
280 | + "Wielkanoc"; "białagłowa"; "rzeczpospolita"; "imćpan"; | |
281 | + "Ob"; "podziem"; "Pia"; "woleoczko"; "wilczełyko"; "jo-jo"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
282 | + "Omulew"; "drama"; (*"Kayah";*) "ratel"; "grubodziób"; "rozpiór:s1"; "ceter"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
283 | + "DJ"; "FIFA"; (*"manicure"; "Greenpeace"; "Google";*) ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
284 | + "włosień:s2"; "deszcz"; "falafel"; "Krasnystaw"; | |
285 | + "Różanystok"; "Białystok"; "ZHR"; "rzep:s2"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
286 | + "IKEA"; "makao"; "macao"; "kakao"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
287 | + "dziecko"; "oko:s2"; "ucho:s2"; "półdziecko"; "b-cia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
288 | + "idea"; "ręka"; "cześć:s"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
289 | + "ABBA"; "UEFA"; "FAMA"; "SABENA"; "MENA"; "APA"; "NASA"; "ANSA"; | |
290 | + "NAFTA"; "LETTA"; "ETA"; "ELTA"; "EFTA"; "CEFTA"; | |
291 | + "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; | |
292 | + "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT"; | |
293 | + "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; ""; | |
294 | + "szwa"; "hawanna"; "butaforia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
295 | + "Skopie"; "Mathea"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
296 | + "poema:s1"; "klima:s1"; "dylema"; "dilemma"; "apoftegma"; "aksjoma"; ""; ""; ""; ""; ""; ""; ""; ""; | |
297 | + "burgrabia"; "gograbia"; "grabia"; "hrabia"; "margrabia"; "murgrabia"; "sędzia:s1"; "wicehrabia"; "współsędzia"; | |
298 | + "cieśla"; "bibliopola"; "świszczypałka"; "śwircałka"; "świerczałka"; "ścierciałka"; "tatka"; "sługa:s1"; "stupajka:s1"; "stepka"; "starowinka:s2"; "skurczypałka"; "mężczyzna"; "klecha"; | |
299 | + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
300 | + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
301 | + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
302 | + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*) | |
303 | + ] @ File.load_lines "data/obce.tab" @ File.load_lines "data/validated_adj.tab" @ File.load_lines "data/validated_noun.tab" @ File.load_lines "data/validated_verb.tab" @ File.load_lines "data/adv_nieodprzymiotnikowe.tab") | |
304 | + | |
305 | +let remove_exceptional_lemmata dict = | |
306 | + Xlist.fold dict [] (fun dict entry -> | |
307 | + if StringSet.mem exceptional_lemmata entry.lemma then dict | |
308 | + else entry :: dict) | |
309 | + | |
310 | +let generate_stem dict = | |
311 | + Xlist.rev_map dict (fun entry -> | |
312 | + {entry with stem= | |
313 | + if entry.ndm then (List.hd entry.forms).orth else | |
314 | + if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then | |
315 | + Stem.generate_stem entry | |
316 | + else ""}) | |
317 | + | |
318 | +let validate dict = | |
319 | + let rules = Rules.interp_compound_rule_trees in | |
320 | + Xlist.rev_map dict (fun entry -> | |
321 | + let simple_lemma = Stem.simplify_lemma entry.lemma in | |
322 | + let forms = Xlist.rev_map entry.forms (fun form -> | |
323 | + (* printf "E %s\t%s\t%s\n" orth lemma interp; *) | |
324 | + let candidates = Rules.CharTrees.find rules form.orth in | |
325 | + (* printf "S %d\n" (Xlist.size forms); *) | |
326 | + let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) -> | |
327 | + (* printf "R %s\t%s\n" stem (Rules.string_of_rule rule); *) | |
328 | + if stem ^ rule.Rules.set = simple_lemma then (stem,rule) :: candidates else candidates) in | |
329 | + if candidates = [] then form else {form with validated=true}) in | |
330 | + {entry with forms=forms}) | |
331 | + | |
332 | +let remove_validated_forms dict = | |
333 | + Xlist.fold dict [] (fun dict entry -> | |
334 | + let forms = Xlist.fold entry.forms [] (fun forms form -> | |
335 | + if form.validated then forms else form :: forms) in | |
336 | + if forms = [] then dict else {entry with forms=forms} :: dict) | |
337 | + | |
338 | +let remove_validated_entries dict = | |
339 | + Xlist.fold dict [] (fun dict entry -> | |
340 | + let forms = Xlist.fold entry.forms [] (fun forms form -> | |
341 | + if form.validated then forms else form :: forms) in | |
342 | + if forms = [] then dict else entry :: dict) | |
343 | + | |
344 | +let remove_not_validated_forms dict = | |
345 | + Xlist.fold dict [] (fun dict entry -> | |
346 | + let forms = Xlist.fold entry.forms [] (fun forms form -> | |
347 | + if form.validated then form :: forms else forms) in | |
348 | + if forms = [] then dict else {entry with forms=forms} :: dict) | |
349 | + | |
350 | +let remove_not_validated_entries dict = | |
351 | + Xlist.fold dict [] (fun dict entry -> | |
352 | + let forms = Xlist.fold entry.forms [] (fun forms form -> | |
353 | + if form.validated then form :: forms else forms) in | |
354 | + if Xlist.size forms <> Xlist.size entry.forms then dict else entry :: dict) | |
355 | + | |
356 | +let print filename dict = | |
357 | + File.file_out filename (fun file -> | |
358 | + Xlist.iter dict (fun entry -> | |
359 | + Xlist.iter entry.forms (fun form -> | |
360 | + fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp))) | |
361 | + | |
362 | +let print_lemmata filename dict = | |
363 | + File.file_out filename (fun file -> | |
364 | + Xlist.iter dict (fun entry -> | |
365 | + fprintf file "%s\n" entry.lemma)) | |
... | ... |
guesser/generate.ml
... | ... | @@ -15,11 +15,172 @@ let zasoby_path = |
15 | 15 | (* | "mozart" -> "." *) |
16 | 16 | | s -> failwith ("unknown host: " ^ s) |
17 | 17 | |
18 | -let morfeusz_path = zasoby_path ^ "morfeusz/" | |
18 | +let lematy_nkjp_filename = "lematy_NKJP1M_freq.tab" | |
19 | + | |
20 | +let morfeusz_old_path = zasoby_path ^ "morfeusz/old2/" | |
19 | 21 | let sgjp_filename2015 = "sgjp-20151020.tab" |
20 | 22 | let polimorf_filename2015 = "polimorf-20151020.tab" |
21 | -let sgjp_filename = "sgjp-20160508.tab" | |
22 | -let polimorf_filename = "polimorf-20160508.tab" | |
23 | + | |
24 | +let morfeusz_path = zasoby_path ^ "morfeusz/" | |
25 | +let sgjp_filename201605 = "sgjp-20160508.tab" | |
26 | +let polimorf_filename201605 = "polimorf-20160508.tab" | |
27 | +let sgjp_filename = "sgjp-20160724.tab" | |
28 | +let polimorf_filename = "polimorf-20160724.tab" | |
29 | + | |
30 | +let adv_sgjp_filename = "adv_" ^ sgjp_filename201605 | |
31 | +let odm_adj_sgjp_filename = "odm_adj_" ^ sgjp_filename201605 | |
32 | +let ndm_adj_sgjp_filename = "ndm_adj_" ^ sgjp_filename201605 | |
33 | +let odm_noun_sgjp_filename = "odm_noun_" ^ sgjp_filename201605 | |
34 | +let ndm_noun_sgjp_filename = "ndm_noun_" ^ sgjp_filename201605 | |
35 | +let verb_sgjp_filename = "verb_" ^ sgjp_filename201605 | |
36 | + | |
37 | +(* Test wczytywania słowników i liczenie częstości *) | |
38 | +let _ = | |
39 | + (* print_endline "sgjp_filename2015"; | |
40 | + let _ = Dict.load_tab (morfeusz_old_path ^ sgjp_filename2015) in | |
41 | + print_endline "polimorf_filename2015"; | |
42 | + let _ = Dict.load_tab (morfeusz_old_path ^ polimorf_filename2015) in | |
43 | + print_endline "sgjp_filename201605"; | |
44 | + let _ = Dict.load_tab (morfeusz_path ^ sgjp_filename201605) in | |
45 | + print_endline "polimorf_filename201605"; | |
46 | + let _ = Dict.load_tab (morfeusz_path ^ polimorf_filename201605) in | |
47 | + print_endline "sgjp_filename"; | |
48 | + let _ = Dict.load_tab (morfeusz_path ^ sgjp_filename) in | |
49 | + print_endline "polimorf_filename"; | |
50 | + let _ = Dict.load_tab (morfeusz_path ^ polimorf_filename) in | |
51 | + print_endline "adv_sgjp_filename"; | |
52 | + let _ = Dict.load_tab (morfeusz_path ^ adv_sgjp_filename) in | |
53 | + print_endline "odm_adj_sgjp_filename"; | |
54 | + let _ = Dict.load_tab (morfeusz_path ^ odm_adj_sgjp_filename) in | |
55 | + print_endline "ndm_adj_sgjp_filename"; | |
56 | + let _ = Dict.load_tab (morfeusz_path ^ ndm_adj_sgjp_filename) in | |
57 | + print_endline "odm_noun_sgjp_filename"; | |
58 | + let _ = Dict.load_tab (morfeusz_path ^ odm_noun_sgjp_filename) in | |
59 | + print_endline "ndm_noun_sgjp_filename"; | |
60 | + let _ = Dict.load_tab (morfeusz_path ^ ndm_noun_sgjp_filename) in | |
61 | + print_endline "verb_sgjp_filename"; | |
62 | + let _ = Dict.load_tab (morfeusz_path ^ verb_sgjp_filename) in | |
63 | + print_endline "sgjp_filename2015"; | |
64 | + let dict = Dict.load_tab_full (morfeusz_old_path ^ sgjp_filename2015) in | |
65 | + Dict.print_quantities "results/proper-type-sgjp-20151020.txt" Dict.proper_type_selector dict; | |
66 | + Dict.print_quantities "results/genre-sgjp-20151020.txt" Dict.genre_selector dict; | |
67 | + Dict.print_quantities "results/interp-sgjp-20151020.txt" Dict.interp_selector dict; | |
68 | + print_endline "polimorf_filename2015"; | |
69 | + let dict = Dict.load_tab_full (morfeusz_old_path ^ polimorf_filename2015) in | |
70 | + Dict.print_quantities "results/proper-type-polimorf-20151020.txt" Dict.proper_type_selector dict; | |
71 | + Dict.print_quantities "results/genre-polimorf-20151020.txt" Dict.genre_selector dict; | |
72 | + Dict.print_quantities "results/interp-polimorf-20151020.txt" Dict.interp_selector dict; | |
73 | + print_endline "sgjp_filename201605"; | |
74 | + let dict = Dict.load_tab_full (morfeusz_path ^ sgjp_filename201605) in | |
75 | + Dict.print_quantities "results/proper-type-sgjp-20160508.txt" Dict.proper_type_selector dict; | |
76 | + Dict.print_quantities "results/genre-sgjp-20160508.txt" Dict.genre_selector dict; | |
77 | + Dict.print_quantities "results/interp-sgjp-20160508.txt" Dict.interp_selector dict; | |
78 | + print_endline "polimorf_filename201605"; | |
79 | + let dict = Dict.load_tab_full (morfeusz_path ^ polimorf_filename201605) in | |
80 | + Dict.print_quantities "results/proper-type-polimorf-20160508.txt" Dict.proper_type_selector dict; | |
81 | + Dict.print_quantities "results/genre-polimorf-20160508.txt" Dict.genre_selector dict; | |
82 | + Dict.print_quantities "results/interp-polimorf-20160508.txt" Dict.interp_selector dict; | |
83 | + print_endline "sgjp_filename"; | |
84 | + let dict = Dict.load_tab_full (morfeusz_path ^ sgjp_filename) in | |
85 | + Dict.print_quantities "results/proper-type-sgjp-20160724.txt" Dict.proper_type_selector dict; | |
86 | + Dict.print_quantities "results/genre-sgjp-20160724.txt" Dict.genre_selector dict; | |
87 | + Dict.print_quantities "results/interp-sgjp-20160724.txt" Dict.interp_selector dict; | |
88 | + print_endline "polimorf_filename"; | |
89 | + let dict = Dict.load_tab_full (morfeusz_path ^ polimorf_filename) in | |
90 | + Dict.print_quantities "results/proper-type-polimorf-20160724.txt" Dict.proper_type_selector dict; | |
91 | + Dict.print_quantities "results/genre-polimorf-20160724.txt" Dict.genre_selector dict; | |
92 | + Dict.print_quantities "results/interp-polimorf-20160724.txt" Dict.interp_selector dict; | |
93 | + print_endline "adv_sgjp_filename"; | |
94 | + let dict = Dict.load_tab_full (morfeusz_path ^ adv_sgjp_filename) in | |
95 | + print_endline "odm_adj_sgjp_filename"; | |
96 | + let dict = Dict.load_tab_full (morfeusz_path ^ odm_adj_sgjp_filename) in | |
97 | + print_endline "ndm_adj_sgjp_filename"; | |
98 | + let dict = Dict.load_tab_full (morfeusz_path ^ ndm_adj_sgjp_filename) in | |
99 | + print_endline "odm_noun_sgjp_filename"; | |
100 | + let dict = Dict.load_tab_full (morfeusz_path ^ odm_noun_sgjp_filename) in | |
101 | + print_endline "ndm_noun_sgjp_filename"; | |
102 | + let dict = Dict.load_tab_full (morfeusz_path ^ ndm_noun_sgjp_filename) in | |
103 | + print_endline "verb_sgjp_filename"; | |
104 | + let dict = Dict.load_tab_full (morfeusz_path ^ verb_sgjp_filename) in | |
105 | + print_endline "lematy_nkjp_filename"; | |
106 | + let dict = Dict.load_freq_tab (zasoby_path ^ lematy_nkjp_filename) in | |
107 | + Dict.print_quantities "results/interp-lematy_nkjp.txt" Dict.interp_selector dict; | |
108 | + Dict.print_quantities "results/freq-lematy_nkjp.txt" Dict.freq_selector dict;*) | |
109 | + () | |
110 | + | |
111 | +(* Porównanie wersji słowników *) | |
112 | +let _ = | |
113 | + (* Dict.compare_dicts_full (morfeusz_old_path ^ sgjp_filename2015) (morfeusz_path ^ sgjp_filename201605) "results/comparition_sgjp1_full.out"; *) | |
114 | + (* Dict.compare_dicts_full (morfeusz_path ^ sgjp_filename201605) (morfeusz_path ^ sgjp_filename) "results/comparition_sgjp2_full.out"; *) | |
115 | + (* Dict.compare_dicts_full (morfeusz_old_path ^ polimorf_filename2015) (morfeusz_path ^ polimorf_filename201605) "results/comparition_polimorf1_full.out"; *) | |
116 | + (* Dict.compare_dicts_full (morfeusz_path ^ polimorf_filename201605) (morfeusz_path ^ polimorf_filename) "results/comparition_polimorf2_full.out"; *) | |
117 | +(* compare_dicts morfeusz_path sgjp_filename2015 sgjp_filename "results/comparition_sgjp.out"; *) | |
118 | + () | |
119 | + | |
120 | +(* Podział słownika *) | |
121 | +let _ = | |
122 | + (* Dict.split_dict morfeusz_path sgjp_filename; *) | |
123 | + () | |
124 | + | |
125 | +(* Wypisanie nieodmiennych *) | |
126 | +let _ = | |
127 | + (* let dict = Dict.load_tab_full (morfeusz_path ^ sgjp_filename) in | |
128 | + let dict = Dict.merge_entries dict in | |
129 | + let dict = Dict.mark_ndm dict in | |
130 | + Dict.print_ndm "results/ndm-sgjp.tab" dict; *) | |
131 | + () | |
132 | + | |
133 | +let check_stem_generation path filename = | |
134 | + let dict = Dict.load_tab_full (path ^ filename) in | |
135 | + let dict = Dict.merge_entries dict in | |
136 | + let dict = Dict.mark_ndm dict in | |
137 | + let dict = Dict.find_kolwiek_suffixes dict in | |
138 | + let _ = Dict.generate_stem dict in | |
139 | + () | |
140 | + | |
141 | +(* Sprawdzenie działania stemowania *) | |
142 | +let _ = | |
143 | + check_stem_generation morfeusz_path sgjp_filename; | |
144 | +(* check_stem_generation morfeusz_path ("odm_adj_" ^ sgjp_filename); *) | |
145 | +(* check_stem_generation morfeusz_path ("odm_noun_" ^ sgjp_filename); *) | |
146 | + () | |
147 | + | |
148 | +let find_not_validated_forms path filename out_filename = | |
149 | + let dict = Dict.load_tab (path ^ filename) in | |
150 | + let dict = Dict.validate dict in | |
151 | + let dict = Dict.remove_validated_forms dict in | |
152 | + Dict.print out_filename dict | |
153 | + | |
154 | +let find_not_validated_entries path filename out_filename = | |
155 | + let dict = Dict.load_tab (path ^ filename) in | |
156 | + let dict = Dict.merge_entries dict in | |
157 | + let dict = Dict.validate dict in | |
158 | + let dict = Dict.remove_validated_entries dict in | |
159 | + Dict.print out_filename dict | |
160 | + | |
161 | +(* Wypisanie niezwalidowanych form *) | |
162 | +let _ = | |
163 | + find_not_validated_forms morfeusz_path odm_adj_sgjp_filename "results/not_validated_odm_adj.tab"; | |
164 | + (* find_not_validated_entries morfeusz_path ("odm_adj_" ^ sgjp_filename) "results/not_validated_odm_adj.tab"; *) | |
165 | + (* find_not_validated_entries "results/" "not_validated_odm_adj.tab" "results/not_validated_odm_adj2.tab"; *) | |
166 | + (* find_not_validated_entries morfeusz_path ("odm_noun_" ^ sgjp_filename) "results/not_validated_odm_noun.tab"; *) | |
167 | + (* find_not_validated_entries morfeusz_path ("adv_" ^ sgjp_filename) "results/not_validated_adv.tab";*) | |
168 | + (* find_not_validated_entries morfeusz_path ("verb_" ^ sgjp_filename) "results/not_validated_verb.tab"; *) | |
169 | + () | |
170 | + | |
171 | +let find_validated_lemmata path filename out_filename = | |
172 | + let dict = Dict.load_tab (path ^ filename) in | |
173 | + let dict = Dict.merge_entries dict in | |
174 | + let dict = Dict.validate dict in | |
175 | + let dict = Dict.remove_not_validated_entries dict in | |
176 | + Dict.print_lemmata out_filename dict | |
177 | + | |
178 | +(* Wypisanie zwalidowanych lematów *) | |
179 | +let _ = | |
180 | + find_validated_lemmata morfeusz_path odm_adj_sgjp_filename "results/validated_odm_adj.tab"; | |
181 | + find_validated_lemmata morfeusz_path odm_noun_sgjp_filename "results/validated_odm_noun.tab"; | |
182 | + find_validated_lemmata morfeusz_path verb_sgjp_filename "results/validated_verb.tab"; | |
183 | + () | |
23 | 184 | |
24 | 185 | (**********************************************************************************) |
25 | 186 | |
... | ... | @@ -164,75 +325,6 @@ let load_tab_full filename = |
164 | 325 | | line -> failwith ("load_tab_full: " ^ (String.concat "\t" line))) |
165 | 326 | (* | _ -> failwith ("load_tab_full: " ^ line)) *) |
166 | 327 | |
167 | -let load_dict_as_set filename = | |
168 | - let l = load_tab filename in | |
169 | - List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set (orth,lemma,interp) -> | |
170 | - StringSet.add set (String.concat "\t" [orth;lemma;interp])))) | |
171 | - | |
172 | -let load_dict_as_set_full filename = | |
173 | - let l = load_tab_full filename in | |
174 | - List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set (orth,lemma,interp,cl,cl2) -> | |
175 | - StringSet.add set (String.concat "\t" [orth;lemma;interp;cl;cl2])))) | |
176 | - | |
177 | -let rec compare_dicts_rec file = function | |
178 | - [],[] -> () | |
179 | - | [],b :: lb -> fprintf file "> %s\n" b; compare_dicts_rec file ([],lb) | |
180 | - | a :: la,[] -> fprintf file "< %s\n" a; compare_dicts_rec file (la,[]) | |
181 | - | a :: la, b :: lb -> | |
182 | - if a = b then compare_dicts_rec file (la,lb) else | |
183 | - if a < b then (fprintf file "< %s\n" a; compare_dicts_rec file (la,b :: lb)) else | |
184 | - (fprintf file "> %s\n" b; compare_dicts_rec file (a :: la,lb)) | |
185 | - | |
186 | -let compare_dicts path filename1 filename2 filename_out = | |
187 | - let dict1 = load_dict_as_set (path ^ filename1) in | |
188 | - let dict2 = load_dict_as_set (path ^ filename2) in | |
189 | - File.file_out filename_out (fun file -> | |
190 | - compare_dicts_rec file (dict1,dict2)) | |
191 | - | |
192 | -let compare_dicts_full path filename1 filename2 filename_out = | |
193 | - let dict1 = load_dict_as_set_full (path ^ filename1) in | |
194 | - let dict2 = load_dict_as_set_full (path ^ filename2) in | |
195 | - File.file_out filename_out (fun file -> | |
196 | - compare_dicts_rec file (dict1,dict2)) | |
197 | - | |
198 | -(* Porównanie wersji słowników *) | |
199 | -let _ = | |
200 | -(* compare_dicts_full morfeusz_path sgjp_filename2015 sgjp_filename "results/comparition_sgjp_full.out"; *) | |
201 | -(* compare_dicts_full morfeusz_path polimorf_filename2015 polimorf_filename "results/comparition_polimorf_full.out"; *) | |
202 | -(* compare_dicts morfeusz_path sgjp_filename2015 sgjp_filename "results/comparition_sgjp.out"; *) | |
203 | - () | |
204 | - | |
205 | -let split_dict path filename = | |
206 | - let dict = load_tab (path ^ filename) in | |
207 | - File.file_out (path ^ "noun_" ^ filename) (fun noun_file -> | |
208 | - File.file_out (path ^ "adj_" ^ filename) (fun adj_file -> | |
209 | - File.file_out (path ^ "adv_" ^ filename) (fun adv_file -> | |
210 | - File.file_out (path ^ "verb_" ^ filename) (fun verb_file -> | |
211 | - File.file_out (path ^ "other_" ^ filename) (fun other_file -> | |
212 | - Xlist.iter dict (fun (orth,lemma,interp) -> | |
213 | - let cat,tags = match Xstring.split ":" interp with | |
214 | - cat :: tags -> cat,tags | |
215 | - | _ -> failwith ("split_dict: " ^ interp) in | |
216 | - if cat = "subst" || cat = "depr" then | |
217 | - fprintf noun_file "%s\t%s\t%s\n" orth lemma interp else | |
218 | - if cat = "adj" || cat = "adja"|| cat = "adjc"|| cat = "adjp" then | |
219 | - fprintf adj_file "%s\t%s\t%s\n" orth lemma interp else | |
220 | - if cat = "adv" then | |
221 | - fprintf adv_file "%s\t%s\t%s\n" orth lemma interp else | |
222 | - if cat = "inf" || cat = "praet"|| cat = "fin" || cat = "ppas" || cat = "pact" || cat = "pacta" || | |
223 | - cat = "impt" || cat = "imps" || cat = "pcon" || cat = "pant" || cat = "ger" || cat = "" then | |
224 | - fprintf verb_file "%s\t%s\t%s\n" orth lemma interp else | |
225 | - if cat = "bedzie" || cat = "pred"|| cat = "prep" || cat = "num" || cat = "aglt" || cat = "winien" || | |
226 | - cat = "qub" || cat = "brev" || cat = "comp" || cat = "interj" || cat = "burk" || cat = "conj" || cat = "ppron12" || cat = "ppron3" || cat = "" then | |
227 | - fprintf other_file "%s\t%s\t%s\n" orth lemma interp else | |
228 | - if cat = "cond" then () else | |
229 | - print_endline cat)))))) | |
230 | - | |
231 | - | |
232 | -(* Podział słownika *) | |
233 | -let _ = | |
234 | -(* split_dict morfeusz_path sgjp_filename; *) | |
235 | - () | |
236 | 328 | |
237 | 329 | let map_of_tab data = |
238 | 330 | Xlist.fold data StringMap.empty (fun map (orth,lemma,interp) -> |
... | ... | @@ -294,128 +386,7 @@ let _ = |
294 | 386 | () |
295 | 387 | |
296 | 388 | (**********************************************************************************) |
297 | - | |
298 | -let kolwiek_lemmas = StringSet.of_list [ | |
299 | - (* adj *) | |
300 | - "czyjkolwiek"; "czyjś"; "czyjże"; "jakiciś"; "jakikolwiek"; "jakisi"; "jakiś"; "jakiści"; | |
301 | - "jakiściś"; "jakiśkolwiek"; "jakiż"; "jakiżkolwiek"; "jakowyś"; "kijże"; "kiż"; "którykolwiek"; | |
302 | - "któryś"; "któryż"; "któryżkolwiek"; "niejakiś"; "takiż"; "takowyż"; "tenże"; "tyliż"; "ówże"; | |
303 | - (* noun *) | |
304 | - "cokolwiek:s"; "cośkolwiek"; "cóżkolwiek"; "ktokolwiek"; "ktośkolwiek"; "któżkolwiek"; | |
305 | - "cociś"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "coże"; "cóż"; | |
306 | - "ktoś:s2"; "któż"; | |
307 | - (* adv *) | |
308 | - "jakkolwiek"; "jakoś"; "małoż"; "niejakkolwiek"; "niejakoś"; (*"niemalże";*) ] | |
309 | - | |
310 | -let kolwiek_suffixes = [ | |
311 | - "żkolwiek"; "żekolwiek"; "śkolwiek"; "kolwiek"; "ż"; "że"; "ściś"; "ciś"; "ś"; "ści"; "sik"; "si"] | |
312 | - | |
313 | -let find_kolwiek_suffixes morfs = | |
314 | - StringMap.mapi morfs (fun lemma interps -> | |
315 | - if StringSet.mem kolwiek_lemmas lemma then | |
316 | - Xlist.map interps (fun (orth,interp) -> | |
317 | - (Xlist.fold kolwiek_suffixes orth (fun orth kolwiek_suf -> | |
318 | - if Xstring.check_sufix kolwiek_suf orth then | |
319 | - Xstring.cut_sufix kolwiek_suf orth | |
320 | - else orth)), interp) | |
321 | - else interps) | |
322 | - | |
323 | - | |
324 | -let exceptional_lemmata = StringSet.of_list ([ | |
325 | - (* błąd w słowniku *) | |
326 | - "ówże"; | |
327 | - (* wiele stemów *) | |
328 | - "twój:a"; "swój"; "mój:a"; "wszystek"; | |
329 | - (* oboczności w stemie *) | |
330 | - "co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek"; | |
331 | - "kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic"; | |
332 | - "Angel"; "Apollo"; "Białystok"; "Bober"; "Dzięgiel"; "Engel"; "Gołąb:s2"; "Gózd"; "Hendel"; "Herschel"; "Jastrząb"; | |
333 | - "Kodrąb:s2"; "Kozioł"; "Krasnystaw"; "Majcher"; "Ob"; "Omulew"; "Orzeł"; "Różanystok"; "Schuster"; "Stępień"; "Słonim"; | |
334 | - "Wielkanoc"; "achtel"; "archiprezbiter"; "arcydzięgiel"; "bedel"; "ber"; "białagłowa"; "białodrzew"; "ceter"; "deszcz"; | |
335 | - "drama"; "dziób:s1"; "dzięgiel"; "dżemper"; "falafel"; "grubodziób"; "harbajtel"; "harbejtel"; "harmider"; "imćpan"; | |
336 | - "iściec"; "jarząb:s2"; "kierdel"; "kimel"; "kiper:s1"; "klaster"; "kliper"; "kosodrzew"; "kureń"; "manczester"; | |
337 | - "nadpiersień"; "osep"; "otrząs"; "pedel"; "piksel"; "podpiersień"; "podziem"; "prezbiter"; "protokół"; "przedpiersień"; | |
338 | - "ratel"; "rondel:s2"; "rozpiór:s1"; "rozpiór:s2"; "rzeczpospolita"; "rzep:s2"; "rzepień"; "rzewień"; "rąb"; "sosrąb"; | |
339 | - "srebrnodrzew"; "swąd"; "szmermel"; "szpiegierz"; "ulster"; "wab:s2"; "wermiszel"; "wilczełyko"; "woleoczko"; "włosień:s2"; | |
340 | - "zew"; "złotogłów"; "świreń"; "źreb"; "żółtodziób"; | |
341 | - "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; ""; | |
342 | - "przechrzest"; "chrzest"; "półdziecko"; "roczek:s2"; "rok:s1"; "tydzień"; ""; ""; ""; ""; ""; ""; | |
343 | - (* oboczności w odmianie *) | |
344 | - "niekażdy"; "każdy"; "niektóry:a"; "który"; "tenże"; "ten"; "tamten"; "kijże"; | |
345 | - "ucho:s2"; "dziecko"; "oko:s2"; "imię"; "nozdrze"; | |
346 | - "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; | |
347 | - "NOT"; "LOT"; "KRRiT"; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
348 | - "być"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
349 | - ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
350 | - (* pozostawione *) | |
351 | - "czyjże"; "czyjś"; "czyjkolwiek"; "kiż"; "ów"; "ow"; "on:a"; "ki"; | |
352 | - "Pia"; "jo-jo"; "+lecie"; ""; | |
353 | - "zagrząźć"; "zrzeć"; | |
354 | - (* niepełny paradygmat *) | |
355 | - "zróść"; "zląc"; "zaróść"; "zaprząc"; "zaprzysiąc"; "zanieść:v2"; "zaląc"; "wzróść"; "wyróść"; "wyprząc"; "wyprzysiąc"; | |
356 | - "róść"; "sprzysiąc"; "sprząc"; "ugrząźć"; "uląc"; "upiec:v2"; "uprząc"; "uróść"; "wieść:v2"; "wprząc"; "wróść"; "wyląc"; | |
357 | - "powieść:v2"; "posiąc"; "przeląc"; "przeprząc"; "przeróść"; "przyprząc"; "przysiąc"; "przyróść"; "prząc"; "pójść:v2"; "rozprząc"; "rozróść"; | |
358 | - "krzywoprzysiąc"; "ląc"; "naróść"; "obróść"; "odprzysiąc"; "odprząc"; "odróść"; "oprzysiąc"; "podróść"; "pogrząźć"; "poprzysiąc"; "poróść"; | |
359 | - "dojść:v2"; "doprząc"; "doróść"; "dosiąc"; "grząźć"; "iść:v2"; | |
360 | - (* wiele stemów *) | |
361 | - "uwlec"; "wewlec"; "wlec"; "wwlec"; "wywlec"; "wyżec"; "zawlec"; "zażec"; "zewlec"; "zwlec"; "zżec"; "żec"; | |
362 | - "podwlec"; "podżec"; "powlec:v1"; "powlec:v2"; "przeoblec"; "przewlec"; "przeżec"; "przyoblec"; "przywlec"; "przyżec"; "rozwlec"; "rozżec"; | |
363 | - "dowlec"; "nawlec"; "oblec:v2"; "obwlec"; "odwlec"; "owlec"; "zeżreć"; | |
364 | - (* inne *) | |
365 | - "liźć"; "iść:v1"; "wyniść"; "wynijść"; "wyjść"; "wniść"; "wnijść"; "wejść"; "ujść"; "rozejść"; "pójść:v1"; "przyjść"; "przejść:v2"; "przejść:v1"; "podejść"; "odejść"; "obejść:v2"; "obejść:v1"; "najść:v2"; "najść:v1"; "nadejść"; "dojść:v1"; | |
366 | - "roztworzyć:v2"; "przetworzyć:v2"; "otworzyć"; | |
367 | - "zsiąść:v2"; "zsiąść:v1"; "zesiąść"; "zasiąść"; "wysiąść"; "współposiąść"; "wsiąść"; "usiąść"; "siąść"; "rozsiąść"; "przysiąść"; "przesiąść"; "powsiąść"; "posiąść"; "podsiąść"; "osiąść"; "obsiąść"; "nasiąść"; "dosiąść"; | |
368 | - "źreć:v1"; "zniść"; "znijść"; "znajść"; "zejść"; "zejść"; "zajść:v2"; "zajść:v1"; "wzniść"; "wznijść"; "wzejść" | |
369 | 389 | (* |
370 | - "moi"; "twoi"; | |
371 | - (*"AIDS"; "BGŻ"; "BWZ"; "BZ";*) (*"Bandtkie";*) (*"CRZZ"; "FPŻ";*) (*"Jokai"; "Jókai"; "Linde";*)(* "MSZ"; "MWGzZ"; *) | |
372 | - (*"NSZ"; "OPZZ";*) "Radetzky"; "Tagore"; (*"UNZ"; "URz"; "WBZ"; "ZSZ"; "ZWZ"; "ZZ";*) "aids"; | |
373 | - "arcyksiężna"; "cornflakes"; "księżna"; (*"scrabble";*) "sms"; "teścina"; | |
374 | - "Wielkanoc"; "białagłowa"; "rzeczpospolita"; "imćpan"; | |
375 | - "Ob"; "podziem"; "Pia"; "woleoczko"; "wilczełyko"; "jo-jo"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
376 | - "Omulew"; "drama"; (*"Kayah";*) "ratel"; "grubodziób"; "rozpiór:s1"; "ceter"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
377 | - "DJ"; "FIFA"; (*"manicure"; "Greenpeace"; "Google";*) ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
378 | - "włosień:s2"; "deszcz"; "falafel"; "Krasnystaw"; | |
379 | - "Różanystok"; "Białystok"; "ZHR"; "rzep:s2"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
380 | - "IKEA"; "makao"; "macao"; "kakao"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
381 | - "dziecko"; "oko:s2"; "ucho:s2"; "półdziecko"; "b-cia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
382 | - "idea"; "ręka"; "cześć:s"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
383 | - "ABBA"; "UEFA"; "FAMA"; "SABENA"; "MENA"; "APA"; "NASA"; "ANSA"; | |
384 | - "NAFTA"; "LETTA"; "ETA"; "ELTA"; "EFTA"; "CEFTA"; | |
385 | - "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; | |
386 | - "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT"; | |
387 | - "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; ""; | |
388 | - "szwa"; "hawanna"; "butaforia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
389 | - "Skopie"; "Mathea"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
390 | - "poema:s1"; "klima:s1"; "dylema"; "dilemma"; "apoftegma"; "aksjoma"; ""; ""; ""; ""; ""; ""; ""; ""; | |
391 | - "burgrabia"; "gograbia"; "grabia"; "hrabia"; "margrabia"; "murgrabia"; "sędzia:s1"; "wicehrabia"; "współsędzia"; | |
392 | - "cieśla"; "bibliopola"; "świszczypałka"; "śwircałka"; "świerczałka"; "ścierciałka"; "tatka"; "sługa:s1"; "stupajka:s1"; "stepka"; "starowinka:s2"; "skurczypałka"; "mężczyzna"; "klecha"; | |
393 | - ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
394 | - ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
395 | - ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; | |
396 | - ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*) | |
397 | - ] @ File.load_lines "data/obce.tab" @ File.load_lines "data/validated_adj.tab" @ File.load_lines "data/validated_noun.tab" @ File.load_lines "data/validated_verb.tab" @ File.load_lines "data/adv_nieodprzymiotnikowe.tab") | |
398 | - | |
399 | -let remove_exceptional_lemmata dict = | |
400 | - StringMap.fold dict StringMap.empty (fun dict lemma interps -> | |
401 | - if StringSet.mem exceptional_lemmata lemma then dict | |
402 | - else StringMap.add dict lemma interps) | |
403 | - | |
404 | -let check_stem_generation stem_sel path filename = | |
405 | - let dict = load_tab (path ^ filename) in | |
406 | - let dict = map_of_tab dict in | |
407 | - let dict = remove_exceptional_lemmata dict in | |
408 | - let dict = find_kolwiek_suffixes dict in | |
409 | - StringMap.iter dict (fun lemma forms -> | |
410 | - (* let _ = Stem.generate_stem stem_sel lemma forms in FIXME*) | |
411 | - ()) | |
412 | - | |
413 | -(* Sprawdzenie działania stemowania *) | |
414 | -let _ = | |
415 | -(* check_stem_generation Stem.adj_stem_sel morfeusz_path ("odm_adj_" ^ sgjp_filename); *) | |
416 | -(* check_stem_generation Stem.noun_stem_sel morfeusz_path ("odm_noun_" ^ sgjp_filename); *) | |
417 | - () | |
418 | - | |
419 | 390 | let remove_com_sup dict = |
420 | 391 | List.rev (Xlist.fold dict [] (fun l (orth,lemma,interp) -> |
421 | 392 | if Xstring.check_sufix ":com" interp || Xstring.check_sufix ":sup" interp then l else (orth,lemma,interp) :: l)) |
... | ... | @@ -470,19 +441,6 @@ let generate_adj_com_rules path filename adj_com_rules_filename = |
470 | 441 | StringMap.iter rules2 (fun rule (q,l) -> |
471 | 442 | fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l)))) |
472 | 443 | |
473 | -let find_validated_lemata_adj_pos dict = | |
474 | - let dict = map_of_tab dict in | |
475 | - StringMap.fold dict [] (fun lemmata lemma forms -> | |
476 | - if Rules.validate_entry lemma forms then lemma :: lemmata else lemmata) | |
477 | - | |
478 | -let find_validated_lemata_adj path filename = | |
479 | - let interp_sel = load_interp_sel "data/interps.tab" in | |
480 | - let dict = load_tab (path ^ filename) in | |
481 | - let dict = split_into_groups interp_sel dict in | |
482 | - if StringMap.size dict <> 3 then failwith ("generate_adj_rules: " ^ | |
483 | - String.concat " " (StringMap.fold dict [] (fun l s _ -> s :: l))) else | |
484 | - find_validated_lemata_adj_pos (StringMap.find dict "adj") | |
485 | - | |
486 | 444 | let generate_adv_pos_rules rules_filename dict = |
487 | 445 | let dict = map_of_tab dict in |
488 | 446 | let dict = find_kolwiek_suffixes dict in |
... | ... | @@ -575,12 +533,6 @@ let generate_verb_rules path filename rules_filename = |
575 | 533 | generate_verb_rules2 rules_filename (StringMap.find dict "verb"); |
576 | 534 | () |
577 | 535 | |
578 | -let find_validated_lemata_noun path filename = | |
579 | - let dict = load_tab (path ^ filename) in | |
580 | - let dict = map_of_tab dict in | |
581 | - StringMap.fold dict [] (fun lemmata lemma forms -> | |
582 | - if Rules.validate_entry lemma forms then lemma :: lemmata else lemmata) | |
583 | - | |
584 | 536 | let find_tags_lemata_noun path filename = |
585 | 537 | let dict = load_tab (path ^ filename) in |
586 | 538 | let dict = map_of_tab dict in |
... | ... | @@ -591,19 +543,6 @@ let find_tags_lemata_noun path filename = |
591 | 543 | String.concat " " (Xlist.map l (fun (k,v) -> k ^ "=" ^ v)) |
592 | 544 | ))))) |
593 | 545 | |
594 | -let find_validated_lemata_verb2 dict = | |
595 | - let dict = map_of_tab dict in | |
596 | - StringMap.fold dict [] (fun lemmata lemma forms -> | |
597 | - if Rules.validate_entry lemma forms then lemma :: lemmata else lemmata) | |
598 | - | |
599 | -let find_validated_lemata_verb path filename = | |
600 | - let interp_sel = load_interp_sel "data/interps.tab" in | |
601 | - let dict = load_tab (path ^ filename) in | |
602 | - let dict = split_into_groups interp_sel dict in | |
603 | - if StringMap.size dict <> 2 then failwith ("generate_verb_rules: " ^ | |
604 | - String.concat " " (StringMap.fold dict [] (fun l s _ -> s :: l))) else | |
605 | - find_validated_lemata_verb2 (StringMap.find dict "verb") | |
606 | - | |
607 | 546 | let find_tags_lemata_verb2 dict = |
608 | 547 | let dict = map_of_tab dict in |
609 | 548 | StringMap.iter dict (fun lemma forms -> |
... | ... | @@ -632,15 +571,6 @@ let _ = |
632 | 571 | () |
633 | 572 | |
634 | 573 | let _ = |
635 | -(* let l = find_validated_lemata_adj morfeusz_path ("odm_adj_" ^ sgjp_filename) in *) | |
636 | -(* let l = find_validated_lemata_noun morfeusz_path ("odm_noun_" ^ sgjp_filename) in *) | |
637 | -(* let l = find_validated_lemata_verb morfeusz_path ("verb_" ^ sgjp_filename) in *) | |
638 | -(* let l = find_validated_lemata_noun "data/" "nouns_ex.tab" in *) | |
639 | -(* let l = find_validated_lemata_verb "data/" "verbs_ex.tab" in *) | |
640 | -(* Xlist.iter l print_endline; *) | |
641 | - () | |
642 | - | |
643 | -let _ = | |
644 | 574 | (* find_tags_lemata_verb "data/" "verbs_ex.tab"; *) |
645 | 575 | (* find_tags_lemata_noun "data/" "nouns_ex.tab"; *) |
646 | 576 | () |
... | ... | @@ -649,23 +579,6 @@ let _ = |
649 | 579 | (* Rules.print "results/rules/"; *) |
650 | 580 | (* Rules.print_compound_rules "results/compounds.dic" (Rules.create_rules ()); *) |
651 | 581 | () |
652 | -(* FIXME | |
653 | -let find_not_parsed_entries path filename out_filename = | |
654 | - let rules = Rules.create_char_trees (Rules.create_rules ()) in | |
655 | - (* let interp_sel = load_interp_sel "data/interps.tab" in *) | |
656 | - let dict = load_tab (path ^ filename) in | |
657 | - let not_parsed = Xlist.fold dict [] (fun not_parsed (orth,lemma,interp) -> | |
658 | - (* printf "E %s\t%s\t%s\n" orth lemma interp; *) | |
659 | - let simple_lemma = Rules.simplify_lemma lemma in | |
660 | - let forms = Rules.CharTrees.find rules orth in | |
661 | - (* printf "S %d\n" (Xlist.size forms); *) | |
662 | - let forms = Xlist.fold forms [] (fun forms (stem,rule) -> | |
663 | - (* printf "R %s\t%s\n" stem (Rules.string_of_rule rule); *) | |
664 | - if stem ^ rule.Rules.set = simple_lemma then (stem,rule) :: forms else forms) in | |
665 | - if forms = [] then (orth,lemma,interp) :: not_parsed else not_parsed) in | |
666 | - File.file_out out_filename (fun file -> | |
667 | - Xlist.iter not_parsed (fun (orth,lemma,interp) -> | |
668 | - fprintf file "%s\t%s\t%s\n" orth lemma interp)) | |
669 | 582 | |
670 | 583 | let interp_rule_string tags interp = |
671 | 584 | let tags = Xlist.sort tags compare_tag in |
... | ... | @@ -699,14 +612,9 @@ let print_interp_rules con_flag group_flag lemma_flag path filename out_filename |
699 | 612 | else interp_rules)) in |
700 | 613 | File.file_out out_filename (fun file -> |
701 | 614 | StringMap.iter interp_rules (fun k (q,l) -> |
702 | - fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l))) *) | |
615 | + fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l))) | |
703 | 616 | |
704 | 617 | let _ = |
705 | - (* find_not_parsed_entries morfeusz_path ("odm_adj_" ^ sgjp_filename) "results/not_parsed_odm_adj.tab"; *) | |
706 | - (* find_not_parsed_entries "results/" "not_parsed_odm_adj.tab" "results/not_parsed_odm_adj2.tab"; *) | |
707 | - (* find_not_parsed_entries morfeusz_path ("odm_noun_" ^ sgjp_filename) "results/not_parsed_odm_noun.tab"; *) | |
708 | - (* find_not_parsed_entries morfeusz_path ("adv_" ^ sgjp_filename) "results/not_parsed_adv.tab";*) | |
709 | - (* find_not_parsed_entries morfeusz_path ("verb_" ^ sgjp_filename) "results/not_parsed_verb.tab"; *) | |
710 | 618 | (* print_interp_rules true true true morfeusz_path ("odm_adj_" ^ sgjp_filename) "results/interp_rules_odm_adj.tab"; *) |
711 | 619 | (* print_interp_rules false true true morfeusz_path ("odm_adj_" ^ sgjp_filename) "results/interp_rules_odm_adj2.tab"; *) |
712 | 620 | (* print_interp_rules true true true morfeusz_path ("adv_" ^ sgjp_filename) "results/interp_rules_adv.tab"; *) |
... | ... | @@ -715,78 +623,4 @@ let _ = |
715 | 623 | (* print_interp_rules true true true morfeusz_path ("odm_noun_" ^ sgjp_filename) "results/interp_rules_odm_noun.tab"; |
716 | 624 | print_interp_rules true true false morfeusz_path ("odm_noun_" ^ sgjp_filename) "results/interp_rules_odm_noun2.tab"; *) |
717 | 625 | () |
718 | - | |
719 | -(*** | |
720 | -let expand_tags tags = | |
721 | - if tags = "" then [] else | |
722 | - List.flatten (Xlist.map (Xstring.split "|" tags) (fun tags -> | |
723 | - let tags = Xlist.map (Xstring.split ":" tags) (Xstring.split "\\.") in | |
724 | - Xlist.map (Xlist.multiply_list tags) (String.concat ":"))) | |
725 | - | |
726 | -let prepare_rules l = | |
727 | - Xlist.fold l [] (fun rules rule_set_name -> | |
728 | - let rule_set = StringMap.find rule_map rule_set_name in | |
729 | - Xlist.fold rule_set rules (fun rules (alternation_name, sufix, tags) -> | |
730 | - let alternation = StringMap.find alternation_map alternation_name in | |
731 | - Xlist.fold alternation rules (fun rules (a,b) -> | |
732 | - (a ^ sufix, b, expand_tags tags) :: rules))) | |
733 | - | |
734 | -let prepare_rules_simple l = | |
735 | - Xlist.fold l [] (fun rules rule_set_name -> | |
736 | - let rule_set = StringMap.find rule_map rule_set_name in | |
737 | - Xlist.fold rule_set rules (fun rules (alternation_name, sufix, tags) -> | |
738 | - let alternation = StringMap.find alternation_map alternation_name in | |
739 | - Xlist.fold alternation rules (fun rules (a,b) -> | |
740 | - (a ^ sufix, b, [tags]) :: rules))) | |
741 | - | |
742 | -let rules_adj_flex = prepare_rules_simple ["ADJ-FLEX"] | |
743 | -let rules_adj_lemma = prepare_rules ["ADJ-LEMMA"] | |
744 | - | |
745 | -let rules_a = prepare_rules ["NOUN-FLEX-GENERAL";"NOUN-FLEX-A"] | |
746 | -let rules_noun_as_adj = prepare_rules ["NOUN-FLEX-GENERAL";"NOUN-ADJ-FLEX"] | |
747 | -let rules_noun_lemma = prepare_rules ["NOUN-LEMMA"] | |
748 | - | |
749 | -let is_applicable_rule (a,_,_) s = check_sufix a s | |
750 | - | |
751 | -let apply_rule (a,b,_) s = | |
752 | - (cut_sufix a s) ^ b | |
753 | - | |
754 | -let match_interp (_,_,l) s = | |
755 | - Xlist.mem l s | |
756 | - | |
757 | -let get_interps (_,_,l) = l | |
758 | - | |
759 | -let apply_rules rules s = | |
760 | - Xlist.fold rules [] (fun l rule -> | |
761 | - if is_applicable_rule rule s then | |
762 | - (apply_rule rule s, get_interps rule) :: l | |
763 | - else l) | |
764 | - | |
765 | -let check_inflexion rules stem interps = | |
766 | - StringMap.fold interps true (fun b interp orths -> | |
767 | - Xlist.fold orths b (fun b orth -> | |
768 | - let c = Xlist.fold rules false (fun c rule -> | |
769 | - if is_applicable_rule rule orth && match_interp rule interp then | |
770 | - if apply_rule rule orth = stem then true else c | |
771 | - else c) in | |
772 | - if c then b else false)) | |
773 | - | |
774 | -let has_known_inflexion_noun stem interps = | |
775 | - let b1 = check_inflexion rules_a stem interps in | |
776 | - let b2 = check_inflexion rules_noun_as_adj stem interps in | |
777 | - b1 || b2 | |
778 | - | |
779 | -let has_known_inflexion_adj stem interps = | |
780 | - let b = check_inflexion rules_adj_flex stem interps in | |
781 | - b | |
782 | - | |
783 | -let select_inflexion rules stem interps = | |
784 | - StringMap.fold interps StringMap.empty (fun interps interp orths -> | |
785 | - let orths = Xlist.fold orths [] (fun orths orth -> | |
786 | - let c = Xlist.fold rules false (fun c rule -> | |
787 | - if is_applicable_rule rule orth && match_interp rule interp then | |
788 | - if apply_rule rule orth = stem then true else c | |
789 | - else c) in | |
790 | - if c then orths else orth :: orths) in | |
791 | - if orths = [] then interps else StringMap.add interps interp orths) | |
792 | -***) | |
626 | +*) | |
... | ... |
guesser/makefile
... | ... | @@ -6,7 +6,7 @@ OCAMLFLAGS=$(INCLUDES) -g |
6 | 6 | OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa xlib.cmxa |
7 | 7 | |
8 | 8 | all: |
9 | - $(OCAMLOPT) -o generate $(OCAMLOPTFLAGS) rules.ml stem.ml ruleGenerator.ml generate.ml | |
9 | + $(OCAMLOPT) -o generate $(OCAMLOPTFLAGS) types.ml rules.ml stem.ml ruleGenerator.ml dict.ml generate.ml | |
10 | 10 | |
11 | 11 | lib: |
12 | 12 | $(OCAMLOPT) -linkall -a -o inflexion.cmxa $(INCLUDES) $(MODS3) |
... | ... |
guesser/ruleGenerator.ml
1 | 1 | open Xstd |
2 | 2 | open Printf |
3 | +open Types | |
3 | 4 | |
4 | 5 | let alternation_map = Rules.alternation_map |
5 | 6 | |
... | ... | @@ -163,13 +164,13 @@ let generate_rule stem stem_pref orth = |
163 | 164 | let c,f = rule_code (a,b) in |
164 | 165 | if f then "\t" ^ c else sprintf "%s\t%s\t%s" c a b |
165 | 166 | |
166 | -let rec classify_entry lemma stem forms = function | |
167 | +let rec classify_entry entry = function | |
167 | 168 | (class_interp,suf,cl) :: class_sel -> |
168 | - let l = Xlist.fold forms [] (fun l (orth,interp) -> | |
169 | - if interp = class_interp then orth :: l else l) in | |
169 | + let l = Xlist.fold entry.forms [] (fun l form -> | |
170 | + if form.interp = class_interp then form.orth :: l else l) in | |
170 | 171 | let b = Xlist.fold l false (fun b orth -> |
171 | 172 | if Xstring.check_sufix suf orth then true else b) in |
172 | - if b then cl else classify_entry lemma stem forms class_sel | |
173 | + if b then cl else classify_entry entry class_sel | |
173 | 174 | (* let l = StringSet.to_list (Xlist.fold l StringSet.empty (fun set orth -> |
174 | 175 | if check_prefix stem orth then |
175 | 176 | StringSet.add set (cut_prefix stem orth) |
... | ... | @@ -210,25 +211,21 @@ let entry_classes = |
210 | 211 | "subst:sg:nom:n2","um","UM"; |
211 | 212 | ] |
212 | 213 | |
213 | -let generate_rules_entry cat rules lemma forms = | |
214 | - let stem = Stem.generate_stem cat lemma forms in | |
215 | - let stem_pref = Stem.cut_stem_sufix stem in | |
216 | - let cl = classify_entry lemma stem forms entry_classes in | |
217 | - let forms = Rules.select_not_validated lemma forms in | |
218 | - Xlist.fold forms rules (fun rules (orth,interp) -> | |
219 | - let rule = cl ^ "\t" ^ generate_rule stem stem_pref orth in | |
220 | - let rules2 = try StringMap.find rules interp with Not_found -> StringMap.empty in | |
221 | - let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in | |
222 | - StringMap.add rules interp rules2) | |
214 | +let generate_rules_entry rules entry = | |
215 | + let stem_pref = Stem.cut_stem_sufix entry.stem in | |
216 | + let cl = classify_entry entry entry_classes in | |
217 | + Xlist.fold entry.forms rules (fun rules form -> | |
218 | + let rule = cl ^ "\t" ^ generate_rule entry.stem stem_pref form.orth in | |
219 | + let rules2 = try StringMap.find rules form.interp with Not_found -> StringMap.empty in | |
220 | + let rules2 = StringMap.add_inc rules2 rule (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l) in | |
221 | + StringMap.add rules form.interp rules2) | |
223 | 222 | |
224 | -let generate_rules_com_entry cat rules lemma forms = | |
225 | - let stem = Stem.generate_stem cat lemma forms in | |
226 | - let stem_pref = Stem.cut_stem_sufix stem in | |
227 | -(* let cl = classify_entry lemma stem forms entry_classes in *) | |
228 | - let forms = Rules.select_not_validated lemma forms in | |
229 | - Xlist.fold forms rules (fun rules (orth,interp) -> | |
230 | - if not (Xstring.check_sufix ":com" interp) then rules else | |
231 | - let rule = "\t" ^ generate_rule stem stem_pref orth in | |
232 | - let rules2 = try StringMap.find rules interp with Not_found -> StringMap.empty in | |
233 | - let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in | |
234 | - StringMap.add rules interp rules2) | |
223 | +let generate_rules_com_entry rules entry = | |
224 | + let stem_pref = Stem.cut_stem_sufix entry.stem in | |
225 | +(* let cl = classify_entry entry entry_classes in *) | |
226 | + Xlist.fold entry.forms rules (fun rules form -> | |
227 | + if not (Xstring.check_sufix ":com" form.interp) then rules else | |
228 | + let rule = "\t" ^ generate_rule entry.stem stem_pref form.orth in | |
229 | + let rules2 = try StringMap.find rules form.interp with Not_found -> StringMap.empty in | |
230 | + let rules2 = StringMap.add_inc rules2 rule (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l) in | |
231 | + StringMap.add rules form.interp rules2) | |
... | ... |
guesser/rules.ml
... | ... | @@ -282,7 +282,7 @@ let interp_compound_rule_trees = CharTrees.create interp_compound_rules |
282 | 282 | (**********************************************************************************************) |
283 | 283 | (**********************************************************************************************) |
284 | 284 | |
285 | -let rec select_tag tag rev = function | |
285 | +(* let rec select_tag tag rev = function | |
286 | 286 | [] -> "", rev |
287 | 287 | | (k,v) :: l -> if k = tag then v, rev @ l else select_tag tag ((k,v) :: rev) l |
288 | 288 | |
... | ... | @@ -375,4 +375,4 @@ let string_of_rule rule = |
375 | 375 | let print_compound_rules filename rules = |
376 | 376 | File.file_out filename (fun file -> |
377 | 377 | Xlist.iter rules (fun rule -> |
378 | - fprintf file "%s\n" (string_of_rule rule))) | |
378 | + fprintf file "%s\n" (string_of_rule rule))) *) | |
... | ... |
guesser/stem.ml
1 | 1 | open Xstd |
2 | 2 | open Printf |
3 | +open Types | |
3 | 4 | |
4 | 5 | (* Zakładam, że zbiór form należy do jednego leksemu *) |
5 | 6 | |
... | ... | @@ -120,18 +121,18 @@ let simplify_lemma s = |
120 | 121 | | [s;_] -> s |
121 | 122 | | _ -> failwith "simplify_lemma" |
122 | 123 | |
123 | -let generate_stem cat lemma forms = | |
124 | - let orth = simplify_lemma lemma in | |
125 | - let lemma_stem_sel = try StringMap.find lemma_stem_sel cat with Not_found -> [] in | |
124 | +let generate_stem entry = | |
125 | + let orth = simplify_lemma entry.lemma in | |
126 | + let lemma_stem_sel = try StringMap.find lemma_stem_sel entry.cat with Not_found -> [] in | |
126 | 127 | let stems = Xlist.fold lemma_stem_sel StringMap.empty (fun stems sel -> |
127 | 128 | if is_applicable_sel sel orth then |
128 | 129 | StringMap.add_inc stems (apply_sel sel orth) (get_priority sel) (fun priority -> min priority (get_priority sel)) |
129 | 130 | else stems) in |
130 | - let stems2 = Xlist.fold forms StringMap.empty (fun stems (orth,interp) -> | |
131 | - let sels = try StringMap.find stem_sel interp with Not_found -> [] in | |
131 | + let stems2 = Xlist.fold entry.forms StringMap.empty (fun stems form -> | |
132 | + let sels = try StringMap.find stem_sel form.interp with Not_found -> [] in | |
132 | 133 | Xlist.fold sels stems (fun stems sel -> |
133 | - if is_applicable_sel sel orth then | |
134 | - StringMap.add_inc stems (apply_sel sel orth) (get_priority sel) (fun priority -> min priority (get_priority sel)) | |
134 | + if is_applicable_sel sel form.orth then | |
135 | + StringMap.add_inc stems (apply_sel sel form.orth) (get_priority sel) (fun priority -> min priority (get_priority sel)) | |
135 | 136 | else stems)) in |
136 | 137 | let stems = if StringMap.is_empty stems then stems2 else stems in |
137 | 138 | let stems,_ = StringMap.fold stems ([],max_int) (fun (stems,priority) stem p -> |
... | ... | @@ -139,12 +140,12 @@ let generate_stem cat lemma forms = |
139 | 140 | if p > priority then stems,priority else |
140 | 141 | stem :: stems, priority) in |
141 | 142 | match stems with |
142 | - [] -> (*print_endline ("stem not found for " ^ lemma); | |
143 | - Xlist.iter forms (fun (orth,interp) -> printf " %s\t%s\n" orth interp);*) | |
143 | + [] -> (*print_endline ("stem not found for " ^ entry.lemma); | |
144 | + Xlist.iter entry.forms (fun (form.orth,form.interp) -> printf " %s\t%s\n" form.orth form.interp);*) | |
144 | 145 | "" |
145 | 146 | | [s] -> s |
146 | - | l -> print_endline ("many stems found for " ^ lemma ^ ": " ^ String.concat " " l); "" | |
147 | - (*printf "\"%s\"; " lemma; ""*) | |
147 | + | l -> print_endline ("many stems found for " ^ entry.lemma ^ ": " ^ String.concat " " l); "" | |
148 | + (*printf "\"%s\"; " entry.lemma; ""*) | |
148 | 149 | |
149 | 150 | let rec merge_digraph = function |
150 | 151 | [] -> [] |
... | ... | @@ -164,13 +165,6 @@ let rec merge_digraph = function |
164 | 165 | | "q" :: "u" :: l -> "qu" :: (merge_digraph l) |
165 | 166 | | s :: l -> s :: (merge_digraph l) |
166 | 167 | |
167 | -(*let text_to_chars s = | |
168 | - (try UTF8.validate s with UTF8.Malformed_code -> failwith ("Invalid UTF8 string: " ^ s)); | |
169 | - let r = ref [] in | |
170 | - UTF8.iter (fun c -> | |
171 | - r := (UTF8.init 1 (fun _ -> c)) :: (!r)) s; | |
172 | - merge_digraph (List.rev (!r))*) | |
173 | - | |
174 | 168 | let cut_stem_sufix s = |
175 | 169 | let l = Xunicode.utf8_chars_of_utf8_string (*text_to_chars*) s in |
176 | 170 | let l = match List.rev l with |
... | ... |
guesser/types.ml
0 → 100644