Commit 78e7e267fa3cce13bebcd3e10dc5e2b490d59322
1 parent
baf1c22d
dodanie inflexion
Showing
6 changed files
with
135 additions
and
24 deletions
guesser/data/schemata.dic
1 | 1 | KOLWIEK-SUFFIXES ADJ-FLEX ADJ-LEMMA |
2 | 2 | PREF-SUP ADJ-FLEX-GRAD ADJ-GRAD ADJ-LEMMA |
3 | -PREF-SUP ADV-FLEX ADV-LEMMA | |
4 | -NOUN-FLEX NOUN-LEMMA | |
5 | -NOUN-FLEX-CAP NOUN-LEMMA-CAP | |
3 | +#PREF-SUP ADV-FLEX ADV-LEMMA | |
4 | +#NOUN-FLEX NOUN-LEMMA | |
5 | +#NOUN-FLEX-CAP NOUN-LEMMA-CAP | |
6 | 6 | #PREF-ε FIN-FLEX VERB-FLEX2 VERB-GROUP-SUFIX VERB-LEMMA-SUFIX |
7 | -PREF-NIE GER-FLEX VERB-FLEX2 VERB-GROUP-SUFIX VERB-LEMMA-SUFIX | |
8 | -PREF-NIE PPAS-FLEX VERB-FLEX2 VERB-GROUP-SUFIX VERB-LEMMA-SUFIX | |
7 | +#PREF-NIE GER-FLEX VERB-FLEX2 VERB-GROUP-SUFIX VERB-LEMMA-SUFIX | |
8 | +#PREF-NIE PPAS-FLEX VERB-FLEX2 VERB-GROUP-SUFIX VERB-LEMMA-SUFIX | |
9 | 9 | #PREF-ε PRAET-FLEX VERB-FLEX2 VERB-GROUP-SUFIX VERB-LEMMA-SUFIX |
10 | 10 | #PREF-ε FIN-FLEX-J VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX |
11 | -PREF-NIE GER-FLEX VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX | |
12 | -PREF-NIE PACT-FLEX VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX | |
13 | -PREF-NIE PPAS-FLEX VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX | |
11 | +#PREF-NIE GER-FLEX VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX | |
12 | +#PREF-NIE PACT-FLEX VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX | |
13 | +#PREF-NIE PPAS-FLEX VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX | |
14 | 14 | #PREF-ε FIN-FLEX VERB-FLEX2 VERB-GROUP-PATAL VERB-LEMMA-PATAL |
15 | -PREF-NIE GER-FLEX VERB-FLEX2 VERB-GROUP-PATAL VERB-LEMMA-PATAL | |
16 | -PREF-NIE PPAS-FLEX VERB-FLEX2 VERB-GROUP-PATAL VERB-LEMMA-PATAL | |
15 | +#PREF-NIE GER-FLEX VERB-FLEX2 VERB-GROUP-PATAL VERB-LEMMA-PATAL | |
16 | +#PREF-NIE PPAS-FLEX VERB-FLEX2 VERB-GROUP-PATAL VERB-LEMMA-PATAL | |
17 | 17 | #PREF-ε PRAET-FLEX VERB-FLEX2 VERB-GROUP-PATAL VERB-LEMMA-PATAL |
18 | 18 | #PREF-ε FIN-FLEX-J VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL |
19 | -PREF-NIE GER-FLEX VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL | |
20 | -PREF-NIE PACT-FLEX VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL | |
21 | -PREF-NIE PPAS-FLEX VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL | |
19 | +#PREF-NIE GER-FLEX VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL | |
20 | +#PREF-NIE PACT-FLEX VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL | |
21 | +#PREF-NIE PPAS-FLEX VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL | |
22 | 22 | #PREF-ε FIN-FLEX VERB-FLEX2 VERB-GROUP-NĄ VERB-LEMMA-NĄ |
23 | 23 | #PREF-NIE GER-FLEX VERB-FLEX2 VERB-GROUP-NĄ VERB-LEMMA-NĄ |
24 | 24 | #PREF-NIE PPAS-FLEX VERB-FLEX2 VERB-GROUP-NĄ VERB-LEMMA-NĄ |
... | ... |
guesser/dict.ml
... | ... | @@ -331,7 +331,7 @@ let validate dict = |
331 | 331 | if candidates = [] then {form with validated=false} else {form with validated=true}) in |
332 | 332 | {entry with forms=forms}) |
333 | 333 | |
334 | -let validate_interp dict = | |
334 | +let validate_interp rules dict = | |
335 | 335 | let rules = Rules.interp_compound_rule_trees in |
336 | 336 | Xlist.rev_map dict (fun entry -> |
337 | 337 | let simple_lemma = Stem.simplify_lemma entry.lemma in |
... | ... | @@ -428,7 +428,7 @@ let generate_interp_rules con_flag group_flag lemma_flag path filename rules_fil |
428 | 428 | let dict = remove_exceptional_lemmata dict in |
429 | 429 | (* printf "g5 %d\n%!" (Xlist.size dict); *) |
430 | 430 | let dict = generate_stem dict in |
431 | - let dict = validate_interp dict in | |
431 | + let dict = validate_interp Rules.interp_compound_rule_trees dict in | |
432 | 432 | let dict = remove_validated_forms dict in |
433 | 433 | (* printf "g6 %d\n%!" (Xlist.size dict); *) |
434 | 434 | (* printf "g7 %d\n%!" (Xlist.size dict); *) |
... | ... | @@ -441,3 +441,52 @@ let generate_interp_rules con_flag group_flag lemma_flag path filename rules_fil |
441 | 441 | File.file_out rules_filename (fun file -> |
442 | 442 | StringMap.iter interp_rules (fun k (q,l) -> |
443 | 443 | fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l))) |
444 | + | |
445 | +let generate_rule_frequencies path filename rules_filename = | |
446 | + let rules = Rules.interp_compound_rule_trees in | |
447 | + let dict = load_tab (path ^ filename) in | |
448 | + (* printf "g1 %d\n%!" (Xlist.size dict); *) | |
449 | + let dict = merge_entries dict in | |
450 | + (* printf "g2 %d\n%!" (Xlist.size dict); *) | |
451 | + let dict = mark_ndm dict in (* FIXME: remove_ndm? *) | |
452 | + (* printf "g3 %d\n%!" (Xlist.size dict); *) | |
453 | + (* printf "g4 %d\n%!" (Xlist.size dict); *) | |
454 | + let dict = remove_exceptional_lemmata dict in | |
455 | + (* printf "g5 %d\n%!" (Xlist.size dict); *) | |
456 | + let dict = generate_stem dict in | |
457 | + (* printf "g6 %d\n%!" (Xlist.size dict); *) | |
458 | + (* printf "g7 %d\n%!" (Xlist.size dict); *) | |
459 | + let freq_rules = Xlist.fold dict Rules.RuleQMap.empty (fun freq_rules entry -> | |
460 | + let simple_lemma = Stem.simplify_lemma entry.lemma in | |
461 | + Xlist.fold entry.forms freq_rules (fun freq_rules form -> | |
462 | + (* printf "E %s\t%s\t%s\n" orth lemma interp; *) | |
463 | + let candidates = Rules.CharTrees.find rules form.orth in | |
464 | + (* printf "S %d\n" (Xlist.size forms); *) | |
465 | + let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) -> | |
466 | + (* printf "R %s\t%s\n" stem (Rules.string_of_rule rule); *) | |
467 | + if stem ^ rule.Rules.set = simple_lemma && form.interp = rule.Rules.interp then | |
468 | + (stem,rule) :: candidates else candidates) in | |
469 | + if candidates = [] then freq_rules else Rules.RuleQMap.add freq_rules (snd (List.hd candidates)))) in | |
470 | + File.file_out rules_filename (fun file -> | |
471 | + Rules.RuleQMap.iter freq_rules (fun rule freq -> | |
472 | + fprintf file "%s\n" (Rules.string_of_freq_rule {rule with Rules.freq=freq}))) | |
473 | + | |
474 | +let generate_stem_dict rules_filename path filename out_filename = | |
475 | + let rules = Rules.load_freq_rules rules_filename in | |
476 | + let rules = Rules.CharTrees.create rules in | |
477 | + let dict = load_tab (path ^ filename) in | |
478 | + let dict = merge_entries dict in | |
479 | + let dict = mark_ndm dict in (* FIXME: remove_ndm? *) | |
480 | + let stems = Xlist.fold dict StringMap.empty (fun stems entry -> | |
481 | + let simple_lemma,lemma_suf = Stem.simplify_lemma_full entry.lemma in | |
482 | + Xlist.fold entry.forms stems (fun stems form -> | |
483 | + let candidates = Rules.CharTrees.find rules form.orth in | |
484 | + let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) -> | |
485 | + if stem ^ rule.Rules.set = simple_lemma && form.interp = rule.Rules.interp then | |
486 | + (stem,rule) :: candidates else candidates) in | |
487 | + if candidates = [] then stems else | |
488 | + let stem,rule = List.hd candidates in | |
489 | + StringMap.add_inc stems (stem ^ "\t" ^ lemma_suf) [rule.Rules.id] (fun l -> rule.Rules.id :: l))) in | |
490 | + File.file_out out_filename (fun file -> | |
491 | + StringMap.iter stems (fun stem ids -> | |
492 | + fprintf file "%s\t%s\n" stem (String.concat " " ids))) | |
... | ... |
guesser/generate.ml
... | ... | @@ -192,7 +192,7 @@ let _ = |
192 | 192 | |
193 | 193 | (* Generowanie reguł dla interpretacji *) |
194 | 194 | let _ = |
195 | - Dict.generate_interp_rules true true true morfeusz_path odm_adj_sgjp_filename "results/interp_rules_odm_adj.tab"; | |
195 | + (* Dict.generate_interp_rules true true true morfeusz_path odm_adj_sgjp_filename "results/interp_rules_odm_adj.tab"; *) | |
196 | 196 | (* Dict.generate_interp_rules false true true morfeusz_path ("odm_adj_" ^ sgjp_filename) "results/interp_rules_odm_adj2.tab"; *) |
197 | 197 | (* Dict.generate_interp_rules true true true morfeusz_path ("adv_" ^ sgjp_filename) "results/interp_rules_adv.tab"; *) |
198 | 198 | (* Dict.generate_interp_rules true true true morfeusz_path ("verb_" ^ sgjp_filename) "results/interp_rules_verb.tab"; |
... | ... | @@ -201,6 +201,29 @@ Dict.generate_interp_rules true false true morfeusz_path ("verb_" ^ sgjp_filenam |
201 | 201 | Dict.generate_interp_rules true true false morfeusz_path ("odm_noun_" ^ sgjp_filename) "results/interp_rules_odm_noun2.tab";*) |
202 | 202 | () |
203 | 203 | |
204 | +(* Generowanie złożonych reguł zaopatrzonych we frekwencje *) | |
205 | +let _ = | |
206 | + (* Dict.generate_rule_frequencies morfeusz_path odm_adj_sgjp_filename "results/freq_rules-odm-adj.txt"; *) | |
207 | + () | |
208 | + | |
209 | +let generate_alt rules_filename path filename out_filename = | |
210 | + let rules = Rules.load_freq_rules rules_filename in | |
211 | + let rules = Rules.CharTrees.create rules in | |
212 | + let dict = Dict.load_tab (path ^ filename) in | |
213 | + let dict = Dict.merge_entries dict in | |
214 | + let dict = Dict.validate_interp rules dict in | |
215 | + let dict = Dict.remove_validated_forms dict in | |
216 | + Dict.print out_filename dict | |
217 | + | |
218 | +(* Walidacja reguł zaopatrznych we frekwencje/generowanie listy wyjątków *) | |
219 | +let _ = | |
220 | + (* generate_alt "results/freq_rules-odm-adj.txt" morfeusz_path odm_adj_sgjp_filename "results/alt-odm-adj.txt"; *) | |
221 | + () | |
222 | + | |
223 | +(* Generowanie stemów z regułami *) | |
224 | +let _ = | |
225 | + (* Dict.generate_stem_dict "results/freq_rules-odm-adj.txt" morfeusz_path odm_adj_sgjp_filename "results/stem-odm-adj.txt"; *) | |
226 | + () | |
204 | 227 | |
205 | 228 | (**********************************************************************************) |
206 | 229 | |
... | ... |
guesser/makefile
... | ... | @@ -7,6 +7,7 @@ OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa xlib.cmxa |
7 | 7 | |
8 | 8 | all: |
9 | 9 | $(OCAMLOPT) -o generate $(OCAMLOPTFLAGS) types.ml rules.ml stem.ml ruleGenerator.ml dict.ml generate.ml |
10 | + $(OCAMLOPT) -o inflexion $(OCAMLOPTFLAGS) types.ml rules.ml stem.ml ruleGenerator.ml dict.ml inflexion.ml | |
10 | 11 | |
11 | 12 | lib: |
12 | 13 | $(OCAMLOPT) -linkall -a -o inflexion.cmxa $(INCLUDES) $(MODS3) |
... | ... | @@ -33,4 +34,4 @@ lib: |
33 | 34 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< |
34 | 35 | |
35 | 36 | clean: |
36 | - rm -f *~ *.cm[oix] *.o generate | |
37 | + rm -f *~ *.cm[oix] *.o generate inflexion | |
... | ... |
guesser/rules.ml
... | ... | @@ -78,7 +78,15 @@ let rules = load_suf_rules "data/rules.dic" |
78 | 78 | let rev_rules = load_suf_rules "data/rev_rules.dic" |
79 | 79 | let pref_rules = load_pref_rules "data/pref_rules.dic" |
80 | 80 | |
81 | -type rule = {star: star; pref: string; find: string; set: string; tags: (string * string) list; interp: string} | |
81 | +type rule = {star: star; pref: string; find: string; set: string; tags: (string * string) list; | |
82 | + interp: string; id: string; freq: int} | |
83 | + | |
84 | +let load_freq_rules filename = | |
85 | + File.fold_tab filename [] (fun rules -> function | |
86 | + [id; freq; star; pref; find; set; interp] -> | |
87 | + {id=id; freq=int_of_string freq; star=parse_star star; pref=pref; find=find; set=set; | |
88 | + tags=[]; interp=interp} :: rules | |
89 | + | _ -> failwith "load_freq_rules") | |
82 | 90 | |
83 | 91 | let expand_tags x l = |
84 | 92 | Xlist.map l (function |
... | ... | @@ -94,17 +102,19 @@ let prepare_rules suf_rules = |
94 | 102 | Xlist.fold suf_rules [] (fun rules s -> |
95 | 103 | let alternation = try StringMap.find alternation_map s.salt_name with Not_found -> failwith ("prepare_rules: " ^ s.salt_name) in |
96 | 104 | Xlist.fold alternation rules (fun rules a -> |
97 | - {star=merge_stars (s.sstar,a.astar); pref=""; find=a.afind ^ s.ssufix; set=a.aset; tags=expand_tags a.aphone s.stags; interp=""} :: rules)) | |
105 | + {star=merge_stars (s.sstar,a.astar); pref=""; find=a.afind ^ s.ssufix; set=a.aset; | |
106 | + tags=expand_tags a.aphone s.stags; interp=""; id=""; freq=0} :: rules)) | |
98 | 107 | |
99 | 108 | let prepare_rev_rules suf_rules = |
100 | 109 | Xlist.fold suf_rules [] (fun rules s -> |
101 | 110 | let alternation = try StringMap.find rev_alternation_map s.salt_name with Not_found -> failwith ("prepare_rev_rules: " ^ s.salt_name) in |
102 | 111 | Xlist.fold alternation rules (fun rules a -> |
103 | - {star=merge_stars (s.sstar,a.astar); pref=""; find=a.afind; set=a.aset ^ s.ssufix; tags=expand_tags a.aphone s.stags; interp=""} :: rules)) | |
112 | + {star=merge_stars (s.sstar,a.astar); pref=""; find=a.afind; set=a.aset ^ s.ssufix; | |
113 | + tags=expand_tags a.aphone s.stags; interp=""; id=""; freq=0} :: rules)) | |
104 | 114 | |
105 | 115 | let prepare_pref_rules pref_rules = |
106 | 116 | Xlist.fold pref_rules [] (fun rules p -> |
107 | - {star=p.pstar; pref=p.pprefix; find=""; set=""; tags=expand_tags "" p.ptags; interp=""} :: rules) | |
117 | + {star=p.pstar; pref=p.pprefix; find=""; set=""; tags=expand_tags "" p.ptags; interp=""; id=""; freq=0} :: rules) | |
108 | 118 | |
109 | 119 | let rule_map = |
110 | 120 | let map = Xlist.fold rules StringMap.empty (fun map (k,v) -> StringMap.add map k (prepare_rules v)) in |
... | ... | @@ -121,7 +131,7 @@ let rec extract_tag s rev = function |
121 | 131 | |
122 | 132 | let create_compound_rules schemata rule_map = |
123 | 133 | let found = Xlist.fold schemata [] (fun found schema -> |
124 | - let compounds = Xlist.fold schema [{star=Productive;pref="";find="";set="";tags=[];interp=""}] (fun compounds rule_set_name -> | |
134 | + let compounds = Xlist.fold schema [{star=Productive;pref="";find="";set="";tags=[];interp=""; id=""; freq=0}] (fun compounds rule_set_name -> | |
125 | 135 | let rules = try StringMap.find rule_map rule_set_name with Not_found -> failwith ("create_rules: " ^ rule_set_name) in |
126 | 136 | Xlist.fold compounds [] (fun compounds compound -> |
127 | 137 | Xlist.fold rules compounds (fun compounds rule -> |
... | ... | @@ -180,7 +190,7 @@ let load_interp_rules filename = |
180 | 190 | star :: tags :: interp :: comment :: [] -> |
181 | 191 | {star=parse_star star; |
182 | 192 | pref=""; find=""; set=""; |
183 | - tags=expand_tags_simple (parse_tags tags); interp=interp; (*comment=comment*)} | |
193 | + tags=expand_tags_simple (parse_tags tags); interp=interp; (*comment=comment;*) id=""; freq=0} | |
184 | 194 | | line -> failwith ("load_tab: " ^ (String.concat "\t" line))) |
185 | 195 | |
186 | 196 | module InterpTree = struct |
... | ... | @@ -231,7 +241,11 @@ let create_interp_compound_rules interp_tree compound_rules = |
231 | 241 | Xlist.fold interp_rules interp_compound_rules (fun interp_compound_rules interp_rule -> |
232 | 242 | {rule with interp=interp_rule.interp; star=merge_stars (rule.star, interp_rule.star)} :: interp_compound_rules)) |
233 | 243 | |
234 | -let interp_compound_rules = create_interp_compound_rules interp_tree compound_rules | |
244 | +let assign_ids rules = | |
245 | + fst (Xlist.fold rules ([],1) (fun (rules,id) rule -> | |
246 | + {rule with id=string_of_int id} :: rules, id+1)) | |
247 | + | |
248 | +let interp_compound_rules = assign_ids (create_interp_compound_rules interp_tree compound_rules) | |
235 | 249 | |
236 | 250 | (**********************************************************************************************) |
237 | 251 | |
... | ... | @@ -280,6 +294,24 @@ let compound_rule_trees = CharTrees.create compound_rules |
280 | 294 | let interp_compound_rule_trees = CharTrees.create interp_compound_rules |
281 | 295 | |
282 | 296 | (**********************************************************************************************) |
297 | + | |
298 | +module OrderedRule = struct | |
299 | + | |
300 | + type t = rule | |
301 | + | |
302 | + let compare = compare | |
303 | + | |
304 | +end | |
305 | + | |
306 | +module RuleQMap = Xmap.MakeQ(OrderedRule) | |
307 | + | |
308 | +let string_of_star = function | |
309 | + Productive -> "" | |
310 | + | Star -> "*" | |
311 | + | |
312 | +let string_of_freq_rule rule = | |
313 | + sprintf "%s\t%d\t%s\t%s\t%s\t%s\t%s" rule.id rule.freq (string_of_star rule.star) rule.pref rule.find rule.set rule.interp | |
314 | + | |
283 | 315 | (**********************************************************************************************) |
284 | 316 | |
285 | 317 | (* let is_applicable_rule rule s = Xstring.check_sufix rule.find s |
... | ... |
guesser/stem.ml
... | ... | @@ -121,6 +121,12 @@ let simplify_lemma s = |
121 | 121 | | [s;_] -> s |
122 | 122 | | _ -> failwith "simplify_lemma" |
123 | 123 | |
124 | +let simplify_lemma_full s = | |
125 | + match Xstring.split ":" s with | |
126 | + [s] -> s,"" | |
127 | + | [s;t] -> s,t | |
128 | + | _ -> failwith "simplify_lemma" | |
129 | + | |
124 | 130 | let generate_stem entry = |
125 | 131 | let orth = simplify_lemma entry.lemma in |
126 | 132 | let lemma_stem_sel = try StringMap.find lemma_stem_sel entry.cat with Not_found -> [] in |
... | ... |