Commit 78e7e267fa3cce13bebcd3e10dc5e2b490d59322

Authored by Wojciech Jaworski
1 parent baf1c22d

dodanie inflexion

guesser/data/schemata.dic
1 1 KOLWIEK-SUFFIXES ADJ-FLEX ADJ-LEMMA
2 2 PREF-SUP ADJ-FLEX-GRAD ADJ-GRAD ADJ-LEMMA
3   -PREF-SUP ADV-FLEX ADV-LEMMA
4   -NOUN-FLEX NOUN-LEMMA
5   -NOUN-FLEX-CAP NOUN-LEMMA-CAP
  3 +#PREF-SUP ADV-FLEX ADV-LEMMA
  4 +#NOUN-FLEX NOUN-LEMMA
  5 +#NOUN-FLEX-CAP NOUN-LEMMA-CAP
6 6 #PREF-ε FIN-FLEX VERB-FLEX2 VERB-GROUP-SUFIX VERB-LEMMA-SUFIX
7   -PREF-NIE GER-FLEX VERB-FLEX2 VERB-GROUP-SUFIX VERB-LEMMA-SUFIX
8   -PREF-NIE PPAS-FLEX VERB-FLEX2 VERB-GROUP-SUFIX VERB-LEMMA-SUFIX
  7 +#PREF-NIE GER-FLEX VERB-FLEX2 VERB-GROUP-SUFIX VERB-LEMMA-SUFIX
  8 +#PREF-NIE PPAS-FLEX VERB-FLEX2 VERB-GROUP-SUFIX VERB-LEMMA-SUFIX
9 9 #PREF-ε PRAET-FLEX VERB-FLEX2 VERB-GROUP-SUFIX VERB-LEMMA-SUFIX
10 10 #PREF-ε FIN-FLEX-J VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX
11   -PREF-NIE GER-FLEX VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX
12   -PREF-NIE PACT-FLEX VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX
13   -PREF-NIE PPAS-FLEX VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX
  11 +#PREF-NIE GER-FLEX VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX
  12 +#PREF-NIE PACT-FLEX VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX
  13 +#PREF-NIE PPAS-FLEX VERB-FLEX2-J VERB-GROUP-SUFIX VERB-LEMMA-SUFIX
14 14 #PREF-ε FIN-FLEX VERB-FLEX2 VERB-GROUP-PATAL VERB-LEMMA-PATAL
15   -PREF-NIE GER-FLEX VERB-FLEX2 VERB-GROUP-PATAL VERB-LEMMA-PATAL
16   -PREF-NIE PPAS-FLEX VERB-FLEX2 VERB-GROUP-PATAL VERB-LEMMA-PATAL
  15 +#PREF-NIE GER-FLEX VERB-FLEX2 VERB-GROUP-PATAL VERB-LEMMA-PATAL
  16 +#PREF-NIE PPAS-FLEX VERB-FLEX2 VERB-GROUP-PATAL VERB-LEMMA-PATAL
17 17 #PREF-ε PRAET-FLEX VERB-FLEX2 VERB-GROUP-PATAL VERB-LEMMA-PATAL
18 18 #PREF-ε FIN-FLEX-J VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL
19   -PREF-NIE GER-FLEX VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL
20   -PREF-NIE PACT-FLEX VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL
21   -PREF-NIE PPAS-FLEX VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL
  19 +#PREF-NIE GER-FLEX VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL
  20 +#PREF-NIE PACT-FLEX VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL
  21 +#PREF-NIE PPAS-FLEX VERB-FLEX2-J VERB-GROUP-J-PATAL VERB-LEMMA-PATAL
22 22 #PREF-ε FIN-FLEX VERB-FLEX2 VERB-GROUP-NĄ VERB-LEMMA-NĄ
23 23 #PREF-NIE GER-FLEX VERB-FLEX2 VERB-GROUP-NĄ VERB-LEMMA-NĄ
24 24 #PREF-NIE PPAS-FLEX VERB-FLEX2 VERB-GROUP-NĄ VERB-LEMMA-NĄ
... ...
guesser/dict.ml
... ... @@ -331,7 +331,7 @@ let validate dict =
331 331 if candidates = [] then {form with validated=false} else {form with validated=true}) in
332 332 {entry with forms=forms})
333 333  
334   -let validate_interp dict =
  334 +let validate_interp rules dict =
335 335 let rules = Rules.interp_compound_rule_trees in
336 336 Xlist.rev_map dict (fun entry ->
337 337 let simple_lemma = Stem.simplify_lemma entry.lemma in
... ... @@ -428,7 +428,7 @@ let generate_interp_rules con_flag group_flag lemma_flag path filename rules_fil
428 428 let dict = remove_exceptional_lemmata dict in
429 429 (* printf "g5 %d\n%!" (Xlist.size dict); *)
430 430 let dict = generate_stem dict in
431   - let dict = validate_interp dict in
  431 + let dict = validate_interp Rules.interp_compound_rule_trees dict in
432 432 let dict = remove_validated_forms dict in
433 433 (* printf "g6 %d\n%!" (Xlist.size dict); *)
434 434 (* printf "g7 %d\n%!" (Xlist.size dict); *)
... ... @@ -441,3 +441,52 @@ let generate_interp_rules con_flag group_flag lemma_flag path filename rules_fil
441 441 File.file_out rules_filename (fun file ->
442 442 StringMap.iter interp_rules (fun k (q,l) ->
443 443 fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l)))
  444 +
  445 +let generate_rule_frequencies path filename rules_filename =
  446 + let rules = Rules.interp_compound_rule_trees in
  447 + let dict = load_tab (path ^ filename) in
  448 + (* printf "g1 %d\n%!" (Xlist.size dict); *)
  449 + let dict = merge_entries dict in
  450 + (* printf "g2 %d\n%!" (Xlist.size dict); *)
  451 + let dict = mark_ndm dict in (* FIXME: remove_ndm? *)
  452 + (* printf "g3 %d\n%!" (Xlist.size dict); *)
  453 + (* printf "g4 %d\n%!" (Xlist.size dict); *)
  454 + let dict = remove_exceptional_lemmata dict in
  455 + (* printf "g5 %d\n%!" (Xlist.size dict); *)
  456 + let dict = generate_stem dict in
  457 + (* printf "g6 %d\n%!" (Xlist.size dict); *)
  458 + (* printf "g7 %d\n%!" (Xlist.size dict); *)
  459 + let freq_rules = Xlist.fold dict Rules.RuleQMap.empty (fun freq_rules entry ->
  460 + let simple_lemma = Stem.simplify_lemma entry.lemma in
  461 + Xlist.fold entry.forms freq_rules (fun freq_rules form ->
  462 + (* printf "E %s\t%s\t%s\n" orth lemma interp; *)
  463 + let candidates = Rules.CharTrees.find rules form.orth in
  464 + (* printf "S %d\n" (Xlist.size forms); *)
  465 + let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
  466 + (* printf "R %s\t%s\n" stem (Rules.string_of_rule rule); *)
  467 + if stem ^ rule.Rules.set = simple_lemma && form.interp = rule.Rules.interp then
  468 + (stem,rule) :: candidates else candidates) in
  469 + if candidates = [] then freq_rules else Rules.RuleQMap.add freq_rules (snd (List.hd candidates)))) in
  470 + File.file_out rules_filename (fun file ->
  471 + Rules.RuleQMap.iter freq_rules (fun rule freq ->
  472 + fprintf file "%s\n" (Rules.string_of_freq_rule {rule with Rules.freq=freq})))
  473 +
  474 +let generate_stem_dict rules_filename path filename out_filename =
  475 + let rules = Rules.load_freq_rules rules_filename in
  476 + let rules = Rules.CharTrees.create rules in
  477 + let dict = load_tab (path ^ filename) in
  478 + let dict = merge_entries dict in
  479 + let dict = mark_ndm dict in (* FIXME: remove_ndm? *)
  480 + let stems = Xlist.fold dict StringMap.empty (fun stems entry ->
  481 + let simple_lemma,lemma_suf = Stem.simplify_lemma_full entry.lemma in
  482 + Xlist.fold entry.forms stems (fun stems form ->
  483 + let candidates = Rules.CharTrees.find rules form.orth in
  484 + let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
  485 + if stem ^ rule.Rules.set = simple_lemma && form.interp = rule.Rules.interp then
  486 + (stem,rule) :: candidates else candidates) in
  487 + if candidates = [] then stems else
  488 + let stem,rule = List.hd candidates in
  489 + StringMap.add_inc stems (stem ^ "\t" ^ lemma_suf) [rule.Rules.id] (fun l -> rule.Rules.id :: l))) in
  490 + File.file_out out_filename (fun file ->
  491 + StringMap.iter stems (fun stem ids ->
  492 + fprintf file "%s\t%s\n" stem (String.concat " " ids)))
... ...
guesser/generate.ml
... ... @@ -192,7 +192,7 @@ let _ =
192 192  
193 193 (* Generowanie reguł dla interpretacji *)
194 194 let _ =
195   - Dict.generate_interp_rules true true true morfeusz_path odm_adj_sgjp_filename "results/interp_rules_odm_adj.tab";
  195 + (* Dict.generate_interp_rules true true true morfeusz_path odm_adj_sgjp_filename "results/interp_rules_odm_adj.tab"; *)
196 196 (* Dict.generate_interp_rules false true true morfeusz_path ("odm_adj_" ^ sgjp_filename) "results/interp_rules_odm_adj2.tab"; *)
197 197 (* Dict.generate_interp_rules true true true morfeusz_path ("adv_" ^ sgjp_filename) "results/interp_rules_adv.tab"; *)
198 198 (* Dict.generate_interp_rules true true true morfeusz_path ("verb_" ^ sgjp_filename) "results/interp_rules_verb.tab";
... ... @@ -201,6 +201,29 @@ Dict.generate_interp_rules true false true morfeusz_path ("verb_" ^ sgjp_filenam
201 201 Dict.generate_interp_rules true true false morfeusz_path ("odm_noun_" ^ sgjp_filename) "results/interp_rules_odm_noun2.tab";*)
202 202 ()
203 203  
  204 +(* Generowanie złożonych reguł zaopatrzonych we frekwencje *)
  205 +let _ =
  206 + (* Dict.generate_rule_frequencies morfeusz_path odm_adj_sgjp_filename "results/freq_rules-odm-adj.txt"; *)
  207 + ()
  208 +
  209 +let generate_alt rules_filename path filename out_filename =
  210 + let rules = Rules.load_freq_rules rules_filename in
  211 + let rules = Rules.CharTrees.create rules in
  212 + let dict = Dict.load_tab (path ^ filename) in
  213 + let dict = Dict.merge_entries dict in
  214 + let dict = Dict.validate_interp rules dict in
  215 + let dict = Dict.remove_validated_forms dict in
  216 + Dict.print out_filename dict
  217 +
  218 +(* Walidacja reguł zaopatrznych we frekwencje/generowanie listy wyjątków *)
  219 +let _ =
  220 + (* generate_alt "results/freq_rules-odm-adj.txt" morfeusz_path odm_adj_sgjp_filename "results/alt-odm-adj.txt"; *)
  221 + ()
  222 +
  223 +(* Generowanie stemów z regułami *)
  224 +let _ =
  225 + (* Dict.generate_stem_dict "results/freq_rules-odm-adj.txt" morfeusz_path odm_adj_sgjp_filename "results/stem-odm-adj.txt"; *)
  226 + ()
204 227  
205 228 (**********************************************************************************)
206 229  
... ...
guesser/makefile
... ... @@ -7,6 +7,7 @@ OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa xlib.cmxa
7 7  
8 8 all:
9 9 $(OCAMLOPT) -o generate $(OCAMLOPTFLAGS) types.ml rules.ml stem.ml ruleGenerator.ml dict.ml generate.ml
  10 + $(OCAMLOPT) -o inflexion $(OCAMLOPTFLAGS) types.ml rules.ml stem.ml ruleGenerator.ml dict.ml inflexion.ml
10 11  
11 12 lib:
12 13 $(OCAMLOPT) -linkall -a -o inflexion.cmxa $(INCLUDES) $(MODS3)
... ... @@ -33,4 +34,4 @@ lib:
33 34 $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $<
34 35  
35 36 clean:
36   - rm -f *~ *.cm[oix] *.o generate
  37 + rm -f *~ *.cm[oix] *.o generate inflexion
... ...
guesser/rules.ml
... ... @@ -78,7 +78,15 @@ let rules = load_suf_rules &quot;data/rules.dic&quot;
78 78 let rev_rules = load_suf_rules "data/rev_rules.dic"
79 79 let pref_rules = load_pref_rules "data/pref_rules.dic"
80 80  
81   -type rule = {star: star; pref: string; find: string; set: string; tags: (string * string) list; interp: string}
  81 +type rule = {star: star; pref: string; find: string; set: string; tags: (string * string) list;
  82 + interp: string; id: string; freq: int}
  83 +
  84 +let load_freq_rules filename =
  85 + File.fold_tab filename [] (fun rules -> function
  86 + [id; freq; star; pref; find; set; interp] ->
  87 + {id=id; freq=int_of_string freq; star=parse_star star; pref=pref; find=find; set=set;
  88 + tags=[]; interp=interp} :: rules
  89 + | _ -> failwith "load_freq_rules")
82 90  
83 91 let expand_tags x l =
84 92 Xlist.map l (function
... ... @@ -94,17 +102,19 @@ let prepare_rules suf_rules =
94 102 Xlist.fold suf_rules [] (fun rules s ->
95 103 let alternation = try StringMap.find alternation_map s.salt_name with Not_found -> failwith ("prepare_rules: " ^ s.salt_name) in
96 104 Xlist.fold alternation rules (fun rules a ->
97   - {star=merge_stars (s.sstar,a.astar); pref=""; find=a.afind ^ s.ssufix; set=a.aset; tags=expand_tags a.aphone s.stags; interp=""} :: rules))
  105 + {star=merge_stars (s.sstar,a.astar); pref=""; find=a.afind ^ s.ssufix; set=a.aset;
  106 + tags=expand_tags a.aphone s.stags; interp=""; id=""; freq=0} :: rules))
98 107  
99 108 let prepare_rev_rules suf_rules =
100 109 Xlist.fold suf_rules [] (fun rules s ->
101 110 let alternation = try StringMap.find rev_alternation_map s.salt_name with Not_found -> failwith ("prepare_rev_rules: " ^ s.salt_name) in
102 111 Xlist.fold alternation rules (fun rules a ->
103   - {star=merge_stars (s.sstar,a.astar); pref=""; find=a.afind; set=a.aset ^ s.ssufix; tags=expand_tags a.aphone s.stags; interp=""} :: rules))
  112 + {star=merge_stars (s.sstar,a.astar); pref=""; find=a.afind; set=a.aset ^ s.ssufix;
  113 + tags=expand_tags a.aphone s.stags; interp=""; id=""; freq=0} :: rules))
104 114  
105 115 let prepare_pref_rules pref_rules =
106 116 Xlist.fold pref_rules [] (fun rules p ->
107   - {star=p.pstar; pref=p.pprefix; find=""; set=""; tags=expand_tags "" p.ptags; interp=""} :: rules)
  117 + {star=p.pstar; pref=p.pprefix; find=""; set=""; tags=expand_tags "" p.ptags; interp=""; id=""; freq=0} :: rules)
108 118  
109 119 let rule_map =
110 120 let map = Xlist.fold rules StringMap.empty (fun map (k,v) -> StringMap.add map k (prepare_rules v)) in
... ... @@ -121,7 +131,7 @@ let rec extract_tag s rev = function
121 131  
122 132 let create_compound_rules schemata rule_map =
123 133 let found = Xlist.fold schemata [] (fun found schema ->
124   - let compounds = Xlist.fold schema [{star=Productive;pref="";find="";set="";tags=[];interp=""}] (fun compounds rule_set_name ->
  134 + let compounds = Xlist.fold schema [{star=Productive;pref="";find="";set="";tags=[];interp=""; id=""; freq=0}] (fun compounds rule_set_name ->
125 135 let rules = try StringMap.find rule_map rule_set_name with Not_found -> failwith ("create_rules: " ^ rule_set_name) in
126 136 Xlist.fold compounds [] (fun compounds compound ->
127 137 Xlist.fold rules compounds (fun compounds rule ->
... ... @@ -180,7 +190,7 @@ let load_interp_rules filename =
180 190 star :: tags :: interp :: comment :: [] ->
181 191 {star=parse_star star;
182 192 pref=""; find=""; set="";
183   - tags=expand_tags_simple (parse_tags tags); interp=interp; (*comment=comment*)}
  193 + tags=expand_tags_simple (parse_tags tags); interp=interp; (*comment=comment;*) id=""; freq=0}
184 194 | line -> failwith ("load_tab: " ^ (String.concat "\t" line)))
185 195  
186 196 module InterpTree = struct
... ... @@ -231,7 +241,11 @@ let create_interp_compound_rules interp_tree compound_rules =
231 241 Xlist.fold interp_rules interp_compound_rules (fun interp_compound_rules interp_rule ->
232 242 {rule with interp=interp_rule.interp; star=merge_stars (rule.star, interp_rule.star)} :: interp_compound_rules))
233 243  
234   -let interp_compound_rules = create_interp_compound_rules interp_tree compound_rules
  244 +let assign_ids rules =
  245 + fst (Xlist.fold rules ([],1) (fun (rules,id) rule ->
  246 + {rule with id=string_of_int id} :: rules, id+1))
  247 +
  248 +let interp_compound_rules = assign_ids (create_interp_compound_rules interp_tree compound_rules)
235 249  
236 250 (**********************************************************************************************)
237 251  
... ... @@ -280,6 +294,24 @@ let compound_rule_trees = CharTrees.create compound_rules
280 294 let interp_compound_rule_trees = CharTrees.create interp_compound_rules
281 295  
282 296 (**********************************************************************************************)
  297 +
  298 +module OrderedRule = struct
  299 +
  300 + type t = rule
  301 +
  302 + let compare = compare
  303 +
  304 +end
  305 +
  306 +module RuleQMap = Xmap.MakeQ(OrderedRule)
  307 +
  308 +let string_of_star = function
  309 + Productive -> ""
  310 + | Star -> "*"
  311 +
  312 +let string_of_freq_rule rule =
  313 + sprintf "%s\t%d\t%s\t%s\t%s\t%s\t%s" rule.id rule.freq (string_of_star rule.star) rule.pref rule.find rule.set rule.interp
  314 +
283 315 (**********************************************************************************************)
284 316  
285 317 (* let is_applicable_rule rule s = Xstring.check_sufix rule.find s
... ...
guesser/stem.ml
... ... @@ -121,6 +121,12 @@ let simplify_lemma s =
121 121 | [s;_] -> s
122 122 | _ -> failwith "simplify_lemma"
123 123  
  124 +let simplify_lemma_full s =
  125 + match Xstring.split ":" s with
  126 + [s] -> s,""
  127 + | [s;t] -> s,t
  128 + | _ -> failwith "simplify_lemma"
  129 +
124 130 let generate_stem entry =
125 131 let orth = simplify_lemma entry.lemma in
126 132 let lemma_stem_sel = try StringMap.find lemma_stem_sel entry.cat with Not_found -> [] in
... ...