Commit 783dcb676121d8fa065bcd58365aee5a93cdcb09
1 parent
78e7e267
dodanie inflexion 2
Showing
1 changed file
with
76 additions
and
0 deletions
guesser/inflexion.ml
0 → 100644
1 | +open Types | |
2 | +open Xstd | |
3 | + | |
4 | +let alt_adj = "alt-odm-adj.txt" | |
5 | +let stem_adj = "stem-odm-adj.txt" | |
6 | +let rules_adj = "freq_rules-odm-adj.txt" | |
7 | +(* let alt_adj = "alt-odm-adj.tab" | |
8 | +let stem_adj = "stem-odm-adj.tab" | |
9 | +let rules_adj = "rules-odm-adj.tab" *) | |
10 | +let alt_all = "alt.tab" | |
11 | +let stem_all = "stem.tab" | |
12 | +let rules_all = "rules.tab" | |
13 | + | |
14 | +let resource_path = "results/" | |
15 | + | |
16 | +let load_stems filename = | |
17 | + File.fold_tab filename StringMap.empty (fun stems -> function | |
18 | + [stem; lemma_suf; ids] -> StringMap.add stems stem (StringSet.of_list (Xstring.split " " ids)) | |
19 | + | l -> failwith ("load_stems: " ^ String.concat " " l)) | |
20 | + | |
21 | +let prepare_inflexion resource_path alt_filename stem_filename rules_filename = | |
22 | + let alt = Dict.load_tab (resource_path ^ alt_filename) in | |
23 | + let alt = Xlist.fold alt StringMap.empty (fun alt entry -> | |
24 | + Xlist.fold entry.forms alt (fun alt form -> | |
25 | + let v = entry.lemma, form.interp, 1, [] in | |
26 | + StringMap.add_inc alt form.orth [v] (fun l -> v :: l))) in | |
27 | + let stems = load_stems (resource_path ^ stem_filename) in | |
28 | + let rules = Rules.load_freq_rules (resource_path ^ rules_filename) in | |
29 | + let rules = Rules.CharTrees.create rules in | |
30 | + alt,stems,rules | |
31 | + | |
32 | +let alt,stems,rules = prepare_inflexion resource_path alt_adj stem_adj rules_adj | |
33 | + | |
34 | +let get_interpretations orth = | |
35 | + let candidates = Rules.CharTrees.find rules orth in | |
36 | + let found = try StringMap.find alt orth with Not_found -> [] in | |
37 | + let found = Xlist.fold candidates found (fun found (stem,rule) -> | |
38 | + let ids = try StringMap.find stems stem with Not_found -> StringSet.empty in | |
39 | + let tags = if StringSet.mem ids rule.Rules.id then [] else ["lemma not validated"] in | |
40 | + (stem ^ rule.Rules.set, rule.Rules.interp, rule.Rules.freq, tags) :: found) in | |
41 | + let found = (orth,"unk",1,["token not found"]) :: found in | |
42 | + let valid = Xlist.fold found [] (fun valid -> function | |
43 | + lemma,interp,quantity,[] -> (lemma,interp,quantity,[]) :: valid | |
44 | + | _ -> valid) in | |
45 | + if valid = [] then found else valid | |
46 | + | |
47 | +(* Testy *) | |
48 | + | |
49 | +let print_interpretations l = | |
50 | + Xlist.iter (Xlist.sort l compare) (fun (lemma,interp,freq,tags) -> | |
51 | + Printf.printf "%s\t%s\t%d\t%s\n" lemma interp freq (String.concat ";" tags)) | |
52 | + | |
53 | +let _ = | |
54 | + let l = get_interpretations "życzliwą" in | |
55 | + print_interpretations l; | |
56 | + let l = get_interpretations "żyźniejszego" in | |
57 | + print_interpretations l; | |
58 | + let l = get_interpretations "zwiśli" in | |
59 | + print_interpretations l; | |
60 | + let l = get_interpretations "najzieleńsza" in | |
61 | + print_interpretations l; | |
62 | + let l = get_interpretations "najtandetniejsza" in | |
63 | + print_interpretations l; | |
64 | + let l = get_interpretations "nieżelazny" in | |
65 | + print_interpretations l; | |
66 | + let l = get_interpretations "któregokolwiek" in | |
67 | + print_interpretations l; | |
68 | + let l = get_interpretations "większą" in | |
69 | + print_interpretations l; | |
70 | + let l = get_interpretations "bordo" in | |
71 | + print_interpretations l; | |
72 | + let l = get_interpretations "sexi" in | |
73 | + print_interpretations l; | |
74 | + let l = get_interpretations "sepulkową" in | |
75 | + print_interpretations l; | |
76 | + () | |
... | ... |