Commit b7bfd55d140b508a75896d82e31959ec8f557d82
1 parent
766cb2a4
Inicjalizacja tokenizera, morphologii i subsyntax
Showing
11 changed files
with
49 additions
and
20 deletions
LCGlexicon/ENIAMcategoriesPL.ml
... | ... | @@ -89,7 +89,7 @@ let adv_modes = |
89 | 89 | try File.fold_tab adv_modes_filename StringMap.empty (fun adv_modes -> function |
90 | 90 | [adv;mode] -> StringMap.add_inc adv_modes adv [mode] (fun l -> mode :: l) |
91 | 91 | | _ -> failwith "adv_modes") |
92 | - with _ -> (prerr_endline ("ENIAMlexicon adv_modes file " ^ adv_modes_filename ^ " not found"); StringMap.empty) | |
92 | + with _ -> (prerr_endline ("ENIAMlexicon adv_modes file " ^ adv_modes_filename ^ " not found"); ) in | |
93 | 93 | |
94 | 94 | let noun_type proper lemma pos = |
95 | 95 | let nsyn = |
... | ... |
morphology2/ENIAMinflexion.ml
... | ... | @@ -40,26 +40,35 @@ let simplify_lemma s = |
40 | 40 | | [s;_] -> s |
41 | 41 | | _ -> failwith "simplify_lemma" |
42 | 42 | |
43 | -let prepare_inflexion alt_filename stem_filename rules_filename = | |
43 | +let prepare_alt alt_filename = | |
44 | 44 | let alt = load_tab alt_filename in |
45 | 45 | let alt = Xlist.fold alt StringMap.empty (fun alt entry -> |
46 | 46 | Xlist.fold entry.forms alt (fun alt form -> |
47 | 47 | let simple_lemma = simplify_lemma entry.lemma in |
48 | 48 | let v = simple_lemma, form.interp, 1, [] in |
49 | 49 | StringMap.add_inc alt form.orth [v] (fun l -> v :: l))) in |
50 | - let stems = load_stems stem_filename in | |
50 | + alt | |
51 | + | |
52 | +let prepare_rules rules_filename = | |
51 | 53 | let rules = ENIAMmorphologyRules.load_freq_rules rules_filename in |
52 | 54 | let rules = ENIAMmorphologyRules.CharTrees.create rules in |
53 | - alt,stems,rules | |
55 | + rules | |
56 | + | |
57 | +let alt = ref (StringMap.empty : (string * string * int * string list) list StringMap.t) | |
58 | +let stems = ref (StringMap.empty : StringSet.t StringMap.t) | |
59 | +let rules = ref ([] : (StringMap.key * ENIAMmorphologyRules.CharTrees.t) list) | |
54 | 60 | |
55 | -let alt,stems,rules = prepare_inflexion alt_filename stem_filename rules_filename | |
61 | +let initialize () = | |
62 | + alt := prepare_alt alt_filename; | |
63 | + stems := load_stems stem_filename; | |
64 | + rules := prepare_rules rules_filename | |
56 | 65 | |
57 | 66 | let get_interpretations orth = |
58 | - let candidates = ENIAMmorphologyRules.CharTrees.find rules orth in | |
59 | - let found = try StringMap.find alt orth with Not_found -> [] in | |
67 | + let candidates = ENIAMmorphologyRules.CharTrees.find !rules orth in | |
68 | + let found = try StringMap.find !alt orth with Not_found -> [] in | |
60 | 69 | let found = Xlist.fold candidates found (fun found (stem,rule) -> |
61 | 70 | (* Printf.printf "%s\t%s\n%!" stem (ENIAMmorphologyRules.string_of_freq_rule rule); *) |
62 | - let ids = try StringMap.find stems stem with Not_found -> StringSet.empty in | |
71 | + let ids = try StringMap.find !stems stem with Not_found -> StringSet.empty in | |
63 | 72 | if not (StringSet.mem ids rule.id) && rule.star <> Productive then found else |
64 | 73 | let tags = if StringSet.mem ids rule.id then [] else ["lemma not validated"] in |
65 | 74 | (stem ^ rule.set, rule.interp, rule.freq, tags) :: found) in |
... | ... |
morphology2/test.ml
... | ... | @@ -32,6 +32,7 @@ let string_of_token (lemma,interp,quantity,attrs) = |
32 | 32 | Printf.sprintf "%s\t%s\t%d\t%s" lemma interp quantity (String.concat ", " attrs) |
33 | 33 | |
34 | 34 | let _ = |
35 | + ENIAMinflexion.initialize (); | |
35 | 36 | print_endline "Testy wbudowane"; |
36 | 37 | Xlist.iter test_strings (fun s -> |
37 | 38 | print_endline ("\nTEST: " ^ s); |
... | ... |
subsyntax/ENIAM_MWE.ml
... | ... | @@ -28,13 +28,15 @@ let load_dict dict filename = |
28 | 28 | StringMap.add_inc dict s [orth,lemma,interp] (fun l -> (orth,lemma,interp) :: l) |
29 | 29 | | l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'")) |
30 | 30 | |
31 | -let mwe_dict = | |
31 | +let load_mwe_dict () = | |
32 | 32 | let dict = load_dict StringMap.empty brev_filename in |
33 | 33 | let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in |
34 | 34 | (* let dict = load_dict dict complete_entries_filename in*) |
35 | 35 | let dict = load_dict dict mwe_filename in |
36 | 36 | dict |
37 | 37 | |
38 | +let mwe_dict = ref (StringMap.empty : (string * string * string) list StringMap.t) | |
39 | + | |
38 | 40 | let preselect_dict orths dict = |
39 | 41 | StringSet.fold orths [] (fun rules orth -> |
40 | 42 | try |
... | ... | @@ -280,7 +282,7 @@ let add_ordnum_rules rules paths = |
280 | 282 | let process (paths,last) = |
281 | 283 | let paths = Xlist.fold paths IntMap.empty add_token in |
282 | 284 | let orths = get_orths paths in |
283 | - let rules = preselect_dict orths mwe_dict in | |
285 | + let rules = preselect_dict orths !mwe_dict in | |
284 | 286 | let rules = add_ordnum_rules rules paths in |
285 | 287 | let paths = Xlist.fold rules paths apply_rule in |
286 | 288 | let paths = IntMap.fold paths [] (fun paths _ map -> |
... | ... |
subsyntax/ENIAMsubsyntax.ml
... | ... | @@ -30,7 +30,7 @@ let load_lemma_frequencies filename = |
30 | 30 | [count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.)) |
31 | 31 | | _ -> failwith ("load_lemma_frequencies: " ^ line)) |
32 | 32 | |
33 | -let lemma_frequencies = load_lemma_frequencies lemma_frequencies_filename | |
33 | +let lemma_frequencies = ref (StringMap.empty : float StringMap.t) | |
34 | 34 | |
35 | 35 | let modify_weights paths = |
36 | 36 | List.rev (Xlist.fold paths [] (fun paths t -> |
... | ... | @@ -41,8 +41,8 @@ let modify_weights paths = |
41 | 41 | | "lemmatized as lowercase" -> w -. 0.1 |
42 | 42 | | _ -> w) in |
43 | 43 | let w = match t.token with |
44 | - Lemma(lemma,cat,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w) | |
45 | - | Proper(lemma,cat,_,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w) | |
44 | + Lemma(lemma,cat,_) -> (try w +. StringMap.find !lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w) | |
45 | + | Proper(lemma,cat,_,_) -> (try w +. StringMap.find !lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w) | |
46 | 46 | | _ -> w in |
47 | 47 | {t with weight = w} :: paths)) |
48 | 48 | |
... | ... | @@ -210,12 +210,14 @@ let load_proper_name proper = function |
210 | 210 | StringMap.add_inc proper lemma types (fun types2 -> types @ types2) |
211 | 211 | | l -> failwith ("proper_names: " ^ String.concat " " l) |
212 | 212 | |
213 | -let proper_names = | |
213 | +let load_proper_names () = | |
214 | 214 | let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in |
215 | 215 | let proper = File.fold_tab proper_names_filename2 proper load_proper_name in |
216 | 216 | let proper = File.fold_tab proper_names_filename3 proper load_proper_name in |
217 | 217 | proper |
218 | 218 | |
219 | +let proper_names = ref (StringMap.empty : string list StringMap.t) | |
220 | + | |
219 | 221 | let remove l s = |
220 | 222 | Xlist.fold l [] (fun l t -> |
221 | 223 | if s = t then l else t :: l) |
... | ... | @@ -223,14 +225,21 @@ let remove l s = |
223 | 225 | let find_proper_names t = |
224 | 226 | match t.token with |
225 | 227 | Lemma(lemma,pos,interp) -> |
226 | - if StringMap.mem proper_names lemma then | |
227 | - {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma); | |
228 | + if StringMap.mem !proper_names lemma then | |
229 | + {t with token=Proper(lemma,pos,interp,StringMap.find !proper_names lemma); | |
228 | 230 | attrs=remove t.attrs "notvalidated proper"} else |
229 | 231 | if Xlist.mem t.attrs "notvalidated proper" then |
230 | 232 | {t with token=Proper(lemma,pos,interp,[])} |
231 | 233 | else t |
232 | 234 | | _ -> t |
233 | 235 | |
236 | +let initialize () = | |
237 | + ENIAMtokenizer.initialize (); | |
238 | + ENIAMinflexion.initialize (); | |
239 | + ENIAM_MWE.mwe_dict := ENIAM_MWE.load_mwe_dict (); | |
240 | + lemma_frequencies := load_lemma_frequencies lemma_frequencies_filename; | |
241 | + proper_names := load_proper_names () | |
242 | + | |
234 | 243 | let parse query = |
235 | 244 | let l = ENIAMtokenizer.parse query in |
236 | 245 | (* print_endline "a6"; *) |
... | ... |
subsyntax/interface.ml
subsyntax/test.ml
tokenizer/ENIAMacronyms.ml
... | ... | @@ -19,7 +19,7 @@ |
19 | 19 | |
20 | 20 | open ENIAMtokenizerTypes |
21 | 21 | |
22 | -let mte_patterns = | |
22 | +let load_mte_patterns () = | |
23 | 23 | let lines = try File.load_lines mte_filename |
24 | 24 | with _ -> (prerr_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in |
25 | 25 | let l = List.rev (Xlist.rev_map lines (fun line -> |
... | ... | @@ -29,6 +29,7 @@ let mte_patterns = |
29 | 29 | List.rev (Xlist.rev_map l (fun (orths,lemma,interp) -> |
30 | 30 | Xlist.map orths (fun orth -> O orth), (fun (_:token_env list) -> ENIAMtokens.make_lemma (lemma,interp)))) |
31 | 31 | |
32 | +let mte_patterns = (ref [] : (pat list * (token_env list -> token)) list ref) | |
32 | 33 | |
33 | 34 | let compose_lemma t lemma_suf interp = |
34 | 35 | ENIAMtokens.make_lemma (ENIAMtokens.get_orth t.token ^ lemma_suf, interp) |
... | ... |
tokenizer/ENIAMpatterns.ml
... | ... | @@ -470,7 +470,7 @@ let find_replacement_patterns tokens = |
470 | 470 | let tokens = normalize_tokens [] tokens in |
471 | 471 | let tokens = find_patterns ENIAMacronyms.acronym_patterns tokens in |
472 | 472 | let tokens = normalize_tokens [] tokens in |
473 | - let tokens = find_patterns ENIAMacronyms.mte_patterns tokens in | |
473 | + let tokens = find_patterns !ENIAMacronyms.mte_patterns tokens in | |
474 | 474 | let tokens = normalize_tokens [] tokens in |
475 | 475 | (* Xlist.iter tokens (fun t -> print_endline (ENIAMtokens.string_of_tokens 0 t)); *) |
476 | 476 | let tokens = find_patterns ENIAMacronyms.name_patterns tokens in |
... | ... |
tokenizer/ENIAMtokenizer.ml
tokenizer/test.ml
... | ... | @@ -51,7 +51,7 @@ let test_strings = [ |
51 | 51 | (* "Szpak śpiewa."; |
52 | 52 | "Ala ma kota."; |
53 | 53 | "Ale mają kota:"*) |
54 | - "Matura."; | |
54 | +(* "Matura."; | |
55 | 55 | "matura"; |
56 | 56 | "„Matura.”"; |
57 | 57 | "„Matura”."; |
... | ... | @@ -59,10 +59,12 @@ let test_strings = [ |
59 | 59 | "- matura"; |
60 | 60 | "- Matura"; |
61 | 61 | "2 jabłka"; |
62 | - "- 2 jabłka"; | |
62 | + "- 2 jabłka";*) | |
63 | + "ping-ponga" | |
63 | 64 | ] |
64 | 65 | |
65 | 66 | let _ = |
67 | + ENIAMtokenizer.initialize (); | |
66 | 68 | print_endline "Testy wbudowane"; |
67 | 69 | Xlist.iter test_strings (fun s -> |
68 | 70 | print_endline ("\nTEST: " ^ s); |
... | ... |