Commit b7bfd55d140b508a75896d82e31959ec8f557d82

Authored by Wojciech Jaworski
1 parent 766cb2a4

Inicjalizacja tokenizera, morphologii i subsyntax

LCGlexicon/ENIAMcategoriesPL.ml
... ... @@ -89,7 +89,7 @@ let adv_modes =
89 89 try File.fold_tab adv_modes_filename StringMap.empty (fun adv_modes -> function
90 90 [adv;mode] -> StringMap.add_inc adv_modes adv [mode] (fun l -> mode :: l)
91 91 | _ -> failwith "adv_modes")
92   - with _ -> (prerr_endline ("ENIAMlexicon adv_modes file " ^ adv_modes_filename ^ " not found"); StringMap.empty)
  92 + with _ -> (prerr_endline ("ENIAMlexicon adv_modes file " ^ adv_modes_filename ^ " not found"); ) in
93 93  
94 94 let noun_type proper lemma pos =
95 95 let nsyn =
... ...
morphology2/ENIAMinflexion.ml
... ... @@ -40,26 +40,35 @@ let simplify_lemma s =
40 40 | [s;_] -> s
41 41 | _ -> failwith "simplify_lemma"
42 42  
43   -let prepare_inflexion alt_filename stem_filename rules_filename =
  43 +let prepare_alt alt_filename =
44 44 let alt = load_tab alt_filename in
45 45 let alt = Xlist.fold alt StringMap.empty (fun alt entry ->
46 46 Xlist.fold entry.forms alt (fun alt form ->
47 47 let simple_lemma = simplify_lemma entry.lemma in
48 48 let v = simple_lemma, form.interp, 1, [] in
49 49 StringMap.add_inc alt form.orth [v] (fun l -> v :: l))) in
50   - let stems = load_stems stem_filename in
  50 + alt
  51 +
  52 +let prepare_rules rules_filename =
51 53 let rules = ENIAMmorphologyRules.load_freq_rules rules_filename in
52 54 let rules = ENIAMmorphologyRules.CharTrees.create rules in
53   - alt,stems,rules
  55 + rules
  56 +
  57 +let alt = ref (StringMap.empty : (string * string * int * string list) list StringMap.t)
  58 +let stems = ref (StringMap.empty : StringSet.t StringMap.t)
  59 +let rules = ref ([] : (StringMap.key * ENIAMmorphologyRules.CharTrees.t) list)
54 60  
55   -let alt,stems,rules = prepare_inflexion alt_filename stem_filename rules_filename
  61 +let initialize () =
  62 + alt := prepare_alt alt_filename;
  63 + stems := load_stems stem_filename;
  64 + rules := prepare_rules rules_filename
56 65  
57 66 let get_interpretations orth =
58   - let candidates = ENIAMmorphologyRules.CharTrees.find rules orth in
59   - let found = try StringMap.find alt orth with Not_found -> [] in
  67 + let candidates = ENIAMmorphologyRules.CharTrees.find !rules orth in
  68 + let found = try StringMap.find !alt orth with Not_found -> [] in
60 69 let found = Xlist.fold candidates found (fun found (stem,rule) ->
61 70 (* Printf.printf "%s\t%s\n%!" stem (ENIAMmorphologyRules.string_of_freq_rule rule); *)
62   - let ids = try StringMap.find stems stem with Not_found -> StringSet.empty in
  71 + let ids = try StringMap.find !stems stem with Not_found -> StringSet.empty in
63 72 if not (StringSet.mem ids rule.id) && rule.star <> Productive then found else
64 73 let tags = if StringSet.mem ids rule.id then [] else ["lemma not validated"] in
65 74 (stem ^ rule.set, rule.interp, rule.freq, tags) :: found) in
... ...
morphology2/test.ml
... ... @@ -32,6 +32,7 @@ let string_of_token (lemma,interp,quantity,attrs) =
32 32 Printf.sprintf "%s\t%s\t%d\t%s" lemma interp quantity (String.concat ", " attrs)
33 33  
34 34 let _ =
  35 + ENIAMinflexion.initialize ();
35 36 print_endline "Testy wbudowane";
36 37 Xlist.iter test_strings (fun s ->
37 38 print_endline ("\nTEST: " ^ s);
... ...
subsyntax/ENIAM_MWE.ml
... ... @@ -28,13 +28,15 @@ let load_dict dict filename =
28 28 StringMap.add_inc dict s [orth,lemma,interp] (fun l -> (orth,lemma,interp) :: l)
29 29 | l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'"))
30 30  
31   -let mwe_dict =
  31 +let load_mwe_dict () =
32 32 let dict = load_dict StringMap.empty brev_filename in
33 33 let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
34 34 (* let dict = load_dict dict complete_entries_filename in*)
35 35 let dict = load_dict dict mwe_filename in
36 36 dict
37 37  
  38 +let mwe_dict = ref (StringMap.empty : (string * string * string) list StringMap.t)
  39 +
38 40 let preselect_dict orths dict =
39 41 StringSet.fold orths [] (fun rules orth ->
40 42 try
... ... @@ -280,7 +282,7 @@ let add_ordnum_rules rules paths =
280 282 let process (paths,last) =
281 283 let paths = Xlist.fold paths IntMap.empty add_token in
282 284 let orths = get_orths paths in
283   - let rules = preselect_dict orths mwe_dict in
  285 + let rules = preselect_dict orths !mwe_dict in
284 286 let rules = add_ordnum_rules rules paths in
285 287 let paths = Xlist.fold rules paths apply_rule in
286 288 let paths = IntMap.fold paths [] (fun paths _ map ->
... ...
subsyntax/ENIAMsubsyntax.ml
... ... @@ -30,7 +30,7 @@ let load_lemma_frequencies filename =
30 30 [count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.))
31 31 | _ -> failwith ("load_lemma_frequencies: " ^ line))
32 32  
33   -let lemma_frequencies = load_lemma_frequencies lemma_frequencies_filename
  33 +let lemma_frequencies = ref (StringMap.empty : float StringMap.t)
34 34  
35 35 let modify_weights paths =
36 36 List.rev (Xlist.fold paths [] (fun paths t ->
... ... @@ -41,8 +41,8 @@ let modify_weights paths =
41 41 | "lemmatized as lowercase" -> w -. 0.1
42 42 | _ -> w) in
43 43 let w = match t.token with
44   - Lemma(lemma,cat,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
45   - | Proper(lemma,cat,_,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
  44 + Lemma(lemma,cat,_) -> (try w +. StringMap.find !lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
  45 + | Proper(lemma,cat,_,_) -> (try w +. StringMap.find !lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
46 46 | _ -> w in
47 47 {t with weight = w} :: paths))
48 48  
... ... @@ -210,12 +210,14 @@ let load_proper_name proper = function
210 210 StringMap.add_inc proper lemma types (fun types2 -> types @ types2)
211 211 | l -> failwith ("proper_names: " ^ String.concat " " l)
212 212  
213   -let proper_names =
  213 +let load_proper_names () =
214 214 let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in
215 215 let proper = File.fold_tab proper_names_filename2 proper load_proper_name in
216 216 let proper = File.fold_tab proper_names_filename3 proper load_proper_name in
217 217 proper
218 218  
  219 +let proper_names = ref (StringMap.empty : string list StringMap.t)
  220 +
219 221 let remove l s =
220 222 Xlist.fold l [] (fun l t ->
221 223 if s = t then l else t :: l)
... ... @@ -223,14 +225,21 @@ let remove l s =
223 225 let find_proper_names t =
224 226 match t.token with
225 227 Lemma(lemma,pos,interp) ->
226   - if StringMap.mem proper_names lemma then
227   - {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma);
  228 + if StringMap.mem !proper_names lemma then
  229 + {t with token=Proper(lemma,pos,interp,StringMap.find !proper_names lemma);
228 230 attrs=remove t.attrs "notvalidated proper"} else
229 231 if Xlist.mem t.attrs "notvalidated proper" then
230 232 {t with token=Proper(lemma,pos,interp,[])}
231 233 else t
232 234 | _ -> t
233 235  
  236 +let initialize () =
  237 + ENIAMtokenizer.initialize ();
  238 + ENIAMinflexion.initialize ();
  239 + ENIAM_MWE.mwe_dict := ENIAM_MWE.load_mwe_dict ();
  240 + lemma_frequencies := load_lemma_frequencies lemma_frequencies_filename;
  241 + proper_names := load_proper_names ()
  242 +
234 243 let parse query =
235 244 let l = ENIAMtokenizer.parse query in
236 245 (* print_endline "a6"; *)
... ...
subsyntax/interface.ml
... ... @@ -86,6 +86,7 @@ let rec main_loop in_chan out_chan =
86 86 let _ =
87 87 prerr_endline message;
88 88 Arg.parse spec_list anon_fun usage_msg;
  89 + ENIAMsubsyntax.initialize ();
89 90 Gc.compact ();
90 91 prerr_endline "Ready!";
91 92 if !comm_stdio then main_loop stdin stdout
... ...
subsyntax/test.ml
... ... @@ -41,6 +41,7 @@ let test_strings2 = [
41 41 ]
42 42  
43 43 let _ =
  44 + ENIAMsubsyntax.initialize ();
44 45 let test_num = ref 1 in
45 46 print_endline "Testy wbudowane";
46 47 Xlist.iter test_strings (fun s ->
... ...
tokenizer/ENIAMacronyms.ml
... ... @@ -19,7 +19,7 @@
19 19  
20 20 open ENIAMtokenizerTypes
21 21  
22   -let mte_patterns =
  22 +let load_mte_patterns () =
23 23 let lines = try File.load_lines mte_filename
24 24 with _ -> (prerr_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in
25 25 let l = List.rev (Xlist.rev_map lines (fun line ->
... ... @@ -29,6 +29,7 @@ let mte_patterns =
29 29 List.rev (Xlist.rev_map l (fun (orths,lemma,interp) ->
30 30 Xlist.map orths (fun orth -> O orth), (fun (_:token_env list) -> ENIAMtokens.make_lemma (lemma,interp))))
31 31  
  32 +let mte_patterns = (ref [] : (pat list * (token_env list -> token)) list ref)
32 33  
33 34 let compose_lemma t lemma_suf interp =
34 35 ENIAMtokens.make_lemma (ENIAMtokens.get_orth t.token ^ lemma_suf, interp)
... ...
tokenizer/ENIAMpatterns.ml
... ... @@ -470,7 +470,7 @@ let find_replacement_patterns tokens =
470 470 let tokens = normalize_tokens [] tokens in
471 471 let tokens = find_patterns ENIAMacronyms.acronym_patterns tokens in
472 472 let tokens = normalize_tokens [] tokens in
473   - let tokens = find_patterns ENIAMacronyms.mte_patterns tokens in
  473 + let tokens = find_patterns !ENIAMacronyms.mte_patterns tokens in
474 474 let tokens = normalize_tokens [] tokens in
475 475 (* Xlist.iter tokens (fun t -> print_endline (ENIAMtokens.string_of_tokens 0 t)); *)
476 476 let tokens = find_patterns ENIAMacronyms.name_patterns tokens in
... ...
tokenizer/ENIAMtokenizer.ml
... ... @@ -20,6 +20,9 @@
20 20 open Xstd
21 21 open ENIAMtokenizerTypes
22 22  
  23 +let initialize () =
  24 + ENIAMacronyms.mte_patterns := ENIAMacronyms.load_mte_patterns ()
  25 +
23 26 let string_of =
24 27 ENIAMtokens.string_of_tokens
25 28  
... ...
tokenizer/test.ml
... ... @@ -51,7 +51,7 @@ let test_strings = [
51 51 (* "Szpak śpiewa.";
52 52 "Ala ma kota.";
53 53 "Ale mają kota:"*)
54   - "Matura.";
  54 +(* "Matura.";
55 55 "matura";
56 56 "„Matura.”";
57 57 "„Matura”.";
... ... @@ -59,10 +59,12 @@ let test_strings = [
59 59 "- matura";
60 60 "- Matura";
61 61 "2 jabłka";
62   - "- 2 jabłka";
  62 + "- 2 jabłka";*)
  63 + "ping-ponga"
63 64 ]
64 65  
65 66 let _ =
  67 + ENIAMtokenizer.initialize ();
66 68 print_endline "Testy wbudowane";
67 69 Xlist.iter test_strings (fun s ->
68 70 print_endline ("\nTEST: " ^ s);
... ...