Inicjalizacja tokenizera, morphologii i subsyntax

Wojciech Jaworski
1 parent 766cb2a4
Showing 11 changed files with 49 additions and 20 deletions
LCGlexicon/ENIAMcategoriesPL.ml
morphology2/ENIAMinflexion.ml
morphology2/test.ml
subsyntax/ENIAM_MWE.ml
subsyntax/ENIAMsubsyntax.ml
subsyntax/interface.ml
subsyntax/test.ml
tokenizer/ENIAMacronyms.ml
tokenizer/ENIAMpatterns.ml
tokenizer/ENIAMtokenizer.ml
tokenizer/test.ml
@@ -89,7 +89,7 @@ let adv_modes =
   try File.fold_tab adv_modes_filename StringMap.empty (fun adv_modes -> function
       [adv;mode] -> StringMap.add_inc adv_modes adv [mode] (fun l -> mode :: l)
     | _ -> failwith "adv_modes")
-  with _ -> (prerr_endline ("ENIAMlexicon adv_modes file " ^ adv_modes_filename ^ " not found"); StringMap.empty)
+  with _ -> (prerr_endline ("ENIAMlexicon adv_modes file " ^ adv_modes_filename ^ " not found"); ) in
  
 let noun_type proper lemma pos =
   let nsyn =
@@ -40,26 +40,35 @@ let simplify_lemma s =
   | [s;_] -> s
   | _ -> failwith "simplify_lemma"
  
-let prepare_inflexion alt_filename stem_filename rules_filename =
+let prepare_alt alt_filename =
   let alt = load_tab alt_filename in
   let alt = Xlist.fold alt StringMap.empty (fun alt entry ->
     Xlist.fold entry.forms alt (fun alt form ->
       let simple_lemma = simplify_lemma entry.lemma in
       let v = simple_lemma, form.interp, 1, [] in
       StringMap.add_inc alt form.orth [v] (fun l -> v :: l))) in
-  let stems = load_stems stem_filename in
+  alt
+
+let prepare_rules rules_filename =
   let rules = ENIAMmorphologyRules.load_freq_rules rules_filename in
   let rules = ENIAMmorphologyRules.CharTrees.create rules in
-  alt,stems,rules
+  rules
+
+let alt = ref (StringMap.empty : (string * string * int * string list) list StringMap.t)
+let stems = ref (StringMap.empty : StringSet.t StringMap.t)
+let rules = ref ([] : (StringMap.key * ENIAMmorphologyRules.CharTrees.t) list)
  
-let alt,stems,rules = prepare_inflexion alt_filename stem_filename rules_filename
+let initialize () =
+  alt := prepare_alt alt_filename;
+  stems := load_stems stem_filename;
+  rules := prepare_rules rules_filename
  
 let get_interpretations orth =
-  let candidates = ENIAMmorphologyRules.CharTrees.find rules orth in
-  let found = try StringMap.find alt orth with Not_found -> [] in
+  let candidates = ENIAMmorphologyRules.CharTrees.find !rules orth in
+  let found = try StringMap.find !alt orth with Not_found -> [] in
   let found = Xlist.fold candidates found (fun found (stem,rule) ->
     (* Printf.printf "%s\t%s\n%!" stem (ENIAMmorphologyRules.string_of_freq_rule rule); *)
-    let ids = try StringMap.find stems stem with Not_found -> StringSet.empty in
+    let ids = try StringMap.find !stems stem with Not_found -> StringSet.empty in
     if not (StringSet.mem ids rule.id) && rule.star <> Productive then found else
     let tags = if StringSet.mem ids rule.id then [] else ["lemma not validated"] in
     (stem ^ rule.set, rule.interp, rule.freq, tags) :: found) in
@@ -32,6 +32,7 @@ let string_of_token (lemma,interp,quantity,attrs) =
   Printf.sprintf "%s\t%s\t%d\t%s" lemma interp quantity (String.concat ", " attrs)
  
 let _ =
+  ENIAMinflexion.initialize ();
   print_endline "Testy wbudowane";
   Xlist.iter test_strings (fun s ->
     print_endline ("\nTEST: " ^ s);
@@ -28,13 +28,15 @@ let load_dict dict filename =
         StringMap.add_inc dict s [orth,lemma,interp] (fun l -> (orth,lemma,interp) :: l)
     | l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'"))
  
-let mwe_dict =
+let load_mwe_dict () =
   let dict = load_dict StringMap.empty brev_filename in
   let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
 (*    let dict = load_dict dict complete_entries_filename in*)
   let dict = load_dict dict mwe_filename in
   dict
  
+let mwe_dict = ref (StringMap.empty : (string * string * string) list StringMap.t)
+
 let preselect_dict orths dict =
   StringSet.fold orths [] (fun rules orth ->
     try
@@ -280,7 +282,7 @@ let add_ordnum_rules rules paths =
 let process (paths,last) =
   let paths = Xlist.fold paths IntMap.empty add_token in
   let orths = get_orths paths in
-  let rules = preselect_dict orths mwe_dict in
+  let rules = preselect_dict orths !mwe_dict in
   let rules = add_ordnum_rules rules paths in
   let paths = Xlist.fold rules paths apply_rule in
   let paths = IntMap.fold paths [] (fun paths _ map ->
@@ -30,7 +30,7 @@ let load_lemma_frequencies filename =
       [count; lemma; cat] -> StringMap.add map (lemma ^ "\t" ^ cat) (log10 (float_of_string count +. 1.))
     | _ -> failwith ("load_lemma_frequencies: " ^ line))
  
-let lemma_frequencies = load_lemma_frequencies lemma_frequencies_filename
+let lemma_frequencies = ref (StringMap.empty : float StringMap.t)
  
 let modify_weights paths =
   List.rev (Xlist.fold paths [] (fun paths t ->
@@ -41,8 +41,8 @@ let modify_weights paths =
       | "lemmatized as lowercase" -> w -. 0.1
       | _ -> w) in
     let w = match t.token with
-        Lemma(lemma,cat,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
-      | Proper(lemma,cat,_,_) -> (try w +. StringMap.find lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
+        Lemma(lemma,cat,_) -> (try w +. StringMap.find !lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
+      | Proper(lemma,cat,_,_) -> (try w +. StringMap.find !lemma_frequencies (lemma ^ "\t" ^ cat) with Not_found -> w)
       | _ -> w in
     {t with weight = w} :: paths))
  
@@ -210,12 +210,14 @@ let load_proper_name proper = function
     StringMap.add_inc proper lemma types (fun types2 -> types @ types2)
   | l -> failwith ("proper_names: " ^ String.concat " " l)
  
-let proper_names =
+let load_proper_names () =
   let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in
   let proper = File.fold_tab proper_names_filename2 proper load_proper_name in
   let proper = File.fold_tab proper_names_filename3 proper load_proper_name in
   proper
  
+let proper_names = ref (StringMap.empty : string list StringMap.t)
+
 let remove l s =
   Xlist.fold l [] (fun l t ->
       if s = t then l else t :: l)
@@ -223,14 +225,21 @@ let remove l s =
 let find_proper_names t =
   match t.token with
     Lemma(lemma,pos,interp) ->
-    if StringMap.mem proper_names lemma then
-      {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma);
+    if StringMap.mem !proper_names lemma then
+      {t with token=Proper(lemma,pos,interp,StringMap.find !proper_names lemma);
               attrs=remove t.attrs "notvalidated proper"} else
     if Xlist.mem t.attrs "notvalidated proper" then
       {t with token=Proper(lemma,pos,interp,[])}
     else t
   | _ -> t
  
+let initialize () =
+  ENIAMtokenizer.initialize ();
+  ENIAMinflexion.initialize ();
+  ENIAM_MWE.mwe_dict := ENIAM_MWE.load_mwe_dict ();
+  lemma_frequencies := load_lemma_frequencies lemma_frequencies_filename;
+  proper_names := load_proper_names ()
+
 let parse query =
   let l = ENIAMtokenizer.parse query in
 (*   print_endline "a6"; *)
@@ -86,6 +86,7 @@ let rec main_loop in_chan out_chan =
 let _ =
   prerr_endline message;
   Arg.parse spec_list anon_fun usage_msg;
+  ENIAMsubsyntax.initialize ();
   Gc.compact ();
   prerr_endline "Ready!";
   if !comm_stdio then main_loop stdin stdout
@@ -41,6 +41,7 @@ let test_strings2 = [
   ]
  
 let _ =
+  ENIAMsubsyntax.initialize ();
   let test_num = ref 1 in
   print_endline "Testy wbudowane";
   Xlist.iter test_strings (fun s ->
@@ -19,7 +19,7 @@
  
 open ENIAMtokenizerTypes
  
-let mte_patterns =
+let load_mte_patterns () =
   let lines = try File.load_lines mte_filename
    with _ -> (prerr_endline ("ENIAMtokenizer mte file " ^ mte_filename ^ " not found"); []) in
   let l = List.rev (Xlist.rev_map lines (fun line ->
@@ -29,6 +29,7 @@ let mte_patterns =
   List.rev (Xlist.rev_map l (fun (orths,lemma,interp) ->
     Xlist.map orths (fun orth -> O orth), (fun (_:token_env list) -> ENIAMtokens.make_lemma (lemma,interp))))
  
+let mte_patterns = (ref [] : (pat list * (token_env list -> token)) list ref)
  
 let compose_lemma t lemma_suf interp =
   ENIAMtokens.make_lemma (ENIAMtokens.get_orth t.token ^ lemma_suf, interp)
@@ -470,7 +470,7 @@ let find_replacement_patterns tokens =
   let tokens = normalize_tokens [] tokens in
   let tokens = find_patterns ENIAMacronyms.acronym_patterns tokens in
   let tokens = normalize_tokens [] tokens in
-  let tokens = find_patterns ENIAMacronyms.mte_patterns tokens in
+  let tokens = find_patterns !ENIAMacronyms.mte_patterns tokens in
   let tokens = normalize_tokens [] tokens in
 (*   Xlist.iter tokens (fun t -> print_endline (ENIAMtokens.string_of_tokens 0 t)); *)
   let tokens = find_patterns ENIAMacronyms.name_patterns tokens in
@@ -20,6 +20,9 @@
 open Xstd
 open ENIAMtokenizerTypes
  
+let initialize () =
+  ENIAMacronyms.mte_patterns := ENIAMacronyms.load_mte_patterns ()
+
 let string_of =
   ENIAMtokens.string_of_tokens
  
@@ -51,7 +51,7 @@ let test_strings = [
 (*  "Szpak śpiewa.";
   "Ala ma kota.";
   "Ale mają kota:"*)
-  "Matura.";
+(*  "Matura.";
   "matura";
   "„Matura.”";
   "„Matura”.";
@@ -59,10 +59,12 @@ let test_strings = [
   "- matura";
   "- Matura";
   "2 jabłka";
-  "- 2 jabłka";
+  "- 2 jabłka";*)
+  "ping-ponga"
   ]
  
 let _ =
+  ENIAMtokenizer.initialize ();
   print_endline "Testy wbudowane";
   Xlist.iter test_strings (fun s ->
     print_endline ("\nTEST: " ^ s);