generate.ml 18.1 KB
open Xstd
open Printf
open Types

let nexus_path = "/home/yacheu/Dokumenty/Badania/Jezyk i Umysl/Przetwarzanie Języka Naturalnego/zasoby/"
let toshiba_ub_path = "/home/wjaworski/Dokumenty/zasoby/"

let get_host_name () =
  let chan = Unix.open_process_in "uname -n" in
  input_line chan

let zasoby_path =
  match get_host_name () with
    "nexus" -> nexus_path
  | "toshiba-UB" -> toshiba_ub_path
(*   | "mozart" -> "." *)
  | s -> failwith ("unknown host: " ^ s)

let nlp_resources_path = "../../NLP resources/"
let sgjp_path = nlp_resources_path ^ "SGJP/"
let results_path = "results/"
let lu_path = "../morphology2/plWordnet/"

let lematy_nkjp_filename = "lematy_NKJP1M_freq.tab"

let sgjp_filename2015 = "sgjp-20151020.tab.gz"
let polimorf_filename2015 = "polimorf-20151020.tab.gz"
let sgjp_filename201605 = "sgjp-20160508.tab.gz"
let polimorf_filename201605 = "polimorf-20160508.tab.gz"
let sgjp_filename201607 = "sgjp-20160724.tab.gz"
let polimorf_filename201607 = "polimorf-20160724.tab.gz"
let sgjp_filename = "sgjp-20170730.tab.gz"
let polimorf_filename = "polimorf-20170402.tab.gz"

let adv_sgjp_filename = "adv_sgjp-20170730.tab"
let adj_sgjp_filename = "adj_sgjp-20170730.tab"
let noun_sgjp_filename = "noun_sgjp-20170730.tab"
let verb_sgjp_filename = "verb_sgjp-20170730.tab"
let adv_polimorf_filename = "adv_polimorf-20170402.tab"
let adj_polimorf_filename = "adj_polimorf-20170402.tab"
let noun_polimorf_filename = "noun_polimorf-20170402.tab"
let verb_polimorf_filename = "verb_polimorf-20170402.tab"

(* Test wczytywania słowników i liczenie częstości *)
let _ =
  (* print_endline "sgjp_filename2015";
  let _ = Dict.load_tab (sgjp_path ^ sgjp_filename2015) in
  print_endline "polimorf_filename2015";
  let _ = Dict.load_tab (sgjp_path ^ polimorf_filename2015) in
  print_endline "sgjp_filename201605";
  let _ = Dict.load_tab (sgjp_path ^ sgjp_filename201605) in
  print_endline "polimorf_filename201605";
  let _ = Dict.load_tab (sgjp_path ^ polimorf_filename201605) in
  print_endline "sgjp_filename201607";
  let _ = Dict.load_tab (sgjp_path ^ sgjp_filename201607) in
  print_endline "polimorf_filename201607";
  let _ = Dict.load_tab (sgjp_path ^ polimorf_filename201607) in
  print_endline "sgjp_filename";
  let _ = Dict.load_tab (sgjp_path ^ sgjp_filename) in
  print_endline "polimorf_filename";
  let _ = Dict.load_tab (sgjp_path ^ polimorf_filename) in
  print_endline "adv_sgjp_filename";
  let _ = Dict.load_tab (results_path ^ adv_sgjp_filename) in
  print_endline "adj_sgjp_filename";
  let _ = Dict.load_tab (results_path ^ adj_sgjp_filename) in
  print_endline "noun_sgjp_filename";
  let _ = Dict.load_tab (results_path ^ noun_sgjp_filename) in
  print_endline "verb_sgjp_filename";
  let _ = Dict.load_tab (results_path ^ verb_sgjp_filename) in
  print_endline "sgjp_filename2015";
  let dict = Dict.load_tab_full (sgjp_path ^ sgjp_filename2015) in
  Dict.print_quantities "results/proper-type-sgjp-20151020.txt" Dict.proper_type_selector dict;
  Dict.print_quantities "results/genre-sgjp-20151020.txt" Dict.genre_selector dict;
  Dict.print_quantities "results/interp-sgjp-20151020.txt" Dict.interp_selector dict;
  print_endline "polimorf_filename2015";
  let dict = Dict.load_tab_full (sgjp_path ^ polimorf_filename2015) in
  Dict.print_quantities "results/proper-type-polimorf-20151020.txt" Dict.proper_type_selector dict;
  Dict.print_quantities "results/genre-polimorf-20151020.txt" Dict.genre_selector dict;
  Dict.print_quantities "results/interp-polimorf-20151020.txt" Dict.interp_selector dict;
  print_endline "sgjp_filename201605";
  let dict = Dict.load_tab_full (sgjp_path ^ sgjp_filename201605) in
  Dict.print_quantities "results/proper-type-sgjp-20160508.txt" Dict.proper_type_selector dict;
  Dict.print_quantities "results/genre-sgjp-20160508.txt" Dict.genre_selector dict;
  Dict.print_quantities "results/interp-sgjp-20160508.txt" Dict.interp_selector dict;
  print_endline "polimorf_filename201605";
  let dict = Dict.load_tab_full (sgjp_path ^ polimorf_filename201605) in
  Dict.print_quantities "results/proper-type-polimorf-20160508.txt" Dict.proper_type_selector dict;
  Dict.print_quantities "results/genre-polimorf-20160508.txt" Dict.genre_selector dict;
  Dict.print_quantities "results/interp-polimorf-20160508.txt" Dict.interp_selector dict;
  print_endline "sgjp_filename201607";
  let dict = Dict.load_tab_full (sgjp_path ^ sgjp_filename201607) in
  Dict.print_quantities "results/proper-type-sgjp-20160724.txt" Dict.proper_type_selector dict;
  Dict.print_quantities "results/genre-sgjp-20160724.txt" Dict.genre_selector dict;
  Dict.print_quantities "results/interp-sgjp-20160724.txt" Dict.interp_selector dict;
  print_endline "polimorf_filename201607";
  let dict = Dict.load_tab_full (sgjp_path ^ polimorf_filename201607) in
  Dict.print_quantities "results/proper-type-polimorf-20160724.txt" Dict.proper_type_selector dict;
  Dict.print_quantities "results/genre-polimorf-20160724.txt" Dict.genre_selector dict;
  Dict.print_quantities "results/interp-polimorf-20160724.txt" Dict.interp_selector dict;
  print_endline "sgjp_filename";
  let dict = Dict.load_tab_full (sgjp_path ^ sgjp_filename) in
  Dict.print_quantities "results/proper-type-sgjp-20170730.txt" Dict.proper_type_selector dict;
  Dict.print_quantities "results/genre-sgjp-20170730.txt" Dict.genre_selector dict;
  Dict.print_quantities "results/interp-sgjp-20170730.txt" Dict.interp_selector dict;
  print_endline "polimorf_filename";
  let dict = Dict.load_tab_full (sgjp_path ^ polimorf_filename) in
  Dict.print_quantities "results/proper-type-polimorf-20170402.txt" Dict.proper_type_selector dict;
  Dict.print_quantities "results/genre-polimorf-20170402.txt" Dict.genre_selector dict;
  Dict.print_quantities "results/interp-polimorf-20170402.txt" Dict.interp_selector dict;
  print_endline "adv_sgjp_filename";
  let dict = Dict.load_tab_full (results_path ^ adv_sgjp_filename) in
  print_endline "adj_sgjp_filename";
  let dict = Dict.load_tab_full (results_path ^ adj_sgjp_filename) in
  print_endline "noun_sgjp_filename";
  let dict = Dict.load_tab_full (results_path ^ noun_sgjp_filename) in
  print_endline "verb_sgjp_filename";
  let dict = Dict.load_tab_full (results_path ^ verb_sgjp_filename) in
  print_endline "lematy_nkjp_filename";
  let dict = Dict.load_freq_tab (zasoby_path ^ lematy_nkjp_filename) in
  Dict.print_quantities "results/interp-lematy_nkjp.txt" Dict.interp_selector dict;
  Dict.print_quantities "results/freq-lematy_nkjp.txt" Dict.freq_selector dict;*)
  ()

(* Porównanie wersji słowników *)
let _ =
  (* Dict.compare_dicts_full (sgjp_path ^ sgjp_filename2015) (sgjp_path ^ sgjp_filename201605) "results/comparition_sgjp1_full.out"; *)
  (* Dict.compare_dicts_full (sgjp_path ^ sgjp_filename201605) (sgjp_path ^ sgjp_filename) "results/comparition_sgjp2_full.out"; *)
  (* Dict.compare_dicts_full (sgjp_path ^ polimorf_filename2015) (sgjp_path ^ polimorf_filename201605) "results/comparition_polimorf1_full.out"; *)
  (* Dict.compare_dicts_full (sgjp_path ^ polimorf_filename201605) (sgjp_path ^ polimorf_filename) "results/comparition_polimorf2_full.out"; *)
  ()

(* Podział słownika *)
let _ =
  (* Dict.split_dict sgjp_path sgjp_filename results_path; *)
  (* Dict.split_dict sgjp_path sgjp_filename201607 results_path;  *)
  (* Dict.split_dict sgjp_path sgjp_filename201605 results_path; *)
  (* Dict.split_dict sgjp_path polimorf_filename results_path; *)
  ()

(* Wypisanie nieodmiennych *)
let _ =
  (* let dict = Dict.load_tab_full (sgjp_path ^ sgjp_filename) in
  let dict = Dict.merge_entries dict in
  let dict = Dict.mark_ndm dict in
  Dict.print_ndm "results/ndm-sgjp.tab" dict; *)
  ()

let check_stem_generation path filename =
  let dict = Dict.load_tab_full (path ^ filename) in
  let dict = Dict.merge_entries dict in
  let dict = Dict.mark_ndm dict in
  let dict = Dict.find_kolwiek_suffixes dict in
  let _ = Dict.generate_stem dict in
  ()

(* Sprawdzenie działania stemowania *)
let _ =
  (* check_stem_generation sgjp_path sgjp_filename; *)
(*   check_stem_generation results_path adj_sgjp_filename;  *)
  (* check_stem_generation results_path noun_sgjp_filename; *)
  (* check_stem_generation results_path noun_polimorf_filename; *)
  ()

let compound_rules = Rules.make_compound_rules ()
let compound_rule_trees = Rules.make_compound_rule_trees compound_rules
let interp_compound_rule_trees = Rules.make_interp_compound_rule_trees compound_rules

(* let _ = Rules.CharTrees.print_rules "results/rules_tree.txt" compound_rule_trees *)

let find_not_validated_forms rules path filename out_filename =
  let dict = Dict.load_tab (path ^ filename) in
  let dict = Dict.validate rules  dict in
  let dict = Dict.remove_validated_forms dict in
  Dict.print out_filename dict

let find_not_validated_entries rules path filename out_filename =
  let dict = Dict.load_tab (path ^ filename) in
  let dict = Dict.merge_entries dict in
  let dict = Dict.mark_ndm dict in
  let dict = Dict.remove_ndm dict in
  let dict = Dict.validate rules dict in
  let dict = Dict.remove_validated_entries dict in
  Dict.print out_filename dict

(* Wypisanie niezwalidowanych form *)
let _ =
  (* find_not_validated_forms compound_rule_trees results_path adj_sgjp_filename "results/not_validated_adj.tab";  *)
  (* find_not_validated_entries compound_rule_trees results_path adj_sgjp_filename "results/not_validated_adj.tab"; *)
  (* find_not_validated_entries compound_rule_trees results_path adj_polimorf_filename "results/not_validated_p_adj.tab"; *)
  (* find_not_validated_entries compound_rule_trees "results/" "not_validated_adj.tab" "results/not_validated_adj2.tab"; *)
  (* find_not_validated_entries compound_rule_trees results_path adv_sgjp_filename "results/not_validated_adv.tab"; *)
  (* find_not_validated_entries compound_rule_trees results_path adv_polimorf_filename "results/not_validated_p_adv.tab"; *)
  (* find_not_validated_entries compound_rule_trees results_path noun_sgjp_filename "results/not_validated_noun.tab";
  find_not_validated_forms compound_rule_trees results_path noun_sgjp_filename "results/not_validated_noun2.tab";
  find_not_validated_entries compound_rule_trees results_path noun_polimorf_filename "results/not_validated_p_noun.tab";
  find_not_validated_forms compound_rule_trees results_path noun_polimorf_filename "results/not_validated_p_noun2.tab"; *)
  (* find_not_validated_entries compound_rule_trees results_path "not_validated_noun.tab" "results/not_validated_noun3.tab"; *)
  (* find_not_validated_forms compound_rule_trees results_path "not_validated_noun.tab" "results/not_validated_noun4.tab"; *)
  (* find_not_validated_entries compound_rule_trees results_path verb_sgjp_filename "results/not_validated_verb.tab";
  find_not_validated_forms compound_rule_trees results_path verb_sgjp_filename "results/not_validated_verb2.tab";
  find_not_validated_entries compound_rule_trees results_path verb_polimorf_filename "results/not_validated_p_verb.tab";
  find_not_validated_forms compound_rule_trees results_path verb_polimorf_filename "results/not_validated_p_verb2.tab"; *)
  ()

let find_not_validated_lemmata rules path filename out_filename =
  let dict = Dict.load_tab (path ^ filename) in
  let dict = Dict.merge_entries dict in
  let dict = Dict.validate rules dict in
  let dict = Dict.remove_validated_entries dict in
  Dict.print_lemmata out_filename dict

(* Wypisanie niezwalidowanych lematów *)
let _ =
  (* find_not_validated_lemmata compound_rule_trees results_path "not_validated_noun.tab" "results/not_validated_noun_lemma.tab"; *)
  ()

let find_validated_lemmata rules path filename out_filename =
  let dict = Dict.load_tab (path ^ filename) in
  let dict = Dict.merge_entries dict in
  let dict = Dict.validate rules dict in
  let dict = Dict.remove_not_validated_entries dict in
  Dict.print_lemmata out_filename dict

(* Wypisanie zwalidowanych lematów *)
let _ =
  (* find_validated_lemmata compound_rule_trees results_path adj_sgjp_filename "results/validated_adj.tab";
  find_validated_lemmata compound_rule_trees results_path noun_sgjp_filename "results/validated_noun.tab";
  find_validated_lemmata compound_rule_trees results_path verb_sgjp_filename "results/validated_verb.tab"; *)
  ()

(* Generowanie reguł *)
let _ =
  (* Dict.generate_rules compound_rule_trees results_path adj_sgjp_filename "results/rules-odm-adj.txt"; *)
  (* Dict.generate_rules compound_rule_trees results_path noun_sgjp_filename "results/rules-odm-noun.txt"; *)
  (* Dict.generate_rules compound_rule_trees results_path adv_sgjp_filename "results/rules-adv.txt"; *)
  (* Dict.generate_rules compound_rule_trees results_path verb_sgjp_filename "results/rules-verb.txt"; *)
  (* Dict.generate_rules compound_rule_trees results_path "test.tab" "results/rules-test.txt"; *)
  (* Dict.generate_rules compound_rule_trees results_path "not_validated_noun.tab" "results/rules-nv-noun.txt"; *)
  (* Dict.generate_rules compound_rule_trees results_path "not_validated_p_noun.tab" "results/rules-nv-noun.txt"; *)
  (* Dict.generate_rules_lu compound_rule_trees 142 lu_path "results/rules-142_lu.txt"; *)
  (* Dict.generate_rules_lu compound_rule_trees 148 lu_path "results/rules-148_lu.txt"; *)
  (* Dict.generate_rules_lu compound_rule_trees 42 lu_path "results/rules-42_lu.txt"; *)
  ()

(* Generowanie reguł dla interpretacji *)
let _ =
  (* Dict.generate_interp_rules compound_rule_trees interp_compound_rule_trees true true true results_path adj_sgjp_filename "results/interp_rules_adj.tab";
  Dict.generate_interp_rules compound_rule_trees interp_compound_rule_trees false true true results_path adj_sgjp_filename "results/interp_rules_adj2.tab"; *)
  (* Dict.generate_interp_rules compound_rule_trees interp_compound_rule_trees true true true results_path adv_sgjp_filename "results/interp_rules_adv.tab"; *)
(* Dict.generate_interp_rules compound_rule_trees interp_compound_rule_trees true true true results_path verb_sgjp_filename "results/interp_rules_verb.tab";
Dict.generate_interp_rules compound_rule_trees interp_compound_rule_trees true false true results_path verb_sgjp_filename "results/interp_rules_verb2.tab"; *)
  (* Dict.generate_interp_rules compound_rule_trees interp_compound_rule_trees true true true results_path noun_sgjp_filename "results/interp_rules_noun.tab"; *)
  Dict.generate_interp_rules compound_rule_trees interp_compound_rule_trees true true false results_path noun_sgjp_filename "results/interp_rules_noun2.tab";
  (* Dict.generate_interp_rules compound_rule_trees interp_compound_rule_trees false true true results_path noun_sgjp_filename "results/interp_rules_noun3.tab"; *)
  ()

(* Generowanie złożonych reguł zaopatrzonych we frekwencje *)
let _ =
  (* Dict.generate_rule_frequencies interp_compound_rule_trees results_path adj_sgjp_filename "results/freq_rules-adj.tab"; *)
  (* Dict.generate_rule_frequencies interp_compound_rule_trees sgjp_path sgjp_filename "results/freq_rules.tab"; *)
  ()

let generate_alt rules_filename path filename out_filename =
  let rules = Rules.load_freq_rules rules_filename in
  let rules = Rules.CharTrees.create rules in
  let dict = Dict.load_tab (path ^ filename) in
  let dict = Dict.merge_entries dict in
  let dict = Dict.remove_cat "cond" dict in
  let dict = Dict.mark_ndm dict in
  let dict = Dict.validate_interp rules dict in
  let dict = Dict.remove_validated_forms dict in
  Dict.print out_filename dict

(* Walidacja reguł zaopatrznych we frekwencje/generowanie listy wyjątków *)
let _ =
  (* generate_alt "results/freq_rules-adj.tab" results_path adj_sgjp_filename "results/alt-adj.tab"; *)
  (* generate_alt "results/freq_rules.tab" sgjp_path sgjp_filename "results/alt.tab"; *)
  ()

(* Generowanie stemów z regułami *)
let _ =
  (* Dict.generate_stem_dict "results/freq_rules-adj.tab" results_path adj_sgjp_filename "results/stem-adj.tab"; *)
  (* Dict.generate_stem_dict "results/freq_rules.tab" sgjp_path sgjp_filename "results/stem.tab"; *)
  ()

(**********************************************************************************)
(* Testy *)

let print_interpretations l =
  Xlist.iter (Xlist.sort l compare) (fun (lemma,interp,freq,tags) ->
    Printf.printf "%s\t%s\t%d\t%s\n" lemma interp freq (String.concat ";" tags))

let _ =
  (* let l = Inflexion.get_interpretations "życzliwą" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "żyźniejszego" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "zwiśli" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "najzieleńsza" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "najtandetniejsza" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "nieżelazny" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "któregokolwiek" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "większą" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "bordo" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "sexi" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "sexy" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "sepulkową" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "profesory" in
  print_interpretations l; *)
  (* let l = Inflexion.get_interpretations "chrobotnąwszy" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "chronografowi" in
  print_interpretations l;
  let l = Inflexion.get_interpretations "chowaniami" in
  print_interpretations l; *)
  (* let l = Inflexion.get_interpretations "Czechami" in
  print_interpretations l; *)
  (* let l = Inflexion.get_interpretations "Włoszech" in
  print_interpretations l; *)
  ()

(* let _ =
  let dict = Dict.load_tab (sgjp_path ^ sgjp_filename) in
  let dict = Dict.merge_entries dict in
  let dict = Dict.remove_cat "cond" dict in
  Xlist.iter dict (fun entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    Xlist.iter entry.forms (fun form ->
      let l = Inflexion.get_interpretations form.orth in
      let n = Xlist.fold l 0 (fun n (lemma,interp,freq,tags) ->
        if lemma = simple_lemma && interp = form.interp then n+1 else n) in
      if n <> 1 then printf "%d\t%s\t%s\t%s\n%!" n form.orth entry.lemma form.interp)) *)

(**********************************************************************************)
(**********************************************************************************)