generate.ml 2.01 KB
open Xstd

let nlp_resources_path = "../../NLP resources/"
let sgjp_path = nlp_resources_path ^ "SGJP/"
let sgjp_filename = "sgjp-20170730.tab.gz"


let sources = [
  sgjp_path, sgjp_filename;
  "data/", "noun-supplement-acro.tab";
  "data/", "noun-supplement-polimorf.tab";
  "data/", "dial_ach.tab";
  "data/", "dial_ami2.tab";
  "data/", "dial_ami3.tab";
  "data/", "dial_ami4.tab";
  "data/", "dial_ami.tab";
  "data/", "dial_ą2.tab";
  "data/", "dial_ą.tab";
  "data/", "dial_ę.tab";
  "data/", "dial_my.tab";
  "data/", "dial_sz.tab";
  "data/", "dial_ym.tab";
  ]

let compound_rules = ENIAMmorphologyRules.make_compound_rules ()
let interp_compound_rule_trees = ENIAMmorphologyRules.make_interp_compound_rule_trees compound_rules

let generate_alt rules_filename path filename out_filename =
  let rules = ENIAMmorphologyRules.load_freq_rules rules_filename in
  let rules = ENIAMmorphologyRules.CharTrees.create rules in
  let dict = Dict.load_tab (path ^ filename) in
  let dict = Dict.merge_entries dict in
  let dict = Dict.process_interps dict in
  let dict = Dict.remove_cat "cond" dict in
  (* let dict = Dict.mark_ndm dict in *)
  let dict = Dict.validate_interp rules dict in
  let dict = Dict.remove_validated_forms dict in
  Dict.print out_filename dict

let generate_lemmata path filename out_filename =
  let dict = Dict.load_tab (path ^ filename) in
  let lemmata = Xlist.fold dict StringSet.empty (fun set e ->
    StringSet.add set (Stem.simplify_lemma e.ENIAMmorphologyTypes.lemma)) in
  File.file_out out_filename (fun file ->
    StringSet.iter lemmata (Printf.fprintf file "%s\n"))


let _ =
  Dict.generate_rule_frequencies_list interp_compound_rule_trees sources "resources/freq_rules.tab";
  generate_alt "resources/freq_rules.tab" sgjp_path sgjp_filename "resources/alt.tab";
  Dict.generate_stem_dict "resources/freq_rules.tab" sgjp_path sgjp_filename "resources/stem.tab";
  Dict.generate_wyglos sgjp_path sgjp_filename "resources/wyglos.tab";
  generate_lemmata sgjp_path sgjp_filename "resources/lemmata.tab";
  ()