dict.ml 55.2 KB

Edit Raw Blame History

open Xstd
open Printf
open Types

let get_form e =
  match e.forms with
    [form] -> form
  | _ -> failwith "get_form"

let load_tab filename =
  File.load_tab filename (function
      orth :: lemma :: interp :: _ ->
        {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]}
    | line -> failwith ("load_tab: " ^ (String.concat "\t" line)))

let load_tab_full filename =
  File.load_tab filename (function
      [orth; lemma; interp] ->
        {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]}
    | [orth; lemma; interp; proper_type] ->
        {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]; proper_type=proper_type}
    | [orth; lemma; interp; proper_type; genre] ->
        {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp; genre=genre}]; proper_type=proper_type}
    | line -> failwith ("load_tab_full: " ^ (String.concat "\t" line)))

let load_lu dict id path =
  let filename = path ^ "morf_rel_" ^ string_of_int id ^ "_lu.tab" in
  File.fold_tab filename dict (fun dict -> function
      [lemma1; lemma2] ->
        {lemma1=lemma1; lemma2=lemma2; rel_id=id; lu_stem=""; lu_validated=false;validated1=false;validated2=false} :: dict
    | line -> failwith ("load_lu: " ^ (String.concat "\t" line)))

let pos = StringSet.of_list [
  "subst";"adj";"adv";"interp";"num";"xxx";"prep";"fin";"praet";"qub";"inf";"interj";
  "brev";"numcol";"ppas";"pact";"adja";"conj";"ger";"pcon";"pant";"comp";"depr";
  "adjp";"imps";"impt";"pred";"bedzie";"burk";"aglt";"ppron12";"ppron3";"adjc";
  "winien";"siebie";"numcomp"
  ]

let rec find_pos rev = function
    s :: l -> if StringSet.mem pos s then List.rev rev, s :: l else find_pos (s :: rev) l
  | [] -> failwith "find_pos"

let split_lemma_interp s =
  let l = Xstring.split_delim ":" s in
  let lemma,interp = find_pos [List.hd l] (List.tl l) in
  String.concat ":" lemma, String.concat ":" interp

let rec remove_empties = function
    "" :: l -> remove_empties l
  | l -> l

let split_freq_orth s =
  match remove_empties (Xstring.split " " s) with
    freq :: l -> int_of_string freq, String.concat " " l
  | _ -> failwith "split_freq_orth"

let load_freq_tab filename =
  File.load_tab filename (function
      [freq_orth; lemma_interp] ->
        let freq,orth = split_freq_orth freq_orth in
        let lemma,interp = split_lemma_interp lemma_interp  in
        {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp; freq=freq}]}
    | line -> failwith ("load_freq_tab: " ^ (String.concat "\t" line)))

let proper_type_selector e = e.proper_type
let genre_selector e = (get_form e).genre
let interp_selector e = (get_form e).interp
let freq_selector e = (get_form e).freq

let print_quantities out_filename selector dict =
  let qmap = Xlist.fold dict StringQMap.empty (fun qmap entry ->
    StringQMap.add qmap (selector entry)) in
  File.file_out out_filename (fun file ->
    StringQMap.iter qmap (fun k v ->
      fprintf file "%6d\t%s\n" v k))

(**********************************************************************************)

let load_dict_as_set filename =
  let l = load_tab filename in
  List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
    let form = get_form entry in
    StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp]))))

let load_dict_as_set_full filename =
  let l = load_tab_full filename in
  List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
    let form = get_form entry in
    StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp;entry.proper_type;form.genre]))))

let rec compare_dicts_rec file = function
    [],[] -> ()
  | [],b :: lb -> fprintf file "> %s\n" b; compare_dicts_rec file ([],lb)
  | a :: la,[] -> fprintf file "< %s\n" a; compare_dicts_rec file (la,[])
  | a :: la, b :: lb ->
       if a = b then compare_dicts_rec file (la,lb) else
       if a < b then (fprintf file "< %s\n" a; compare_dicts_rec file (la,b :: lb)) else
       (fprintf file "> %s\n" b; compare_dicts_rec file (a :: la,lb))

let compare_dicts filename1 filename2 filename_out =
  let dict1 = load_dict_as_set filename1 in
  let dict2 = load_dict_as_set filename2 in
  File.file_out filename_out (fun file ->
    compare_dicts_rec file (dict1,dict2))

let compare_dicts_full filename1 filename2 filename_out =
  let dict1 = load_dict_as_set_full filename1 in
  let dict2 = load_dict_as_set_full filename2 in
  File.file_out filename_out (fun file ->
    compare_dicts_rec file (dict1,dict2))

(**********************************************************************************)

let find_entry_cat entry =
  let form = get_form entry in
  let cat,tags = match Xstring.split ":" form.interp with
      cat :: tags -> cat,tags
    | _ -> failwith ("find_entry_type: " ^ form.interp) in
  if cat = "praet" then
    let t = match tags with
      _ :: _ :: t :: _ -> t
    | _ -> failwith ("find_entry_cat: " ^ form.interp) in
    if t = "pri" || t = "sec" || t = "ter" then "cond" else "verb" else
  if cat = "subst" || cat = "depr" then "noun" else
  if cat = "adj" || cat = "adja"|| cat = "adjc"|| cat = "adjp" then "adj" else
  if cat = "adv" then "adv" else
  if cat = "inf" || cat = "praet"|| cat = "fin" || cat = "ppas" || cat = "pact" || cat = "pacta" ||
     cat = "impt" || cat = "imps" || cat = "pcon" || cat = "pant" || cat = "ger" || cat = "" then "verb" else
  if cat = "bedzie" || cat = "pred"|| cat = "prep" || cat = "num" || cat = "aglt" || cat = "winien" ||
     cat = "qub" || cat = "brev" || cat = "comp" || cat = "interj" || cat = "burk" ||
     cat = "conj" || cat = "ppron12" || cat = "ppron3" || cat = "numcomp" || cat = "" then "other" else
  if cat = "cond" then "cond" else
  failwith ("find_entry_cat: " ^ cat)

let assign_entry_cat dict =
  Xlist.rev_map dict (fun entry ->
    {entry with cat = find_entry_cat entry})

let split_dict in_path filename out_path =
  let dict = load_tab (in_path ^ filename) in
  let dict = List.rev (assign_entry_cat dict) in
  let filename = if Xstring.check_sufix ".gz" filename then
    Xstring.cut_sufix ".gz" filename else filename in
  File.file_out (out_path ^ "noun_" ^ filename) (fun noun_file ->
  File.file_out (out_path ^ "adj_" ^ filename) (fun adj_file ->
  File.file_out (out_path ^ "adv_" ^ filename) (fun adv_file ->
  File.file_out (out_path ^ "verb_" ^ filename) (fun verb_file ->
  File.file_out (out_path ^ "other_" ^ filename) (fun other_file ->
  Xlist.iter dict (fun entry ->
    let form = get_form entry in
    try
      let file = match entry.cat with
          "noun" -> noun_file
        | "adj" -> adj_file
        | "adv" -> adv_file
        | "verb" -> verb_file
        | "other" -> other_file
        | "cond" -> raise Not_found
        | _ -> failwith "split_dict" in
      fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp
    with Not_found -> ()))))))

let merge_entries dict =
  let dict = assign_entry_cat dict in
  let map = Xlist.fold dict StringMap.empty (fun map entry ->
    let form = get_form entry in
    let key =
      if entry.cat = "noun" then
        let gender = match Xstring.split ":" form.interp with
            ["depr";_;_;"m2"] -> "m1"
          | "depr" :: _ -> failwith ("merge_entries: " ^ form.interp)
          | [_;_;_;gender] -> gender
          | [_;_;_;gender;col] -> gender ^ ":" ^ col
          | _ -> failwith ("merge_entries: " ^ form.interp) in
        entry.lemma ^ "|" ^ entry.cat ^ "|" ^ gender
      else entry.lemma ^ "|" ^ entry.cat in
    StringMap.add_inc map key entry (fun e ->
      if entry.proper_type <> e.proper_type then
        failwith ("merge_entries: " ^ key ^ " " ^ entry.proper_type ^ " " ^ e.proper_type) else
      {e with forms = form :: e.forms})) in
  StringMap.fold map [] (fun dict _ e -> e :: dict)

let remove_cat cat dict =
  Xlist.fold dict [] (fun dict entry ->
    if entry.cat = cat then dict
    else entry :: dict)

(*
*)
let rec get_aspect lemma = function
    (f : form) :: l ->
      (match Xstring.split ":" f.interp with
        ["inf";a] -> a
      | ["ger";_;_;_;a;_] -> a
      | _ -> get_aspect lemma l)
  | [] -> failwith ("get_aspect: " ^ lemma)

let get_lemma_suf lemma =
  let lemma_suf =
    if lemma = "" then "" else
    List.hd (List.rev (Xunicode.utf8_chars_of_utf8_string (Stem.simplify_lemma lemma))) in
  match lemma_suf with
    "a" -> "a"
  | "e" -> "e"
  | "o" -> "o"
  | "y" -> "y"
  | "i" -> "y"
  | "ę" -> "ę"
  | _ -> "ε"

let get_orth_suf orth =
  let orth_suf =
    if orth = "" then "" else
    List.hd (List.rev (Xunicode.utf8_chars_of_utf8_string orth)) in
  match orth_suf with
    "j" -> "j"
  | "e" -> "e"
  | _ -> "ε"

let merge_interps lemma forms =
  let lemma_suf = get_lemma_suf lemma in
  let map = Xlist.fold forms StringMap.empty (fun map form ->
    (* printf "merge_interps 1: %s %s\n%!" form.orth form.interp; *)
    StringMap.add_inc map form.orth (StringSet.singleton form.interp) (fun set -> StringSet.add set form.interp)) in
  StringMap.fold map [] (fun forms orth set ->
    (* printf "merge_interps 2: %s %s\n%!" orth (String.concat " " (StringSet.to_list set)); *)
    let orth_suf = get_orth_suf orth in
    match lemma_suf, Xlist.sort (StringSet.to_list set) compare with
      _,["adv"] -> {empty_form with orth=orth; interp="adv:pos"} :: forms
    | _,["adv";"adv:pos"] -> {empty_form with orth=orth; interp="adv:pos"} :: forms
    | _,["adj:pl:acc:m2.m3.f.n:pos";"adj:pl:nom.voc:m2.m3.f.n:pos";"adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos|adj:pl:nom.acc.voc:m2.m3.f.n:pos"} :: forms
    | _,["adj:sg:acc:m1.m2:pos";"adj:sg:gen:m1.m2.m3.n:pos"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:pos|adj:sg:acc:m1.m2:pos"} :: forms
    | _,["adj:sg:dat:f:pos";"adj:sg:gen:f:pos";"adj:sg:loc:f:pos"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:pos"} :: forms
    | _,["adj:sg:acc:m3:pos";"adj:sg:nom.voc:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos"} :: forms
    | _,["adj:pl:acc:m1:pos";"adj:pl:gen:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:pos|adj:pl:acc:m1:pos"} :: forms
    | _,["adj:pl:dat:m1.m2.m3.f.n:pos";"adj:sg:inst:m1.m2.m3.n:pos";"adj:sg:loc:m1.m2.m3.n:pos"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:pos|adj:pl:dat:m1.m2.m3.f.n:pos"} :: forms
    | _,["adj:sg:acc:f:pos";"adj:sg:inst:f:pos"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:pos"} :: forms
    | _,["adj:pl:nom.voc:m1:pos";"adj:sg:acc:m3:pos";"adj:sg:nom.voc:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos|adj:pl:nom.voc:m1:pos"} :: forms
    | _,["adj:sg:acc:m3:pos";"adj:sg:nom:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom:m1.m2.m3:pos|adj:sg:acc:m3:pos"} :: forms
    | _,["adj:pl:acc:m2.m3.f.n:pos";"adj:pl:nom.voc:m2.m3.f.n:pos"] -> {empty_form with orth=orth; interp="adj:pl:nom.acc.voc:m2.m3.f.n:pos"} :: forms
    | _,["adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos"} :: forms
    | _,["adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos";"adja"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos|adja"} :: forms
    | _,["adj:pl:nom:m2.m3.f.n:pos";"adj:sg:nom:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom:n:pos|adj:pl:nom:m2.m3.f.n:pos"} :: forms
    | _,["adj:pl:acc:m2.m3.f.n:sup";"adj:pl:nom.voc:m2.m3.f.n:sup";"adj:sg:acc:n:sup";"adj:sg:nom.voc:n:sup"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:sup|adj:pl:nom.acc.voc:m2.m3.f.n:sup"} :: forms
    | _,["adj:sg:acc:m1.m2:sup";"adj:sg:gen:m1.m2.m3.n:sup"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:sup|adj:sg:acc:m1.m2:sup"} :: forms
    | _,["adj:sg:dat:f:sup";"adj:sg:gen:f:sup";"adj:sg:loc:f:sup"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:sup"} :: forms
    | _,["adj:sg:acc:m3:sup";"adj:sg:nom.voc:m1.m2.m3:sup"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:sup|adj:sg:acc:m3:sup"} :: forms
    | _,["adj:pl:acc:m1:sup";"adj:pl:gen:m1.m2.m3.f.n:sup";"adj:pl:loc:m1.m2.m3.f.n:sup"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:sup|adj:pl:acc:m1:sup"} :: forms
    | _,["adj:pl:dat:m1.m2.m3.f.n:sup";"adj:sg:inst:m1.m2.m3.n:sup";"adj:sg:loc:m1.m2.m3.n:sup"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:sup|adj:pl:dat:m1.m2.m3.f.n:sup"} :: forms
    | _,["adj:sg:acc:f:sup";"adj:sg:inst:f:sup"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:sup"} :: forms
    | _,["adj:pl:acc:m2.m3.f.n:com";"adj:pl:nom.voc:m2.m3.f.n:com";"adj:sg:acc:n:com";"adj:sg:nom.voc:n:com"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:com|adj:pl:nom.acc.voc:m2.m3.f.n:com"} :: forms
    | _,["adj:sg:acc:m1.m2:com";"adj:sg:gen:m1.m2.m3.n:com"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:com|adj:sg:acc:m1.m2:com"} :: forms
    | _,["adj:sg:dat:f:com";"adj:sg:gen:f:com";"adj:sg:loc:f:com"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:com"} :: forms
    | _,["adj:sg:acc:m3:com";"adj:sg:nom.voc:m1.m2.m3:com"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:com|adj:sg:acc:m3:com"} :: forms
    | _,["adj:pl:acc:m1:com";"adj:pl:gen:m1.m2.m3.f.n:com";"adj:pl:loc:m1.m2.m3.f.n:com"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:com|adj:pl:acc:m1:com"} :: forms
    | _,["adj:pl:dat:m1.m2.m3.f.n:com";"adj:sg:inst:m1.m2.m3.n:com";"adj:sg:loc:m1.m2.m3.n:com"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:com|adj:pl:dat:m1.m2.m3.f.n:com"} :: forms
    | _,["adj:sg:acc:f:com";"adj:sg:inst:f:com"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:com"} :: forms
    | _,["adj:pl:acc:m1:pos";"adj:pl:acc:m2.m3.f.n:pos";"adj:pl:dat:m1.m2.m3.f.n:pos";"adj:pl:gen:m1.m2.m3.f.n:pos";
       "adj:pl:inst:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos";"adj:pl:nom.voc:m1:pos";"adj:pl:nom.voc:m2.m3.f.n:pos";
       "adj:sg:acc:f:pos";"adj:sg:acc:m1.m2:pos";"adj:sg:acc:m3:pos";"adj:sg:acc:n:pos";"adj:sg:dat:f:pos";
       "adj:sg:dat:m1.m2.m3.n:pos";"adj:sg:gen:f:pos";"adj:sg:gen:m1.m2.m3.n:pos";"adj:sg:inst:f:pos";"adj:sg:inst:m1.m2.m3.n:pos";
       "adj:sg:loc:f:pos";"adj:sg:loc:m1.m2.m3.n:pos";"adj:sg:nom.voc:f:pos";"adj:sg:nom.voc:m1.m2.m3:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos"} :: forms
    | _,["adj:pl:acc:m1:pos";"adj:pl:acc:m2.m3.f.n:pos";"adj:pl:dat:m1.m2.m3.f.n:pos";"adj:pl:gen:m1.m2.m3.f.n:pos";
       "adj:pl:inst:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos";"adj:pl:nom.voc:m1:pos";"adj:pl:nom.voc:m2.m3.f.n:pos";
       "adj:sg:acc:f:pos";"adj:sg:acc:m1.m2:pos";"adj:sg:acc:m3:pos";"adj:sg:acc:n:pos";"adj:sg:dat:f:pos";
       "adj:sg:dat:m1.m2.m3.n:pos";"adj:sg:gen:f:pos";"adj:sg:gen:m1.m2.m3.n:pos";"adj:sg:inst:f:pos";"adj:sg:inst:m1.m2.m3.n:pos";
       "adj:sg:loc:f:pos";"adj:sg:loc:m1.m2.m3.n:pos";"adj:sg:nom.voc:f:pos";"adj:sg:nom.voc:m1.m2.m3:pos";"adj:sg:nom.voc:n:pos";"adja"] -> {empty_form with orth=orth; interp="adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos|adja"} :: forms
    | _,["ger:pl:nom.acc:n:imperf.perf:aff";"ger:sg:gen:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ger:sg:gen:n:imperf.perf:aff|ger:pl:nom.acc:n:imperf.perf:aff"} :: forms
    | _,["ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:aff";"ppas:sg:nom.acc.voc:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:nom.acc.voc:n:imperf.perf:aff|ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:aff"} :: forms
    | _,["ppas:sg:acc:m1.m2:imperf.perf:aff";"ppas:sg:gen:m1.m2.m3.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:gen:m1.m2.m3.n:imperf.perf:aff|ppas:sg:acc:m1.m2:imperf.perf:aff"} :: forms
    | _,["ppas:sg:acc:m3:imperf.perf:aff";"ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff|ppas:sg:acc:m3:imperf.perf:aff"} :: forms
    | _,["ppas:pl:acc:m1:imperf.perf:aff";"ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:aff|ppas:pl:acc:m1:imperf.perf:aff"} :: forms
    | _,["ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:aff";"ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:aff|ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:aff"} :: forms
    | _,["pact:pl:nom.acc.voc:m2.m3.f.n:imperf:aff";"pact:sg:nom.acc.voc:n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:nom.acc.voc:n:imperf:aff|pact:pl:nom.acc.voc:m2.m3.f.n:imperf:aff"} :: forms
    | _,["pact:sg:acc:m1.m2:imperf:aff";"pact:sg:gen:m1.m2.m3.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:gen:m1.m2.m3.n:imperf:aff|pact:sg:acc:m1.m2:imperf:aff"} :: forms
    | _,["pact:pl:nom.voc:m1:imperf:aff";"pact:sg:acc:m3:imperf:aff";"pact:sg:nom.voc:m1.m2.m3:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:nom.voc:m1.m2.m3:imperf:aff|pact:sg:acc:m3:imperf:aff|pact:pl:nom.voc:m1:imperf:aff"} :: forms
    | _,["pact:pl:acc:m1:imperf:aff";"pact:pl:gen.loc:m1.m2.m3.f.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:pl:gen.loc:m1.m2.m3.f.n:imperf:aff|pact:pl:acc:m1:imperf:aff"} :: forms
    | _,["pact:pl:dat:m1.m2.m3.f.n:imperf:aff";"pact:sg:inst.loc:m1.m2.m3.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:inst.loc:m1.m2.m3.n:imperf:aff|pact:pl:dat:m1.m2.m3.f.n:imperf:aff"} :: forms
    | _,["ger:pl:nom.acc:n:imperf.perf:neg";"ger:sg:gen:n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ger:sg:gen:n:imperf.perf:neg|ger:pl:nom.acc:n:imperf.perf:neg"} :: forms
    | _,["ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:neg";"ppas:sg:nom.acc.voc:n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:nom.acc.voc:n:imperf.perf:neg|ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:neg"} :: forms
    | _,["ppas:sg:acc:m1.m2:imperf.perf:neg";"ppas:sg:gen:m1.m2.m3.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:gen:m1.m2.m3.n:imperf.perf:neg|ppas:sg:acc:m1.m2:imperf.perf:neg"} :: forms
    | _,["ppas:sg:acc:m3:imperf.perf:neg";"ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg|ppas:sg:acc:m3:imperf.perf:neg"} :: forms
    | _,["ppas:pl:acc:m1:imperf.perf:neg";"ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:neg|ppas:pl:acc:m1:imperf.perf:neg"} :: forms
    | _,["ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:neg";"ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:neg|ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:neg"} :: forms
    | _,["pact:pl:nom.acc.voc:m2.m3.f.n:imperf:neg";"pact:sg:nom.acc.voc:n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:nom.acc.voc:n:imperf:neg|pact:pl:nom.acc.voc:m2.m3.f.n:imperf:neg"} :: forms
    | _,["pact:sg:acc:m1.m2:imperf:neg";"pact:sg:gen:m1.m2.m3.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:gen:m1.m2.m3.n:imperf:neg|pact:sg:acc:m1.m2:imperf:neg"} :: forms
    | _,["pact:pl:nom.voc:m1:imperf:neg";"pact:sg:acc:m3:imperf:neg";"pact:sg:nom.voc:m1.m2.m3:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:nom.voc:m1.m2.m3:imperf:neg|pact:sg:acc:m3:imperf:neg|pact:pl:nom.voc:m1:imperf:neg"} :: forms
    | _,["pact:pl:acc:m1:imperf:neg";"pact:pl:gen.loc:m1.m2.m3.f.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:pl:gen.loc:m1.m2.m3.f.n:imperf:neg|pact:pl:acc:m1:imperf:neg"} :: forms
    | _,["pact:pl:dat:m1.m2.m3.f.n:imperf:neg";"pact:sg:inst.loc:m1.m2.m3.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:inst.loc:m1.m2.m3.n:imperf:neg|pact:pl:dat:m1.m2.m3.f.n:imperf:neg"} :: forms
    | _,["ger:pl:gen:n:imperf.perf:aff";"inf:imperf.perf"] -> {empty_form with orth=orth; interp="ger:pl:gen:n:imperf.perf:aff"} :: {empty_form with orth=orth; interp="inf:imperf.perf"} :: forms
    | _,["praet:sg:m1.m2.m3:imperf.perf";"praet:sg:m1.m2.m3:imperf.perf:nagl"] -> {empty_form with orth=orth; interp="praet:sg:m1.m2.m3:imperf.perf:nagl"} :: forms
    | _,["fin:sg:ter:imperf.perf";"ger:sg:nom.acc:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="fin:sg:ter:imperf.perf"} :: {empty_form with orth=orth; interp="ger:sg:nom.acc:n:imperf.perf:aff"} :: forms
    | _,["ger:pl:gen:n:imperf.perf:aff";"impt:sg:sec:imperf.perf"] -> {empty_form with orth=orth; interp="ger:pl:gen:n:imperf.perf:aff"} :: {empty_form with orth=orth; interp="impt:sg:sec:imperf.perf"} :: forms
    | _,["fin:pl:ter:imperf.perf";"ppas:sg:acc.inst:f:imperf.perf:aff"] -> {empty_form with orth=orth; interp="fin:pl:ter:imperf.perf"} :: {empty_form with orth=orth; interp="ppas:sg:acc.inst:f:imperf.perf:aff"} :: forms
    | "a",["subst:sg:dat.loc:f";"subst:sg:gen:f"] ->
          if orth_suf = "j" then {empty_form with orth=orth; interp="subst:sg:gen.dat.loc:f"} :: forms
          else {empty_form with orth=orth; interp="subst:sg:gen:f"} :: {empty_form with orth=orth; interp="subst:sg:dat.loc:f"} :: forms
    (* | "a",["subst:pl:gen:f";"subst:sg:dat.loc:f";"subst:sg:gen:f"] -> (*print_endline lemma;*) {empty_form with orth=orth; interp="subst:pl:gen:f"} :: {empty_form with orth=orth; interp="subst:sg:dat.loc:f"} :: {empty_form with orth=orth; interp="subst:sg:gen:f"} :: forms *)
    | "a",["subst:pl:gen:f";"subst:pl:loc:f"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:f"} :: forms
    | "ε",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
    | "ε",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms
    | "ε",["subst:sg:loc:m1";"subst:sg:voc:m1"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m1"} :: forms
    | "ε",["subst:sg:loc:m2";"subst:sg:voc:m2"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m2"} :: forms
    | "ε",["subst:sg:loc:m3";"subst:sg:voc:m3"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m3"} :: forms
    | "ε",["subst:sg:gen:m3";"subst:sg:loc:m3";"subst:sg:voc:m3"] -> {empty_form with orth=orth; interp="subst:sg:loc.voc:m3"} :: {empty_form with orth=orth; interp="subst:sg:gen:m3"} :: forms
    | "ε",["subst:sg:acc:f";"subst:sg:nom:f"] -> {empty_form with orth=orth; interp="subst:sg:nom.acc:f"} :: forms
    | "ε",["subst:pl:gen:f";"subst:pl:nom.acc.voc:f";"subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:gen.dat.loc.voc:f|subst:pl:gen:f"} :: {empty_form with orth=orth; interp="subst:pl:nom.acc.voc:f"} :: forms
    | "ε",["subst:pl:gen:f";"subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:gen.dat.loc.voc:f|subst:pl:gen:f"} :: forms
    | "y",["subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1"} :: forms
    | "y",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
    | "y",["subst:pl:gen.acc:m1";"subst:pl:loc:m1"] -> {empty_form with orth=orth; interp="subst:pl:gen.acc.loc:m1"} :: forms
    | "y",["subst:pl:dat:m1";"subst:sg:inst:m1";"subst:sg:loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m1|subst:pl:dat:m1"} :: forms
    | "y",["subst:sg:nom:m2";"subst:sg:voc:m2"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m2"} :: forms
    | "y",["subst:pl:nom.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1"} :: {empty_form with orth=orth; interp="subst:pl:nom.voc:m1"} :: forms
    | "y",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms
    | "y",["subst:pl:gen:m2";"subst:pl:loc:m2"] -> {empty_form with orth=orth; interp="subst:pl:gen.acc.loc:m2"} :: forms
    | "y",["subst:pl:dat:m2";"subst:sg:inst:m2";"subst:sg:loc:m2"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m2|subst:pl:dat:m2"} :: forms
    | "y",["subst:sg:nom.acc:m3";"subst:sg:voc:m3"] -> {empty_form with orth=orth; interp="subst:sg:nom.acc.voc:m3"} :: forms
    | "y",["subst:pl:gen:m3";"subst:pl:loc:m3"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:m3"} :: forms
    | "y",["subst:pl:dat:m3";"subst:sg:inst:m3";"subst:sg:loc:m3"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m3|subst:pl:dat:m3"} :: forms
    | "y",["subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:nom:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.loc.voc:f"} :: forms
    | "e",["depr:pl:nom.acc.voc:m2";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
    | "e",["depr:pl:nom.acc.voc:m2";"subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1"} :: {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms
    | "e",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
    | "e",["subst:sg:inst:m1";"subst:sg:loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m1"} :: forms
    | "e",["subst:pl:dat:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:n:ncol"} :: {empty_form with orth=orth; interp="subst:pl:dat:n:ncol"} :: forms
    | "e",["subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:n:ncol"} :: forms
    | "e",["subst:pl:gen:n:ncol";"subst:pl:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:n:ncol"} :: forms
    | "e",["subst:sg:dat:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:ncol"} :: forms
    | "e",["subst:pl:nom.acc.voc:n:ncol";"subst:sg:gen:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:gen:n:ncol|subst:pl:nom.acc.voc:n:ncol"} :: forms
    | "o",["subst:sg:nom.voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom:m1"} :: {empty_form with orth=orth; interp="subst:sg:voc:m1"} :: forms
    | "o",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms
    | "o",["depr:pl:nom.acc.voc:m2";"subst:pl:nom.voc:m1";"subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: {empty_form with orth=orth; interp="subst:pl:nom.voc:m1"} :: {empty_form with orth=orth; interp="depr:pl:nom.acc.voc:m2"} :: forms
    | "o",["subst:sg:dat.loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:dat:m1"} :: {empty_form with orth=orth; interp="subst:sg:loc:m1"} :: forms
    | "o",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms
    | "o",["subst:pl:dat:m1:pt";"subst:pl:loc:m1:pt"] -> {empty_form with orth=orth; interp="subst:pl:dat.loc:m1:pt"} :: forms
    | "ε",["subst:sg:dat:n:ncol";"subst:sg:gen:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol";"subst:sg:nom.acc.voc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: forms
    | "ε",["subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";"subst:sg:dat:n:ncol";"subst:sg:gen:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol";"subst:sg:nom.acc.voc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: forms
    | "ę",["subst:sg:dat:n:col";"subst:sg:loc:n:col"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:col"} :: forms
    | "ę",["subst:sg:dat:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:ncol"} :: forms
    | "o",["subst:sg:loc:m1";"subst:sg:voc:m1"] ->
          if orth_suf = "e" then {empty_form with orth=orth; interp="subst:sg:loc.voc:m1"} :: forms
          else {empty_form with orth=orth; interp="subst:sg:loc:m1"} :: {empty_form with orth=orth; interp="subst:sg:voc:m1"} :: forms
    | _,[interp] -> {empty_form with orth=orth; interp=interp} :: forms
    | _,interps ->
        (* print_endline ("merge_interps: " ^ lemma_suf ^ " [\"" ^ String.concat "\";\"" interps ^ "\"]"); *)
        Xlist.fold interps forms (fun forms interp ->
          {empty_form with orth=orth; interp=interp} :: forms))

let process_interps dict =
  Xlist.rev_map dict (fun entry ->
    if entry.cat = "verb" then
      let aspect = get_aspect entry.lemma entry.forms in
      let forms = Xlist.rev_map entry.forms (fun f ->
        let interp = match (Xstring.split ":" f.interp) with
            ["fin";n;p;_] -> String.concat ":" ["fin";n;p;"imperf.perf"]
          | ["impt";n;p;_] -> String.concat ":" ["impt";n;p;"imperf.perf"]
          | "pcon" :: _ -> f.interp
          | "pacta" :: _ -> f.interp
          | "pact" :: _ -> f.interp
          | ["ger";n;c;g;_;a] -> String.concat ":" ["ger";n;c;g;"imperf.perf";a]
          | ["praet";n;g;_] -> String.concat ":" ["praet";n;g;"imperf.perf"]
          | ["praet";n;g;_;a] -> String.concat ":" ["praet";n;g;"imperf.perf";a]
          | ["inf";_] -> String.concat ":" ["inf";"imperf.perf"]
          | ["pant";_] -> String.concat ":" ["pant";"imperf.perf"]
          | ["imps";_] -> String.concat ":" ["imps";"imperf.perf"]
          | ["ppas";n;c;g;_;a] -> String.concat ":" ["ppas";n;c;g;"imperf.perf";a]
          | _ -> print_endline ("merge_interps: " ^ f.interp); f.interp in
        {f with interp=interp}) in
      let forms = merge_interps entry.lemma forms in
      {entry with aspect=aspect; forms=forms} else
    {entry with forms=merge_interps entry.lemma entry.forms})

(**********************************************************************************)

let mark_ndm dict =
  Xlist.fold dict [] (fun dict entry ->
    if entry.cat <> "noun" &&  entry.cat <> "adj" then entry :: dict else
    let map = Xlist.fold entry.forms StringMap.empty (fun map form ->
      StringMap.add_inc map form.interp (StringSet.singleton form.orth) (fun set -> StringSet.add set form.orth)) in
    let qmap = StringMap.fold map StringQMap.empty (fun qmap interp orths ->
      StringSet.fold orths qmap StringQMap.add) in
    let n = StringMap.size map in
    let found = StringQMap.fold qmap [] (fun found orth v ->
      if v = n then orth :: found else found) in
    match found with
      [] -> entry :: dict
    | [orth] ->
        let ndm,odm = Xlist.fold entry.forms ([],[]) (fun (ndm,odm) form ->
          if form.orth = orth then form :: ndm, odm else ndm, form :: odm) in
        let dict = {entry with forms=odm} :: dict in
        {entry with forms=ndm; ndm=true} :: dict
    | _ -> failwith ("mark_ndm: " ^ (String.concat " " found)))

let print_ndm filename dict =
  File.file_out filename (fun file ->
    Xlist.iter dict (fun entry ->
      if entry.ndm then
        let orth = (List.hd entry.forms).orth in
        fprintf file "%s\t%s\t%s\n" orth entry.lemma entry.cat))

let remove_ndm dict =
  Xlist.fold dict [] (fun dict entry ->
    if entry.ndm then dict
    else entry :: dict)

let remove_not_ndm dict =
  Xlist.fold dict [] (fun dict entry ->
    if not entry.ndm then dict
    else entry :: dict)


let kolwiek_lemmas = StringSet.of_list [
  (* adj *)
  "czyjkolwiek"; "czyjś"; "czyjże"; "jakiciś"; "jakikolwiek"; "jakisi"; "jakiś"; "jakiści";
  "jakiściś"; "jakiśkolwiek"; "jakiż"; "jakiżkolwiek"; "jakowyś"; "kijże"; "kiż"; "którykolwiek";
  "któryś"; "któryż"; "któryżkolwiek"; "niejakiś"; "takiż"; "takowyż"; "tenże"; "tyliż"; "ówże";
  (* noun *)
  "cokolwiek:s"; "cośkolwiek"; "cóżkolwiek"; "ktokolwiek"; "ktośkolwiek"; "któżkolwiek";
  "cociś"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "coże"; "cóż";
  "ktoś:s2"; "któż";
  (* adv *)
  "jakkolwiek"; "jakoś"; "małoż"; "niejakkolwiek"; "niejakoś"; (*"niemalże";*) ]

let kolwiek_suffixes = [
  "żkolwiek"; "żekolwiek"; "śkolwiek"; "kolwiek"; "ż"; "że"; "ściś"; "ciś"; "ś"; "ści"; "sik"; "si"]

let find_kolwiek_suffixes dict =
  Xlist.rev_map dict (fun entry ->
    if StringSet.mem kolwiek_lemmas entry.lemma then
      {entry with forms=Xlist.map entry.forms (fun form ->
          {form with orth=Xlist.fold kolwiek_suffixes form.orth (fun orth kolwiek_suf ->
            if Xstring.check_sufix kolwiek_suf orth then
              Xstring.cut_sufix kolwiek_suf orth
            else orth)})}
    else entry)

let exceptional_lemmata = StringSet.of_list ([
  (* wiele stemów *)
  "Apollo"; "Aujeszky"; "Białystok"; "Gózd"; "Krasnystaw"; "Różanystok"; "Wielkanoc"; "białagłowa";
  "deszcz"; "imćpan"; "iściec"; "otrząs"; "rzeczpospolita"; "wilczełyko"; "woleoczko";

  "prapraojciec"; "praojciec"; "ojciec"; "współbrat"; "spółbrat"; "półbrat"; "brat";
  "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; "człowiek";
  "półdziecko"; "+lecie"; "zimoziele"; "ziele"; "trójziele"; "nasienie";
  "ksiądz"; "dech"; "tydzień"; "roczek:s2"; "rok:s1"; "przechrzest"; "chrzest";
  "dziecko"; "ucho:s2"; "oko:s2"; "cześć:s"; "jo-jo"; "Zabłotce"; "tysiąc:s1"; "półmiesiąc"; "miesiąc"; ""; ""; "";
  "Pia"; "ręka"; "człek"; "Kozak:s1"; "bóg"; "psubrat"; "pieniądz"; ""; ""; ""; "";
  "kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic";
  "co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek";
  "niebiosa"; "Włochy:s1"; "Niemcy"; "Węgry"; "Austro-Węgry"; ""; ""; ""; ""; ""; ""; "";
  "zając:s1"; "tysiąc:s2"; "wszyscy"; ""; ""; ""; ""; ""; ""; ""; ""; "";
(*  "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
  "NOT"; "LOT"; "KRRiT"; "OIT"; ""; ""; ""; ""; ""; ""; ""; "";*)
  "bliscy"; "ojcowie"; "teściowie"; "ichmościowie"; "wujkowie"; "staruszkowie"; "pradziadkowie"; ""; ""; ""; ""; "";
  "małżonkowie"; "kochankowie"; "dziadkowie"; "rozwiedzeni"; "nieliczni"; "chrzestni"; ""; ""; ""; ""; ""; "";
  "starzy"; "wasi"; "nasi"; "najmłodsi"; "dzisiejsi"; ""; ""; ""; ""; ""; ""; "";
  "IKEA"; "stajnia"; "kuchnia:s"; "suknia"; "minisuknia"; "głównia"; "głownia"; "dźwignia"; ""; ""; ""; "";
  "workowiśnia"; "wiśnia"; "sośnia"; "laurowiśnia"; "studnia"; "idea"; "imienie"; ""; ""; ""; ""; "";
  "makao"; "macao"; "kakao"; "Akademgorodok"; "yuppi"; "hippie"; "yuppie"; ""; ""; ""; ""; "";
  "Uj"; "PIT"; "ChAT"; "podczłowiek"; "nieczłowiek"; "cześć"; "ktoś"; "ktosik"; ""; ""; ""; "";
  "+ówna"; "+yna"; "+ina"; "+anka"; "+owa"; "co"; "cokolwiek"; "coś"; "cośtam"; ""; ""; "";
  "zając"; "tysiąc"; "rok"; "roczek"; "oko"; "ucho"; "Włochy"; "niebiosy"; "wici"; ""; ""; "";
  "André"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";

  "zły:a"; "dobry:a"; "przymały"; "mały:a"; "duży"; "wielki:a";
  "ppoż."; "ppanc."; "pepanc."; "midi:a"; ""; ""; ""; ""; ""; ""; ""; "";
  "zwać"; "wiedzieć"; "pójść"; "przejść"; "dojść"; "zsiąść"; "iść"; ""; ""; ""; ""; "";
  "być"; "zasłonić"; "słonić"; "przysłonić"; "przesłonić"; "osłonić"; "odsłonić"; ""; ""; ""; ""; "";

  (*
  (* błąd w słowniku *)
  "ówże";
  (* wiele stemów *)
  "twój:a"; "swój"; "mój:a"; "wszystek";
  (* oboczności w stemie *)
  "co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek";
  "kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic";
  "Angel"; "Apollo"; "Białystok"; "Bober"; "Dzięgiel"; "Engel"; "Gołąb:s2"; "Gózd"; "Hendel"; "Herschel"; "Jastrząb";
  "Kodrąb:s2"; "Kozioł"; "Krasnystaw"; "Majcher"; "Ob"; "Omulew"; "Orzeł"; "Różanystok"; "Schuster"; "Stępień"; "Słonim";
  "Wielkanoc"; "achtel"; "archiprezbiter"; "arcydzięgiel"; "bedel"; "ber"; "białagłowa"; "białodrzew"; "ceter"; "deszcz";
  "drama"; "dziób:s1"; "dzięgiel"; "dżemper"; "falafel"; "grubodziób"; "harbajtel"; "harbejtel"; "harmider"; "imćpan";
  "iściec"; "jarząb:s2"; "kierdel"; "kimel"; "kiper:s1"; "klaster"; "kliper"; "kosodrzew"; "kureń"; "manczester";
  "nadpiersień"; "osep"; "otrząs"; "pedel"; "piksel"; "podpiersień"; "podziem"; "prezbiter"; "protokół"; "przedpiersień";
  "ratel"; "rondel:s2"; "rozpiór:s1"; "rozpiór:s2"; "rzeczpospolita"; "rzep:s2"; "rzepień"; "rzewień"; "rąb"; "sosrąb";
  "srebrnodrzew"; "swąd"; "szmermel"; "szpiegierz"; "ulster"; "wab:s2"; "wermiszel"; "wilczełyko"; "woleoczko"; "włosień:s2";
  "zew"; "złotogłów"; "świreń"; "źreb"; "żółtodziób";
  "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
  "przechrzest"; "chrzest"; "półdziecko"; "roczek:s2"; "rok:s1"; "tydzień"; ""; ""; ""; ""; ""; "";
  (* oboczności w odmianie *)
  "niekażdy"; "każdy"; "niektóry:a"; "który"; "tenże"; "ten"; "tamten"; "kijże";
  "ucho:s2"; "dziecko"; "oko:s2"; "imię"; "nozdrze";
  "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
  "NOT"; "LOT"; "KRRiT"; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "być"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
  (* pozostawione *)
  "czyjże"; "czyjś"; "czyjkolwiek"; "kiż"; "ów"; "ow"; "on:a"; "ki";
  "Pia"; "jo-jo"; "+lecie"; "";
  "zagrząźć"; "zrzeć";
  (* niepełny paradygmat *)
  "zróść"; "zląc"; "zaróść"; "zaprząc"; "zaprzysiąc"; "zanieść:v2"; "zaląc"; "wzróść"; "wyróść"; "wyprząc"; "wyprzysiąc";
  "róść"; "sprzysiąc"; "sprząc"; "ugrząźć"; "uląc"; "upiec:v2"; "uprząc"; "uróść"; "wieść:v2"; "wprząc"; "wróść"; "wyląc";
  "powieść:v2"; "posiąc"; "przeląc"; "przeprząc"; "przeróść"; "przyprząc"; "przysiąc"; "przyróść"; "prząc"; "pójść:v2"; "rozprząc"; "rozróść";
  "krzywoprzysiąc"; "ląc"; "naróść"; "obróść"; "odprzysiąc"; "odprząc"; "odróść"; "oprzysiąc"; "podróść"; "pogrząźć"; "poprzysiąc"; "poróść";
  "dojść:v2"; "doprząc"; "doróść"; "dosiąc"; "grząźć"; "iść:v2";
  (* wiele stemów *)
  "uwlec"; "wewlec"; "wlec"; "wwlec"; "wywlec"; "wyżec"; "zawlec"; "zażec"; "zewlec"; "zwlec"; "zżec"; "żec";
  "podwlec"; "podżec"; "powlec:v1"; "powlec:v2"; "przeoblec"; "przewlec"; "przeżec"; "przyoblec"; "przywlec"; "przyżec"; "rozwlec"; "rozżec";
  "dowlec"; "nawlec"; "oblec:v2"; "obwlec"; "odwlec"; "owlec"; "zeżreć";
  (* inne *)
  "liźć"; "iść:v1"; "wyniść"; "wynijść"; "wyjść"; "wniść"; "wnijść"; "wejść"; "ujść"; "rozejść"; "pójść:v1"; "przyjść"; "przejść:v2"; "przejść:v1"; "podejść"; "odejść"; "obejść:v2"; "obejść:v1"; "najść:v2"; "najść:v1"; "nadejść"; "dojść:v1";
  "roztworzyć:v2"; "przetworzyć:v2"; "otworzyć";
  "zsiąść:v2"; "zsiąść:v1"; "zesiąść"; "zasiąść"; "wysiąść"; "współposiąść"; "wsiąść"; "usiąść"; "siąść"; "rozsiąść"; "przysiąść"; "przesiąść"; "powsiąść"; "posiąść"; "podsiąść"; "osiąść"; "obsiąść"; "nasiąść"; "dosiąść";
  "źreć:v1"; "zniść"; "znijść"; "znajść"; "zejść"; "zejść"; "zajść:v2"; "zajść:v1"; "wzniść"; "wznijść"; "wzejść"
(*
   "moi"; "twoi";
  (*"AIDS"; "BGŻ"; "BWZ"; "BZ";*) (*"Bandtkie";*) (*"CRZZ"; "FPŻ";*) (*"Jokai"; "Jókai"; "Linde";*)(* "MSZ"; "MWGzZ"; *)
  (*"NSZ"; "OPZZ";*) "Radetzky"; "Tagore"; (*"UNZ"; "URz"; "WBZ"; "ZSZ"; "ZWZ"; "ZZ";*) "aids";
  "arcyksiężna"; "cornflakes"; "księżna"; (*"scrabble";*) "sms"; "teścina";
  "Wielkanoc"; "białagłowa"; "rzeczpospolita"; "imćpan";
  "Ob"; "podziem"; "Pia"; "woleoczko"; "wilczełyko"; "jo-jo"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "Omulew"; "drama"; (*"Kayah";*) "ratel"; "grubodziób"; "rozpiór:s1"; "ceter"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "DJ"; "FIFA"; (*"manicure"; "Greenpeace"; "Google";*) ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "włosień:s2"; "deszcz"; "falafel"; "Krasnystaw";
  "Różanystok"; "Białystok"; "ZHR"; "rzep:s2"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "IKEA"; "makao"; "macao"; "kakao"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "dziecko"; "oko:s2"; "ucho:s2"; "półdziecko"; "b-cia"; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "idea"; "ręka"; "cześć:s"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "ABBA"; "UEFA"; "FAMA"; "SABENA"; "MENA"; "APA"; "NASA"; "ANSA";
  "NAFTA"; "LETTA"; "ETA"; "ELTA"; "EFTA"; "CEFTA";
  "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART";
  "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT";
  "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
  "szwa"; "hawanna"; "butaforia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "Skopie"; "Mathea"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "poema:s1"; "klima:s1"; "dylema"; "dilemma"; "apoftegma"; "aksjoma"; ""; ""; ""; ""; ""; ""; ""; "";
  "burgrabia"; "gograbia"; "grabia"; "hrabia"; "margrabia"; "murgrabia"; "sędzia:s1"; "wicehrabia"; "współsędzia";
  "cieśla"; "bibliopola"; "świszczypałka"; "śwircałka"; "świerczałka"; "ścierciałka"; "tatka"; "sługa:s1"; "stupajka:s1"; "stepka"; "starowinka:s2"; "skurczypałka"; "mężczyzna"; "klecha";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
  ] @ File.load_lines "../morphology/data/obce.tab" @ File.load_lines "../morphology/data/akronimy.tab" @
    (* File.load_lines "results/interp_validated_verb.tab" @  *)
    (*File.load_lines "results/interp_validated_noun.tab" @ File.load_lines "results/interp_validated_adj.tab" @
    File.load_lines "../morphology/data/validated_adj.tab" @ File.load_lines "../morphology/data/validated_noun.tab" @
    File.load_lines "../morphology/data/validated_verb.tab" @ File.load_lines "../morphology/data/adv_nieodprzymiotnikowe.tab" *) [])

let remove_exceptional_lemmata dict =
  Xlist.fold dict [] (fun dict entry ->
    if StringSet.mem exceptional_lemmata entry.lemma then dict
    else entry :: dict)

let remove_exceptional_lemmata_gen ex dict =
  Xlist.fold dict [] (fun dict entry ->
    if StringSet.mem ex entry.lemma then dict
    else entry :: dict)

let generate_stem dict =
  Xlist.rev_map dict (fun entry ->
    {entry with stem=
      if entry.ndm then (List.hd entry.forms).orth else
      if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then
        Stem.generate_stem entry
      else ""})

let phon_generate_stem dict =
  Xlist.rev_map dict (fun entry ->
    {entry with phon_stem=
      if entry.ndm then (List.hd entry.forms).phon_orth else
      if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then
        Stem.phon_generate_stem entry
      else []})

let generate_stem_lu dict =
  Xlist.rev_map dict (fun entry ->
    {entry with lu_stem=Stem.generate_stem_lu entry.lemma1 entry.lemma2})

let lowercase_lu dict =
  Xlist.rev_map dict (fun entry ->
    {entry with
      lemma1=Xunicode.lowercase_utf8_string entry.lemma1;
      lemma2=Xunicode.lowercase_utf8_string entry.lemma2})


let fonetic_translation dict =
  Xlist.fold dict [] (fun dict e ->
    try
      let lemma = Stem.simplify_lemma e.lemma in
      let phon_lemma = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules lemma in
      let phon_stem = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules e.stem in
      {e with phon_lemma = phon_lemma; phon_stem=phon_stem;
        forms = Xlist.map e.forms (fun f ->
          let phon_orth = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules f.orth in
          {f with phon_orth = phon_orth})} :: dict
    with
      Fonetics.NotFound(x,s) -> printf "NF %s %s %s\n%!" e.lemma x s; dict
    | Fonetics.NotEqual(x,s,t) -> printf "NE %s %s %s %s\n%!" e.lemma x s t; dict
    | Fonetics.MulipleSolutions(x,s,l) -> printf "MS %s %s %s: %s\n%!" e.lemma x s (String.concat " " l); dict
    | _ -> dict)

let validate rules dict =
  Xlist.rev_map dict (fun entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    let forms = Xlist.rev_map entry.forms (fun form ->
      let candidates = Rules.CharTrees.find rules form.orth in
      let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
        if stem ^ rule.set = simple_lemma then (stem,rule) :: candidates else candidates) in
      if candidates = [] then {form with validated=false} else {form with validated=true}) in
    {entry with forms=forms})

let phon_validate rules dict =
  Xlist.rev_map dict (fun entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    let forms = Xlist.rev_map entry.forms (fun form ->
      let candidates = Xlist.fold form.phon_orth [] (fun candidates s ->
        Xlist.fold (Rules.CharTrees.find rules s) candidates (fun candidates (stem,rule) ->
          let candidate_lemma = Fonetics.translate_single true Fonetics.rev_rules (stem ^ rule.set) in
          if candidate_lemma = simple_lemma then (stem,rule) :: candidates else candidates)) in
      if candidates = [] then {form with validated=false} else {form with validated=true}) in
    {entry with forms=forms})

let validate_lu rules dict =
  Xlist.rev_map dict (fun entry ->
    let candidates1 = Rules.CharTrees.find rules entry.lemma1 in
    let candidates2 = Rules.CharTrees.find rules entry.lemma2 in
    let b = Xlist.fold candidates1 false (fun b (stem1,rule1) ->
      Xlist.fold candidates2 b (fun b (stem2,rule2) ->
        (* Printf.printf "%s %s %s %s\n%!" stem1 stem2 (string_of_rule rule1) (string_of_rule rule1); *)
        if stem1 ^ rule1.set = stem2 ^ rule2.set then true else b)) in
    (* if b then print_endline "validated"; *)
    let b1 = Xlist.fold candidates1 false (fun b (stem1,rule1) -> if stem1 = entry.lu_stem then true else b) in
    let b2 = Xlist.fold candidates2 false (fun b (stem2,rule2) -> if stem2 = entry.lu_stem then true else b) in
    {entry with lu_validated=b; validated1=b1; validated2=b2})

let validate_interp rules dict =
  Xlist.rev_map dict (fun entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    let forms = Xlist.rev_map entry.forms (fun form ->
      let candidates = Xlist.fold form.phon_orth [] (fun candidates s ->
        Xlist.fold (Rules.CharTrees.find rules s) candidates (fun candidates (stem,rule) ->
          if rule.star = Ndm && not entry.ndm then candidates else
          if rule.star <> Ndm && entry.ndm then candidates else
          let candidate_lemma = Fonetics.translate_single true Fonetics.rev_rules (stem ^ rule.set) in
          if candidate_lemma = simple_lemma && form.interp = rule.interp then
            (stem,rule) :: candidates else candidates)) in
      if candidates = [] then {form with validated=false} else {form with validated=true}) in
    {entry with forms=forms})

let remove_validated_forms dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if form.validated then forms else form :: forms) in
    if forms = [] then dict else {entry with forms=forms} :: dict)

let remove_validated_entries dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if form.validated then forms else form :: forms) in
    if forms = [] then dict else entry :: dict)

let remove_validated_lu dict =
  Xlist.fold dict [] (fun dict entry ->
    if entry.lu_validated then dict else entry :: dict)

let remove_not_validated_forms dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if form.validated then form :: forms else forms) in
    if forms = [] then dict else {entry with forms=forms} :: dict)

let remove_not_validated_entries dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if form.validated then form :: forms else forms) in
    if Xlist.size forms <> Xlist.size entry.forms then dict else entry :: dict)

let print filename dict =
  File.file_out filename (fun file ->
    Xlist.iter dict (fun entry ->
      Xlist.iter entry.forms (fun form ->
        fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp)))

let print_lemmata filename dict =
  File.file_out filename (fun file ->
    Xlist.iter dict (fun entry ->
      fprintf file "%s\n" entry.lemma))

let remove_sup_neg_forms dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if Xstring.check_sufix ":neg" form.interp || Xstring.check_sufix ":sup" form.interp then
        forms else form :: forms) in
    if forms = [] then dict else {entry with forms=forms} :: dict)

let generate_rules rules path filename rules_filename =
  let dict = load_tab (path ^ filename) in
  let dict = merge_entries dict in
  let dict = process_interps dict in
  let dict = mark_ndm dict in (* FIXME: remove_ndm? *)
  let dict = remove_exceptional_lemmata dict in
  let dict = find_kolwiek_suffixes dict in (* FIXME: lematy z kolwiek_suffixes nie są walidowane *)
  let dict = generate_stem dict in
  let dict = fonetic_translation dict in
  let dict = phon_validate rules dict in
  let dict = remove_validated_forms dict in
  let dict = remove_sup_neg_forms dict in (* FIXME *)
  let rules = Xlist.fold dict StringMap.empty (fun rules entry ->
    Xlist.fold (RuleGenerator.phon_generate_rules_entry entry) rules (fun rules (key,rule) ->
      let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in
      let rules2 = StringMap.add_inc rules2 rule (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l) in
      StringMap.add rules key rules2)) in
  File.file_out rules_filename (fun file ->
    StringMap.iter rules (fun interp rules2 ->
      fprintf file "\n@RULES %s\n" interp;
      StringMap.iter rules2 (fun rule (q,l) ->
        fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))

let generate_rules_lu rules id path rules_filename =
  let dict = load_lu [] id path in
  let dict = lowercase_lu dict in
  let dict = generate_stem_lu dict in
  let dict = validate_lu rules dict in
  let dict = remove_validated_lu dict in
  (* let dict = remove_sup_neg_forms dict in *)
  let rules = Xlist.fold dict StringMap.empty (fun rules entry ->
    Xlist.fold (RuleGenerator.generate_rules_lu_entry entry) rules (fun rules (key,rule,lemma) ->
      let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in
      let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in
      StringMap.add rules key rules2)) in
  File.file_out rules_filename (fun file ->
    StringMap.iter rules (fun interp rules2 ->
      fprintf file "\n@RULES %s\n" interp;
      StringMap.iter rules2 (fun rule (q,l) ->
        fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))

let rec get_first n l =
  if n = 0 || l = [] then [] else
  List.hd l :: (get_first (n-1) (List.tl l))

let generate_interp_rules rules interp_rules selected_tags path filename rules_filename =
  let selected_tags = StringSet.of_list selected_tags in
  let dict = load_tab (path ^ filename) in
  let dict = merge_entries dict in
  let dict = process_interps dict in
  let dict = mark_ndm dict in (* FIXME: remove_ndm? *)
  let dict = remove_exceptional_lemmata dict in
  (* let dict = find_kolwiek_suffixes dict in *)
  (* let dict = generate_stem dict in *)
  let dict = fonetic_translation dict in
  let dict = validate_interp interp_rules dict in
  let dict = remove_validated_forms dict in
  let interp_rules = Xlist.fold dict StringMap.empty (fun interp_rules entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    Xlist.fold entry.forms interp_rules (fun interp_rules form ->
      let candidates = RuleGenerator.phon_generate_interp_rules rules selected_tags simple_lemma form in
      Xlist.fold candidates interp_rules (fun interp_rules (v,cand) ->
        (* StringMap.add_inc interp_rules cand (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l)))) in *)
        StringMap.add_inc interp_rules cand (v,StringSet.singleton entry.lemma) (fun (v,set) -> v,StringSet.add set entry.lemma)))) in
  let interp_rules = List.rev (List.sort compare (StringMap.fold interp_rules [] (fun l k (v,set) ->
    (v,k,set) :: l))) in
  File.file_out rules_filename (fun file ->
    Xlist.iter interp_rules (fun (v,k,set)(*q,l*) ->
      (* fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l))) *)
      (*if StringSet.size set > 1000 then*) fprintf file "\t%s\t# %d %s\n" k (StringSet.size set) (String.concat " " (get_first 20 (List.rev (StringSet.to_list set))))))

let generate_ndm_rules dict =
  let freq_rules = Xlist.fold dict Rules.RuleQMap.empty (fun freq_rules entry ->
    Xlist.fold entry.forms freq_rules (fun freq_rules form ->
      let rule = {id=""; freq=0; star=Ndm; pref=""; find=""; set=""; tags=[]; interp=form.interp} in
      Rules.RuleQMap.add freq_rules rule)) in
   fst (Rules.RuleQMap.fold freq_rules (Rules.RuleQMap.empty,1) (fun (freq_rules,i) rule freq ->
     Rules.RuleQMap.add_val freq_rules {rule with id = "N" ^ string_of_int i} freq, i+1))

let generate_rule_frequencies rules path filename rules_filename =
  let dict = load_tab (path ^ filename) in
  let dict = merge_entries dict in
  let dict = process_interps dict in
  let dict = remove_cat "cond" dict in
  let dict = mark_ndm dict in
  let freq_rules = generate_ndm_rules (remove_not_ndm dict) in
  let dict = remove_ndm dict in
  let dict = remove_exceptional_lemmata dict in
  let dict = generate_stem dict in
  let freq_rules = Xlist.fold dict freq_rules (fun freq_rules entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    Xlist.fold entry.forms freq_rules (fun freq_rules form ->
      let candidates = Rules.CharTrees.find rules form.orth in
      let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
        if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
          (stem,rule) :: candidates else candidates) in
      if candidates = [] then freq_rules else Rules.RuleQMap.add freq_rules (snd (List.hd candidates)))) in
  File.file_out rules_filename (fun file ->
    Rules.RuleQMap.iter freq_rules (fun rule freq ->
      fprintf file "%s\n" (Rules.string_of_freq_rule {rule with freq=freq})))

let generate_stem_dict rules_filename path filename out_filename =
  let rules = Rules.load_freq_rules rules_filename in
  let rules = Rules.CharTrees.create rules in
  let dict = load_tab (path ^ filename) in
  let dict = merge_entries dict in
  let dict = process_interps dict in
  let dict = remove_cat "cond" dict in
  let dict = mark_ndm dict in
  let stems = Xlist.fold dict StringMap.empty (fun stems entry ->
    let simple_lemma,lemma_suf = Stem.simplify_lemma_full entry.lemma in
    Xlist.fold entry.forms stems (fun stems form ->
      let candidates = Rules.CharTrees.find rules form.orth in
      let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
        if rule.star = Ndm && not entry.ndm then candidates else
        if rule.star <> Ndm && entry.ndm then candidates else
        if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
          (stem,rule) :: candidates else candidates) in
      if candidates = [] then stems else
      let stem,rule = List.hd candidates in
      StringMap.add_inc stems (stem ^ "\t" ^ lemma_suf) [rule.id] (fun l -> rule.id :: l))) in
  File.file_out out_filename (fun file ->
    StringMap.iter stems (fun stem ids ->
      fprintf file "%s\t%s\n" stem (String.concat " " ids)))