open Xstd open Printf open Types let get_form e = match e.forms with [form] -> form | _ -> failwith "get_form" let load_tab filename = File.load_tab filename (function orth :: lemma :: interp :: _ -> {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]} | line -> failwith ("load_tab: " ^ (String.concat "\t" line))) let load_tab_full filename = File.load_tab filename (function [orth; lemma; interp] -> {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]} | [orth; lemma; interp; proper_type] -> {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]; proper_type=proper_type} | [orth; lemma; interp; proper_type; genre] -> {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp; genre=genre}]; proper_type=proper_type} | line -> failwith ("load_tab_full: " ^ (String.concat "\t" line))) let load_lu dict id path = let filename = path ^ "morf_rel_" ^ string_of_int id ^ "_lu.tab" in File.fold_tab filename dict (fun dict -> function [lemma1; lemma2] -> {lemma1=lemma1; lemma2=lemma2; rel_id=id; lu_stem=""; lu_validated=false;validated1=false;validated2=false} :: dict | line -> failwith ("load_lu: " ^ (String.concat "\t" line))) let pos = StringSet.of_list [ "subst";"adj";"adv";"interp";"num";"xxx";"prep";"fin";"praet";"qub";"inf";"interj"; "brev";"numcol";"ppas";"pact";"adja";"conj";"ger";"pcon";"pant";"comp";"depr"; "adjp";"imps";"impt";"pred";"bedzie";"burk";"aglt";"ppron12";"ppron3";"adjc"; "winien";"siebie";"numcomp" ] let rec find_pos rev = function s :: l -> if StringSet.mem pos s then List.rev rev, s :: l else find_pos (s :: rev) l | [] -> failwith "find_pos" let split_lemma_interp s = let l = Xstring.split_delim ":" s in let lemma,interp = find_pos [List.hd l] (List.tl l) in String.concat ":" lemma, String.concat ":" interp let rec remove_empties = function "" :: l -> remove_empties l | l -> l let split_freq_orth s = match remove_empties (Xstring.split " " s) with freq :: l -> int_of_string freq, String.concat " " l | _ -> failwith "split_freq_orth" let load_freq_tab filename = File.load_tab filename (function [freq_orth; lemma_interp] -> let freq,orth = split_freq_orth freq_orth in let lemma,interp = split_lemma_interp lemma_interp in {empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp; freq=freq}]} | line -> failwith ("load_freq_tab: " ^ (String.concat "\t" line))) let proper_type_selector e = e.proper_type let genre_selector e = (get_form e).genre let interp_selector e = (get_form e).interp let freq_selector e = (get_form e).freq let print_quantities out_filename selector dict = let qmap = Xlist.fold dict StringQMap.empty (fun qmap entry -> StringQMap.add qmap (selector entry)) in File.file_out out_filename (fun file -> StringQMap.iter qmap (fun k v -> fprintf file "%6d\t%s\n" v k)) (**********************************************************************************) let load_dict_as_set filename = let l = load_tab filename in List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry -> let form = get_form entry in StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp])))) let load_dict_as_set_full filename = let l = load_tab_full filename in List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry -> let form = get_form entry in StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp;entry.proper_type;form.genre])))) let rec compare_dicts_rec file = function [],[] -> () | [],b :: lb -> fprintf file "> %s\n" b; compare_dicts_rec file ([],lb) | a :: la,[] -> fprintf file "< %s\n" a; compare_dicts_rec file (la,[]) | a :: la, b :: lb -> if a = b then compare_dicts_rec file (la,lb) else if a < b then (fprintf file "< %s\n" a; compare_dicts_rec file (la,b :: lb)) else (fprintf file "> %s\n" b; compare_dicts_rec file (a :: la,lb)) let compare_dicts filename1 filename2 filename_out = let dict1 = load_dict_as_set filename1 in let dict2 = load_dict_as_set filename2 in File.file_out filename_out (fun file -> compare_dicts_rec file (dict1,dict2)) let compare_dicts_full filename1 filename2 filename_out = let dict1 = load_dict_as_set_full filename1 in let dict2 = load_dict_as_set_full filename2 in File.file_out filename_out (fun file -> compare_dicts_rec file (dict1,dict2)) (**********************************************************************************) let find_entry_cat entry = let form = get_form entry in let cat,tags = match Xstring.split ":" form.interp with cat :: tags -> cat,tags | _ -> failwith ("find_entry_type: " ^ form.interp) in if cat = "praet" then let t = match tags with _ :: _ :: t :: _ -> t | _ -> failwith ("find_entry_cat: " ^ form.interp) in if t = "pri" || t = "sec" || t = "ter" then "cond" else "verb" else if cat = "subst" || cat = "depr" then "noun" else if cat = "adj" || cat = "adja"|| cat = "adjc"|| cat = "adjp" then "adj" else if cat = "adv" then "adv" else if cat = "inf" || cat = "praet"|| cat = "fin" || cat = "ppas" || cat = "pact" || cat = "pacta" || cat = "impt" || cat = "imps" || cat = "pcon" || cat = "pant" || cat = "ger" || cat = "" then "verb" else if cat = "bedzie" || cat = "pred"|| cat = "prep" || cat = "num" || cat = "aglt" || cat = "winien" || cat = "qub" || cat = "brev" || cat = "comp" || cat = "interj" || cat = "burk" || cat = "conj" || cat = "ppron12" || cat = "ppron3" || cat = "numcomp" || cat = "" then "other" else if cat = "cond" then "cond" else failwith ("find_entry_cat: " ^ cat) let assign_entry_cat dict = Xlist.rev_map dict (fun entry -> {entry with cat = find_entry_cat entry}) let split_dict in_path filename out_path = let dict = load_tab (in_path ^ filename) in let dict = List.rev (assign_entry_cat dict) in let filename = if Xstring.check_sufix ".gz" filename then Xstring.cut_sufix ".gz" filename else filename in File.file_out (out_path ^ "noun_" ^ filename) (fun noun_file -> File.file_out (out_path ^ "adj_" ^ filename) (fun adj_file -> File.file_out (out_path ^ "adv_" ^ filename) (fun adv_file -> File.file_out (out_path ^ "verb_" ^ filename) (fun verb_file -> File.file_out (out_path ^ "other_" ^ filename) (fun other_file -> Xlist.iter dict (fun entry -> let form = get_form entry in try let file = match entry.cat with "noun" -> noun_file | "adj" -> adj_file | "adv" -> adv_file | "verb" -> verb_file | "other" -> other_file | "cond" -> raise Not_found | _ -> failwith "split_dict" in fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp with Not_found -> ())))))) let merge_entries dict = let dict = assign_entry_cat dict in let map = Xlist.fold dict StringMap.empty (fun map entry -> let form = get_form entry in let key = if entry.cat = "noun" then let gender = match Xstring.split ":" form.interp with ["depr";_;_;"m2"] -> "m1" | "depr" :: _ -> failwith ("merge_entries: " ^ form.interp) | [_;_;_;gender] -> gender | [_;_;_;gender;col] -> gender ^ ":" ^ col | _ -> failwith ("merge_entries: " ^ form.interp) in entry.lemma ^ "|" ^ entry.cat ^ "|" ^ gender else entry.lemma ^ "|" ^ entry.cat in StringMap.add_inc map key entry (fun e -> if entry.proper_type <> e.proper_type then failwith ("merge_entries: " ^ key ^ " " ^ entry.proper_type ^ " " ^ e.proper_type) else {e with forms = form :: e.forms})) in StringMap.fold map [] (fun dict _ e -> e :: dict) let remove_cat cat dict = Xlist.fold dict [] (fun dict entry -> if entry.cat = cat then dict else entry :: dict) let rec get_aspect lemma = function (f : form) :: l -> (match Xstring.split ":" f.interp with ["inf";a] -> a | ["ger";_;_;_;a;_] -> a | _ -> get_aspect lemma l) | [] -> failwith ("get_aspect: " ^ lemma) let get_lemma_suf lemma = let lemma_suf = if lemma = "" then "" else List.hd (List.rev (Xunicode.utf8_chars_of_utf8_string (Stem.simplify_lemma lemma))) in match lemma_suf with "a" -> "a" | "e" -> "e" | "o" -> "o" | "y" -> "y" | "i" -> "y" | "ę" -> "ę" | _ -> "ε" let get_orth_suf orth = let orth_suf = if orth = "" then "" else List.hd (List.rev (Xunicode.utf8_chars_of_utf8_string orth)) in match orth_suf with "j" -> "j" | "e" -> "e" | _ -> "ε" let merge_interps lemma forms = let lemma_suf = get_lemma_suf lemma in let map = Xlist.fold forms StringMap.empty (fun map form -> (* printf "merge_interps 1: %s %s\n%!" form.orth form.interp; *) StringMap.add_inc map form.orth (StringSet.singleton form.interp) (fun set -> StringSet.add set form.interp)) in StringMap.fold map [] (fun forms orth set -> (* printf "merge_interps 2: %s %s\n%!" orth (String.concat " " (StringSet.to_list set)); *) let orth_suf = get_orth_suf orth in match lemma_suf, Xlist.sort (StringSet.to_list set) compare with _,["adv"] -> {empty_form with orth=orth; interp="adv:pos"} :: forms | _,["adv";"adv:pos"] -> {empty_form with orth=orth; interp="adv:pos"} :: forms | _,["adj:pl:acc:m2.m3.f.n:pos";"adj:pl:nom.voc:m2.m3.f.n:pos";"adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos|adj:pl:nom.acc.voc:m2.m3.f.n:pos"} :: forms | _,["adj:sg:acc:m1.m2:pos";"adj:sg:gen:m1.m2.m3.n:pos"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:pos|adj:sg:acc:m1.m2:pos"} :: forms | _,["adj:sg:dat:f:pos";"adj:sg:gen:f:pos";"adj:sg:loc:f:pos"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:pos"} :: forms | _,["adj:sg:acc:m3:pos";"adj:sg:nom.voc:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos"} :: forms | _,["adj:pl:acc:m1:pos";"adj:pl:gen:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:pos|adj:pl:acc:m1:pos"} :: forms | _,["adj:pl:dat:m1.m2.m3.f.n:pos";"adj:sg:inst:m1.m2.m3.n:pos";"adj:sg:loc:m1.m2.m3.n:pos"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:pos|adj:pl:dat:m1.m2.m3.f.n:pos"} :: forms | _,["adj:sg:acc:f:pos";"adj:sg:inst:f:pos"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:pos"} :: forms | _,["adj:pl:nom.voc:m1:pos";"adj:sg:acc:m3:pos";"adj:sg:nom.voc:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos|adj:pl:nom.voc:m1:pos"} :: forms | _,["adj:sg:acc:m3:pos";"adj:sg:nom:m1.m2.m3:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom:m1.m2.m3:pos|adj:sg:acc:m3:pos"} :: forms | _,["adj:pl:acc:m2.m3.f.n:pos";"adj:pl:nom.voc:m2.m3.f.n:pos"] -> {empty_form with orth=orth; interp="adj:pl:nom.acc.voc:m2.m3.f.n:pos"} :: forms | _,["adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos"} :: forms | _,["adj:sg:acc:n:pos";"adj:sg:nom.voc:n:pos";"adja"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:pos|adja"} :: forms | _,["adj:pl:nom:m2.m3.f.n:pos";"adj:sg:nom:n:pos"] -> {empty_form with orth=orth; interp="adj:sg:nom:n:pos|adj:pl:nom:m2.m3.f.n:pos"} :: forms | _,["adj:pl:acc:m2.m3.f.n:sup";"adj:pl:nom.voc:m2.m3.f.n:sup";"adj:sg:acc:n:sup";"adj:sg:nom.voc:n:sup"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:sup|adj:pl:nom.acc.voc:m2.m3.f.n:sup"} :: forms | _,["adj:sg:acc:m1.m2:sup";"adj:sg:gen:m1.m2.m3.n:sup"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:sup|adj:sg:acc:m1.m2:sup"} :: forms | _,["adj:sg:dat:f:sup";"adj:sg:gen:f:sup";"adj:sg:loc:f:sup"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:sup"} :: forms | _,["adj:sg:acc:m3:sup";"adj:sg:nom.voc:m1.m2.m3:sup"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:sup|adj:sg:acc:m3:sup"} :: forms | _,["adj:pl:acc:m1:sup";"adj:pl:gen:m1.m2.m3.f.n:sup";"adj:pl:loc:m1.m2.m3.f.n:sup"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:sup|adj:pl:acc:m1:sup"} :: forms | _,["adj:pl:dat:m1.m2.m3.f.n:sup";"adj:sg:inst:m1.m2.m3.n:sup";"adj:sg:loc:m1.m2.m3.n:sup"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:sup|adj:pl:dat:m1.m2.m3.f.n:sup"} :: forms | _,["adj:sg:acc:f:sup";"adj:sg:inst:f:sup"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:sup"} :: forms | _,["adj:pl:acc:m2.m3.f.n:com";"adj:pl:nom.voc:m2.m3.f.n:com";"adj:sg:acc:n:com";"adj:sg:nom.voc:n:com"] -> {empty_form with orth=orth; interp="adj:sg:nom.acc.voc:n:com|adj:pl:nom.acc.voc:m2.m3.f.n:com"} :: forms | _,["adj:sg:acc:m1.m2:com";"adj:sg:gen:m1.m2.m3.n:com"] -> {empty_form with orth=orth; interp="adj:sg:gen:m1.m2.m3.n:com|adj:sg:acc:m1.m2:com"} :: forms | _,["adj:sg:dat:f:com";"adj:sg:gen:f:com";"adj:sg:loc:f:com"] -> {empty_form with orth=orth; interp="adj:sg:gen.dat.loc:f:com"} :: forms | _,["adj:sg:acc:m3:com";"adj:sg:nom.voc:m1.m2.m3:com"] -> {empty_form with orth=orth; interp="adj:sg:nom.voc:m1.m2.m3:com|adj:sg:acc:m3:com"} :: forms | _,["adj:pl:acc:m1:com";"adj:pl:gen:m1.m2.m3.f.n:com";"adj:pl:loc:m1.m2.m3.f.n:com"] -> {empty_form with orth=orth; interp="adj:pl:gen.loc:m1.m2.m3.f.n:com|adj:pl:acc:m1:com"} :: forms | _,["adj:pl:dat:m1.m2.m3.f.n:com";"adj:sg:inst:m1.m2.m3.n:com";"adj:sg:loc:m1.m2.m3.n:com"] -> {empty_form with orth=orth; interp="adj:sg:inst.loc:m1.m2.m3.n:com|adj:pl:dat:m1.m2.m3.f.n:com"} :: forms | _,["adj:sg:acc:f:com";"adj:sg:inst:f:com"] -> {empty_form with orth=orth; interp="adj:sg:acc.inst:f:com"} :: forms | _,["adj:pl:acc:m1:pos";"adj:pl:acc:m2.m3.f.n:pos";"adj:pl:dat:m1.m2.m3.f.n:pos";"adj:pl:gen:m1.m2.m3.f.n:pos"; "adj:pl:inst:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos";"adj:pl:nom.voc:m1:pos";"adj:pl:nom.voc:m2.m3.f.n:pos"; "adj:sg:acc:f:pos";"adj:sg:acc:m1.m2:pos";"adj:sg:acc:m3:pos";"adj:sg:acc:n:pos";"adj:sg:dat:f:pos"; "adj:sg:dat:m1.m2.m3.n:pos";"adj:sg:gen:f:pos";"adj:sg:gen:m1.m2.m3.n:pos";"adj:sg:inst:f:pos";"adj:sg:inst:m1.m2.m3.n:pos"; "adj:sg:loc:f:pos";"adj:sg:loc:m1.m2.m3.n:pos";"adj:sg:nom.voc:f:pos";"adj:sg:nom.voc:m1.m2.m3:pos";"adj:sg:nom.voc:n:pos"] -> {empty_form with orth=orth; interp="adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos"} :: forms | _,["adj:pl:acc:m1:pos";"adj:pl:acc:m2.m3.f.n:pos";"adj:pl:dat:m1.m2.m3.f.n:pos";"adj:pl:gen:m1.m2.m3.f.n:pos"; "adj:pl:inst:m1.m2.m3.f.n:pos";"adj:pl:loc:m1.m2.m3.f.n:pos";"adj:pl:nom.voc:m1:pos";"adj:pl:nom.voc:m2.m3.f.n:pos"; "adj:sg:acc:f:pos";"adj:sg:acc:m1.m2:pos";"adj:sg:acc:m3:pos";"adj:sg:acc:n:pos";"adj:sg:dat:f:pos"; "adj:sg:dat:m1.m2.m3.n:pos";"adj:sg:gen:f:pos";"adj:sg:gen:m1.m2.m3.n:pos";"adj:sg:inst:f:pos";"adj:sg:inst:m1.m2.m3.n:pos"; "adj:sg:loc:f:pos";"adj:sg:loc:m1.m2.m3.n:pos";"adj:sg:nom.voc:f:pos";"adj:sg:nom.voc:m1.m2.m3:pos";"adj:sg:nom.voc:n:pos";"adja"] -> {empty_form with orth=orth; interp="adj:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1.m2.m3.f.n:pos|adja"} :: forms | _,["ger:pl:nom.acc:n:imperf.perf:aff";"ger:sg:gen:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ger:sg:gen:n:imperf.perf:aff|ger:pl:nom.acc:n:imperf.perf:aff"} :: forms | _,["ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:aff";"ppas:sg:nom.acc.voc:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:nom.acc.voc:n:imperf.perf:aff|ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:aff"} :: forms | _,["ppas:sg:acc:m1.m2:imperf.perf:aff";"ppas:sg:gen:m1.m2.m3.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:gen:m1.m2.m3.n:imperf.perf:aff|ppas:sg:acc:m1.m2:imperf.perf:aff"} :: forms | _,["ppas:sg:acc:m3:imperf.perf:aff";"ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff|ppas:sg:acc:m3:imperf.perf:aff"} :: forms | _,["ppas:pl:acc:m1:imperf.perf:aff";"ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:aff|ppas:pl:acc:m1:imperf.perf:aff"} :: forms | _,["ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:aff";"ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:aff|ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:aff"} :: forms | _,["pact:pl:nom.acc.voc:m2.m3.f.n:imperf:aff";"pact:sg:nom.acc.voc:n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:nom.acc.voc:n:imperf:aff|pact:pl:nom.acc.voc:m2.m3.f.n:imperf:aff"} :: forms | _,["pact:sg:acc:m1.m2:imperf:aff";"pact:sg:gen:m1.m2.m3.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:gen:m1.m2.m3.n:imperf:aff|pact:sg:acc:m1.m2:imperf:aff"} :: forms | _,["pact:pl:nom.voc:m1:imperf:aff";"pact:sg:acc:m3:imperf:aff";"pact:sg:nom.voc:m1.m2.m3:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:nom.voc:m1.m2.m3:imperf:aff|pact:sg:acc:m3:imperf:aff|pact:pl:nom.voc:m1:imperf:aff"} :: forms | _,["pact:pl:acc:m1:imperf:aff";"pact:pl:gen.loc:m1.m2.m3.f.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:pl:gen.loc:m1.m2.m3.f.n:imperf:aff|pact:pl:acc:m1:imperf:aff"} :: forms | _,["pact:pl:dat:m1.m2.m3.f.n:imperf:aff";"pact:sg:inst.loc:m1.m2.m3.n:imperf:aff"] -> {empty_form with orth=orth; interp="pact:sg:inst.loc:m1.m2.m3.n:imperf:aff|pact:pl:dat:m1.m2.m3.f.n:imperf:aff"} :: forms | _,["ger:pl:nom.acc:n:imperf.perf:neg";"ger:sg:gen:n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ger:sg:gen:n:imperf.perf:neg|ger:pl:nom.acc:n:imperf.perf:neg"} :: forms | _,["ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:neg";"ppas:sg:nom.acc.voc:n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:nom.acc.voc:n:imperf.perf:neg|ppas:pl:nom.acc.voc:m2.m3.f.n:imperf.perf:neg"} :: forms | _,["ppas:sg:acc:m1.m2:imperf.perf:neg";"ppas:sg:gen:m1.m2.m3.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:gen:m1.m2.m3.n:imperf.perf:neg|ppas:sg:acc:m1.m2:imperf.perf:neg"} :: forms | _,["ppas:sg:acc:m3:imperf.perf:neg";"ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg|ppas:sg:acc:m3:imperf.perf:neg"} :: forms | _,["ppas:pl:acc:m1:imperf.perf:neg";"ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:pl:gen.loc:m1.m2.m3.f.n:imperf.perf:neg|ppas:pl:acc:m1:imperf.perf:neg"} :: forms | _,["ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:neg";"ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:neg"] -> {empty_form with orth=orth; interp="ppas:sg:inst.loc:m1.m2.m3.n:imperf.perf:neg|ppas:pl:dat:m1.m2.m3.f.n:imperf.perf:neg"} :: forms | _,["pact:pl:nom.acc.voc:m2.m3.f.n:imperf:neg";"pact:sg:nom.acc.voc:n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:nom.acc.voc:n:imperf:neg|pact:pl:nom.acc.voc:m2.m3.f.n:imperf:neg"} :: forms | _,["pact:sg:acc:m1.m2:imperf:neg";"pact:sg:gen:m1.m2.m3.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:gen:m1.m2.m3.n:imperf:neg|pact:sg:acc:m1.m2:imperf:neg"} :: forms | _,["pact:pl:nom.voc:m1:imperf:neg";"pact:sg:acc:m3:imperf:neg";"pact:sg:nom.voc:m1.m2.m3:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:nom.voc:m1.m2.m3:imperf:neg|pact:sg:acc:m3:imperf:neg|pact:pl:nom.voc:m1:imperf:neg"} :: forms | _,["pact:pl:acc:m1:imperf:neg";"pact:pl:gen.loc:m1.m2.m3.f.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:pl:gen.loc:m1.m2.m3.f.n:imperf:neg|pact:pl:acc:m1:imperf:neg"} :: forms | _,["pact:pl:dat:m1.m2.m3.f.n:imperf:neg";"pact:sg:inst.loc:m1.m2.m3.n:imperf:neg"] -> {empty_form with orth=orth; interp="pact:sg:inst.loc:m1.m2.m3.n:imperf:neg|pact:pl:dat:m1.m2.m3.f.n:imperf:neg"} :: forms | _,["ger:pl:gen:n:imperf.perf:aff";"inf:imperf.perf"] -> {empty_form with orth=orth; interp="ger:pl:gen:n:imperf.perf:aff"} :: {empty_form with orth=orth; interp="inf:imperf.perf"} :: forms | _,["praet:sg:m1.m2.m3:imperf.perf";"praet:sg:m1.m2.m3:imperf.perf:nagl"] -> {empty_form with orth=orth; interp="praet:sg:m1.m2.m3:imperf.perf:nagl"} :: forms | _,["fin:sg:ter:imperf.perf";"ger:sg:nom.acc:n:imperf.perf:aff"] -> {empty_form with orth=orth; interp="fin:sg:ter:imperf.perf"} :: {empty_form with orth=orth; interp="ger:sg:nom.acc:n:imperf.perf:aff"} :: forms | _,["ger:pl:gen:n:imperf.perf:aff";"impt:sg:sec:imperf.perf"] -> {empty_form with orth=orth; interp="ger:pl:gen:n:imperf.perf:aff"} :: {empty_form with orth=orth; interp="impt:sg:sec:imperf.perf"} :: forms | _,["fin:pl:ter:imperf.perf";"ppas:sg:acc.inst:f:imperf.perf:aff"] -> {empty_form with orth=orth; interp="fin:pl:ter:imperf.perf"} :: {empty_form with orth=orth; interp="ppas:sg:acc.inst:f:imperf.perf:aff"} :: forms | "a",["subst:sg:dat.loc:f";"subst:sg:gen:f"] -> if orth_suf = "j" then {empty_form with orth=orth; interp="subst:sg:gen.dat.loc:f"} :: forms else {empty_form with orth=orth; interp="subst:sg:gen:f"} :: {empty_form with orth=orth; interp="subst:sg:dat.loc:f"} :: forms (* | "a",["subst:pl:gen:f";"subst:sg:dat.loc:f";"subst:sg:gen:f"] -> (*print_endline lemma;*) {empty_form with orth=orth; interp="subst:pl:gen:f"} :: {empty_form with orth=orth; interp="subst:sg:dat.loc:f"} :: {empty_form with orth=orth; interp="subst:sg:gen:f"} :: forms *) | "a",["subst:pl:gen:f";"subst:pl:loc:f"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:f"} :: forms | "ε",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms | "ε",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms | "ε",["subst:sg:loc:m1";"subst:sg:voc:m1"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m1"} :: forms | "ε",["subst:sg:loc:m2";"subst:sg:voc:m2"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m2"} :: forms | "ε",["subst:sg:loc:m3";"subst:sg:voc:m3"]-> {empty_form with orth=orth; interp="subst:sg:loc.voc:m3"} :: forms | "ε",["subst:sg:gen:m3";"subst:sg:loc:m3";"subst:sg:voc:m3"] -> {empty_form with orth=orth; interp="subst:sg:loc.voc:m3"} :: {empty_form with orth=orth; interp="subst:sg:gen:m3"} :: forms | "ε",["subst:sg:acc:f";"subst:sg:nom:f"] -> {empty_form with orth=orth; interp="subst:sg:nom.acc:f"} :: forms | "ε",["subst:pl:gen:f";"subst:pl:nom.acc.voc:f";"subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:gen.dat.loc.voc:f|subst:pl:gen:f"} :: {empty_form with orth=orth; interp="subst:pl:nom.acc.voc:f"} :: forms | "ε",["subst:pl:gen:f";"subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:gen.dat.loc.voc:f|subst:pl:gen:f"} :: forms | "y",["subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1"} :: forms | "y",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms | "y",["subst:pl:gen.acc:m1";"subst:pl:loc:m1"] -> {empty_form with orth=orth; interp="subst:pl:gen.acc.loc:m1"} :: forms | "y",["subst:pl:dat:m1";"subst:sg:inst:m1";"subst:sg:loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m1|subst:pl:dat:m1"} :: forms | "y",["subst:sg:nom:m2";"subst:sg:voc:m2"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m2"} :: forms | "y",["subst:pl:nom.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1"} :: {empty_form with orth=orth; interp="subst:pl:nom.voc:m1"} :: forms | "y",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms | "y",["subst:pl:gen:m2";"subst:pl:loc:m2"] -> {empty_form with orth=orth; interp="subst:pl:gen.acc.loc:m2"} :: forms | "y",["subst:pl:dat:m2";"subst:sg:inst:m2";"subst:sg:loc:m2"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m2|subst:pl:dat:m2"} :: forms | "y",["subst:sg:nom.acc:m3";"subst:sg:voc:m3"] -> {empty_form with orth=orth; interp="subst:sg:nom.acc.voc:m3"} :: forms | "y",["subst:pl:gen:m3";"subst:pl:loc:m3"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:m3"} :: forms | "y",["subst:pl:dat:m3";"subst:sg:inst:m3";"subst:sg:loc:m3"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m3|subst:pl:dat:m3"} :: forms | "y",["subst:sg:dat.loc:f";"subst:sg:gen:f";"subst:sg:nom:f";"subst:sg:voc:f"] -> {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.loc.voc:f"} :: forms | "e",["depr:pl:nom.acc.voc:m2";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms | "e",["depr:pl:nom.acc.voc:m2";"subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1";"subst:sg:nom:m1";"subst:sg:voc:m1"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1|depr:pl:nom.acc.voc:m2"} :: {empty_form with orth=orth; interp="subst:sg:nom.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms | "e",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms | "e",["subst:sg:inst:m1";"subst:sg:loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:m1"} :: forms | "e",["subst:pl:dat:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:n:ncol"} :: {empty_form with orth=orth; interp="subst:pl:dat:n:ncol"} :: forms | "e",["subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:inst.loc:n:ncol"} :: forms | "e",["subst:pl:gen:n:ncol";"subst:pl:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:pl:gen.loc:n:ncol"} :: forms | "e",["subst:sg:dat:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:ncol"} :: forms | "e",["subst:pl:nom.acc.voc:n:ncol";"subst:sg:gen:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:gen:n:ncol|subst:pl:nom.acc.voc:n:ncol"} :: forms | "o",["subst:sg:nom.voc:m1"] -> {empty_form with orth=orth; interp="subst:sg:nom:m1"} :: {empty_form with orth=orth; interp="subst:sg:voc:m1"} :: forms | "o",["subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: forms | "o",["depr:pl:nom.acc.voc:m2";"subst:pl:nom.voc:m1";"subst:sg:gen.acc:m1";"subst:sg:gen:m1"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m1"} :: {empty_form with orth=orth; interp="subst:pl:nom.voc:m1"} :: {empty_form with orth=orth; interp="depr:pl:nom.acc.voc:m2"} :: forms | "o",["subst:sg:dat.loc:m1"] -> {empty_form with orth=orth; interp="subst:sg:dat:m1"} :: {empty_form with orth=orth; interp="subst:sg:loc:m1"} :: forms | "o",["subst:sg:gen.acc:m2";"subst:sg:gen:m2"] -> {empty_form with orth=orth; interp="subst:sg:gen.acc:m2"} :: forms | "o",["subst:pl:dat:m1:pt";"subst:pl:loc:m1:pt"] -> {empty_form with orth=orth; interp="subst:pl:dat.loc:m1:pt"} :: forms | "ε",["subst:sg:dat:n:ncol";"subst:sg:gen:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol";"subst:sg:nom.acc.voc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: forms | "ε",["subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol";"subst:sg:dat:n:ncol";"subst:sg:gen:n:ncol";"subst:sg:inst:n:ncol";"subst:sg:loc:n:ncol";"subst:sg:nom.acc.voc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: {empty_form with orth=orth; interp="subst:sg:nom.gen.dat.acc.inst.loc.voc:n:ncol"} :: forms | "ę",["subst:sg:dat:n:col";"subst:sg:loc:n:col"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:col"} :: forms | "ę",["subst:sg:dat:n:ncol";"subst:sg:loc:n:ncol"] -> {empty_form with orth=orth; interp="subst:sg:dat.loc:n:ncol"} :: forms | "o",["subst:sg:loc:m1";"subst:sg:voc:m1"] -> if orth_suf = "e" then {empty_form with orth=orth; interp="subst:sg:loc.voc:m1"} :: forms else {empty_form with orth=orth; interp="subst:sg:loc:m1"} :: {empty_form with orth=orth; interp="subst:sg:voc:m1"} :: forms | _,["depr:pl:nom.acc.voc:m2";"subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1"] -> {empty_form with orth=orth; interp="subst:sg.pl:nom.gen.dat.acc.inst.loc.voc:m1|depr:pl:nom.acc.voc:m2"} :: forms | _,[interp] -> {empty_form with orth=orth; interp=interp} :: forms | _,interps -> (* print_endline ("merge_interps: " (*^ lemma_suf*) ^ " [\"" ^ String.concat "\";\"" interps ^ "\"]"); *) Xlist.fold interps forms (fun forms interp -> {empty_form with orth=orth; interp=interp} :: forms)) let process_interps dict = Xlist.rev_map dict (fun entry -> if entry.cat = "verb" then let aspect = get_aspect entry.lemma entry.forms in let forms = Xlist.rev_map entry.forms (fun f -> let interp = match (Xstring.split ":" f.interp) with ["fin";n;p;_] -> String.concat ":" ["fin";n;p;"imperf.perf"] | ["impt";n;p;_] -> String.concat ":" ["impt";n;p;"imperf.perf"] | "pcon" :: _ -> f.interp | "pacta" :: _ -> f.interp | "pact" :: _ -> f.interp | ["ger";n;c;g;_;a] -> String.concat ":" ["ger";n;c;g;"imperf.perf";a] | ["praet";n;g;_] -> String.concat ":" ["praet";n;g;"imperf.perf"] | ["praet";n;g;_;a] -> String.concat ":" ["praet";n;g;"imperf.perf";a] | ["inf";_] -> String.concat ":" ["inf";"imperf.perf"] | ["pant";_] -> String.concat ":" ["pant";"imperf.perf"] | ["imps";_] -> String.concat ":" ["imps";"imperf.perf"] | ["ppas";n;c;g;_;a] -> String.concat ":" ["ppas";n;c;g;"imperf.perf";a] | _ -> print_endline ("merge_interps: " ^ f.interp); f.interp in {f with interp=interp}) in let forms = merge_interps entry.lemma forms in {entry with aspect=aspect; forms=forms} else {entry with forms=merge_interps entry.lemma entry.forms}) (**********************************************************************************) (*let mark_ndm dict = Xlist.fold dict [] (fun dict entry -> if entry.cat <> "noun" && entry.cat <> "adj" then entry :: dict else let map = Xlist.fold entry.forms StringMap.empty (fun map form -> StringMap.add_inc map form.interp (StringSet.singleton form.orth) (fun set -> StringSet.add set form.orth)) in let qmap = StringMap.fold map StringQMap.empty (fun qmap interp orths -> StringSet.fold orths qmap StringQMap.add) in let n = StringMap.size map in let found = StringQMap.fold qmap [] (fun found orth v -> if v = n then orth :: found else found) in match found with [] -> entry :: dict | [orth] -> let ndm,odm = Xlist.fold entry.forms ([],[]) (fun (ndm,odm) form -> if form.orth = orth then form :: ndm, odm else ndm, form :: odm) in let dict = {entry with forms=odm} :: dict in {entry with forms=ndm; ndm=true} :: dict | _ -> failwith ("mark_ndm: " ^ (String.concat " " found))) let print_ndm filename dict = File.file_out filename (fun file -> Xlist.iter dict (fun entry -> if entry.ndm then let orth = (List.hd entry.forms).orth in fprintf file "%s\t%s\t%s\n" orth entry.lemma entry.cat)) let remove_ndm dict = Xlist.fold dict [] (fun dict entry -> if entry.ndm then dict else entry :: dict) let remove_not_ndm dict = Xlist.fold dict [] (fun dict entry -> if not entry.ndm then dict else entry :: dict)*) let kolwiek_lemmas = StringSet.of_list [ (* adj *) "czyjkolwiek"; "czyjś"; "czyjże"; "jakiciś"; "jakikolwiek"; "jakisi"; "jakiś"; "jakiści"; "jakiściś"; "jakiśkolwiek"; "jakiż"; "jakiżkolwiek"; "jakowyś"; "kijże"; "kiż"; "którykolwiek"; "któryś"; "któryż"; "któryżkolwiek"; "niejakiś"; "takiż"; "takowyż"; "tenże"; "tyliż"; "ówże"; (* noun *) "cokolwiek:s"; "cośkolwiek"; "cóżkolwiek"; "ktokolwiek"; "ktośkolwiek"; "któżkolwiek"; "cociś"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "coże"; "cóż"; "ktoś:s2"; "któż"; (* adv *) "jakkolwiek"; "jakoś"; "małoż"; "niejakkolwiek"; "niejakoś"; (*"niemalże";*) ] let kolwiek_suffixes = [ "żkolwiek"; "żekolwiek"; "śkolwiek"; "kolwiek"; "ż"; "że"; "ściś"; "ciś"; "ś"; "ści"; "sik"; "si"] let find_kolwiek_suffixes dict = Xlist.rev_map dict (fun entry -> if StringSet.mem kolwiek_lemmas entry.lemma then {entry with forms=Xlist.map entry.forms (fun form -> {form with orth=Xlist.fold kolwiek_suffixes form.orth (fun orth kolwiek_suf -> if Xstring.check_sufix kolwiek_suf orth then Xstring.cut_sufix kolwiek_suf orth else orth)})} else entry) let exceptional_lemmata = StringSet.of_list ([ (* wiele stemów *) "Apollo"; "Aujeszky"; "Białystok"; "Gózd"; "Krasnystaw"; "Różanystok"; "Wielkanoc"; "białagłowa"; "deszcz"; "imćpan"; "iściec"; "otrząs"; "rzeczpospolita"; "wilczełyko"; "woleoczko"; "prapraojciec"; "praojciec"; "ojciec"; "współbrat"; "spółbrat"; "półbrat"; "brat"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; "człowiek"; "półdziecko"; "+lecie"; "zimoziele"; "ziele"; "trójziele"; "nasienie"; "ksiądz"; "dech"; "tydzień"; "roczek:s2"; "rok:s1"; "przechrzest"; "chrzest"; "dziecko"; "ucho:s2"; "oko:s2"; "cześć:s"; "jo-jo"; "Zabłotce"; "tysiąc:s1"; "półmiesiąc"; "miesiąc"; ""; ""; ""; "Pia"; "ręka"; "człek"; "Kozak:s1"; "bóg"; "psubrat"; "pieniądz"; ""; ""; ""; ""; "kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic"; "co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek"; "niebiosa"; "Włochy:s1"; "Niemcy"; "Węgry"; "Austro-Węgry"; ""; ""; ""; ""; ""; ""; ""; "zając:s1"; "tysiąc:s2"; "wszyscy"; ""; ""; ""; ""; ""; ""; ""; ""; ""; (* "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT"; "OIT"; ""; ""; ""; ""; ""; ""; ""; "";*) "bliscy"; "ojcowie"; "teściowie"; "ichmościowie"; "wujkowie"; "staruszkowie"; "pradziadkowie"; ""; ""; ""; ""; ""; "małżonkowie"; "kochankowie"; "dziadkowie"; "rozwiedzeni"; "nieliczni"; "chrzestni"; ""; ""; ""; ""; ""; ""; "starzy"; "wasi"; "nasi"; "najmłodsi"; "dzisiejsi"; ""; ""; ""; ""; ""; ""; ""; "IKEA"; "stajnia"; "kuchnia:s"; "suknia"; "minisuknia"; "głównia"; "głownia"; "dźwignia"; ""; ""; ""; ""; "workowiśnia"; "wiśnia"; "sośnia"; "laurowiśnia"; "studnia"; "idea"; "imienie"; ""; ""; ""; ""; ""; "makao"; "macao"; "kakao"; "Akademgorodok"; "yuppi"; "hippie"; "yuppie"; ""; ""; ""; ""; ""; "Uj"; "PIT"; "ChAT"; "podczłowiek"; "nieczłowiek"; "cześć"; "ktoś"; "ktosik"; ""; ""; ""; ""; "+ówna"; "+yna"; "+ina"; "+anka"; "+owa"; "co"; "cokolwiek"; "coś"; "cośtam"; ""; ""; ""; "zając"; "tysiąc"; "rok"; "roczek"; "oko"; "ucho"; "Włochy"; "niebiosy"; "wici"; ""; ""; ""; "André"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "zły:a"; "dobry:a"; "przymały"; "mały:a"; "duży"; "wielki:a"; "ppoż."; "ppanc."; "pepanc."; "midi:a"; ""; ""; ""; ""; ""; ""; ""; ""; "zwać"; "wiedzieć"; "pójść"; "przejść"; "dojść"; "zsiąść"; "iść"; ""; ""; ""; ""; ""; "być"; "zasłonić"; "słonić"; "przysłonić"; "przesłonić"; "osłonić"; "odsłonić"; ""; ""; ""; ""; ""; (* (* błąd w słowniku *) "ówże"; (* wiele stemów *) "twój:a"; "swój"; "mój:a"; "wszystek"; (* oboczności w stemie *) "co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek"; "kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic"; "Angel"; "Apollo"; "Białystok"; "Bober"; "Dzięgiel"; "Engel"; "Gołąb:s2"; "Gózd"; "Hendel"; "Herschel"; "Jastrząb"; "Kodrąb:s2"; "Kozioł"; "Krasnystaw"; "Majcher"; "Ob"; "Omulew"; "Orzeł"; "Różanystok"; "Schuster"; "Stępień"; "Słonim"; "Wielkanoc"; "achtel"; "archiprezbiter"; "arcydzięgiel"; "bedel"; "ber"; "białagłowa"; "białodrzew"; "ceter"; "deszcz"; "drama"; "dziób:s1"; "dzięgiel"; "dżemper"; "falafel"; "grubodziób"; "harbajtel"; "harbejtel"; "harmider"; "imćpan"; "iściec"; "jarząb:s2"; "kierdel"; "kimel"; "kiper:s1"; "klaster"; "kliper"; "kosodrzew"; "kureń"; "manczester"; "nadpiersień"; "osep"; "otrząs"; "pedel"; "piksel"; "podpiersień"; "podziem"; "prezbiter"; "protokół"; "przedpiersień"; "ratel"; "rondel:s2"; "rozpiór:s1"; "rozpiór:s2"; "rzeczpospolita"; "rzep:s2"; "rzepień"; "rzewień"; "rąb"; "sosrąb"; "srebrnodrzew"; "swąd"; "szmermel"; "szpiegierz"; "ulster"; "wab:s2"; "wermiszel"; "wilczełyko"; "woleoczko"; "włosień:s2"; "zew"; "złotogłów"; "świreń"; "źreb"; "żółtodziób"; "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; ""; "przechrzest"; "chrzest"; "półdziecko"; "roczek:s2"; "rok:s1"; "tydzień"; ""; ""; ""; ""; ""; ""; (* oboczności w odmianie *) "niekażdy"; "każdy"; "niektóry:a"; "który"; "tenże"; "ten"; "tamten"; "kijże"; "ucho:s2"; "dziecko"; "oko:s2"; "imię"; "nozdrze"; "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "być"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*) (* pozostawione *) "czyjże"; "czyjś"; "czyjkolwiek"; "kiż"; "ów"; "ow"; "on:a"; "ki"; "Pia"; "jo-jo"; "+lecie"; ""; "zagrząźć"; "zrzeć"; (* niepełny paradygmat *) "zróść"; "zląc"; "zaróść"; "zaprząc"; "zaprzysiąc"; "zanieść:v2"; "zaląc"; "wzróść"; "wyróść"; "wyprząc"; "wyprzysiąc"; "róść"; "sprzysiąc"; "sprząc"; "ugrząźć"; "uląc"; "upiec:v2"; "uprząc"; "uróść"; "wieść:v2"; "wprząc"; "wróść"; "wyląc"; "powieść:v2"; "posiąc"; "przeląc"; "przeprząc"; "przeróść"; "przyprząc"; "przysiąc"; "przyróść"; "prząc"; "pójść:v2"; "rozprząc"; "rozróść"; "krzywoprzysiąc"; "ląc"; "naróść"; "obróść"; "odprzysiąc"; "odprząc"; "odróść"; "oprzysiąc"; "podróść"; "pogrząźć"; "poprzysiąc"; "poróść"; "dojść:v2"; "doprząc"; "doróść"; "dosiąc"; "grząźć"; "iść:v2"; (* wiele stemów *) "uwlec"; "wewlec"; "wlec"; "wwlec"; "wywlec"; "wyżec"; "zawlec"; "zażec"; "zewlec"; "zwlec"; "zżec"; "żec"; "podwlec"; "podżec"; "powlec:v1"; "powlec:v2"; "przeoblec"; "przewlec"; "przeżec"; "przyoblec"; "przywlec"; "przyżec"; "rozwlec"; "rozżec"; "dowlec"; "nawlec"; "oblec:v2"; "obwlec"; "odwlec"; "owlec"; "zeżreć"; (* inne *) "liźć"; "iść:v1"; "wyniść"; "wynijść"; "wyjść"; "wniść"; "wnijść"; "wejść"; "ujść"; "rozejść"; "pójść:v1"; "przyjść"; "przejść:v2"; "przejść:v1"; "podejść"; "odejść"; "obejść:v2"; "obejść:v1"; "najść:v2"; "najść:v1"; "nadejść"; "dojść:v1"; "roztworzyć:v2"; "przetworzyć:v2"; "otworzyć"; "zsiąść:v2"; "zsiąść:v1"; "zesiąść"; "zasiąść"; "wysiąść"; "współposiąść"; "wsiąść"; "usiąść"; "siąść"; "rozsiąść"; "przysiąść"; "przesiąść"; "powsiąść"; "posiąść"; "podsiąść"; "osiąść"; "obsiąść"; "nasiąść"; "dosiąść"; "źreć:v1"; "zniść"; "znijść"; "znajść"; "zejść"; "zejść"; "zajść:v2"; "zajść:v1"; "wzniść"; "wznijść"; "wzejść" (* "moi"; "twoi"; (*"AIDS"; "BGŻ"; "BWZ"; "BZ";*) (*"Bandtkie";*) (*"CRZZ"; "FPŻ";*) (*"Jokai"; "Jókai"; "Linde";*)(* "MSZ"; "MWGzZ"; *) (*"NSZ"; "OPZZ";*) "Radetzky"; "Tagore"; (*"UNZ"; "URz"; "WBZ"; "ZSZ"; "ZWZ"; "ZZ";*) "aids"; "arcyksiężna"; "cornflakes"; "księżna"; (*"scrabble";*) "sms"; "teścina"; "Wielkanoc"; "białagłowa"; "rzeczpospolita"; "imćpan"; "Ob"; "podziem"; "Pia"; "woleoczko"; "wilczełyko"; "jo-jo"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "Omulew"; "drama"; (*"Kayah";*) "ratel"; "grubodziób"; "rozpiór:s1"; "ceter"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "DJ"; "FIFA"; (*"manicure"; "Greenpeace"; "Google";*) ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "włosień:s2"; "deszcz"; "falafel"; "Krasnystaw"; "Różanystok"; "Białystok"; "ZHR"; "rzep:s2"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "IKEA"; "makao"; "macao"; "kakao"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "dziecko"; "oko:s2"; "ucho:s2"; "półdziecko"; "b-cia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "idea"; "ręka"; "cześć:s"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "ABBA"; "UEFA"; "FAMA"; "SABENA"; "MENA"; "APA"; "NASA"; "ANSA"; "NAFTA"; "LETTA"; "ETA"; "ELTA"; "EFTA"; "CEFTA"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT"; "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; ""; "szwa"; "hawanna"; "butaforia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "Skopie"; "Mathea"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "poema:s1"; "klima:s1"; "dylema"; "dilemma"; "apoftegma"; "aksjoma"; ""; ""; ""; ""; ""; ""; ""; ""; "burgrabia"; "gograbia"; "grabia"; "hrabia"; "margrabia"; "murgrabia"; "sędzia:s1"; "wicehrabia"; "współsędzia"; "cieśla"; "bibliopola"; "świszczypałka"; "śwircałka"; "świerczałka"; "ścierciałka"; "tatka"; "sługa:s1"; "stupajka:s1"; "stepka"; "starowinka:s2"; "skurczypałka"; "mężczyzna"; "klecha"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*) ] @ File.load_lines "../morphology/data/obce.tab" @ File.load_lines "../morphology/data/akronimy.tab" @ (* File.load_lines "results/interp_validated_verb.tab" @ *) (*File.load_lines "results/interp_validated_noun.tab" @ File.load_lines "results/interp_validated_adj.tab" @ File.load_lines "../morphology/data/validated_adj.tab" @ File.load_lines "../morphology/data/validated_noun.tab" @ File.load_lines "../morphology/data/validated_verb.tab" @ File.load_lines "../morphology/data/adv_nieodprzymiotnikowe.tab" *) []) let remove_exceptional_lemmata dict = Xlist.fold dict [] (fun dict entry -> if StringSet.mem exceptional_lemmata entry.lemma then dict else entry :: dict) let remove_exceptional_lemmata_gen ex dict = Xlist.fold dict [] (fun dict entry -> if StringSet.mem ex entry.lemma then dict else entry :: dict) let generate_stem dict = Xlist.rev_map dict (fun entry -> {entry with stem= (* if entry.ndm then (List.hd entry.forms).orth else *) if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then Stem.generate_stem entry else ""}) (*let phon_generate_stem dict = Xlist.rev_map dict (fun entry -> {entry with phon_stem= if entry.ndm then (List.hd entry.forms).phon_orth else if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then Stem.phon_generate_stem entry else []})*) let generate_stem_lu dict = Xlist.rev_map dict (fun entry -> {entry with lu_stem=Stem.generate_stem_lu entry.lemma1 entry.lemma2}) let lowercase_lu dict = Xlist.rev_map dict (fun entry -> {entry with lemma1=Xunicode.lowercase_utf8_string entry.lemma1; lemma2=Xunicode.lowercase_utf8_string entry.lemma2}) let fonetic_translation dict = Xlist.fold dict [] (fun dict e -> try let lemma = Stem.simplify_lemma e.lemma in let phon_lemma = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules lemma in let phon_stem = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules e.stem in {e with phon_lemma = phon_lemma; phon_stem=phon_stem; forms = Xlist.map e.forms (fun f -> let phon_orth = Fonetics.translate_and_check true Fonetics.rules Fonetics.rev_rules f.orth in {f with phon_orth = phon_orth})} :: dict with Fonetics.NotFound(x,s) -> printf "NF %s %s %s\n%!" e.lemma x s; dict | Fonetics.NotEqual(x,s,t) -> printf "NE %s %s %s %s\n%!" e.lemma x s t; dict | Fonetics.MulipleSolutions(x,s,l) -> printf "MS %s %s %s: %s\n%!" e.lemma x s (String.concat " " l); dict | _ -> dict) let validate rules dict = Xlist.rev_map dict (fun entry -> let simple_lemma = Stem.simplify_lemma entry.lemma in let forms = Xlist.rev_map entry.forms (fun form -> let candidates = Rules.CharTrees.find rules form.orth in let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) -> if stem ^ rule.set = simple_lemma then (stem,rule) :: candidates else candidates) in if candidates = [] then {form with validated=false} else {form with validated=true}) in {entry with forms=forms}) let phon_validate rules dict = Xlist.rev_map dict (fun entry -> let simple_lemma = Stem.simplify_lemma entry.lemma in let forms = Xlist.rev_map entry.forms (fun form -> let candidates = Xlist.fold form.phon_orth [] (fun candidates s -> Xlist.fold (Rules.CharTrees.find rules s) candidates (fun candidates (stem,rule) -> let candidate_lemma = Fonetics.translate_single true Fonetics.rev_rules (stem ^ rule.set) in if candidate_lemma = simple_lemma then (stem,rule) :: candidates else candidates)) in if candidates = [] then {form with validated=false} else {form with validated=true}) in {entry with forms=forms}) let validate_lu rules dict = Xlist.rev_map dict (fun entry -> let candidates1 = Rules.CharTrees.find rules entry.lemma1 in let candidates2 = Rules.CharTrees.find rules entry.lemma2 in let b = Xlist.fold candidates1 false (fun b (stem1,rule1) -> Xlist.fold candidates2 b (fun b (stem2,rule2) -> (* Printf.printf "%s %s %s %s\n%!" stem1 stem2 (string_of_rule rule1) (string_of_rule rule1); *) if stem1 ^ rule1.set = stem2 ^ rule2.set then true else b)) in (* if b then print_endline "validated"; *) let b1 = Xlist.fold candidates1 false (fun b (stem1,rule1) -> if stem1 = entry.lu_stem then true else b) in let b2 = Xlist.fold candidates2 false (fun b (stem2,rule2) -> if stem2 = entry.lu_stem then true else b) in {entry with lu_validated=b; validated1=b1; validated2=b2}) let validate_interp rules dict = Xlist.rev_map dict (fun entry -> let simple_lemma = Stem.simplify_lemma entry.lemma in let forms = Xlist.rev_map entry.forms (fun form -> let candidates = Xlist.fold form.phon_orth [] (fun candidates s -> Xlist.fold (Rules.CharTrees.find rules s) candidates (fun candidates (stem,rule) -> (* if rule.star = Ndm && not entry.ndm then candidates else if rule.star <> Ndm && entry.ndm then candidates else *) let candidate_lemma = Fonetics.translate_single true Fonetics.rev_rules (stem ^ rule.set) in if candidate_lemma = simple_lemma && form.interp = rule.interp then (stem,rule) :: candidates else candidates)) in if candidates = [] then {form with validated=false} else {form with validated=true}) in {entry with forms=forms}) let remove_validated_forms dict = Xlist.fold dict [] (fun dict entry -> let forms = Xlist.fold entry.forms [] (fun forms form -> if form.validated then forms else form :: forms) in if forms = [] then dict else {entry with forms=forms} :: dict) let remove_validated_entries dict = Xlist.fold dict [] (fun dict entry -> let forms = Xlist.fold entry.forms [] (fun forms form -> if form.validated then forms else form :: forms) in if forms = [] then dict else entry :: dict) let remove_validated_lu dict = Xlist.fold dict [] (fun dict entry -> if entry.lu_validated then dict else entry :: dict) let remove_not_validated_forms dict = Xlist.fold dict [] (fun dict entry -> let forms = Xlist.fold entry.forms [] (fun forms form -> if form.validated then form :: forms else forms) in if forms = [] then dict else {entry with forms=forms} :: dict) let remove_not_validated_entries dict = Xlist.fold dict [] (fun dict entry -> let forms = Xlist.fold entry.forms [] (fun forms form -> if form.validated then form :: forms else forms) in if Xlist.size forms <> Xlist.size entry.forms then dict else entry :: dict) let print filename dict = File.file_out filename (fun file -> Xlist.iter dict (fun entry -> Xlist.iter entry.forms (fun form -> fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp))) let print_lemmata filename dict = File.file_out filename (fun file -> Xlist.iter dict (fun entry -> fprintf file "%s\n" entry.lemma)) let remove_sup_neg_forms dict = Xlist.fold dict [] (fun dict entry -> let forms = Xlist.fold entry.forms [] (fun forms form -> if Xstring.check_sufix ":neg" form.interp || Xstring.check_sufix ":sup" form.interp then forms else form :: forms) in if forms = [] then dict else {entry with forms=forms} :: dict) let generate_rules rules path filename rules_filename = let dict = load_tab (path ^ filename) in let dict = merge_entries dict in let dict = process_interps dict in (* let dict = mark_ndm dict in (* FIXME: remove_ndm? *) *) let dict = remove_exceptional_lemmata dict in let dict = find_kolwiek_suffixes dict in (* FIXME: lematy z kolwiek_suffixes nie są walidowane *) let dict = generate_stem dict in let dict = fonetic_translation dict in let dict = phon_validate rules dict in let dict = remove_validated_forms dict in let dict = remove_sup_neg_forms dict in (* FIXME *) let rules = Xlist.fold dict StringMap.empty (fun rules entry -> Xlist.fold (RuleGenerator.phon_generate_rules_entry entry) rules (fun rules (key,rule) -> let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in let rules2 = StringMap.add_inc rules2 rule (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l) in StringMap.add rules key rules2)) in File.file_out rules_filename (fun file -> StringMap.iter rules (fun interp rules2 -> fprintf file "\n@RULES %s\n" interp; StringMap.iter rules2 (fun rule (q,l) -> fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l)))) let generate_rules_lu rules id path rules_filename = let dict = load_lu [] id path in let dict = lowercase_lu dict in let dict = generate_stem_lu dict in let dict = validate_lu rules dict in let dict = remove_validated_lu dict in (* let dict = remove_sup_neg_forms dict in *) let rules = Xlist.fold dict StringMap.empty (fun rules entry -> Xlist.fold (RuleGenerator.generate_rules_lu_entry entry) rules (fun rules (key,rule,lemma) -> let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in StringMap.add rules key rules2)) in File.file_out rules_filename (fun file -> StringMap.iter rules (fun interp rules2 -> fprintf file "\n@RULES %s\n" interp; StringMap.iter rules2 (fun rule (q,l) -> fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l)))) let rec get_first n l = if n = 0 || l = [] then [] else List.hd l :: (get_first (n-1) (List.tl l)) let generate_interp_rules rules interp_rules selected_tags path filename rules_filename = let selected_tags = StringSet.of_list selected_tags in let dict = load_tab (path ^ filename) in let dict = merge_entries dict in let dict = process_interps dict in (* let dict = mark_ndm dict in (* FIXME: remove_ndm? *) *) let dict = remove_exceptional_lemmata dict in (* let dict = find_kolwiek_suffixes dict in *) (* let dict = generate_stem dict in *) let dict = fonetic_translation dict in let dict = validate_interp interp_rules dict in let dict = remove_validated_forms dict in let interp_rules = Xlist.fold dict StringMap.empty (fun interp_rules entry -> let simple_lemma = Stem.simplify_lemma entry.lemma in Xlist.fold entry.forms interp_rules (fun interp_rules form -> let candidates = RuleGenerator.phon_generate_interp_rules rules selected_tags simple_lemma form in Xlist.fold candidates interp_rules (fun interp_rules (v,cand) -> (* StringMap.add_inc interp_rules cand (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l)))) in *) StringMap.add_inc interp_rules cand (v,StringSet.singleton entry.lemma) (fun (v,set) -> v,StringSet.add set entry.lemma)))) in let interp_rules = List.rev (List.sort compare (StringMap.fold interp_rules [] (fun l k (v,set) -> (v,k,set) :: l))) in File.file_out rules_filename (fun file -> Xlist.iter interp_rules (fun (v,k,set)(*q,l*) -> (* fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l))) *) (*if StringSet.size set > 1000 then*) fprintf file "\t%s\t# %d %s\n" k (StringSet.size set) (String.concat " " (get_first 20 (List.rev (StringSet.to_list set)))))) (* let generate_ndm_rules dict = let freq_rules = Xlist.fold dict Rules.RuleQMap.empty (fun freq_rules entry -> Xlist.fold entry.forms freq_rules (fun freq_rules form -> let rule = {id=""; freq=0; star=Ndm; pref=""; find=""; set=""; tags=[]; interp=form.interp} in Rules.RuleQMap.add freq_rules rule)) in fst (Rules.RuleQMap.fold freq_rules (Rules.RuleQMap.empty,1) (fun (freq_rules,i) rule freq -> Rules.RuleQMap.add_val freq_rules {rule with id = "N" ^ string_of_int i} freq, i+1)) *) let generate_rule_frequencies rules path filename rules_filename = let dict = load_tab (path ^ filename) in let dict = merge_entries dict in let dict = process_interps dict in let dict = remove_cat "cond" dict in (* let dict = mark_ndm dict in let freq_rules = generate_ndm_rules (remove_not_ndm dict) in let dict = remove_ndm dict in *) let dict = remove_exceptional_lemmata dict in let dict = generate_stem dict in let freq_rules = Xlist.fold dict Rules.RuleQMap.empty(*freq_rules*) (fun freq_rules entry -> let simple_lemma = Stem.simplify_lemma entry.lemma in Xlist.fold entry.forms freq_rules (fun freq_rules form -> let candidates = Rules.CharTrees.find rules form.orth in let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) -> if stem ^ rule.set = simple_lemma && form.interp = rule.interp then (stem,rule) :: candidates else candidates) in if candidates = [] then freq_rules else Rules.RuleQMap.add freq_rules (snd (List.hd candidates)))) in File.file_out rules_filename (fun file -> Rules.RuleQMap.iter freq_rules (fun rule freq -> fprintf file "%s\n" (Rules.string_of_freq_rule {rule with freq=freq}))) let generate_stem_dict rules_filename path filename out_filename = let rules = Rules.load_freq_rules rules_filename in let rules = Rules.CharTrees.create rules in let dict = load_tab (path ^ filename) in let dict = merge_entries dict in let dict = process_interps dict in let dict = remove_cat "cond" dict in (* let dict = mark_ndm dict in *) let stems = Xlist.fold dict StringMap.empty (fun stems entry -> let simple_lemma,lemma_suf = Stem.simplify_lemma_full entry.lemma in Xlist.fold entry.forms stems (fun stems form -> let candidates = Rules.CharTrees.find rules form.orth in let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) -> (* if rule.star = Ndm && not entry.ndm then candidates else if rule.star <> Ndm && entry.ndm then candidates else *) if stem ^ rule.set = simple_lemma && form.interp = rule.interp then (stem,rule) :: candidates else candidates) in if candidates = [] then stems else let stem,rule = List.hd candidates in StringMap.add_inc stems (stem ^ "\t" ^ lemma_suf) [rule.id] (fun l -> rule.id :: l))) in File.file_out out_filename (fun file -> StringMap.iter stems (fun stem ids -> fprintf file "%s\t%s\n" stem (String.concat " " ids)))