Commit 173c381a9761ad523dd96392859442abf3c8fd7b

Authored by Wojciech Jaworski
1 parent 8e42d298

obsługa słowników

guesser/dict.ml 0 → 100644
  1 +open Xstd
  2 +open Printf
  3 +open Types
  4 +
  5 +let get_form e =
  6 + match e.forms with
  7 + [form] -> form
  8 + | _ -> failwith "get_form"
  9 +
  10 +let load_tab filename =
  11 + File.load_tab filename (function
  12 + orth :: lemma :: interp :: _ ->
  13 + {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type="";
  14 + ndm=false; stem=""}
  15 + | line -> failwith ("load_tab: " ^ (String.concat "\t" line)))
  16 +
  17 +let load_tab_full filename =
  18 + File.load_tab filename (function
  19 + [orth; lemma; interp] ->
  20 + {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type="";
  21 + ndm=false; stem=""}
  22 + | [orth; lemma; interp; proper_type] ->
  23 + {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type=proper_type;
  24 + ndm=false; stem=""}
  25 + | [orth; lemma; interp; proper_type; genre] ->
  26 + {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=genre; validated=false}]; proper_type=proper_type;
  27 + ndm=false; stem=""}
  28 +(* | orth :: lemma :: interp :: proper_type :: genre -> (orth,lemma,interp,proper_type,String.concat ";" genre) :: l *)
  29 + | line -> failwith ("load_tab_full: " ^ (String.concat "\t" line)))
  30 +
  31 +let pos = StringSet.of_list [
  32 + "subst";"adj";"adv";"interp";"num";"xxx";"prep";"fin";"praet";"qub";"inf";"interj";
  33 + "brev";"numcol";"ppas";"pact";"adja";"conj";"ger";"pcon";"pant";"comp";"depr";
  34 + "adjp";"imps";"impt";"pred";"bedzie";"burk";"aglt";"ppron12";"ppron3";"adjc";
  35 + "winien";"siebie"
  36 + ]
  37 +
  38 +let rec find_pos rev = function
  39 + s :: l -> if StringSet.mem pos s then List.rev rev, s :: l else find_pos (s :: rev) l
  40 + | [] -> failwith "find_pos"
  41 +
  42 +let split_lemma_interp s =
  43 + let l = Xstring.split_delim ":" s in
  44 + let lemma,interp = find_pos [List.hd l] (List.tl l) in
  45 + String.concat ":" lemma, String.concat ":" interp
  46 +
  47 +let rec remove_empties = function
  48 + "" :: l -> remove_empties l
  49 + | l -> l
  50 +
  51 +let split_freq_orth s =
  52 + match remove_empties (Xstring.split " " s) with
  53 + freq :: l -> (*print_endline (String.concat "|" (freq :: l));*) int_of_string freq, String.concat " " l
  54 + | _ -> failwith "split_freq_orth"
  55 +
  56 +let load_freq_tab filename =
  57 + File.load_tab filename (function
  58 + [freq_orth; lemma_interp] ->
  59 + (* print_endline (freq_orth ^ "\t" ^ lemma_interp); *)
  60 + let freq,orth = split_freq_orth freq_orth in
  61 + let lemma,interp = split_lemma_interp lemma_interp in
  62 + {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=freq; genre=""; validated=false}]; proper_type="";
  63 + ndm=false; stem=""}
  64 + | line -> failwith ("load_freq_tab: " ^ (String.concat "\t" line)))
  65 +
  66 +let proper_type_selector e = e.proper_type
  67 +let genre_selector e = e.genre
  68 +let interp_selector e = (get_form e).interp
  69 +let freq_selector e = (get_form e).freq
  70 +
  71 +let print_quantities out_filename selector dict =
  72 + let qmap = Xlist.fold dict StringQMap.empty (fun qmap entry ->
  73 + StringQMap.add qmap (selector entry)) in
  74 + File.file_out out_filename (fun file ->
  75 + StringQMap.iter qmap (fun k v ->
  76 + fprintf file "%6d\t%s\n" v k))
  77 +
  78 +(**********************************************************************************)
  79 +
  80 +let load_dict_as_set filename =
  81 + let l = load_tab filename in
  82 + List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
  83 + let form = get_form entry in
  84 + StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp]))))
  85 +
  86 +let load_dict_as_set_full filename =
  87 + let l = load_tab_full filename in
  88 + List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
  89 + let form = get_form entry in
  90 + StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp;entry.proper_type;form.genre]))))
  91 +
  92 +let rec compare_dicts_rec file = function
  93 + [],[] -> ()
  94 + | [],b :: lb -> fprintf file "> %s\n" b; compare_dicts_rec file ([],lb)
  95 + | a :: la,[] -> fprintf file "< %s\n" a; compare_dicts_rec file (la,[])
  96 + | a :: la, b :: lb ->
  97 + if a = b then compare_dicts_rec file (la,lb) else
  98 + if a < b then (fprintf file "< %s\n" a; compare_dicts_rec file (la,b :: lb)) else
  99 + (fprintf file "> %s\n" b; compare_dicts_rec file (a :: la,lb))
  100 +
  101 +let compare_dicts filename1 filename2 filename_out =
  102 + let dict1 = load_dict_as_set filename1 in
  103 + let dict2 = load_dict_as_set filename2 in
  104 + File.file_out filename_out (fun file ->
  105 + compare_dicts_rec file (dict1,dict2))
  106 +
  107 +let compare_dicts_full filename1 filename2 filename_out =
  108 + let dict1 = load_dict_as_set_full filename1 in
  109 + let dict2 = load_dict_as_set_full filename2 in
  110 + File.file_out filename_out (fun file ->
  111 + compare_dicts_rec file (dict1,dict2))
  112 +
  113 +(**********************************************************************************)
  114 +
  115 +let find_entry_cat entry =
  116 + let form = get_form entry in
  117 + let cat,tags = match Xstring.split ":" form.interp with
  118 + cat :: tags -> cat,tags
  119 + | _ -> failwith ("find_entry_type: " ^ form.interp) in
  120 + if cat = "subst" || cat = "depr" then "noun" else
  121 + if cat = "adj" || cat = "adja"|| cat = "adjc"|| cat = "adjp" then "adj" else
  122 + if cat = "adv" then "adv" else
  123 + if cat = "inf" || cat = "praet"|| cat = "fin" || cat = "ppas" || cat = "pact" || cat = "pacta" ||
  124 + cat = "impt" || cat = "imps" || cat = "pcon" || cat = "pant" || cat = "ger" || cat = "" then "verb" else
  125 + if cat = "bedzie" || cat = "pred"|| cat = "prep" || cat = "num" || cat = "aglt" || cat = "winien" ||
  126 + cat = "qub" || cat = "brev" || cat = "comp" || cat = "interj" || cat = "burk" ||
  127 + cat = "conj" || cat = "ppron12" || cat = "ppron3" || cat = "" then "other" else
  128 + if cat = "cond" then "cond" else
  129 + failwith ("find_entry_cat: " ^ cat)
  130 +
  131 +let assign_entry_cat dict =
  132 + Xlist.rev_map dict (fun entry ->
  133 + {entry with cat = find_entry_cat entry})
  134 +
  135 +let split_dict path filename =
  136 + let dict = load_tab (path ^ filename) in
  137 + let dict = List.rev (assign_entry_cat dict) in
  138 + File.file_out (path ^ "noun_" ^ filename) (fun noun_file ->
  139 + File.file_out (path ^ "adj_" ^ filename) (fun adj_file ->
  140 + File.file_out (path ^ "adv_" ^ filename) (fun adv_file ->
  141 + File.file_out (path ^ "verb_" ^ filename) (fun verb_file ->
  142 + File.file_out (path ^ "other_" ^ filename) (fun other_file ->
  143 + Xlist.iter dict (fun entry ->
  144 + let form = get_form entry in
  145 + try
  146 + let file = match entry.cat with
  147 + "noun" -> noun_file
  148 + | "adj" -> adj_file
  149 + | "adv" -> adv_file
  150 + | "verb" -> verb_file
  151 + | "other" -> other_file
  152 + | "cond" -> raise Not_found
  153 + | _ -> failwith "split_dict" in
  154 + fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp
  155 + with Not_found -> ()))))))
  156 +
  157 +let merge_entries dict =
  158 + let dict = assign_entry_cat dict in
  159 + let map = Xlist.fold dict StringMap.empty (fun map entry ->
  160 + let form = get_form entry in
  161 + let key =
  162 + if entry.cat = "noun" then (* FIXME: problem z depr *)
  163 + let gender = match Xstring.split ":" form.interp with
  164 + ["depr";_;_;"m2"] -> "m1"
  165 + | "depr" :: _ -> failwith ("merge_entries: " ^ form.interp)
  166 + | [_;_;_;gender] -> gender
  167 + | _ -> failwith ("merge_entries: " ^ form.interp) in
  168 + entry.lemma ^ "|" ^ entry.cat ^ "|" ^ gender
  169 + else entry.lemma ^ "|" ^ entry.cat in
  170 + StringMap.add_inc map key entry (fun e ->
  171 + if entry.proper_type <> e.proper_type then
  172 + failwith ("merge_entries: " ^ key ^ " " ^ entry.proper_type ^ " " ^ e.proper_type) else
  173 + {e with forms = form :: e.forms})) in
  174 + StringMap.fold map [] (fun dict _ e -> e :: dict)
  175 +
  176 +(**********************************************************************************)
  177 +
  178 +let mark_ndm dict =
  179 + Xlist.fold dict [] (fun dict entry ->
  180 + if entry.cat <> "noun" && entry.cat <> "adj" then entry :: dict else
  181 + let map = Xlist.fold entry.forms StringMap.empty (fun map form ->
  182 + StringMap.add_inc map form.interp (StringSet.singleton form.orth) (fun set -> StringSet.add set form.orth)) in
  183 + let qmap = StringMap.fold map StringQMap.empty (fun qmap interp orths ->
  184 + StringSet.fold orths qmap StringQMap.add) in
  185 + let n = StringMap.size map in
  186 + let found = StringQMap.fold qmap [] (fun found orth v ->
  187 + if v = n then orth :: found else found) in
  188 + (* printf "%s\t%s\t%d\t%s\n%!" entry.lemma entry.cat n (String.concat " " found); *)
  189 + match found with
  190 + [] -> entry :: dict
  191 + | [orth] ->
  192 + let ndm,odm = Xlist.fold entry.forms ([],[]) (fun (ndm,odm) form ->
  193 + if form.orth = orth then form :: ndm, odm else ndm, form :: odm) in
  194 + let dict = {entry with forms=odm} :: dict in
  195 + {entry with forms=ndm; ndm=true} :: dict
  196 + | _ -> failwith ("mark_ndm: " ^ (String.concat " " found)))
  197 +
  198 +let print_ndm filename dict =
  199 + File.file_out filename (fun file ->
  200 + Xlist.iter dict (fun entry ->
  201 + if entry.ndm then
  202 + let orth = (List.hd entry.forms).orth in
  203 + fprintf file "%s\t%s\t%s\n" orth entry.lemma entry.cat))
  204 +
  205 +let kolwiek_lemmas = StringSet.of_list [
  206 + (* adj *)
  207 + "czyjkolwiek"; "czyjś"; "czyjże"; "jakiciś"; "jakikolwiek"; "jakisi"; "jakiś"; "jakiści";
  208 + "jakiściś"; "jakiśkolwiek"; "jakiż"; "jakiżkolwiek"; "jakowyś"; "kijże"; "kiż"; "którykolwiek";
  209 + "któryś"; "któryż"; "któryżkolwiek"; "niejakiś"; "takiż"; "takowyż"; "tenże"; "tyliż"; "ówże";
  210 + (* noun *)
  211 + "cokolwiek:s"; "cośkolwiek"; "cóżkolwiek"; "ktokolwiek"; "ktośkolwiek"; "któżkolwiek";
  212 + "cociś"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "coże"; "cóż";
  213 + "ktoś:s2"; "któż";
  214 + (* adv *)
  215 + "jakkolwiek"; "jakoś"; "małoż"; "niejakkolwiek"; "niejakoś"; (*"niemalże";*) ]
  216 +
  217 +let kolwiek_suffixes = [
  218 + "żkolwiek"; "żekolwiek"; "śkolwiek"; "kolwiek"; "ż"; "że"; "ściś"; "ciś"; "ś"; "ści"; "sik"; "si"]
  219 +
  220 +let find_kolwiek_suffixes dict =
  221 + Xlist.rev_map dict (fun entry ->
  222 + if StringSet.mem kolwiek_lemmas entry.lemma then
  223 + {entry with forms=Xlist.map entry.forms (fun form ->
  224 + {form with orth=Xlist.fold kolwiek_suffixes form.orth (fun orth kolwiek_suf ->
  225 + if Xstring.check_sufix kolwiek_suf orth then
  226 + Xstring.cut_sufix kolwiek_suf orth
  227 + else orth)})}
  228 + else entry)
  229 +
  230 +let exceptional_lemmata = StringSet.of_list ([
  231 + (* błąd w słowniku *)
  232 + "ówże";
  233 + (* wiele stemów *)
  234 + "twój:a"; "swój"; "mój:a"; "wszystek";
  235 + (* oboczności w stemie *)
  236 + "co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek";
  237 + "kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic";
  238 + "Angel"; "Apollo"; "Białystok"; "Bober"; "Dzięgiel"; "Engel"; "Gołąb:s2"; "Gózd"; "Hendel"; "Herschel"; "Jastrząb";
  239 + "Kodrąb:s2"; "Kozioł"; "Krasnystaw"; "Majcher"; "Ob"; "Omulew"; "Orzeł"; "Różanystok"; "Schuster"; "Stępień"; "Słonim";
  240 + "Wielkanoc"; "achtel"; "archiprezbiter"; "arcydzięgiel"; "bedel"; "ber"; "białagłowa"; "białodrzew"; "ceter"; "deszcz";
  241 + "drama"; "dziób:s1"; "dzięgiel"; "dżemper"; "falafel"; "grubodziób"; "harbajtel"; "harbejtel"; "harmider"; "imćpan";
  242 + "iściec"; "jarząb:s2"; "kierdel"; "kimel"; "kiper:s1"; "klaster"; "kliper"; "kosodrzew"; "kureń"; "manczester";
  243 + "nadpiersień"; "osep"; "otrząs"; "pedel"; "piksel"; "podpiersień"; "podziem"; "prezbiter"; "protokół"; "przedpiersień";
  244 + "ratel"; "rondel:s2"; "rozpiór:s1"; "rozpiór:s2"; "rzeczpospolita"; "rzep:s2"; "rzepień"; "rzewień"; "rąb"; "sosrąb";
  245 + "srebrnodrzew"; "swąd"; "szmermel"; "szpiegierz"; "ulster"; "wab:s2"; "wermiszel"; "wilczełyko"; "woleoczko"; "włosień:s2";
  246 + "zew"; "złotogłów"; "świreń"; "źreb"; "żółtodziób";
  247 + "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
  248 + "przechrzest"; "chrzest"; "półdziecko"; "roczek:s2"; "rok:s1"; "tydzień"; ""; ""; ""; ""; ""; "";
  249 + (* oboczności w odmianie *)
  250 + "niekażdy"; "każdy"; "niektóry:a"; "który"; "tenże"; "ten"; "tamten"; "kijże";
  251 + "ucho:s2"; "dziecko"; "oko:s2"; "imię"; "nozdrze";
  252 + "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
  253 + "NOT"; "LOT"; "KRRiT"; ""; ""; ""; ""; ""; ""; ""; ""; "";
  254 + "być"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  255 + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  256 + (* pozostawione *)
  257 + "czyjże"; "czyjś"; "czyjkolwiek"; "kiż"; "ów"; "ow"; "on:a"; "ki";
  258 + "Pia"; "jo-jo"; "+lecie"; "";
  259 + "zagrząźć"; "zrzeć";
  260 + (* niepełny paradygmat *)
  261 + "zróść"; "zląc"; "zaróść"; "zaprząc"; "zaprzysiąc"; "zanieść:v2"; "zaląc"; "wzróść"; "wyróść"; "wyprząc"; "wyprzysiąc";
  262 + "róść"; "sprzysiąc"; "sprząc"; "ugrząźć"; "uląc"; "upiec:v2"; "uprząc"; "uróść"; "wieść:v2"; "wprząc"; "wróść"; "wyląc";
  263 + "powieść:v2"; "posiąc"; "przeląc"; "przeprząc"; "przeróść"; "przyprząc"; "przysiąc"; "przyróść"; "prząc"; "pójść:v2"; "rozprząc"; "rozróść";
  264 + "krzywoprzysiąc"; "ląc"; "naróść"; "obróść"; "odprzysiąc"; "odprząc"; "odróść"; "oprzysiąc"; "podróść"; "pogrząźć"; "poprzysiąc"; "poróść";
  265 + "dojść:v2"; "doprząc"; "doróść"; "dosiąc"; "grząźć"; "iść:v2";
  266 + (* wiele stemów *)
  267 + "uwlec"; "wewlec"; "wlec"; "wwlec"; "wywlec"; "wyżec"; "zawlec"; "zażec"; "zewlec"; "zwlec"; "zżec"; "żec";
  268 + "podwlec"; "podżec"; "powlec:v1"; "powlec:v2"; "przeoblec"; "przewlec"; "przeżec"; "przyoblec"; "przywlec"; "przyżec"; "rozwlec"; "rozżec";
  269 + "dowlec"; "nawlec"; "oblec:v2"; "obwlec"; "odwlec"; "owlec"; "zeżreć";
  270 + (* inne *)
  271 + "liźć"; "iść:v1"; "wyniść"; "wynijść"; "wyjść"; "wniść"; "wnijść"; "wejść"; "ujść"; "rozejść"; "pójść:v1"; "przyjść"; "przejść:v2"; "przejść:v1"; "podejść"; "odejść"; "obejść:v2"; "obejść:v1"; "najść:v2"; "najść:v1"; "nadejść"; "dojść:v1";
  272 + "roztworzyć:v2"; "przetworzyć:v2"; "otworzyć";
  273 + "zsiąść:v2"; "zsiąść:v1"; "zesiąść"; "zasiąść"; "wysiąść"; "współposiąść"; "wsiąść"; "usiąść"; "siąść"; "rozsiąść"; "przysiąść"; "przesiąść"; "powsiąść"; "posiąść"; "podsiąść"; "osiąść"; "obsiąść"; "nasiąść"; "dosiąść";
  274 + "źreć:v1"; "zniść"; "znijść"; "znajść"; "zejść"; "zejść"; "zajść:v2"; "zajść:v1"; "wzniść"; "wznijść"; "wzejść"
  275 +(*
  276 + "moi"; "twoi";
  277 + (*"AIDS"; "BGŻ"; "BWZ"; "BZ";*) (*"Bandtkie";*) (*"CRZZ"; "FPŻ";*) (*"Jokai"; "Jókai"; "Linde";*)(* "MSZ"; "MWGzZ"; *)
  278 + (*"NSZ"; "OPZZ";*) "Radetzky"; "Tagore"; (*"UNZ"; "URz"; "WBZ"; "ZSZ"; "ZWZ"; "ZZ";*) "aids";
  279 + "arcyksiężna"; "cornflakes"; "księżna"; (*"scrabble";*) "sms"; "teścina";
  280 + "Wielkanoc"; "białagłowa"; "rzeczpospolita"; "imćpan";
  281 + "Ob"; "podziem"; "Pia"; "woleoczko"; "wilczełyko"; "jo-jo"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  282 + "Omulew"; "drama"; (*"Kayah";*) "ratel"; "grubodziób"; "rozpiór:s1"; "ceter"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  283 + "DJ"; "FIFA"; (*"manicure"; "Greenpeace"; "Google";*) ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  284 + "włosień:s2"; "deszcz"; "falafel"; "Krasnystaw";
  285 + "Różanystok"; "Białystok"; "ZHR"; "rzep:s2"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  286 + "IKEA"; "makao"; "macao"; "kakao"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  287 + "dziecko"; "oko:s2"; "ucho:s2"; "półdziecko"; "b-cia"; ""; ""; ""; ""; ""; ""; ""; ""; "";
  288 + "idea"; "ręka"; "cześć:s"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  289 + "ABBA"; "UEFA"; "FAMA"; "SABENA"; "MENA"; "APA"; "NASA"; "ANSA";
  290 + "NAFTA"; "LETTA"; "ETA"; "ELTA"; "EFTA"; "CEFTA";
  291 + "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART";
  292 + "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT";
  293 + "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
  294 + "szwa"; "hawanna"; "butaforia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  295 + "Skopie"; "Mathea"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  296 + "poema:s1"; "klima:s1"; "dylema"; "dilemma"; "apoftegma"; "aksjoma"; ""; ""; ""; ""; ""; ""; ""; "";
  297 + "burgrabia"; "gograbia"; "grabia"; "hrabia"; "margrabia"; "murgrabia"; "sędzia:s1"; "wicehrabia"; "współsędzia";
  298 + "cieśla"; "bibliopola"; "świszczypałka"; "śwircałka"; "świerczałka"; "ścierciałka"; "tatka"; "sługa:s1"; "stupajka:s1"; "stepka"; "starowinka:s2"; "skurczypałka"; "mężczyzna"; "klecha";
  299 + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  300 + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  301 + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  302 + ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
  303 + ] @ File.load_lines "data/obce.tab" @ File.load_lines "data/validated_adj.tab" @ File.load_lines "data/validated_noun.tab" @ File.load_lines "data/validated_verb.tab" @ File.load_lines "data/adv_nieodprzymiotnikowe.tab")
  304 +
  305 +let remove_exceptional_lemmata dict =
  306 + Xlist.fold dict [] (fun dict entry ->
  307 + if StringSet.mem exceptional_lemmata entry.lemma then dict
  308 + else entry :: dict)
  309 +
  310 +let generate_stem dict =
  311 + Xlist.rev_map dict (fun entry ->
  312 + {entry with stem=
  313 + if entry.ndm then (List.hd entry.forms).orth else
  314 + if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then
  315 + Stem.generate_stem entry
  316 + else ""})
  317 +
  318 +let validate dict =
  319 + let rules = Rules.interp_compound_rule_trees in
  320 + Xlist.rev_map dict (fun entry ->
  321 + let simple_lemma = Stem.simplify_lemma entry.lemma in
  322 + let forms = Xlist.rev_map entry.forms (fun form ->
  323 + (* printf "E %s\t%s\t%s\n" orth lemma interp; *)
  324 + let candidates = Rules.CharTrees.find rules form.orth in
  325 + (* printf "S %d\n" (Xlist.size forms); *)
  326 + let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
  327 + (* printf "R %s\t%s\n" stem (Rules.string_of_rule rule); *)
  328 + if stem ^ rule.Rules.set = simple_lemma then (stem,rule) :: candidates else candidates) in
  329 + if candidates = [] then form else {form with validated=true}) in
  330 + {entry with forms=forms})
  331 +
  332 +let remove_validated_forms dict =
  333 + Xlist.fold dict [] (fun dict entry ->
  334 + let forms = Xlist.fold entry.forms [] (fun forms form ->
  335 + if form.validated then forms else form :: forms) in
  336 + if forms = [] then dict else {entry with forms=forms} :: dict)
  337 +
  338 +let remove_validated_entries dict =
  339 + Xlist.fold dict [] (fun dict entry ->
  340 + let forms = Xlist.fold entry.forms [] (fun forms form ->
  341 + if form.validated then forms else form :: forms) in
  342 + if forms = [] then dict else entry :: dict)
  343 +
  344 +let remove_not_validated_forms dict =
  345 + Xlist.fold dict [] (fun dict entry ->
  346 + let forms = Xlist.fold entry.forms [] (fun forms form ->
  347 + if form.validated then form :: forms else forms) in
  348 + if forms = [] then dict else {entry with forms=forms} :: dict)
  349 +
  350 +let remove_not_validated_entries dict =
  351 + Xlist.fold dict [] (fun dict entry ->
  352 + let forms = Xlist.fold entry.forms [] (fun forms form ->
  353 + if form.validated then form :: forms else forms) in
  354 + if Xlist.size forms <> Xlist.size entry.forms then dict else entry :: dict)
  355 +
  356 +let print filename dict =
  357 + File.file_out filename (fun file ->
  358 + Xlist.iter dict (fun entry ->
  359 + Xlist.iter entry.forms (fun form ->
  360 + fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp)))
  361 +
  362 +let print_lemmata filename dict =
  363 + File.file_out filename (fun file ->
  364 + Xlist.iter dict (fun entry ->
  365 + fprintf file "%s\n" entry.lemma))
... ...
guesser/generate.ml
... ... @@ -15,11 +15,172 @@ let zasoby_path =
15 15 (* | "mozart" -> "." *)
16 16 | s -> failwith ("unknown host: " ^ s)
17 17  
18   -let morfeusz_path = zasoby_path ^ "morfeusz/"
  18 +let lematy_nkjp_filename = "lematy_NKJP1M_freq.tab"
  19 +
  20 +let morfeusz_old_path = zasoby_path ^ "morfeusz/old2/"
19 21 let sgjp_filename2015 = "sgjp-20151020.tab"
20 22 let polimorf_filename2015 = "polimorf-20151020.tab"
21   -let sgjp_filename = "sgjp-20160508.tab"
22   -let polimorf_filename = "polimorf-20160508.tab"
  23 +
  24 +let morfeusz_path = zasoby_path ^ "morfeusz/"
  25 +let sgjp_filename201605 = "sgjp-20160508.tab"
  26 +let polimorf_filename201605 = "polimorf-20160508.tab"
  27 +let sgjp_filename = "sgjp-20160724.tab"
  28 +let polimorf_filename = "polimorf-20160724.tab"
  29 +
  30 +let adv_sgjp_filename = "adv_" ^ sgjp_filename201605
  31 +let odm_adj_sgjp_filename = "odm_adj_" ^ sgjp_filename201605
  32 +let ndm_adj_sgjp_filename = "ndm_adj_" ^ sgjp_filename201605
  33 +let odm_noun_sgjp_filename = "odm_noun_" ^ sgjp_filename201605
  34 +let ndm_noun_sgjp_filename = "ndm_noun_" ^ sgjp_filename201605
  35 +let verb_sgjp_filename = "verb_" ^ sgjp_filename201605
  36 +
  37 +(* Test wczytywania słowników i liczenie częstości *)
  38 +let _ =
  39 + (* print_endline "sgjp_filename2015";
  40 + let _ = Dict.load_tab (morfeusz_old_path ^ sgjp_filename2015) in
  41 + print_endline "polimorf_filename2015";
  42 + let _ = Dict.load_tab (morfeusz_old_path ^ polimorf_filename2015) in
  43 + print_endline "sgjp_filename201605";
  44 + let _ = Dict.load_tab (morfeusz_path ^ sgjp_filename201605) in
  45 + print_endline "polimorf_filename201605";
  46 + let _ = Dict.load_tab (morfeusz_path ^ polimorf_filename201605) in
  47 + print_endline "sgjp_filename";
  48 + let _ = Dict.load_tab (morfeusz_path ^ sgjp_filename) in
  49 + print_endline "polimorf_filename";
  50 + let _ = Dict.load_tab (morfeusz_path ^ polimorf_filename) in
  51 + print_endline "adv_sgjp_filename";
  52 + let _ = Dict.load_tab (morfeusz_path ^ adv_sgjp_filename) in
  53 + print_endline "odm_adj_sgjp_filename";
  54 + let _ = Dict.load_tab (morfeusz_path ^ odm_adj_sgjp_filename) in
  55 + print_endline "ndm_adj_sgjp_filename";
  56 + let _ = Dict.load_tab (morfeusz_path ^ ndm_adj_sgjp_filename) in
  57 + print_endline "odm_noun_sgjp_filename";
  58 + let _ = Dict.load_tab (morfeusz_path ^ odm_noun_sgjp_filename) in
  59 + print_endline "ndm_noun_sgjp_filename";
  60 + let _ = Dict.load_tab (morfeusz_path ^ ndm_noun_sgjp_filename) in
  61 + print_endline "verb_sgjp_filename";
  62 + let _ = Dict.load_tab (morfeusz_path ^ verb_sgjp_filename) in
  63 + print_endline "sgjp_filename2015";
  64 + let dict = Dict.load_tab_full (morfeusz_old_path ^ sgjp_filename2015) in
  65 + Dict.print_quantities "results/proper-type-sgjp-20151020.txt" Dict.proper_type_selector dict;
  66 + Dict.print_quantities "results/genre-sgjp-20151020.txt" Dict.genre_selector dict;
  67 + Dict.print_quantities "results/interp-sgjp-20151020.txt" Dict.interp_selector dict;
  68 + print_endline "polimorf_filename2015";
  69 + let dict = Dict.load_tab_full (morfeusz_old_path ^ polimorf_filename2015) in
  70 + Dict.print_quantities "results/proper-type-polimorf-20151020.txt" Dict.proper_type_selector dict;
  71 + Dict.print_quantities "results/genre-polimorf-20151020.txt" Dict.genre_selector dict;
  72 + Dict.print_quantities "results/interp-polimorf-20151020.txt" Dict.interp_selector dict;
  73 + print_endline "sgjp_filename201605";
  74 + let dict = Dict.load_tab_full (morfeusz_path ^ sgjp_filename201605) in
  75 + Dict.print_quantities "results/proper-type-sgjp-20160508.txt" Dict.proper_type_selector dict;
  76 + Dict.print_quantities "results/genre-sgjp-20160508.txt" Dict.genre_selector dict;
  77 + Dict.print_quantities "results/interp-sgjp-20160508.txt" Dict.interp_selector dict;
  78 + print_endline "polimorf_filename201605";
  79 + let dict = Dict.load_tab_full (morfeusz_path ^ polimorf_filename201605) in
  80 + Dict.print_quantities "results/proper-type-polimorf-20160508.txt" Dict.proper_type_selector dict;
  81 + Dict.print_quantities "results/genre-polimorf-20160508.txt" Dict.genre_selector dict;
  82 + Dict.print_quantities "results/interp-polimorf-20160508.txt" Dict.interp_selector dict;
  83 + print_endline "sgjp_filename";
  84 + let dict = Dict.load_tab_full (morfeusz_path ^ sgjp_filename) in
  85 + Dict.print_quantities "results/proper-type-sgjp-20160724.txt" Dict.proper_type_selector dict;
  86 + Dict.print_quantities "results/genre-sgjp-20160724.txt" Dict.genre_selector dict;
  87 + Dict.print_quantities "results/interp-sgjp-20160724.txt" Dict.interp_selector dict;
  88 + print_endline "polimorf_filename";
  89 + let dict = Dict.load_tab_full (morfeusz_path ^ polimorf_filename) in
  90 + Dict.print_quantities "results/proper-type-polimorf-20160724.txt" Dict.proper_type_selector dict;
  91 + Dict.print_quantities "results/genre-polimorf-20160724.txt" Dict.genre_selector dict;
  92 + Dict.print_quantities "results/interp-polimorf-20160724.txt" Dict.interp_selector dict;
  93 + print_endline "adv_sgjp_filename";
  94 + let dict = Dict.load_tab_full (morfeusz_path ^ adv_sgjp_filename) in
  95 + print_endline "odm_adj_sgjp_filename";
  96 + let dict = Dict.load_tab_full (morfeusz_path ^ odm_adj_sgjp_filename) in
  97 + print_endline "ndm_adj_sgjp_filename";
  98 + let dict = Dict.load_tab_full (morfeusz_path ^ ndm_adj_sgjp_filename) in
  99 + print_endline "odm_noun_sgjp_filename";
  100 + let dict = Dict.load_tab_full (morfeusz_path ^ odm_noun_sgjp_filename) in
  101 + print_endline "ndm_noun_sgjp_filename";
  102 + let dict = Dict.load_tab_full (morfeusz_path ^ ndm_noun_sgjp_filename) in
  103 + print_endline "verb_sgjp_filename";
  104 + let dict = Dict.load_tab_full (morfeusz_path ^ verb_sgjp_filename) in
  105 + print_endline "lematy_nkjp_filename";
  106 + let dict = Dict.load_freq_tab (zasoby_path ^ lematy_nkjp_filename) in
  107 + Dict.print_quantities "results/interp-lematy_nkjp.txt" Dict.interp_selector dict;
  108 + Dict.print_quantities "results/freq-lematy_nkjp.txt" Dict.freq_selector dict;*)
  109 + ()
  110 +
  111 +(* Porównanie wersji słowników *)
  112 +let _ =
  113 + (* Dict.compare_dicts_full (morfeusz_old_path ^ sgjp_filename2015) (morfeusz_path ^ sgjp_filename201605) "results/comparition_sgjp1_full.out"; *)
  114 + (* Dict.compare_dicts_full (morfeusz_path ^ sgjp_filename201605) (morfeusz_path ^ sgjp_filename) "results/comparition_sgjp2_full.out"; *)
  115 + (* Dict.compare_dicts_full (morfeusz_old_path ^ polimorf_filename2015) (morfeusz_path ^ polimorf_filename201605) "results/comparition_polimorf1_full.out"; *)
  116 + (* Dict.compare_dicts_full (morfeusz_path ^ polimorf_filename201605) (morfeusz_path ^ polimorf_filename) "results/comparition_polimorf2_full.out"; *)
  117 +(* compare_dicts morfeusz_path sgjp_filename2015 sgjp_filename "results/comparition_sgjp.out"; *)
  118 + ()
  119 +
  120 +(* Podział słownika *)
  121 +let _ =
  122 + (* Dict.split_dict morfeusz_path sgjp_filename; *)
  123 + ()
  124 +
  125 +(* Wypisanie nieodmiennych *)
  126 +let _ =
  127 + (* let dict = Dict.load_tab_full (morfeusz_path ^ sgjp_filename) in
  128 + let dict = Dict.merge_entries dict in
  129 + let dict = Dict.mark_ndm dict in
  130 + Dict.print_ndm "results/ndm-sgjp.tab" dict; *)
  131 + ()
  132 +
  133 +let check_stem_generation path filename =
  134 + let dict = Dict.load_tab_full (path ^ filename) in
  135 + let dict = Dict.merge_entries dict in
  136 + let dict = Dict.mark_ndm dict in
  137 + let dict = Dict.find_kolwiek_suffixes dict in
  138 + let _ = Dict.generate_stem dict in
  139 + ()
  140 +
  141 +(* Sprawdzenie działania stemowania *)
  142 +let _ =
  143 + check_stem_generation morfeusz_path sgjp_filename;
  144 +(* check_stem_generation morfeusz_path ("odm_adj_" ^ sgjp_filename); *)
  145 +(* check_stem_generation morfeusz_path ("odm_noun_" ^ sgjp_filename); *)
  146 + ()
  147 +
  148 +let find_not_validated_forms path filename out_filename =
  149 + let dict = Dict.load_tab (path ^ filename) in
  150 + let dict = Dict.validate dict in
  151 + let dict = Dict.remove_validated_forms dict in
  152 + Dict.print out_filename dict
  153 +
  154 +let find_not_validated_entries path filename out_filename =
  155 + let dict = Dict.load_tab (path ^ filename) in
  156 + let dict = Dict.merge_entries dict in
  157 + let dict = Dict.validate dict in
  158 + let dict = Dict.remove_validated_entries dict in
  159 + Dict.print out_filename dict
  160 +
  161 +(* Wypisanie niezwalidowanych form *)
  162 +let _ =
  163 + find_not_validated_forms morfeusz_path odm_adj_sgjp_filename "results/not_validated_odm_adj.tab";
  164 + (* find_not_validated_entries morfeusz_path ("odm_adj_" ^ sgjp_filename) "results/not_validated_odm_adj.tab"; *)
  165 + (* find_not_validated_entries "results/" "not_validated_odm_adj.tab" "results/not_validated_odm_adj2.tab"; *)
  166 + (* find_not_validated_entries morfeusz_path ("odm_noun_" ^ sgjp_filename) "results/not_validated_odm_noun.tab"; *)
  167 + (* find_not_validated_entries morfeusz_path ("adv_" ^ sgjp_filename) "results/not_validated_adv.tab";*)
  168 + (* find_not_validated_entries morfeusz_path ("verb_" ^ sgjp_filename) "results/not_validated_verb.tab"; *)
  169 + ()
  170 +
  171 +let find_validated_lemmata path filename out_filename =
  172 + let dict = Dict.load_tab (path ^ filename) in
  173 + let dict = Dict.merge_entries dict in
  174 + let dict = Dict.validate dict in
  175 + let dict = Dict.remove_not_validated_entries dict in
  176 + Dict.print_lemmata out_filename dict
  177 +
  178 +(* Wypisanie zwalidowanych lematów *)
  179 +let _ =
  180 + find_validated_lemmata morfeusz_path odm_adj_sgjp_filename "results/validated_odm_adj.tab";
  181 + find_validated_lemmata morfeusz_path odm_noun_sgjp_filename "results/validated_odm_noun.tab";
  182 + find_validated_lemmata morfeusz_path verb_sgjp_filename "results/validated_verb.tab";
  183 + ()
23 184  
24 185 (**********************************************************************************)
25 186  
... ... @@ -164,75 +325,6 @@ let load_tab_full filename =
164 325 | line -> failwith ("load_tab_full: " ^ (String.concat "\t" line)))
165 326 (* | _ -> failwith ("load_tab_full: " ^ line)) *)
166 327  
167   -let load_dict_as_set filename =
168   - let l = load_tab filename in
169   - List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set (orth,lemma,interp) ->
170   - StringSet.add set (String.concat "\t" [orth;lemma;interp]))))
171   -
172   -let load_dict_as_set_full filename =
173   - let l = load_tab_full filename in
174   - List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set (orth,lemma,interp,cl,cl2) ->
175   - StringSet.add set (String.concat "\t" [orth;lemma;interp;cl;cl2]))))
176   -
177   -let rec compare_dicts_rec file = function
178   - [],[] -> ()
179   - | [],b :: lb -> fprintf file "> %s\n" b; compare_dicts_rec file ([],lb)
180   - | a :: la,[] -> fprintf file "< %s\n" a; compare_dicts_rec file (la,[])
181   - | a :: la, b :: lb ->
182   - if a = b then compare_dicts_rec file (la,lb) else
183   - if a < b then (fprintf file "< %s\n" a; compare_dicts_rec file (la,b :: lb)) else
184   - (fprintf file "> %s\n" b; compare_dicts_rec file (a :: la,lb))
185   -
186   -let compare_dicts path filename1 filename2 filename_out =
187   - let dict1 = load_dict_as_set (path ^ filename1) in
188   - let dict2 = load_dict_as_set (path ^ filename2) in
189   - File.file_out filename_out (fun file ->
190   - compare_dicts_rec file (dict1,dict2))
191   -
192   -let compare_dicts_full path filename1 filename2 filename_out =
193   - let dict1 = load_dict_as_set_full (path ^ filename1) in
194   - let dict2 = load_dict_as_set_full (path ^ filename2) in
195   - File.file_out filename_out (fun file ->
196   - compare_dicts_rec file (dict1,dict2))
197   -
198   -(* Porównanie wersji słowników *)
199   -let _ =
200   -(* compare_dicts_full morfeusz_path sgjp_filename2015 sgjp_filename "results/comparition_sgjp_full.out"; *)
201   -(* compare_dicts_full morfeusz_path polimorf_filename2015 polimorf_filename "results/comparition_polimorf_full.out"; *)
202   -(* compare_dicts morfeusz_path sgjp_filename2015 sgjp_filename "results/comparition_sgjp.out"; *)
203   - ()
204   -
205   -let split_dict path filename =
206   - let dict = load_tab (path ^ filename) in
207   - File.file_out (path ^ "noun_" ^ filename) (fun noun_file ->
208   - File.file_out (path ^ "adj_" ^ filename) (fun adj_file ->
209   - File.file_out (path ^ "adv_" ^ filename) (fun adv_file ->
210   - File.file_out (path ^ "verb_" ^ filename) (fun verb_file ->
211   - File.file_out (path ^ "other_" ^ filename) (fun other_file ->
212   - Xlist.iter dict (fun (orth,lemma,interp) ->
213   - let cat,tags = match Xstring.split ":" interp with
214   - cat :: tags -> cat,tags
215   - | _ -> failwith ("split_dict: " ^ interp) in
216   - if cat = "subst" || cat = "depr" then
217   - fprintf noun_file "%s\t%s\t%s\n" orth lemma interp else
218   - if cat = "adj" || cat = "adja"|| cat = "adjc"|| cat = "adjp" then
219   - fprintf adj_file "%s\t%s\t%s\n" orth lemma interp else
220   - if cat = "adv" then
221   - fprintf adv_file "%s\t%s\t%s\n" orth lemma interp else
222   - if cat = "inf" || cat = "praet"|| cat = "fin" || cat = "ppas" || cat = "pact" || cat = "pacta" ||
223   - cat = "impt" || cat = "imps" || cat = "pcon" || cat = "pant" || cat = "ger" || cat = "" then
224   - fprintf verb_file "%s\t%s\t%s\n" orth lemma interp else
225   - if cat = "bedzie" || cat = "pred"|| cat = "prep" || cat = "num" || cat = "aglt" || cat = "winien" ||
226   - cat = "qub" || cat = "brev" || cat = "comp" || cat = "interj" || cat = "burk" || cat = "conj" || cat = "ppron12" || cat = "ppron3" || cat = "" then
227   - fprintf other_file "%s\t%s\t%s\n" orth lemma interp else
228   - if cat = "cond" then () else
229   - print_endline cat))))))
230   -
231   -
232   -(* Podział słownika *)
233   -let _ =
234   -(* split_dict morfeusz_path sgjp_filename; *)
235   - ()
236 328  
237 329 let map_of_tab data =
238 330 Xlist.fold data StringMap.empty (fun map (orth,lemma,interp) ->
... ... @@ -294,128 +386,7 @@ let _ =
294 386 ()
295 387  
296 388 (**********************************************************************************)
297   -
298   -let kolwiek_lemmas = StringSet.of_list [
299   - (* adj *)
300   - "czyjkolwiek"; "czyjś"; "czyjże"; "jakiciś"; "jakikolwiek"; "jakisi"; "jakiś"; "jakiści";
301   - "jakiściś"; "jakiśkolwiek"; "jakiż"; "jakiżkolwiek"; "jakowyś"; "kijże"; "kiż"; "którykolwiek";
302   - "któryś"; "któryż"; "któryżkolwiek"; "niejakiś"; "takiż"; "takowyż"; "tenże"; "tyliż"; "ówże";
303   - (* noun *)
304   - "cokolwiek:s"; "cośkolwiek"; "cóżkolwiek"; "ktokolwiek"; "ktośkolwiek"; "któżkolwiek";
305   - "cociś"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "coże"; "cóż";
306   - "ktoś:s2"; "któż";
307   - (* adv *)
308   - "jakkolwiek"; "jakoś"; "małoż"; "niejakkolwiek"; "niejakoś"; (*"niemalże";*) ]
309   -
310   -let kolwiek_suffixes = [
311   - "żkolwiek"; "żekolwiek"; "śkolwiek"; "kolwiek"; "ż"; "że"; "ściś"; "ciś"; "ś"; "ści"; "sik"; "si"]
312   -
313   -let find_kolwiek_suffixes morfs =
314   - StringMap.mapi morfs (fun lemma interps ->
315   - if StringSet.mem kolwiek_lemmas lemma then
316   - Xlist.map interps (fun (orth,interp) ->
317   - (Xlist.fold kolwiek_suffixes orth (fun orth kolwiek_suf ->
318   - if Xstring.check_sufix kolwiek_suf orth then
319   - Xstring.cut_sufix kolwiek_suf orth
320   - else orth)), interp)
321   - else interps)
322   -
323   -
324   -let exceptional_lemmata = StringSet.of_list ([
325   - (* błąd w słowniku *)
326   - "ówże";
327   - (* wiele stemów *)
328   - "twój:a"; "swój"; "mój:a"; "wszystek";
329   - (* oboczności w stemie *)
330   - "co:s"; "cociś"; "cokolwiek:s"; "cosi"; "cosik"; "cosiś"; "coś:s"; "cościś"; "cośkolwiek"; "coże"; "cóż"; "cóżkolwiek";
331   - "kto"; "ktokolwiek"; "ktoś:s2"; "ktośkolwiek"; "któż"; "któżkolwiek"; "nikt"; "nic";
332   - "Angel"; "Apollo"; "Białystok"; "Bober"; "Dzięgiel"; "Engel"; "Gołąb:s2"; "Gózd"; "Hendel"; "Herschel"; "Jastrząb";
333   - "Kodrąb:s2"; "Kozioł"; "Krasnystaw"; "Majcher"; "Ob"; "Omulew"; "Orzeł"; "Różanystok"; "Schuster"; "Stępień"; "Słonim";
334   - "Wielkanoc"; "achtel"; "archiprezbiter"; "arcydzięgiel"; "bedel"; "ber"; "białagłowa"; "białodrzew"; "ceter"; "deszcz";
335   - "drama"; "dziób:s1"; "dzięgiel"; "dżemper"; "falafel"; "grubodziób"; "harbajtel"; "harbejtel"; "harmider"; "imćpan";
336   - "iściec"; "jarząb:s2"; "kierdel"; "kimel"; "kiper:s1"; "klaster"; "kliper"; "kosodrzew"; "kureń"; "manczester";
337   - "nadpiersień"; "osep"; "otrząs"; "pedel"; "piksel"; "podpiersień"; "podziem"; "prezbiter"; "protokół"; "przedpiersień";
338   - "ratel"; "rondel:s2"; "rozpiór:s1"; "rozpiór:s2"; "rzeczpospolita"; "rzep:s2"; "rzepień"; "rzewień"; "rąb"; "sosrąb";
339   - "srebrnodrzew"; "swąd"; "szmermel"; "szpiegierz"; "ulster"; "wab:s2"; "wermiszel"; "wilczełyko"; "woleoczko"; "włosień:s2";
340   - "zew"; "złotogłów"; "świreń"; "źreb"; "żółtodziób";
341   - "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
342   - "przechrzest"; "chrzest"; "półdziecko"; "roczek:s2"; "rok:s1"; "tydzień"; ""; ""; ""; ""; ""; "";
343   - (* oboczności w odmianie *)
344   - "niekażdy"; "każdy"; "niektóry:a"; "który"; "tenże"; "ten"; "tamten"; "kijże";
345   - "ucho:s2"; "dziecko"; "oko:s2"; "imię"; "nozdrze";
346   - "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
347   - "NOT"; "LOT"; "KRRiT"; ""; ""; ""; ""; ""; ""; ""; ""; "";
348   - "być"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
349   - ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
350   - (* pozostawione *)
351   - "czyjże"; "czyjś"; "czyjkolwiek"; "kiż"; "ów"; "ow"; "on:a"; "ki";
352   - "Pia"; "jo-jo"; "+lecie"; "";
353   - "zagrząźć"; "zrzeć";
354   - (* niepełny paradygmat *)
355   - "zróść"; "zląc"; "zaróść"; "zaprząc"; "zaprzysiąc"; "zanieść:v2"; "zaląc"; "wzróść"; "wyróść"; "wyprząc"; "wyprzysiąc";
356   - "róść"; "sprzysiąc"; "sprząc"; "ugrząźć"; "uląc"; "upiec:v2"; "uprząc"; "uróść"; "wieść:v2"; "wprząc"; "wróść"; "wyląc";
357   - "powieść:v2"; "posiąc"; "przeląc"; "przeprząc"; "przeróść"; "przyprząc"; "przysiąc"; "przyróść"; "prząc"; "pójść:v2"; "rozprząc"; "rozróść";
358   - "krzywoprzysiąc"; "ląc"; "naróść"; "obróść"; "odprzysiąc"; "odprząc"; "odróść"; "oprzysiąc"; "podróść"; "pogrząźć"; "poprzysiąc"; "poróść";
359   - "dojść:v2"; "doprząc"; "doróść"; "dosiąc"; "grząźć"; "iść:v2";
360   - (* wiele stemów *)
361   - "uwlec"; "wewlec"; "wlec"; "wwlec"; "wywlec"; "wyżec"; "zawlec"; "zażec"; "zewlec"; "zwlec"; "zżec"; "żec";
362   - "podwlec"; "podżec"; "powlec:v1"; "powlec:v2"; "przeoblec"; "przewlec"; "przeżec"; "przyoblec"; "przywlec"; "przyżec"; "rozwlec"; "rozżec";
363   - "dowlec"; "nawlec"; "oblec:v2"; "obwlec"; "odwlec"; "owlec"; "zeżreć";
364   - (* inne *)
365   - "liźć"; "iść:v1"; "wyniść"; "wynijść"; "wyjść"; "wniść"; "wnijść"; "wejść"; "ujść"; "rozejść"; "pójść:v1"; "przyjść"; "przejść:v2"; "przejść:v1"; "podejść"; "odejść"; "obejść:v2"; "obejść:v1"; "najść:v2"; "najść:v1"; "nadejść"; "dojść:v1";
366   - "roztworzyć:v2"; "przetworzyć:v2"; "otworzyć";
367   - "zsiąść:v2"; "zsiąść:v1"; "zesiąść"; "zasiąść"; "wysiąść"; "współposiąść"; "wsiąść"; "usiąść"; "siąść"; "rozsiąść"; "przysiąść"; "przesiąść"; "powsiąść"; "posiąść"; "podsiąść"; "osiąść"; "obsiąść"; "nasiąść"; "dosiąść";
368   - "źreć:v1"; "zniść"; "znijść"; "znajść"; "zejść"; "zejść"; "zajść:v2"; "zajść:v1"; "wzniść"; "wznijść"; "wzejść"
369 389 (*
370   - "moi"; "twoi";
371   - (*"AIDS"; "BGŻ"; "BWZ"; "BZ";*) (*"Bandtkie";*) (*"CRZZ"; "FPŻ";*) (*"Jokai"; "Jókai"; "Linde";*)(* "MSZ"; "MWGzZ"; *)
372   - (*"NSZ"; "OPZZ";*) "Radetzky"; "Tagore"; (*"UNZ"; "URz"; "WBZ"; "ZSZ"; "ZWZ"; "ZZ";*) "aids";
373   - "arcyksiężna"; "cornflakes"; "księżna"; (*"scrabble";*) "sms"; "teścina";
374   - "Wielkanoc"; "białagłowa"; "rzeczpospolita"; "imćpan";
375   - "Ob"; "podziem"; "Pia"; "woleoczko"; "wilczełyko"; "jo-jo"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
376   - "Omulew"; "drama"; (*"Kayah";*) "ratel"; "grubodziób"; "rozpiór:s1"; "ceter"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
377   - "DJ"; "FIFA"; (*"manicure"; "Greenpeace"; "Google";*) ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
378   - "włosień:s2"; "deszcz"; "falafel"; "Krasnystaw";
379   - "Różanystok"; "Białystok"; "ZHR"; "rzep:s2"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
380   - "IKEA"; "makao"; "macao"; "kakao"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
381   - "dziecko"; "oko:s2"; "ucho:s2"; "półdziecko"; "b-cia"; ""; ""; ""; ""; ""; ""; ""; ""; "";
382   - "idea"; "ręka"; "cześć:s"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
383   - "ABBA"; "UEFA"; "FAMA"; "SABENA"; "MENA"; "APA"; "NASA"; "ANSA";
384   - "NAFTA"; "LETTA"; "ETA"; "ELTA"; "EFTA"; "CEFTA";
385   - "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART";
386   - "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT";
387   - "człowiek"; "półczłowiek"; "przedczłowiek"; "praczłowiek"; "nadczłowiek"; "git-człowiek"; ""; ""; ""; ""; ""; ""; ""; "";
388   - "szwa"; "hawanna"; "butaforia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
389   - "Skopie"; "Mathea"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
390   - "poema:s1"; "klima:s1"; "dylema"; "dilemma"; "apoftegma"; "aksjoma"; ""; ""; ""; ""; ""; ""; ""; "";
391   - "burgrabia"; "gograbia"; "grabia"; "hrabia"; "margrabia"; "murgrabia"; "sędzia:s1"; "wicehrabia"; "współsędzia";
392   - "cieśla"; "bibliopola"; "świszczypałka"; "śwircałka"; "świerczałka"; "ścierciałka"; "tatka"; "sługa:s1"; "stupajka:s1"; "stepka"; "starowinka:s2"; "skurczypałka"; "mężczyzna"; "klecha";
393   - ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
394   - ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
395   - ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
396   - ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
397   - ] @ File.load_lines "data/obce.tab" @ File.load_lines "data/validated_adj.tab" @ File.load_lines "data/validated_noun.tab" @ File.load_lines "data/validated_verb.tab" @ File.load_lines "data/adv_nieodprzymiotnikowe.tab")
398   -
399   -let remove_exceptional_lemmata dict =
400   - StringMap.fold dict StringMap.empty (fun dict lemma interps ->
401   - if StringSet.mem exceptional_lemmata lemma then dict
402   - else StringMap.add dict lemma interps)
403   -
404   -let check_stem_generation stem_sel path filename =
405   - let dict = load_tab (path ^ filename) in
406   - let dict = map_of_tab dict in
407   - let dict = remove_exceptional_lemmata dict in
408   - let dict = find_kolwiek_suffixes dict in
409   - StringMap.iter dict (fun lemma forms ->
410   - (* let _ = Stem.generate_stem stem_sel lemma forms in FIXME*)
411   - ())
412   -
413   -(* Sprawdzenie działania stemowania *)
414   -let _ =
415   -(* check_stem_generation Stem.adj_stem_sel morfeusz_path ("odm_adj_" ^ sgjp_filename); *)
416   -(* check_stem_generation Stem.noun_stem_sel morfeusz_path ("odm_noun_" ^ sgjp_filename); *)
417   - ()
418   -
419 390 let remove_com_sup dict =
420 391 List.rev (Xlist.fold dict [] (fun l (orth,lemma,interp) ->
421 392 if Xstring.check_sufix ":com" interp || Xstring.check_sufix ":sup" interp then l else (orth,lemma,interp) :: l))
... ... @@ -470,19 +441,6 @@ let generate_adj_com_rules path filename adj_com_rules_filename =
470 441 StringMap.iter rules2 (fun rule (q,l) ->
471 442 fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))
472 443  
473   -let find_validated_lemata_adj_pos dict =
474   - let dict = map_of_tab dict in
475   - StringMap.fold dict [] (fun lemmata lemma forms ->
476   - if Rules.validate_entry lemma forms then lemma :: lemmata else lemmata)
477   -
478   -let find_validated_lemata_adj path filename =
479   - let interp_sel = load_interp_sel "data/interps.tab" in
480   - let dict = load_tab (path ^ filename) in
481   - let dict = split_into_groups interp_sel dict in
482   - if StringMap.size dict <> 3 then failwith ("generate_adj_rules: " ^
483   - String.concat " " (StringMap.fold dict [] (fun l s _ -> s :: l))) else
484   - find_validated_lemata_adj_pos (StringMap.find dict "adj")
485   -
486 444 let generate_adv_pos_rules rules_filename dict =
487 445 let dict = map_of_tab dict in
488 446 let dict = find_kolwiek_suffixes dict in
... ... @@ -575,12 +533,6 @@ let generate_verb_rules path filename rules_filename =
575 533 generate_verb_rules2 rules_filename (StringMap.find dict "verb");
576 534 ()
577 535  
578   -let find_validated_lemata_noun path filename =
579   - let dict = load_tab (path ^ filename) in
580   - let dict = map_of_tab dict in
581   - StringMap.fold dict [] (fun lemmata lemma forms ->
582   - if Rules.validate_entry lemma forms then lemma :: lemmata else lemmata)
583   -
584 536 let find_tags_lemata_noun path filename =
585 537 let dict = load_tab (path ^ filename) in
586 538 let dict = map_of_tab dict in
... ... @@ -591,19 +543,6 @@ let find_tags_lemata_noun path filename =
591 543 String.concat " " (Xlist.map l (fun (k,v) -> k ^ "=" ^ v))
592 544 )))))
593 545  
594   -let find_validated_lemata_verb2 dict =
595   - let dict = map_of_tab dict in
596   - StringMap.fold dict [] (fun lemmata lemma forms ->
597   - if Rules.validate_entry lemma forms then lemma :: lemmata else lemmata)
598   -
599   -let find_validated_lemata_verb path filename =
600   - let interp_sel = load_interp_sel "data/interps.tab" in
601   - let dict = load_tab (path ^ filename) in
602   - let dict = split_into_groups interp_sel dict in
603   - if StringMap.size dict <> 2 then failwith ("generate_verb_rules: " ^
604   - String.concat " " (StringMap.fold dict [] (fun l s _ -> s :: l))) else
605   - find_validated_lemata_verb2 (StringMap.find dict "verb")
606   -
607 546 let find_tags_lemata_verb2 dict =
608 547 let dict = map_of_tab dict in
609 548 StringMap.iter dict (fun lemma forms ->
... ... @@ -632,15 +571,6 @@ let _ =
632 571 ()
633 572  
634 573 let _ =
635   -(* let l = find_validated_lemata_adj morfeusz_path ("odm_adj_" ^ sgjp_filename) in *)
636   -(* let l = find_validated_lemata_noun morfeusz_path ("odm_noun_" ^ sgjp_filename) in *)
637   -(* let l = find_validated_lemata_verb morfeusz_path ("verb_" ^ sgjp_filename) in *)
638   -(* let l = find_validated_lemata_noun "data/" "nouns_ex.tab" in *)
639   -(* let l = find_validated_lemata_verb "data/" "verbs_ex.tab" in *)
640   -(* Xlist.iter l print_endline; *)
641   - ()
642   -
643   -let _ =
644 574 (* find_tags_lemata_verb "data/" "verbs_ex.tab"; *)
645 575 (* find_tags_lemata_noun "data/" "nouns_ex.tab"; *)
646 576 ()
... ... @@ -649,23 +579,6 @@ let _ =
649 579 (* Rules.print "results/rules/"; *)
650 580 (* Rules.print_compound_rules "results/compounds.dic" (Rules.create_rules ()); *)
651 581 ()
652   -(* FIXME
653   -let find_not_parsed_entries path filename out_filename =
654   - let rules = Rules.create_char_trees (Rules.create_rules ()) in
655   - (* let interp_sel = load_interp_sel "data/interps.tab" in *)
656   - let dict = load_tab (path ^ filename) in
657   - let not_parsed = Xlist.fold dict [] (fun not_parsed (orth,lemma,interp) ->
658   - (* printf "E %s\t%s\t%s\n" orth lemma interp; *)
659   - let simple_lemma = Rules.simplify_lemma lemma in
660   - let forms = Rules.CharTrees.find rules orth in
661   - (* printf "S %d\n" (Xlist.size forms); *)
662   - let forms = Xlist.fold forms [] (fun forms (stem,rule) ->
663   - (* printf "R %s\t%s\n" stem (Rules.string_of_rule rule); *)
664   - if stem ^ rule.Rules.set = simple_lemma then (stem,rule) :: forms else forms) in
665   - if forms = [] then (orth,lemma,interp) :: not_parsed else not_parsed) in
666   - File.file_out out_filename (fun file ->
667   - Xlist.iter not_parsed (fun (orth,lemma,interp) ->
668   - fprintf file "%s\t%s\t%s\n" orth lemma interp))
669 582  
670 583 let interp_rule_string tags interp =
671 584 let tags = Xlist.sort tags compare_tag in
... ... @@ -699,14 +612,9 @@ let print_interp_rules con_flag group_flag lemma_flag path filename out_filename
699 612 else interp_rules)) in
700 613 File.file_out out_filename (fun file ->
701 614 StringMap.iter interp_rules (fun k (q,l) ->
702   - fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l))) *)
  615 + fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l)))
703 616  
704 617 let _ =
705   - (* find_not_parsed_entries morfeusz_path ("odm_adj_" ^ sgjp_filename) "results/not_parsed_odm_adj.tab"; *)
706   - (* find_not_parsed_entries "results/" "not_parsed_odm_adj.tab" "results/not_parsed_odm_adj2.tab"; *)
707   - (* find_not_parsed_entries morfeusz_path ("odm_noun_" ^ sgjp_filename) "results/not_parsed_odm_noun.tab"; *)
708   - (* find_not_parsed_entries morfeusz_path ("adv_" ^ sgjp_filename) "results/not_parsed_adv.tab";*)
709   - (* find_not_parsed_entries morfeusz_path ("verb_" ^ sgjp_filename) "results/not_parsed_verb.tab"; *)
710 618 (* print_interp_rules true true true morfeusz_path ("odm_adj_" ^ sgjp_filename) "results/interp_rules_odm_adj.tab"; *)
711 619 (* print_interp_rules false true true morfeusz_path ("odm_adj_" ^ sgjp_filename) "results/interp_rules_odm_adj2.tab"; *)
712 620 (* print_interp_rules true true true morfeusz_path ("adv_" ^ sgjp_filename) "results/interp_rules_adv.tab"; *)
... ... @@ -715,78 +623,4 @@ let _ =
715 623 (* print_interp_rules true true true morfeusz_path ("odm_noun_" ^ sgjp_filename) "results/interp_rules_odm_noun.tab";
716 624 print_interp_rules true true false morfeusz_path ("odm_noun_" ^ sgjp_filename) "results/interp_rules_odm_noun2.tab"; *)
717 625 ()
718   -
719   -(***
720   -let expand_tags tags =
721   - if tags = "" then [] else
722   - List.flatten (Xlist.map (Xstring.split "|" tags) (fun tags ->
723   - let tags = Xlist.map (Xstring.split ":" tags) (Xstring.split "\\.") in
724   - Xlist.map (Xlist.multiply_list tags) (String.concat ":")))
725   -
726   -let prepare_rules l =
727   - Xlist.fold l [] (fun rules rule_set_name ->
728   - let rule_set = StringMap.find rule_map rule_set_name in
729   - Xlist.fold rule_set rules (fun rules (alternation_name, sufix, tags) ->
730   - let alternation = StringMap.find alternation_map alternation_name in
731   - Xlist.fold alternation rules (fun rules (a,b) ->
732   - (a ^ sufix, b, expand_tags tags) :: rules)))
733   -
734   -let prepare_rules_simple l =
735   - Xlist.fold l [] (fun rules rule_set_name ->
736   - let rule_set = StringMap.find rule_map rule_set_name in
737   - Xlist.fold rule_set rules (fun rules (alternation_name, sufix, tags) ->
738   - let alternation = StringMap.find alternation_map alternation_name in
739   - Xlist.fold alternation rules (fun rules (a,b) ->
740   - (a ^ sufix, b, [tags]) :: rules)))
741   -
742   -let rules_adj_flex = prepare_rules_simple ["ADJ-FLEX"]
743   -let rules_adj_lemma = prepare_rules ["ADJ-LEMMA"]
744   -
745   -let rules_a = prepare_rules ["NOUN-FLEX-GENERAL";"NOUN-FLEX-A"]
746   -let rules_noun_as_adj = prepare_rules ["NOUN-FLEX-GENERAL";"NOUN-ADJ-FLEX"]
747   -let rules_noun_lemma = prepare_rules ["NOUN-LEMMA"]
748   -
749   -let is_applicable_rule (a,_,_) s = check_sufix a s
750   -
751   -let apply_rule (a,b,_) s =
752   - (cut_sufix a s) ^ b
753   -
754   -let match_interp (_,_,l) s =
755   - Xlist.mem l s
756   -
757   -let get_interps (_,_,l) = l
758   -
759   -let apply_rules rules s =
760   - Xlist.fold rules [] (fun l rule ->
761   - if is_applicable_rule rule s then
762   - (apply_rule rule s, get_interps rule) :: l
763   - else l)
764   -
765   -let check_inflexion rules stem interps =
766   - StringMap.fold interps true (fun b interp orths ->
767   - Xlist.fold orths b (fun b orth ->
768   - let c = Xlist.fold rules false (fun c rule ->
769   - if is_applicable_rule rule orth && match_interp rule interp then
770   - if apply_rule rule orth = stem then true else c
771   - else c) in
772   - if c then b else false))
773   -
774   -let has_known_inflexion_noun stem interps =
775   - let b1 = check_inflexion rules_a stem interps in
776   - let b2 = check_inflexion rules_noun_as_adj stem interps in
777   - b1 || b2
778   -
779   -let has_known_inflexion_adj stem interps =
780   - let b = check_inflexion rules_adj_flex stem interps in
781   - b
782   -
783   -let select_inflexion rules stem interps =
784   - StringMap.fold interps StringMap.empty (fun interps interp orths ->
785   - let orths = Xlist.fold orths [] (fun orths orth ->
786   - let c = Xlist.fold rules false (fun c rule ->
787   - if is_applicable_rule rule orth && match_interp rule interp then
788   - if apply_rule rule orth = stem then true else c
789   - else c) in
790   - if c then orths else orth :: orths) in
791   - if orths = [] then interps else StringMap.add interps interp orths)
792   -***)
  626 +*)
... ...
guesser/makefile
... ... @@ -6,7 +6,7 @@ OCAMLFLAGS=$(INCLUDES) -g
6 6 OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa xlib.cmxa
7 7  
8 8 all:
9   - $(OCAMLOPT) -o generate $(OCAMLOPTFLAGS) rules.ml stem.ml ruleGenerator.ml generate.ml
  9 + $(OCAMLOPT) -o generate $(OCAMLOPTFLAGS) types.ml rules.ml stem.ml ruleGenerator.ml dict.ml generate.ml
10 10  
11 11 lib:
12 12 $(OCAMLOPT) -linkall -a -o inflexion.cmxa $(INCLUDES) $(MODS3)
... ...
guesser/ruleGenerator.ml
1 1 open Xstd
2 2 open Printf
  3 +open Types
3 4  
4 5 let alternation_map = Rules.alternation_map
5 6  
... ... @@ -163,13 +164,13 @@ let generate_rule stem stem_pref orth =
163 164 let c,f = rule_code (a,b) in
164 165 if f then "\t" ^ c else sprintf "%s\t%s\t%s" c a b
165 166  
166   -let rec classify_entry lemma stem forms = function
  167 +let rec classify_entry entry = function
167 168 (class_interp,suf,cl) :: class_sel ->
168   - let l = Xlist.fold forms [] (fun l (orth,interp) ->
169   - if interp = class_interp then orth :: l else l) in
  169 + let l = Xlist.fold entry.forms [] (fun l form ->
  170 + if form.interp = class_interp then form.orth :: l else l) in
170 171 let b = Xlist.fold l false (fun b orth ->
171 172 if Xstring.check_sufix suf orth then true else b) in
172   - if b then cl else classify_entry lemma stem forms class_sel
  173 + if b then cl else classify_entry entry class_sel
173 174 (* let l = StringSet.to_list (Xlist.fold l StringSet.empty (fun set orth ->
174 175 if check_prefix stem orth then
175 176 StringSet.add set (cut_prefix stem orth)
... ... @@ -210,25 +211,21 @@ let entry_classes =
210 211 "subst:sg:nom:n2","um","UM";
211 212 ]
212 213  
213   -let generate_rules_entry cat rules lemma forms =
214   - let stem = Stem.generate_stem cat lemma forms in
215   - let stem_pref = Stem.cut_stem_sufix stem in
216   - let cl = classify_entry lemma stem forms entry_classes in
217   - let forms = Rules.select_not_validated lemma forms in
218   - Xlist.fold forms rules (fun rules (orth,interp) ->
219   - let rule = cl ^ "\t" ^ generate_rule stem stem_pref orth in
220   - let rules2 = try StringMap.find rules interp with Not_found -> StringMap.empty in
221   - let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in
222   - StringMap.add rules interp rules2)
  214 +let generate_rules_entry rules entry =
  215 + let stem_pref = Stem.cut_stem_sufix entry.stem in
  216 + let cl = classify_entry entry entry_classes in
  217 + Xlist.fold entry.forms rules (fun rules form ->
  218 + let rule = cl ^ "\t" ^ generate_rule entry.stem stem_pref form.orth in
  219 + let rules2 = try StringMap.find rules form.interp with Not_found -> StringMap.empty in
  220 + let rules2 = StringMap.add_inc rules2 rule (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l) in
  221 + StringMap.add rules form.interp rules2)
223 222  
224   -let generate_rules_com_entry cat rules lemma forms =
225   - let stem = Stem.generate_stem cat lemma forms in
226   - let stem_pref = Stem.cut_stem_sufix stem in
227   -(* let cl = classify_entry lemma stem forms entry_classes in *)
228   - let forms = Rules.select_not_validated lemma forms in
229   - Xlist.fold forms rules (fun rules (orth,interp) ->
230   - if not (Xstring.check_sufix ":com" interp) then rules else
231   - let rule = "\t" ^ generate_rule stem stem_pref orth in
232   - let rules2 = try StringMap.find rules interp with Not_found -> StringMap.empty in
233   - let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in
234   - StringMap.add rules interp rules2)
  223 +let generate_rules_com_entry rules entry =
  224 + let stem_pref = Stem.cut_stem_sufix entry.stem in
  225 +(* let cl = classify_entry entry entry_classes in *)
  226 + Xlist.fold entry.forms rules (fun rules form ->
  227 + if not (Xstring.check_sufix ":com" form.interp) then rules else
  228 + let rule = "\t" ^ generate_rule entry.stem stem_pref form.orth in
  229 + let rules2 = try StringMap.find rules form.interp with Not_found -> StringMap.empty in
  230 + let rules2 = StringMap.add_inc rules2 rule (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l) in
  231 + StringMap.add rules form.interp rules2)
... ...
guesser/rules.ml
... ... @@ -282,7 +282,7 @@ let interp_compound_rule_trees = CharTrees.create interp_compound_rules
282 282 (**********************************************************************************************)
283 283 (**********************************************************************************************)
284 284  
285   -let rec select_tag tag rev = function
  285 +(* let rec select_tag tag rev = function
286 286 [] -> "", rev
287 287 | (k,v) :: l -> if k = tag then v, rev @ l else select_tag tag ((k,v) :: rev) l
288 288  
... ... @@ -375,4 +375,4 @@ let string_of_rule rule =
375 375 let print_compound_rules filename rules =
376 376 File.file_out filename (fun file ->
377 377 Xlist.iter rules (fun rule ->
378   - fprintf file "%s\n" (string_of_rule rule)))
  378 + fprintf file "%s\n" (string_of_rule rule))) *)
... ...
guesser/stem.ml
1 1 open Xstd
2 2 open Printf
  3 +open Types
3 4  
4 5 (* Zakładam, że zbiór form należy do jednego leksemu *)
5 6  
... ... @@ -120,18 +121,18 @@ let simplify_lemma s =
120 121 | [s;_] -> s
121 122 | _ -> failwith "simplify_lemma"
122 123  
123   -let generate_stem cat lemma forms =
124   - let orth = simplify_lemma lemma in
125   - let lemma_stem_sel = try StringMap.find lemma_stem_sel cat with Not_found -> [] in
  124 +let generate_stem entry =
  125 + let orth = simplify_lemma entry.lemma in
  126 + let lemma_stem_sel = try StringMap.find lemma_stem_sel entry.cat with Not_found -> [] in
126 127 let stems = Xlist.fold lemma_stem_sel StringMap.empty (fun stems sel ->
127 128 if is_applicable_sel sel orth then
128 129 StringMap.add_inc stems (apply_sel sel orth) (get_priority sel) (fun priority -> min priority (get_priority sel))
129 130 else stems) in
130   - let stems2 = Xlist.fold forms StringMap.empty (fun stems (orth,interp) ->
131   - let sels = try StringMap.find stem_sel interp with Not_found -> [] in
  131 + let stems2 = Xlist.fold entry.forms StringMap.empty (fun stems form ->
  132 + let sels = try StringMap.find stem_sel form.interp with Not_found -> [] in
132 133 Xlist.fold sels stems (fun stems sel ->
133   - if is_applicable_sel sel orth then
134   - StringMap.add_inc stems (apply_sel sel orth) (get_priority sel) (fun priority -> min priority (get_priority sel))
  134 + if is_applicable_sel sel form.orth then
  135 + StringMap.add_inc stems (apply_sel sel form.orth) (get_priority sel) (fun priority -> min priority (get_priority sel))
135 136 else stems)) in
136 137 let stems = if StringMap.is_empty stems then stems2 else stems in
137 138 let stems,_ = StringMap.fold stems ([],max_int) (fun (stems,priority) stem p ->
... ... @@ -139,12 +140,12 @@ let generate_stem cat lemma forms =
139 140 if p > priority then stems,priority else
140 141 stem :: stems, priority) in
141 142 match stems with
142   - [] -> (*print_endline ("stem not found for " ^ lemma);
143   - Xlist.iter forms (fun (orth,interp) -> printf " %s\t%s\n" orth interp);*)
  143 + [] -> (*print_endline ("stem not found for " ^ entry.lemma);
  144 + Xlist.iter entry.forms (fun (form.orth,form.interp) -> printf " %s\t%s\n" form.orth form.interp);*)
144 145 ""
145 146 | [s] -> s
146   - | l -> print_endline ("many stems found for " ^ lemma ^ ": " ^ String.concat " " l); ""
147   - (*printf "\"%s\"; " lemma; ""*)
  147 + | l -> print_endline ("many stems found for " ^ entry.lemma ^ ": " ^ String.concat " " l); ""
  148 + (*printf "\"%s\"; " entry.lemma; ""*)
148 149  
149 150 let rec merge_digraph = function
150 151 [] -> []
... ... @@ -164,13 +165,6 @@ let rec merge_digraph = function
164 165 | "q" :: "u" :: l -> "qu" :: (merge_digraph l)
165 166 | s :: l -> s :: (merge_digraph l)
166 167  
167   -(*let text_to_chars s =
168   - (try UTF8.validate s with UTF8.Malformed_code -> failwith ("Invalid UTF8 string: " ^ s));
169   - let r = ref [] in
170   - UTF8.iter (fun c ->
171   - r := (UTF8.init 1 (fun _ -> c)) :: (!r)) s;
172   - merge_digraph (List.rev (!r))*)
173   -
174 168 let cut_stem_sufix s =
175 169 let l = Xunicode.utf8_chars_of_utf8_string (*text_to_chars*) s in
176 170 let l = match List.rev l with
... ...
guesser/types.ml 0 → 100644
  1 +type form = {orth: string; interp: string; freq: int; genre: string; validated: bool}
  2 +type entry = {lemma: string; cat: string; forms: form list; proper_type: string; ndm: bool; stem: string}
... ...