dict.ml 30.7 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582
open Xstd
open Printf
open Types

let get_form e =
  match e.forms with
    [form] -> form
  | _ -> failwith "get_form"

let load_tab filename =
  File.load_tab filename (function
      orth :: lemma :: interp :: _ ->
        {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type="";
         ndm=false; stem=""}
    | line -> failwith ("load_tab: " ^ (String.concat "\t" line)))

let load_tab_full filename =
  File.load_tab filename (function
      [orth; lemma; interp] ->
        {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type="";
         ndm=false; stem=""}
    | [orth; lemma; interp; proper_type] ->
        {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=""; validated=false}]; proper_type=proper_type;
         ndm=false; stem=""}
    | [orth; lemma; interp; proper_type; genre] ->
        {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=1; genre=genre; validated=false}]; proper_type=proper_type;
         ndm=false; stem=""}
    | line -> failwith ("load_tab_full: " ^ (String.concat "\t" line)))

let load_lu dict id path =
  let filename = path ^ "morf_rel_" ^ string_of_int id ^ "_lu.tab" in
  File.fold_tab filename dict (fun dict -> function
      [lemma1; lemma2] ->
        {lemma1=lemma1; lemma2=lemma2; rel_id=id; lu_stem=""; lu_validated=false;validated1=false;validated2=false} :: dict
    | line -> failwith ("load_lu: " ^ (String.concat "\t" line)))

let pos = StringSet.of_list [
  "subst";"adj";"adv";"interp";"num";"xxx";"prep";"fin";"praet";"qub";"inf";"interj";
  "brev";"numcol";"ppas";"pact";"adja";"conj";"ger";"pcon";"pant";"comp";"depr";
  "adjp";"imps";"impt";"pred";"bedzie";"burk";"aglt";"ppron12";"ppron3";"adjc";
  "winien";"siebie";"numcomp"
  ]

let rec find_pos rev = function
    s :: l -> if StringSet.mem pos s then List.rev rev, s :: l else find_pos (s :: rev) l
  | [] -> failwith "find_pos"

let split_lemma_interp s =
  let l = Xstring.split_delim ":" s in
  let lemma,interp = find_pos [List.hd l] (List.tl l) in
  String.concat ":" lemma, String.concat ":" interp

let rec remove_empties = function
    "" :: l -> remove_empties l
  | l -> l

let split_freq_orth s =
  match remove_empties (Xstring.split " " s) with
    freq :: l -> int_of_string freq, String.concat " " l
  | _ -> failwith "split_freq_orth"

let load_freq_tab filename =
  File.load_tab filename (function
      [freq_orth; lemma_interp] ->
        let freq,orth = split_freq_orth freq_orth in
        let lemma,interp = split_lemma_interp lemma_interp  in
        {lemma=lemma; cat=""; forms=[{orth=orth; interp=interp; freq=freq; genre=""; validated=false}]; proper_type="";
         ndm=false; stem=""}
    | line -> failwith ("load_freq_tab: " ^ (String.concat "\t" line)))

let proper_type_selector e = e.proper_type
let genre_selector e = (get_form e).genre
let interp_selector e = (get_form e).interp
let freq_selector e = (get_form e).freq

let print_quantities out_filename selector dict =
  let qmap = Xlist.fold dict StringQMap.empty (fun qmap entry ->
    StringQMap.add qmap (selector entry)) in
  File.file_out out_filename (fun file ->
    StringQMap.iter qmap (fun k v ->
      fprintf file "%6d\t%s\n" v k))

(**********************************************************************************)

let load_dict_as_set filename =
  let l = load_tab filename in
  List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
    let form = get_form entry in
    StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp]))))

let load_dict_as_set_full filename =
  let l = load_tab_full filename in
  List.sort compare (StringSet.to_list (Xlist.fold l StringSet.empty (fun set entry ->
    let form = get_form entry in
    StringSet.add set (String.concat "\t" [form.orth;entry.lemma;form.interp;entry.proper_type;form.genre]))))

let rec compare_dicts_rec file = function
    [],[] -> ()
  | [],b :: lb -> fprintf file "> %s\n" b; compare_dicts_rec file ([],lb)
  | a :: la,[] -> fprintf file "< %s\n" a; compare_dicts_rec file (la,[])
  | a :: la, b :: lb ->
       if a = b then compare_dicts_rec file (la,lb) else
       if a < b then (fprintf file "< %s\n" a; compare_dicts_rec file (la,b :: lb)) else
       (fprintf file "> %s\n" b; compare_dicts_rec file (a :: la,lb))

let compare_dicts filename1 filename2 filename_out =
  let dict1 = load_dict_as_set filename1 in
  let dict2 = load_dict_as_set filename2 in
  File.file_out filename_out (fun file ->
    compare_dicts_rec file (dict1,dict2))

let compare_dicts_full filename1 filename2 filename_out =
  let dict1 = load_dict_as_set_full filename1 in
  let dict2 = load_dict_as_set_full filename2 in
  File.file_out filename_out (fun file ->
    compare_dicts_rec file (dict1,dict2))

(**********************************************************************************)

let find_entry_cat entry =
  let form = get_form entry in
  let cat,tags = match Xstring.split ":" form.interp with
      cat :: tags -> cat,tags
    | _ -> failwith ("find_entry_type: " ^ form.interp) in
  if cat = "subst" || cat = "depr" then "noun" else
  if cat = "adj" || cat = "adja"|| cat = "adjc"|| cat = "adjp" then "adj" else
  if cat = "adv" then "adv" else
  if cat = "inf" || cat = "praet"|| cat = "fin" || cat = "ppas" || cat = "pact" || cat = "pacta" ||
     cat = "impt" || cat = "imps" || cat = "pcon" || cat = "pant" || cat = "ger" || cat = "" then "verb" else
  if cat = "bedzie" || cat = "pred"|| cat = "prep" || cat = "num" || cat = "aglt" || cat = "winien" ||
     cat = "qub" || cat = "brev" || cat = "comp" || cat = "interj" || cat = "burk" ||
     cat = "conj" || cat = "ppron12" || cat = "ppron3" || cat = "numcomp" || cat = "" then "other" else
  if cat = "cond" then "cond" else
  failwith ("find_entry_cat: " ^ cat)

let assign_entry_cat dict =
  Xlist.rev_map dict (fun entry ->
    {entry with cat = find_entry_cat entry})

let split_dict in_path filename out_path =
  let dict = load_tab (in_path ^ filename) in
  let dict = List.rev (assign_entry_cat dict) in
  let filename = if Xstring.check_sufix ".gz" filename then
    Xstring.cut_sufix ".gz" filename else filename in
  File.file_out (out_path ^ "noun_" ^ filename) (fun noun_file ->
  File.file_out (out_path ^ "adj_" ^ filename) (fun adj_file ->
  File.file_out (out_path ^ "adv_" ^ filename) (fun adv_file ->
  File.file_out (out_path ^ "verb_" ^ filename) (fun verb_file ->
  File.file_out (out_path ^ "other_" ^ filename) (fun other_file ->
  Xlist.iter dict (fun entry ->
    let form = get_form entry in
    try
      let file = match entry.cat with
          "noun" -> noun_file
        | "adj" -> adj_file
        | "adv" -> adv_file
        | "verb" -> verb_file
        | "other" -> other_file
        | "cond" -> raise Not_found
        | _ -> failwith "split_dict" in
      fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp
    with Not_found -> ()))))))

let merge_entries dict =
  let dict = assign_entry_cat dict in
  let map = Xlist.fold dict StringMap.empty (fun map entry ->
    let form = get_form entry in
    let key =
      if entry.cat = "noun" then
        let gender = match Xstring.split ":" form.interp with
            ["depr";_;_;"m2"] -> "m1"
          | "depr" :: _ -> failwith ("merge_entries: " ^ form.interp)
          | [_;_;_;gender] -> gender
          | [_;_;_;gender;col] -> gender ^ ":" ^ col
          | _ -> failwith ("merge_entries: " ^ form.interp) in
        entry.lemma ^ "|" ^ entry.cat ^ "|" ^ gender
      else entry.lemma ^ "|" ^ entry.cat in
    StringMap.add_inc map key entry (fun e ->
      if entry.proper_type <> e.proper_type then
        failwith ("merge_entries: " ^ key ^ " " ^ entry.proper_type ^ " " ^ e.proper_type) else
      {e with forms = form :: e.forms})) in
  StringMap.fold map [] (fun dict _ e -> e :: dict)

let remove_cat cat dict =
  Xlist.fold dict [] (fun dict entry ->
    if entry.cat = cat then dict
    else entry :: dict)

(**********************************************************************************)

let mark_ndm dict =
  Xlist.fold dict [] (fun dict entry ->
    if entry.cat <> "noun" &&  entry.cat <> "adj" then entry :: dict else
    let map = Xlist.fold entry.forms StringMap.empty (fun map form ->
      StringMap.add_inc map form.interp (StringSet.singleton form.orth) (fun set -> StringSet.add set form.orth)) in
    let qmap = StringMap.fold map StringQMap.empty (fun qmap interp orths ->
      StringSet.fold orths qmap StringQMap.add) in
    let n = StringMap.size map in
    let found = StringQMap.fold qmap [] (fun found orth v ->
      if v = n then orth :: found else found) in
    match found with
      [] -> entry :: dict
    | [orth] ->
        let ndm,odm = Xlist.fold entry.forms ([],[]) (fun (ndm,odm) form ->
          if form.orth = orth then form :: ndm, odm else ndm, form :: odm) in
        let dict = {entry with forms=odm} :: dict in
        {entry with forms=ndm; ndm=true} :: dict
    | _ -> failwith ("mark_ndm: " ^ (String.concat " " found)))

let print_ndm filename dict =
  File.file_out filename (fun file ->
    Xlist.iter dict (fun entry ->
      if entry.ndm then
        let orth = (List.hd entry.forms).orth in
        fprintf file "%s\t%s\t%s\n" orth entry.lemma entry.cat))

let remove_ndm dict =
  Xlist.fold dict [] (fun dict entry ->
    if entry.ndm then dict
    else entry :: dict)

let remove_not_ndm dict =
  Xlist.fold dict [] (fun dict entry ->
    if not entry.ndm then dict
    else entry :: dict)


let kolwiek_lemmas = StringSet.of_list [
  (* adj *)
  "czyjkolwiek"; "czyjล›"; "czyjลผe"; "jakiciล›"; "jakikolwiek"; "jakisi"; "jakiล›"; "jakiล›ci";
  "jakiล›ciล›"; "jakiล›kolwiek"; "jakiลผ"; "jakiลผkolwiek"; "jakowyล›"; "kijลผe"; "kiลผ"; "ktรณrykolwiek";
  "ktรณryล›"; "ktรณryลผ"; "ktรณryลผkolwiek"; "niejakiล›"; "takiลผ"; "takowyลผ"; "tenลผe"; "tyliลผ"; "รณwลผe";
  (* noun *)
  "cokolwiek:s"; "coล›kolwiek"; "cรณลผkolwiek"; "ktokolwiek"; "ktoล›kolwiek"; "ktรณลผkolwiek";
  "cociล›"; "cosi"; "cosik"; "cosiล›"; "coล›:s"; "coล›ciล›"; "coลผe"; "cรณลผ";
  "ktoล›:s2"; "ktรณลผ";
  (* adv *)
  "jakkolwiek"; "jakoล›"; "maล‚oลผ"; "niejakkolwiek"; "niejakoล›"; (*"niemalลผe";*) ]

let kolwiek_suffixes = [
  "ลผkolwiek"; "ลผekolwiek"; "ล›kolwiek"; "kolwiek"; "ลผ"; "ลผe"; "ล›ciล›"; "ciล›"; "ล›"; "ล›ci"; "sik"; "si"]

let find_kolwiek_suffixes dict =
  Xlist.rev_map dict (fun entry ->
    if StringSet.mem kolwiek_lemmas entry.lemma then
      {entry with forms=Xlist.map entry.forms (fun form ->
          {form with orth=Xlist.fold kolwiek_suffixes form.orth (fun orth kolwiek_suf ->
            if Xstring.check_sufix kolwiek_suf orth then
              Xstring.cut_sufix kolwiek_suf orth
            else orth)})}
    else entry)

let exceptional_lemmata = StringSet.of_list ([
  (* wiele stemรณw *)
  "Apollo"; "Aujeszky"; "Biaล‚ystok"; "Gรณzd"; "Krasnystaw"; "Rรณลผanystok"; "Wielkanoc"; "biaล‚agล‚owa";
  "deszcz"; "imฤ‡pan"; "iล›ciec"; "otrzฤ…s"; "rzeczpospolita"; "wilczeล‚yko"; "woleoczko";

  "prapraojciec"; "praojciec"; "ojciec"; "wspรณล‚brat"; "spรณล‚brat"; "pรณล‚brat"; "brat";
  "pรณล‚czล‚owiek"; "przedczล‚owiek"; "praczล‚owiek"; "nadczล‚owiek"; "git-czล‚owiek"; "czล‚owiek";
  "pรณล‚dziecko"; "+lecie"; "zimoziele"; "ziele"; "trรณjziele"; "nasienie";
  "ksiฤ…dz"; "dech"; "tydzieล„"; "roczek:s2"; "rok:s1"; "przechrzest"; "chrzest";
  "dziecko"; "ucho:s2"; "oko:s2"; "czeล›ฤ‡:s"; "jo-jo"; "Zabล‚otce"; "tysiฤ…c:s1"; "pรณล‚miesiฤ…c"; "miesiฤ…c"; ""; ""; "";
  "Pia"; "rฤ™ka"; "czล‚ek"; "Kozak:s1"; "bรณg"; "psubrat"; "pieniฤ…dz"; ""; ""; ""; "";
  "kto"; "ktokolwiek"; "ktoล›:s2"; "ktoล›kolwiek"; "ktรณลผ"; "ktรณลผkolwiek"; "nikt"; "nic";
  "co:s"; "cociล›"; "cokolwiek:s"; "cosi"; "cosik"; "cosiล›"; "coล›:s"; "coล›ciล›"; "coล›kolwiek"; "coลผe"; "cรณลผ"; "cรณลผkolwiek";
  "niebiosa"; "Wล‚ochy:s1"; "Niemcy"; "Wฤ™gry"; "Austro-Wฤ™gry"; ""; ""; ""; ""; ""; ""; "";
  "zajฤ…c:s1"; "tysiฤ…c:s2"; "wszyscy"; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
  "NOT"; "LOT"; "KRRiT"; "OIT"; ""; ""; ""; ""; ""; ""; ""; "";
  "bliscy"; "ojcowie"; "teล›ciowie"; "ichmoล›ciowie"; "wujkowie"; "staruszkowie"; "pradziadkowie"; ""; ""; ""; ""; "";
  "maล‚ลผonkowie"; "kochankowie"; "dziadkowie"; "rozwiedzeni"; "nieliczni"; "chrzestni"; ""; ""; ""; ""; ""; "";
  "starzy"; "wasi"; "nasi"; "najmล‚odsi"; "dzisiejsi"; ""; ""; ""; ""; ""; ""; "";
  "IKEA"; "stajnia"; "kuchnia:s"; "suknia"; "minisuknia"; "gล‚รณwnia"; "gล‚ownia"; "dลบwignia"; ""; ""; ""; "";
  "workowiล›nia"; "wiล›nia"; "soล›nia"; "laurowiล›nia"; "studnia"; "idea"; "imienie"; ""; ""; ""; ""; "";
  "makao"; "macao"; "kakao"; "Akademgorodok"; "yuppi"; "hippie"; "yuppie"; ""; ""; ""; ""; "";
  "Uj"; "PIT"; "ChAT"; "podczล‚owiek"; "nieczล‚owiek"; "czeล›ฤ‡"; "ktoล›"; "ktosik"; ""; ""; ""; "";
  "+รณwna"; "+yna"; "+ina"; "+anka"; "+owa"; "co"; "cokolwiek"; "coล›"; "coล›tam"; ""; ""; "";
  "zajฤ…c"; "tysiฤ…c"; "rok"; "roczek"; "oko"; "ucho"; "Wล‚ochy"; "niebiosy"; "wici"; ""; ""; "";
  "Andrรฉ"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";

  "zล‚y:a"; "dobry:a"; "przymaล‚y"; "maล‚y:a"; "duลผy"; "wielki:a";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "zwaฤ‡"; "wiedzieฤ‡"; "pรณjล›ฤ‡"; "przejล›ฤ‡"; "dojล›ฤ‡"; "zsiฤ…ล›ฤ‡"; "iล›ฤ‡"; ""; ""; ""; ""; "";
  "byฤ‡"; "zasล‚oniฤ‡"; "sล‚oniฤ‡"; "przysล‚oniฤ‡"; "przesล‚oniฤ‡"; "osล‚oniฤ‡"; "odsล‚oniฤ‡"; ""; ""; ""; ""; "";

  (*
  (* bล‚ฤ…d w sล‚owniku *)
  "รณwลผe";
  (* wiele stemรณw *)
  "twรณj:a"; "swรณj"; "mรณj:a"; "wszystek";
  (* obocznoล›ci w stemie *)
  "co:s"; "cociล›"; "cokolwiek:s"; "cosi"; "cosik"; "cosiล›"; "coล›:s"; "coล›ciล›"; "coล›kolwiek"; "coลผe"; "cรณลผ"; "cรณลผkolwiek";
  "kto"; "ktokolwiek"; "ktoล›:s2"; "ktoล›kolwiek"; "ktรณลผ"; "ktรณลผkolwiek"; "nikt"; "nic";
  "Angel"; "Apollo"; "Biaล‚ystok"; "Bober"; "Dziฤ™giel"; "Engel"; "Goล‚ฤ…b:s2"; "Gรณzd"; "Hendel"; "Herschel"; "Jastrzฤ…b";
  "Kodrฤ…b:s2"; "Kozioล‚"; "Krasnystaw"; "Majcher"; "Ob"; "Omulew"; "Orzeล‚"; "Rรณลผanystok"; "Schuster"; "Stฤ™pieล„"; "Sล‚onim";
  "Wielkanoc"; "achtel"; "archiprezbiter"; "arcydziฤ™giel"; "bedel"; "ber"; "biaล‚agล‚owa"; "biaล‚odrzew"; "ceter"; "deszcz";
  "drama"; "dziรณb:s1"; "dziฤ™giel"; "dลผemper"; "falafel"; "grubodziรณb"; "harbajtel"; "harbejtel"; "harmider"; "imฤ‡pan";
  "iล›ciec"; "jarzฤ…b:s2"; "kierdel"; "kimel"; "kiper:s1"; "klaster"; "kliper"; "kosodrzew"; "kureล„"; "manczester";
  "nadpiersieล„"; "osep"; "otrzฤ…s"; "pedel"; "piksel"; "podpiersieล„"; "podziem"; "prezbiter"; "protokรณล‚"; "przedpiersieล„";
  "ratel"; "rondel:s2"; "rozpiรณr:s1"; "rozpiรณr:s2"; "rzeczpospolita"; "rzep:s2"; "rzepieล„"; "rzewieล„"; "rฤ…b"; "sosrฤ…b";
  "srebrnodrzew"; "swฤ…d"; "szmermel"; "szpiegierz"; "ulster"; "wab:s2"; "wermiszel"; "wilczeล‚yko"; "woleoczko"; "wล‚osieล„:s2";
  "zew"; "zล‚otogล‚รณw"; "ล›wireล„"; "ลบreb"; "ลผรณล‚todziรณb";
  "czล‚owiek"; "pรณล‚czล‚owiek"; "przedczล‚owiek"; "praczล‚owiek"; "nadczล‚owiek"; "git-czล‚owiek"; ""; ""; ""; ""; ""; ""; ""; "";
  "przechrzest"; "chrzest"; "pรณล‚dziecko"; "roczek:s2"; "rok:s1"; "tydzieล„"; ""; ""; ""; ""; ""; "";
  (* obocznoล›ci w odmianie *)
  "niekaลผdy"; "kaลผdy"; "niektรณry:a"; "ktรณry"; "tenลผe"; "ten"; "tamten"; "kijลผe";
  "ucho:s2"; "dziecko"; "oko:s2"; "imiฤ™"; "nozdrze";
  "ZHR"; "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART"; "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT";
  "NOT"; "LOT"; "KRRiT"; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "byฤ‡"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
  (* pozostawione *)
  "czyjลผe"; "czyjล›"; "czyjkolwiek"; "kiลผ"; "รณw"; "ow"; "on:a"; "ki";
  "Pia"; "jo-jo"; "+lecie"; "";
  "zagrzฤ…ลบฤ‡"; "zrzeฤ‡";
  (* niepeล‚ny paradygmat *)
  "zrรณล›ฤ‡"; "zlฤ…c"; "zarรณล›ฤ‡"; "zaprzฤ…c"; "zaprzysiฤ…c"; "zanieล›ฤ‡:v2"; "zalฤ…c"; "wzrรณล›ฤ‡"; "wyrรณล›ฤ‡"; "wyprzฤ…c"; "wyprzysiฤ…c";
  "rรณล›ฤ‡"; "sprzysiฤ…c"; "sprzฤ…c"; "ugrzฤ…ลบฤ‡"; "ulฤ…c"; "upiec:v2"; "uprzฤ…c"; "urรณล›ฤ‡"; "wieล›ฤ‡:v2"; "wprzฤ…c"; "wrรณล›ฤ‡"; "wylฤ…c";
  "powieล›ฤ‡:v2"; "posiฤ…c"; "przelฤ…c"; "przeprzฤ…c"; "przerรณล›ฤ‡"; "przyprzฤ…c"; "przysiฤ…c"; "przyrรณล›ฤ‡"; "przฤ…c"; "pรณjล›ฤ‡:v2"; "rozprzฤ…c"; "rozrรณล›ฤ‡";
  "krzywoprzysiฤ…c"; "lฤ…c"; "narรณล›ฤ‡"; "obrรณล›ฤ‡"; "odprzysiฤ…c"; "odprzฤ…c"; "odrรณล›ฤ‡"; "oprzysiฤ…c"; "podrรณล›ฤ‡"; "pogrzฤ…ลบฤ‡"; "poprzysiฤ…c"; "porรณล›ฤ‡";
  "dojล›ฤ‡:v2"; "doprzฤ…c"; "dorรณล›ฤ‡"; "dosiฤ…c"; "grzฤ…ลบฤ‡"; "iล›ฤ‡:v2";
  (* wiele stemรณw *)
  "uwlec"; "wewlec"; "wlec"; "wwlec"; "wywlec"; "wyลผec"; "zawlec"; "zaลผec"; "zewlec"; "zwlec"; "zลผec"; "ลผec";
  "podwlec"; "podลผec"; "powlec:v1"; "powlec:v2"; "przeoblec"; "przewlec"; "przeลผec"; "przyoblec"; "przywlec"; "przyลผec"; "rozwlec"; "rozลผec";
  "dowlec"; "nawlec"; "oblec:v2"; "obwlec"; "odwlec"; "owlec"; "zeลผreฤ‡";
  (* inne *)
  "liลบฤ‡"; "iล›ฤ‡:v1"; "wyniล›ฤ‡"; "wynijล›ฤ‡"; "wyjล›ฤ‡"; "wniล›ฤ‡"; "wnijล›ฤ‡"; "wejล›ฤ‡"; "ujล›ฤ‡"; "rozejล›ฤ‡"; "pรณjล›ฤ‡:v1"; "przyjล›ฤ‡"; "przejล›ฤ‡:v2"; "przejล›ฤ‡:v1"; "podejล›ฤ‡"; "odejล›ฤ‡"; "obejล›ฤ‡:v2"; "obejล›ฤ‡:v1"; "najล›ฤ‡:v2"; "najล›ฤ‡:v1"; "nadejล›ฤ‡"; "dojล›ฤ‡:v1";
  "roztworzyฤ‡:v2"; "przetworzyฤ‡:v2"; "otworzyฤ‡";
  "zsiฤ…ล›ฤ‡:v2"; "zsiฤ…ล›ฤ‡:v1"; "zesiฤ…ล›ฤ‡"; "zasiฤ…ล›ฤ‡"; "wysiฤ…ล›ฤ‡"; "wspรณล‚posiฤ…ล›ฤ‡"; "wsiฤ…ล›ฤ‡"; "usiฤ…ล›ฤ‡"; "siฤ…ล›ฤ‡"; "rozsiฤ…ล›ฤ‡"; "przysiฤ…ล›ฤ‡"; "przesiฤ…ล›ฤ‡"; "powsiฤ…ล›ฤ‡"; "posiฤ…ล›ฤ‡"; "podsiฤ…ล›ฤ‡"; "osiฤ…ล›ฤ‡"; "obsiฤ…ล›ฤ‡"; "nasiฤ…ล›ฤ‡"; "dosiฤ…ล›ฤ‡";
  "ลบreฤ‡:v1"; "zniล›ฤ‡"; "znijล›ฤ‡"; "znajล›ฤ‡"; "zejล›ฤ‡"; "zejล›ฤ‡"; "zajล›ฤ‡:v2"; "zajล›ฤ‡:v1"; "wzniล›ฤ‡"; "wznijล›ฤ‡"; "wzejล›ฤ‡"
(*
   "moi"; "twoi";
  (*"AIDS"; "BGลป"; "BWZ"; "BZ";*) (*"Bandtkie";*) (*"CRZZ"; "FPลป";*) (*"Jokai"; "Jรณkai"; "Linde";*)(* "MSZ"; "MWGzZ"; *)
  (*"NSZ"; "OPZZ";*) "Radetzky"; "Tagore"; (*"UNZ"; "URz"; "WBZ"; "ZSZ"; "ZWZ"; "ZZ";*) "aids";
  "arcyksiฤ™ลผna"; "cornflakes"; "ksiฤ™ลผna"; (*"scrabble";*) "sms"; "teล›cina";
  "Wielkanoc"; "biaล‚agล‚owa"; "rzeczpospolita"; "imฤ‡pan";
  "Ob"; "podziem"; "Pia"; "woleoczko"; "wilczeล‚yko"; "jo-jo"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "Omulew"; "drama"; (*"Kayah";*) "ratel"; "grubodziรณb"; "rozpiรณr:s1"; "ceter"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "DJ"; "FIFA"; (*"manicure"; "Greenpeace"; "Google";*) ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "wล‚osieล„:s2"; "deszcz"; "falafel"; "Krasnystaw";
  "Rรณลผanystok"; "Biaล‚ystok"; "ZHR"; "rzep:s2"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "IKEA"; "makao"; "macao"; "kakao"; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "dziecko"; "oko:s2"; "ucho:s2"; "pรณล‚dziecko"; "b-cia"; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "idea"; "rฤ™ka"; "czeล›ฤ‡:s"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "ABBA"; "UEFA"; "FAMA"; "SABENA"; "MENA"; "APA"; "NASA"; "ANSA";
  "NAFTA"; "LETTA"; "ETA"; "ELTA"; "EFTA"; "CEFTA";
  "WAT"; "VAT"; "PAT"; "FAT"; "DAT"; "PAGART";
  "PIT:s2"; "PIT:s1"; "OIT:s2"; "OIT:s1"; "CIT"; "NOT"; "LOT"; "KRRiT";
  "czล‚owiek"; "pรณล‚czล‚owiek"; "przedczล‚owiek"; "praczล‚owiek"; "nadczล‚owiek"; "git-czล‚owiek"; ""; ""; ""; ""; ""; ""; ""; "";
  "szwa"; "hawanna"; "butaforia"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "Skopie"; "Mathea"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  "poema:s1"; "klima:s1"; "dylema"; "dilemma"; "apoftegma"; "aksjoma"; ""; ""; ""; ""; ""; ""; ""; "";
  "burgrabia"; "gograbia"; "grabia"; "hrabia"; "margrabia"; "murgrabia"; "sฤ™dzia:s1"; "wicehrabia"; "wspรณล‚sฤ™dzia";
  "cieล›la"; "bibliopola"; "ล›wiszczypaล‚ka"; "ล›wircaล‚ka"; "ล›wierczaล‚ka"; "ล›cierciaล‚ka"; "tatka"; "sล‚uga:s1"; "stupajka:s1"; "stepka"; "starowinka:s2"; "skurczypaล‚ka"; "mฤ™ลผczyzna"; "klecha";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";
  ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "";*)
  ] (*@ File.load_lines "../morphology/data/obce.tab"*) (* @
    File.load_lines "../morphology/data/validated_adj.tab" @ File.load_lines "../morphology/data/validated_noun.tab" @
    File.load_lines "../morphology/data/validated_verb.tab" @ File.load_lines "../morphology/data/adv_nieodprzymiotnikowe.tab" *))

let remove_exceptional_lemmata dict =
  Xlist.fold dict [] (fun dict entry ->
    if StringSet.mem exceptional_lemmata entry.lemma then dict
    else entry :: dict)

let generate_stem dict =
  Xlist.rev_map dict (fun entry ->
    {entry with stem=
      if entry.ndm then (List.hd entry.forms).orth else
      if entry.cat = "noun" || entry.cat = "adj" || entry.cat = "adv" || entry.cat = "verb" then
        Stem.generate_stem entry
      else ""})

let generate_stem_lu dict =
  Xlist.rev_map dict (fun entry ->
    {entry with lu_stem=Stem.generate_stem_lu entry.lemma1 entry.lemma2})

let lowercase_lu dict =
  Xlist.rev_map dict (fun entry ->
    {entry with
      lemma1=Xunicode.lowercase_utf8_string entry.lemma1;
      lemma2=Xunicode.lowercase_utf8_string entry.lemma2})

let validate rules dict =
  Xlist.rev_map dict (fun entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    let forms = Xlist.rev_map entry.forms (fun form ->
      let candidates = Rules.CharTrees.find rules form.orth in
      let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
        if stem ^ rule.set = simple_lemma then (stem,rule) :: candidates else candidates) in
      if candidates = [] then {form with validated=false} else {form with validated=true}) in
    {entry with forms=forms})

let validate_lu rules dict =
  Xlist.rev_map dict (fun entry ->
    let candidates1 = Rules.CharTrees.find rules entry.lemma1 in
    let candidates2 = Rules.CharTrees.find rules entry.lemma2 in
    let b = Xlist.fold candidates1 false (fun b (stem1,rule1) ->
      Xlist.fold candidates2 b (fun b (stem2,rule2) ->
        (* Printf.printf "%s %s %s %s\n%!" stem1 stem2 (string_of_rule rule1) (string_of_rule rule1); *)
        if stem1 ^ rule1.set = stem2 ^ rule2.set then true else b)) in
    (* if b then print_endline "validated"; *)
    let b1 = Xlist.fold candidates1 false (fun b (stem1,rule1) -> if stem1 = entry.lu_stem then true else b) in
    let b2 = Xlist.fold candidates2 false (fun b (stem2,rule2) -> if stem2 = entry.lu_stem then true else b) in
    {entry with lu_validated=b; validated1=b1; validated2=b2})

let validate_interp rules dict =
  Xlist.rev_map dict (fun entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    let forms = Xlist.rev_map entry.forms (fun form ->
      let candidates = Rules.CharTrees.find rules form.orth in
      let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
        if rule.star = Ndm && not entry.ndm then candidates else
        if rule.star <> Ndm && entry.ndm then candidates else
        if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
          (stem,rule) :: candidates else candidates) in
      if candidates = [] then {form with validated=false} else {form with validated=true}) in
    {entry with forms=forms})

let remove_validated_forms dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if form.validated then forms else form :: forms) in
    if forms = [] then dict else {entry with forms=forms} :: dict)

let remove_validated_entries dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if form.validated then forms else form :: forms) in
    if forms = [] then dict else entry :: dict)

let remove_validated_lu dict =
  Xlist.fold dict [] (fun dict entry ->
    if entry.lu_validated then dict else entry :: dict)

let remove_not_validated_forms dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if form.validated then form :: forms else forms) in
    if forms = [] then dict else {entry with forms=forms} :: dict)

let remove_not_validated_entries dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if form.validated then form :: forms else forms) in
    if Xlist.size forms <> Xlist.size entry.forms then dict else entry :: dict)

let print filename dict =
  File.file_out filename (fun file ->
    Xlist.iter dict (fun entry ->
      Xlist.iter entry.forms (fun form ->
        fprintf file "%s\t%s\t%s\n" form.orth entry.lemma form.interp)))

let print_lemmata filename dict =
  File.file_out filename (fun file ->
    Xlist.iter dict (fun entry ->
      fprintf file "%s\n" entry.lemma))

let remove_sup_neg_forms dict =
  Xlist.fold dict [] (fun dict entry ->
    let forms = Xlist.fold entry.forms [] (fun forms form ->
      if Xstring.check_sufix ":neg" form.interp || Xstring.check_sufix ":sup" form.interp then
        forms else form :: forms) in
    if forms = [] then dict else {entry with forms=forms} :: dict)

let generate_rules rules path filename rules_filename =
  let dict = load_tab (path ^ filename) in
  let dict = merge_entries dict in
  let dict = mark_ndm dict in (* FIXME: remove_ndm? *)
  let dict = find_kolwiek_suffixes dict in
  let dict = remove_exceptional_lemmata dict in
  let dict = generate_stem dict in
  let dict = validate rules dict in
  let dict = remove_validated_forms dict in
  let dict = remove_sup_neg_forms dict in
  let rules = Xlist.fold dict StringMap.empty (fun rules entry ->
    Xlist.fold (RuleGenerator.generate_rules_entry entry) rules (fun rules (key,rule) ->
      let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in
      let rules2 = StringMap.add_inc rules2 rule (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l) in
      StringMap.add rules key rules2)) in
  File.file_out rules_filename (fun file ->
    StringMap.iter rules (fun interp rules2 ->
      fprintf file "\n@RULES %s\n" interp;
      StringMap.iter rules2 (fun rule (q,l) ->
        fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))

let generate_rules_lu rules id path rules_filename =
  let dict = load_lu [] id path in
  let dict = lowercase_lu dict in
  let dict = generate_stem_lu dict in
  let dict = validate_lu rules dict in
  let dict = remove_validated_lu dict in
  (* let dict = remove_sup_neg_forms dict in *)
  let rules = Xlist.fold dict StringMap.empty (fun rules entry ->
    Xlist.fold (RuleGenerator.generate_rules_lu_entry entry) rules (fun rules (key,rule,lemma) ->
      let rules2 = try StringMap.find rules key with Not_found -> StringMap.empty in
      let rules2 = StringMap.add_inc rules2 rule (1,[lemma]) (fun (q,l) -> q+1, if q < 20 then lemma :: l else l) in
      StringMap.add rules key rules2)) in
  File.file_out rules_filename (fun file ->
    StringMap.iter rules (fun interp rules2 ->
      fprintf file "\n@RULES %s\n" interp;
      StringMap.iter rules2 (fun rule (q,l) ->
        fprintf file "\t%s\t# %d %s\n" rule q (String.concat " " l))))

let rec get_first n l =
  if n = 0 || l = [] then [] else
  List.hd l :: (get_first (n-1) (List.tl l))

let generate_interp_rules rules interp_rules con_flag group_flag lemma_flag path filename rules_filename =
  let dict = load_tab (path ^ filename) in
  let dict = merge_entries dict in
  let dict = mark_ndm dict in (* FIXME: remove_ndm? *)
  let dict = find_kolwiek_suffixes dict in
  let dict = remove_exceptional_lemmata dict in
  let dict = generate_stem dict in
  let dict = validate_interp interp_rules dict in
  let dict = remove_validated_forms dict in
  let interp_rules = Xlist.fold dict StringMap.empty (fun interp_rules entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    Xlist.fold entry.forms interp_rules (fun interp_rules form ->
      let candidates = RuleGenerator.generate_interp_rules rules con_flag group_flag lemma_flag simple_lemma form in
      Xlist.fold candidates interp_rules (fun interp_rules cand ->
        (* StringMap.add_inc interp_rules cand (1,[entry.lemma]) (fun (q,l) -> q+1, if q < 20 then entry.lemma :: l else l)))) in *)
        StringMap.add_inc interp_rules cand (StringSet.singleton entry.lemma) (fun set -> StringSet.add set entry.lemma)))) in
  File.file_out rules_filename (fun file ->
    StringMap.iter interp_rules (fun k set(*q,l*) ->
      (* fprintf file "\t%s\t# %d %s\n" k q (String.concat " " l))) *)
      if StringSet.size set > 1000 then fprintf file "\t%s\t# %d %s\n" k (StringSet.size set) (String.concat " " (get_first 20 (List.rev (StringSet.to_list set))))))

let generate_ndm_rules dict =
  let freq_rules = Xlist.fold dict Rules.RuleQMap.empty (fun freq_rules entry ->
    Xlist.fold entry.forms freq_rules (fun freq_rules form ->
      let rule = {id=""; freq=0; star=Ndm; pref=""; find=""; set=""; tags=[]; interp=form.interp} in
      Rules.RuleQMap.add freq_rules rule)) in
   fst (Rules.RuleQMap.fold freq_rules (Rules.RuleQMap.empty,1) (fun (freq_rules,i) rule freq ->
     Rules.RuleQMap.add_val freq_rules {rule with id = "N" ^ string_of_int i} freq, i+1))

let generate_rule_frequencies rules path filename rules_filename =
  let dict = load_tab (path ^ filename) in
  let dict = merge_entries dict in
  let dict = remove_cat "cond" dict in
  let dict = mark_ndm dict in
  let freq_rules = generate_ndm_rules (remove_not_ndm dict) in
  let dict = remove_ndm dict in
  let dict = remove_exceptional_lemmata dict in
  let dict = generate_stem dict in
  let freq_rules = Xlist.fold dict freq_rules (fun freq_rules entry ->
    let simple_lemma = Stem.simplify_lemma entry.lemma in
    Xlist.fold entry.forms freq_rules (fun freq_rules form ->
      let candidates = Rules.CharTrees.find rules form.orth in
      let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
        if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
          (stem,rule) :: candidates else candidates) in
      if candidates = [] then freq_rules else Rules.RuleQMap.add freq_rules (snd (List.hd candidates)))) in
  File.file_out rules_filename (fun file ->
    Rules.RuleQMap.iter freq_rules (fun rule freq ->
      fprintf file "%s\n" (Rules.string_of_freq_rule {rule with freq=freq})))

let generate_stem_dict rules_filename path filename out_filename =
  let rules = Rules.load_freq_rules rules_filename in
  let rules = Rules.CharTrees.create rules in
  let dict = load_tab (path ^ filename) in
  let dict = merge_entries dict in
  let dict = remove_cat "cond" dict in
  let dict = mark_ndm dict in
  let stems = Xlist.fold dict StringMap.empty (fun stems entry ->
    let simple_lemma,lemma_suf = Stem.simplify_lemma_full entry.lemma in
    Xlist.fold entry.forms stems (fun stems form ->
      let candidates = Rules.CharTrees.find rules form.orth in
      let candidates = Xlist.fold candidates [] (fun candidates (stem,rule) ->
        if rule.star = Ndm && not entry.ndm then candidates else
        if rule.star <> Ndm && entry.ndm then candidates else
        if stem ^ rule.set = simple_lemma && form.interp = rule.interp then
          (stem,rule) :: candidates else candidates) in
      if candidates = [] then stems else
      let stem,rule = List.hd candidates in
      StringMap.add_inc stems (stem ^ "\t" ^ lemma_suf) [rule.id] (fun l -> rule.id :: l))) in
  File.file_out out_filename (fun file ->
    StringMap.iter stems (fun stem ids ->
      fprintf file "%s\t%s\n" stem (String.concat " " ids)))