freqListInterps.ml 23.9 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558
(* Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Dec 2016. *)

open Xstd

(* utils *)

let uniq lst =
        let seen = Hashtbl.create (List.length lst) in
        List.filter (fun x -> let tmp = not (Hashtbl.mem seen x) in
                Hashtbl.replace seen x ();
                tmp) lst

let rec join delim lst =
        (* Concatenate list of strings into one string, delimited by delim *)
        if lst = [] then ""
        else List.hd lst ^ (if List.tl lst = [] then ""
                            else delim ^ join delim (List.tl lst))

let slice lst b e =
        (* Given a list, return a slice from b to e *)
        let rec slice_step lst b e accm =
        if lst = [] || b >= e then []
        else if e = 0 then accm
        else if b > 0 then slice_step (List.tl lst) (b-1) (e-1) accm
        else if b = 0 then
                slice_step (List.tl lst) (b-1) (e-1) (List.hd lst :: accm)
        else slice_step lst 0 e accm
        in
        List.rev (slice_step lst b e [])

let str_contains str frag =
        try (Str.search_forward (Str.regexp frag) str 0; true)
        with Not_found -> false

let longest_str lst =
        let rec longest_str_step lst win winlen =
                if lst = [] then win
                else if String.length (List.hd lst) > winlen 
                then longest_str_step (List.tl lst) (List.hd lst)
                                                (String.length (List.hd lst))
                else longest_str_step (List.tl lst) win winlen
        in
        longest_str_step lst "" 0

let shortest_str lst =
        let rec shortest_str_step lst win winlen =
                if lst = [] then win
                else if String.length (List.hd lst) < winlen 
                then shortest_str_step (List.tl lst) (List.hd lst)
                                                (String.length (List.hd lst))
                else shortest_str_step (List.tl lst) win winlen
        in
        shortest_str_step lst "" 999999

let most_samechars lst patrn =
        (* Return a string from lst that shares of the same characters in its
         * beginning with patrn. If no characters are the same, return the 
         * patrn. *)
        let rec samechars s1 s2 cnt =
                if s1 = "" || s2 = "" then cnt
                else if String.get s1 0 = String.get s2 0
                then samechars (String.sub s1 0 ((String.length s1)-1))
                (String.sub s1 0 ((String.length s1)-1)) (cnt+1)
                else cnt
        in
        let rec most_samechars_step lst patrn win winscr =
                if lst = [] then win
                else let score = (samechars (List.hd lst) patrn 0) in
                if score > winscr 
                then most_samechars_step (List.tl lst) patrn (List.hd lst) score
                else most_samechars_step (List.tl lst) patrn win winscr
        in
        most_samechars_step lst patrn patrn 0

let prefer pfs lst =
        (* Return first element from pfs that was found in lst. Throw Not_found
         * if nothing was found. pfs can't contain empty string. *)
        if lst = [] || pfs = [] then raise Not_found
        (* for each element from pfs, try to find it on lst *)
        else let findings = List.map
        (fun p -> try (List.find (fun x -> x = p) lst) with Not_found -> "")
        pfs
        in
        (* try to find some non-false finding: *)
        List.find (fun f -> String.length f > 0) findings
        (* Not_found uncatched /\ *)

let strrev s =
        (* Reverse a string. *)
        let rec strrev_step s ns =
                if s = "" then ns
                else strrev_step (String.sub s 1 ((String.length s)-1))
                                ((String.make 1 (String.get s 0)) ^ ns)
        in
        strrev_step s ""

let print_strlst lst =
        Printf.printf "[";
        Xlist.iter
        lst
        (fun e -> Printf.printf " %s " e);
        Printf.printf "]"

let strmap_contains m e = try (StringMap.find m e; true)
                                with Not_found -> false

let map_from_list lst idx_fun =
        (* return a StringMap, containing all entries sorted by string hash
         * obtained by applying idx_fun to given element of lst *)
        Xlist.fold lst StringMap.empty
        (fun m elem -> StringMap.add_inc m (idx_fun elem) [elem]
                (fun l -> elem :: l))

(* working code *)

let neutx_to_neut tag = Str.global_replace
                        (Str.regexp ("\(n[0-9]\.?\)*n[0-9]"))
                        "n" tag

let lists_of_tag tag =
                (Xlist.rev_map
                (Xstring.split ":" (neutx_to_neut tag))
                (fun elem ->
                Xstring.split "\." elem))

let variants_of tag = 
        (* variants_of takes a tag as string and returns all its variants as a
         * list of strings, unfolding all the variants *)
        let rec fold_as_str = fun l ->
                if l = [] then "" else (List.hd l)
                (* add the colon if needed: *)
                ^ (if (List.tl l) = [] then (fold_as_str (List.tl l))
                else (":" ^ fold_as_str (List.tl l)))
        in
        uniq
        (Xlist.rev_map
                (Xlist.multiply_list
                        (List.rev (* tag elems got reversed earlier *) 
                        (lists_of_tag tag)))
                fold_as_str)

let regexp_of_tag tg =
        Str.regexp
        (join ":" (Xlist.rev_map (lists_of_tag tg)
        (fun variants -> "\(\(" ^ (join "\|" variants) ^ "\)\.?\)+")))

let create_interp_map itp_list =
        (* create_interp takes itp_list and creates a string map, indexed by
         * short tags, to the long (full) tags *)
        Xlist.fold itp_list StringMap.empty
        (fun smap tag ->
                let tag = neutx_to_neut tag in
                (Xlist.fold
                (variants_of tag)
                smap
                (fun smap short_tag ->
                        (StringMap.add_inc smap short_tag [tag]
                        (fun l -> tag :: l)))))

type sgjp_entry = { sg_orth: string; sg_lemma: string; sg_interp: string}

let create_sgjp_map_nohyph fname =
        (* load SGJP from the fname, as a map indexed by word forms, removing
         * all entries that contain a hyphen *)
        let clean_lemma lm = List.hd (Xstring.split ":" lm) in
        map_from_list
        (List.filter (fun s -> not (str_contains s.sg_orth "-"))
                (File.load_tab fname
                (function [sg_orth; sg_lemma; sg_interp; _; _] ->
                        {sg_orth=sg_orth; sg_lemma=clean_lemma sg_lemma;
                        sg_interp=(neutx_to_neut sg_interp)}
                | [] -> failwith "Empty entry in SGJP file"
                | _::_ -> {sg_orth=""; sg_lemma=""; sg_interp=""})))
        (fun etr -> etr.sg_orth)

type freq_entry = { orth:string; lemma:string; interp:string; frq:string;
        compos:string; sgjp_status:string; word_type:string; corr:string;
        rule_id:string; cat:string }

let check_sgjp_interp etr sgjp =
        (* return interpretation for the entry, if possible, using sgjp (map) *)
        try (let retrieved = (StringMap.find sgjp etr.orth) in
            if List.length retrieved = 1
            && (List.hd retrieved).sg_lemma = etr.lemma
            then (List.hd retrieved).sg_interp
            else "AMBIG-" ^ etr.interp)
        with Not_found -> "AMBIG-" ^ etr.interp

let simplify_qub tg =
        (* change qub:nwok/wok into plain qub *)
        try (Str.search_forward (Str.regexp "qub") tg 0;
             List.hd(Xstring.split ":" tg))
        with Not_found -> tg

let resolve_num etr inps =
        (* if word form of etr is purely numerical, assign to it the broadest
         * interpretation *)
        if not (str_contains etr.interp "num:") then etr
        else if (Str.string_match (Str.regexp "^[123456789IVXLCM]+$")
        etr.orth 0)
        then { etr with interp=longest_str inps }
        else { etr with interp=shortest_str inps }

let resolve_verb etr inps =
        (* choose interpretation of a verb that shares most chars with the
         * original one (should resolve perf.imperf issues etc.) *)
        let verbs = Xlist.rev_map ["praet:"; "imps:"; "imp"; "fin"; "inf";
        "ger:"; "pact:"; "ppas:"] (str_contains etr.interp) in
        let truth x = if x = true then true else false in
        if List.exists truth verbs
        then {etr with interp=strrev (most_samechars (Xlist.rev_map inps strrev)
                                                (strrev (etr.interp)))}
        else etr

let generalize etr itp_map sgjp_map =
        (* given an entry from frequency list, try to assign it a generalized
         * tag using itp_map and sgjp_map *)
        try (match uniq (StringMap.find itp_map (etr.interp))
        with
        | [] -> etr
        | h::[] -> { etr with interp=h }
        | inps -> resolve_verb (resolve_num
                        { etr with interp=(check_sgjp_interp etr sgjp_map) }
                        inps) inps)
        with Not_found -> (* Printf.printf "not found %s\n" etr.interp;*)
        {etr with interp= (simplify_qub etr.interp)}

let print_list fname lst =
        (let out = open_out fname in
        (Xlist.iter lst (fun etr -> output_string out etr));
        close_out out)

let print_freq fname lst =
        print_list fname
        (Xlist.rev_map
        lst
        (fun etr -> Printf.sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n"
        etr.orth etr.lemma etr.interp etr.frq etr.compos etr.sgjp_status
        etr.word_type etr.corr etr.rule_id etr.cat))

let merge_entries lst =
        (* Given a list of freq entries, merge those that have the same (orth,
         * lemma, interp) triple if lowercased, summing the frequencies *)
        let lwr = Xunicode.lowercase_utf8_string in
        let etr_pnts = Xlist.fold lst StringMap.empty
        (fun m etr -> StringMap.add_inc m
                (lwr (join "~" [etr.orth; etr.lemma; etr.interp]))
                [etr] (fun l -> etr :: l))
        in
        StringMap.fold etr_pnts []
        (fun l k v -> let e = (List.hd v) in
        {orth=(lwr e.orth); lemma=(lwr e.lemma); interp=e.interp;
         (* sum the frequencies *)
         frq=string_of_int
                 (Xlist.fold v 0 (fun cnt vl -> cnt+(int_of_string vl.frq)));
         (* prefer the most general value of those parameters, among present in
          * all merged entries: *)
         compos=prefer ["COMPOS"; "COMPOS-ndm"; "COMPOS-*"; "COMPOS-ALT";
         "COMPOS-LWR"; "COMPOS-LWR-ndm"; "COMPOS-LWR-*"; "COMPOS-LWR-ALT";
         "NCOMPOS"] (Xlist.rev_map v (fun e -> e.compos));
         sgjp_status=prefer ["SGJP-EXACT"; "SGJP-LMM-UNCAPITAL";
         "SGJP-LMM-CAPITAL"; "SGJP-LMM-LOWER"; "SGJP-BTH-LOWER"; "NON-SGJP"]
         (Xlist.rev_map v (fun e -> e.sgjp_status));
         word_type=prefer ["CW"; "NCH"; "EXT"; "SYMB"; "ACRO"; "PN"; "SPEC";
         "NEOL"; "COMPD"; "WEB"] (Xlist.rev_map v (fun e -> e.word_type));
         corr=prefer ["CORR"; "CERR"; "DIAL"; "PHON"; "ERR"; "TAGD"; "PLTAN";
         "TAGE"; "ERR-TAGE"; "CERR-TAGE"] (Xlist.rev_map v (fun e -> e.corr));
         rule_id=""; cat=""}
         :: l)

let tag_rules alt_fname lst =
        (* tag freq entries with their rule ids - loading the exceptions from
         * alt_fname - currently rules ambiguity are resolved by choosing the
         * most specific rule *)
        let alts =
                Xlist.fold
                        (File.load_tab alt_fname (fun [orth; _; _] -> orth))
                        StringMap.empty (fun m e -> StringMap.add m e "")
        in
        Xlist.rev_map lst
        (fun etr ->
                if strmap_contains alts etr.orth then {etr with rule_id="ALT"}
                else if etr.compos="NCOMPOS" then etr
                else let rls = Xlist.filter
                        (Rules.CharTrees.find Inflexion.rules etr.orth)
                        (fun o -> match o with
                        (stem, rl) -> ((stem ^ rl.set) = etr.lemma)
                        && (Str.string_match (regexp_of_tag etr.interp)
                                (neutx_to_neut rl.interp) 0))
                in
                if List.length rls = 0 then
                (if etr.compos!="NCOMPOS" then
                ((*Printf.printf "can't find rule for COMPOS %s\n" etr.orth;*)
                {etr with compos="NCOMPOS"})
                else etr)
                (* extract the first rule from the list: *)
                else let hdrule = (match List.hd rls with (_, rl) -> rl) in
                (* if it's the only one, just return it: *)
                if List.length rls = 1 then {etr with rule_id=hdrule.id}
                else ((*Printf.printf "too many rules match %s (%s )\n"
                        etr.orth (Xlist.fold rls "" (fun str r -> str ^ " " ^
                               (match r with (_, r) -> r.id)));*)
                      (* select the rule with the longest "set" (agglutinant) *)
                      {etr with rule_id = (Xlist.fold rls hdrule 
                              (fun choice opt ->
                               match opt with (_, opt) ->
                               if String.length choice.set<String.length opt.set
                               then opt else choice)).id}))

let cat_of_tag tag = match List.hd (Xstring.split ":" tag) with
        | "subst" -> "noun" | "depr" -> "noun"
        | "adj" -> "adj" | "adja" -> "adj" | "adjc" -> "adj" | "adjp" -> "adj"
        | "adv" -> "adv"
        | "inf" -> "verb" | "praet"-> "verb" | "fin" -> "verb"
        | "ppas" -> "verb" | "pact" -> "verb" | "pacta" -> "verb"
        | "impt" -> "verb" | "imps" -> "verb" | "pcon" -> "verb"
        | "pant" -> "verb" | "ger" -> "verb"
        | "bedzie" -> "other" | "pred"-> "other" | "prep" -> "other"
        | "num" -> "other" | "aglt" -> "other" | "winien" -> "other"
        | "qub" -> "other" | "brev" -> "other" | "comp" -> "other"
        | "interj" -> "other" | "burk" -> "other" | "numcol" -> "other"
        | "conj" -> "other" | "ppron12" -> "other" | "ppron3" -> "other"
        | "ppron2" -> "other" | "ppron1" -> "other" | "ppron" -> "other"
        | "interp" -> "other" | "xxx" -> "other" | "siebie" -> "other"
        | "cond" -> "cond"
        | _ -> failwith
        (Printf.sprintf "unknown part of speech in %s" tag)

let tag_cats lst =
        Xlist.rev_map lst (fun etr -> {etr with cat = cat_of_tag etr.interp })

let sum_list_freq lst =
        Xlist.fold lst 0 (fun t etr -> t+(int_of_string etr.frq))

let map_interp_given_cat freq =
        (* map_interp_given_cat prepares a map from cat's to map of interps to
         * their (normalized->probability) scores *)
        let normalize scoretable =
                StringMap.fold scoretable StringMap.empty
                (fun sctb key vals ->
                        (* count total frequency for this cat *)
                        let total = float_of_int
                                (StringMap.fold vals 0 (fun t _ f -> t + f))
                        in
                        (StringMap.add sctb key
                                (* normalize for each interp *)
                                (StringMap.fold vals StringMap.empty
                                (fun newvals v s -> StringMap.add newvals v
                                 ((float_of_int s) /. total)))))
        in
        normalize
        (Xlist.fold freq StringMap.empty
        (fun m etr -> StringMap.add_inc m etr.cat
                (* create an empty map, and add the first entry: *)
                (StringMap.add StringMap.empty etr.interp
                        (int_of_string etr.frq))
                (* if map exists, add or increment entry for this interp: *)
                (fun itps -> StringMap.add_inc itps etr.interp
                          (int_of_string etr.frq)
                          (fun score -> score+(int_of_string etr.frq)))))

let model_prob freq_map freq_map_lmcat itp_given_cat float_total_freq orth interp cat = 
        (* P (form|lemma, cat, interp): *) (1.0
        *. ((((* P(lemma, cat): *)
            (float_of_int
                (if strmap_contains freq_map_lmcat
                (interp ^ "~" ^ cat)
                (* sum frequencies of entries with given lemma:cat: *)
                then sum_list_freq (StringMap.find freq_map_lmcat
                (interp ^ "~" ^ cat)) + 1
                else 1))
            /. float_total_freq)
            (* P(interp|cat) - precalculated probability: *)
            *. try (let itps_for_cat = StringMap.find itp_given_cat cat in
                StringMap.find itps_for_cat interp)
               with Not_found ->
                        ((*Printf.printf (*failwith*) "Can't find freq for interp: %s\n"
                        interp*) 0.0))
        /. (* P(form): *)
        (float_of_int
        (if strmap_contains freq_map orth
        then sum_list_freq (StringMap.find freq_map orth) + 1
        else 1)
        /. float_total_freq)))

let all_model_probs orth itp_list mod_prob_func =  
        (* for interps from itp_list, return a tuple (probability, interp) *)
        Xlist.map itp_list
        (fun itp -> ((mod_prob_func orth itp (cat_of_tag itp)), itp))

let normalize_probs probs =
        let total = Xlist.fold probs 0.0 (fun t (p, i) -> t +. p) in
        Xlist.map probs (fun (p, i) -> (p /. total, i))

let most_probable probs threshold =
        (* probs contains tuples (prob, interp), return interps such that
         * their accumulated probability minimally exceeds the threshold *)
        List.map
        (fun (p, i) -> i)
        (Xlist.fold (List.sort compare probs) []
        (fun lst (prob, itp) ->
                if (Xlist.fold lst 0.0 (fun sum (p, i) -> sum +. p)) > threshold
                then lst else (prob, itp) :: lst))

let eval_model threshold freq_map itp_lst float_total_freq mod_prob_func =
        StringMap.fold freq_map 0.0
        (fun accum form etrs ->
        let total_local_freq = float_of_int (sum_list_freq etrs) in
        accum +.
        (* weight of this form: *)
        (total_local_freq /. float_total_freq)
        *. ((float_of_int (sum_list_freq
                (* get the interps below the threshold *)
                (Xlist.fold (most_probable
                (normalize_probs (all_model_probs form itp_lst mod_prob_func))
                threshold)
                []
                (* get entries for these interps, so we'll sum their freqs *)
                (fun lst interp ->
                        try ((List.find (fun etr -> etr.interp = interp) etrs)
                        :: lst)
                        with _ -> lst))))
        /. total_local_freq))

let _ =
        let itp_list = File.load_tab "data/interps_general.tab"
        (function [_; tag; _] -> tag
        | [] -> failwith "Empty entry in the interp file"
        | _::_ -> failwith "Malformatted entry in the interp file")
        in
        let sgjp_map = create_sgjp_map_nohyph
                "../../NLP resources/sgjp-20160724.tab"
        in
        (* generalize frequency: *)
        let gen_freq =
        (tag_cats
        (tag_rules "../resources/SGJP/alt.tab"
        (merge_entries
          (let interp_map = (create_interp_map itp_list) in
          let freq = File.load_tab
          "../resources/NKJP1M/NKJP1M-tagged-frequency.tab"
          (function [o;l;i;f;c;s;w;cr] -> { orth=o; lemma=l; interp=i; frq=f;
          compos=c; sgjp_status=s; word_type=w; corr=cr; rule_id=""; cat=""}
          | [] -> failwith "Empty entry in the freq file"
          | _::_ -> failwith "Malformatted entry in the freq file") in
          (* after loading the list, prune it and perform generalization *)
          Xlist.rev_map
          (List.filter
            (* filter out errors and symbols *)
          (fun etr -> etr.corr = "CORR" && not (etr.word_type = "SYMB"
          || etr.word_type = "COMPD" || etr.word_type = "WEB"
          || etr.word_type = "ACRO"))
          freq)
          (fun etr -> generalize etr interp_map sgjp_map)))))
        in

        (* count the total frequency of all entries *)
        let total_freq = Xlist.fold gen_freq 0
        (fun tally etr -> tally + int_of_string etr.frq)
        in
        let float_total_freq = float_of_int total_freq
        in
        (* make a map of the freq, indexed by word forms *)
        let freq_map = map_from_list gen_freq (fun etr -> etr.orth)
        in
        (* and another by lemma:cat *)
        let freq_map_lmcat = map_from_list gen_freq
        (fun etr -> etr.lemma ^ "~" ^ etr.cat)
        in
        let freq_map_lmitp = map_from_list gen_freq
        (fun etr -> etr.lemma ^ "~" ^ etr.interp)
        in
        (* count P(interp|cat)'s *)
        let itp_given_cat = map_interp_given_cat gen_freq
        in 

        (* print the generalized frequency to file *)
        (print_freq "../resources/NKJP1M/NKJP1M-generalized-frequency.tab"
        gen_freq;

        (* print the number of forms from freq that are not NON-SGJP *)
        Printf.printf "All forms that are in freq and not NON-SGJP %d/%d\n"
        (Xlist.fold gen_freq 0 (fun tally etr -> if etr.sgjp_status = "NON-SGJP"
        then tally else tally+int_of_string etr.frq))
        total_freq;

        (* print the number of forms from freq that are present in SGJP, but not
         * in their correct interpretations *)
        (Printf.printf "Forms present in SGJP, w/o the correct interp: %d/%d\n"
        (Xlist.fold gen_freq 0
        (fun tally etr ->
        if strmap_contains sgjp_map etr.orth
        && List.length (Xlist.filter (StringMap.find sgjp_map etr.orth)
               (fun sg_etr -> sg_etr.sg_lemma = etr.lemma
                 && Str.string_match (regexp_of_tag etr.interp)
                            sg_etr.sg_interp 0))
        = 0
        then ((*Printf.printf "%s\n" etr.orth; (*FIXME printing 'traps' to stdout...*)*)
        (tally+int_of_string etr.frq)) else tally))
        total_freq);
       
        (* print intermediate probabilities used by the model *)
        print_list "doc/prob_lemmacat.txt"
        (StringMap.fold freq_map_lmcat []
        (fun lst lmcat vnts -> Printf.sprintf "%s\t%f\n"
        (Str.global_replace (Str.regexp "~") "\t" lmcat)
        ((float_of_int (sum_list_freq vnts)) /. float_total_freq) :: lst));

        print_list "doc/prob_itp_givencat.txt"
        (StringMap.fold itp_given_cat []
        (fun lst cat itps ->
                lst @
                StringMap.fold itps [] (fun ilst itp prob ->
                        Printf.sprintf "%s\t%s\t%f\n" itp cat prob :: ilst)));

        (* print a model for entries from SGJP *)
        (*print_list "../resources/NKJP1M/model.tab"
        (StringMap.fold sgjp_map []
        (fun whole_lst _ sg_etrs -> (Xlist.fold sg_etrs []
        (fun form_lst sg_etr ->
        let sg_etr_cat = (cat_of_tag sg_etr.sg_interp)
        in
        Printf.sprintf "%s\t%s\t%s\t%s\t%f\n"
        sg_etr.sg_orth sg_etr.sg_lemma sg_etr_cat sg_etr.sg_interp
        (model_prob freq_map freq_map_lmcat itp_given_cat float_total_freq 
        sg_etr.sg_orth sg_etr.sg_interp sg_etr_cat) :: form_lst)
        @ whole_lst)));*)
    

        (* count non-uniform forms in SGJP *) 
        (let out = open_out "doc/multi_forms.txt" in
        StringMap.iter 

        (* fold SGJP into a map (lemma~interp)->their occurences *) 
        (StringMap.fold sgjp_map StringMap.empty
        (fun m _ entries ->
        Xlist.fold entries m 
        (fun m e -> let ident = (e.sg_lemma ^ "~" ^ e.sg_interp) in
        StringMap.add_inc m ident [e.sg_orth] (fun l -> e.sg_orth :: l))))

        (* for each entry in this map, print it if has more >1 occurence*)
        (fun k fs -> let fs = uniq fs in
        if List.length fs > 1
        then Printf.fprintf out "\nMore than 1 form of %s: %s %s" k
        (Xlist.fold fs "" (fun str fm -> str ^ " " ^ fm))
        (* look up the freq and print the entries if relevant *)
        (join " "
                (List.map (fun etr -> "(from freq "^etr.orth^": "^etr.frq^")")
                (if strmap_contains freq_map_lmitp k
                then StringMap.find freq_map_lmitp k else [])))
        else ());
        close_out out);

        Printf.printf "Model evaluation: %f\n"
        (eval_model 0.95 freq_map itp_list
        float_total_freq
        (* model probability function: *)
        (model_prob freq_map freq_map_lmcat itp_given_cat float_total_freq)))