freqProbs.ml 5.98 KB
(* Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Dec 2016-Jan 2017. *)

open FreqUtils
open Xstd

let count_fsuf_freq freq rules_by_id =
        (* Return a map: fsuf (find) in rules -> frequency to the matching 
         * forms *)
        Xlist.fold freq StringMap.empty
        (fun map etr ->
                if etr.rule_id = "" || etr.rule_id = "ALT" then map
                else let fsuf =
                (* we receive a list here, so unpack the only element *)
                (match try (List.hd (StringMap.find rules_by_id etr.rule_id))
                with _ -> failwith (Printf.sprintf "can't find rule %s" etr.rule_id)
                with
                | [id; fsuf] -> fsuf | _ -> failwith "bad entry in rule map")
                in
                StringMap.add_inc map fsuf (int_of_string etr.frq)
                (fun accum -> accum + (int_of_string etr.frq)))

let count_fsuf_cat_freq freq rules_by_id =
        (* Return a map: fsuf (find) in rules -> frequency to the matching 
         * forms *)
        Xlist.fold freq StringMap.empty
        (fun map etr ->
                if etr.rule_id = "" || etr.rule_id = "ALT" then map
                else if etr.cat = "" then failwith etr.orth
                else let fsuf =
                (* we receive a list here, so unpack the only element *)
                (match try (List.hd (StringMap.find rules_by_id etr.rule_id))
                with _ -> failwith (Printf.sprintf "can't find rule %s" etr.rule_id)
                with
                | [id; fsuf] -> fsuf | _ -> failwith "bad entry in rule map")
                in
                StringMap.add_inc map (fsuf^"~+~"^etr.cat) (int_of_string etr.frq)
                (fun accum -> accum + (int_of_string etr.frq)))

let _ =
        let gen_freq =
          (List.filter
            (* filter out errors and symbols *)
          (fun etr -> etr.corr = "CORR" && not (etr.word_type = "SYMB"
          || etr.word_type = "COMPD" || etr.word_type = "WEB"
          || etr.word_type = "ACRO"))
          (File.load_tab
          "../resources/NKJP1M/NKJP1M-generalized-frequency.tab"
          (function [o;l;i;f;c;s;w;cr;ri;ct] -> { orth=o; lemma=l; interp=i;
          frq=f; compos=c; sgjp_status=s; word_type=w; corr=cr; rule_id=ri;
          cat=ct}
          | [] -> failwith "Empty entry in the freq file"
          | _::_ -> failwith "Malformatted entry in the freq file")))
        in
        (* count the total frequency of all entries *)
        let total_freq = Xlist.fold gen_freq 0
        (fun tally etr -> tally + int_of_string etr.frq)
        in
        let float_total_freq = float_of_int total_freq
        in

        (* split the gen_freq into in_sgjp i non_sgjp parts *)
        let in_sgjp_freq = (List.filter
        (fun etr -> etr.sgjp_status != "NON-SGJP")
        gen_freq) in
        let non_sgjp_freq = (List.filter
        (fun etr -> etr.sgjp_status = "NON-SGJP")
        gen_freq) in
        let float_total_insgjp_freq = float_of_int (sum_list_freq in_sgjp_freq)
        in
        let float_total_nonsgjp_freq = float_of_int (sum_list_freq non_sgjp_freq)
        in

        (* make a map of the freq, indexed by word forms *)
        let freq_map = map_from_list gen_freq (fun etr -> etr.orth)
        in
        (* and another by lemma:cat *)
        let freq_insgjp_map_lmcat = map_from_list in_sgjp_freq
        (fun etr -> etr.lemma ^ "~" ^ etr.cat)
        in
        let freq_insgjp_map_lmitp = map_from_list in_sgjp_freq
        (fun etr -> etr.lemma ^ "~" ^ etr.interp)
        in
        (* count P(interp|cat)'s *)
        let insgjp_itp_given_cat = map_interp_given_cat in_sgjp_freq
        in 

        (* prepare a map of rules: id -> rule entry (as Rules) *)
        let rules_by_id =
                (map_from_list
                (* load the freq_rules *)
                (File.load_tab "../resources/SGJP/freq_rules.tab"
                (function [id; _; _; _; fsuf; _; _] -> [id; fsuf]
                | _ -> failwith "bad entry in freq_rules.tab"))
                (* index by id *)
                (function [id; fsuf] -> id
                | _ -> failwith "error when making a map of freq rules"))
        in

        (* print probabilities to be used by the model for SGJP entries *)
        print_list "doc/prob_lemmacat.txt"
        (StringMap.fold freq_insgjp_map_lmcat []
        (fun lst lmcat vnts -> Printf.sprintf "%s\t%f\n"
        (Str.global_replace (Str.regexp "~") "\t" lmcat)
        ((float_of_int (sum_list_freq vnts)) /. float_total_insgjp_freq) :: lst));

        print_list "doc/prob_itp_givencat.txt"
        (StringMap.fold insgjp_itp_given_cat []
        (fun lst cat itps ->
                lst @
                StringMap.fold itps [] (fun ilst itp prob ->
                        Printf.sprintf "%s\t%s\t%f\n" itp cat prob :: ilst)));

        let fsuf_probs = count_fsuf_freq non_sgjp_freq rules_by_id in
        print_list "doc/prob_fsuf.txt"
        (List.sort (fun e1 e2 -> compare
        (StringMap.find fsuf_probs (List.hd (Xstring.split_delim "\\\t" e2)))
        (StringMap.find fsuf_probs (List.hd (Xstring.split_delim "\\\t" e1))))
        (StringMap.fold fsuf_probs []
        (fun lst fsuf frq -> Printf.sprintf "%s\t%f\t%d\n" fsuf
        ((float_of_int frq) /. float_total_insgjp_freq) frq :: lst)));

        let fsuf_cat_probs = count_fsuf_cat_freq non_sgjp_freq rules_by_id in
        print_list "doc/prob_fsuf_cat.txt"
        (List.sort (fun e1 e2 -> compare
        (StringMap.find fsuf_cat_probs
        (join "~+~" (slice (Xstring.split_delim "\\\t" e2) 0 2)))
        (StringMap.find fsuf_cat_probs
        (join "~+~" (slice (Xstring.split_delim "\\\t" e1) 0 2))))
        (StringMap.fold fsuf_cat_probs []
        (fun lst fsufcat_str frq ->
        let fsufcat = Str.split_delim (Str.regexp "~\+~") fsufcat_str in 
        try
        (Printf.sprintf "%s\t%s\t%f\t%d\n" (List.nth fsufcat 0) (List.nth fsufcat 1)
        ((float_of_int frq) /. float_total_insgjp_freq) frq) :: lst
        with _->(Printf.printf "problem with fsufcat %s -> %d\n"
        fsufcat_str frq; lst))))