Blame view

morphology/freqProbs.ml 5.98 KB
Szymon Rutkowski authored
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
(* Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Dec 2016-Jan 2017. *)

open FreqUtils
open Xstd

let count_fsuf_freq freq rules_by_id =
        (* Return a map: fsuf (find) in rules -> frequency to the matching 
         * forms *)
        Xlist.fold freq StringMap.empty
        (fun map etr ->
                if etr.rule_id = "" || etr.rule_id = "ALT" then map
                else let fsuf =
                (* we receive a list here, so unpack the only element *)
                (match try (List.hd (StringMap.find rules_by_id etr.rule_id))
                with _ -> failwith (Printf.sprintf "can't find rule %s" etr.rule_id)
                with
                | [id; fsuf] -> fsuf | _ -> failwith "bad entry in rule map")
                in
                StringMap.add_inc map fsuf (int_of_string etr.frq)
                (fun accum -> accum + (int_of_string etr.frq)))

let count_fsuf_cat_freq freq rules_by_id =
        (* Return a map: fsuf (find) in rules -> frequency to the matching 
         * forms *)
        Xlist.fold freq StringMap.empty
        (fun map etr ->
                if etr.rule_id = "" || etr.rule_id = "ALT" then map
                else if etr.cat = "" then failwith etr.orth
                else let fsuf =
                (* we receive a list here, so unpack the only element *)
                (match try (List.hd (StringMap.find rules_by_id etr.rule_id))
                with _ -> failwith (Printf.sprintf "can't find rule %s" etr.rule_id)
                with
                | [id; fsuf] -> fsuf | _ -> failwith "bad entry in rule map")
                in
                StringMap.add_inc map (fsuf^"~+~"^etr.cat) (int_of_string etr.frq)
                (fun accum -> accum + (int_of_string etr.frq)))

let _ =
Szymon Rutkowski authored
40
41
42
43
44
45
46
        let gen_freq =
          (List.filter
            (* filter out errors and symbols *)
          (fun etr -> etr.corr = "CORR" && not (etr.word_type = "SYMB"
          || etr.word_type = "COMPD" || etr.word_type = "WEB"
          || etr.word_type = "ACRO"))
          (File.load_tab
Szymon Rutkowski authored
47
48
49
50
51
          "../resources/NKJP1M/NKJP1M-generalized-frequency.tab"
          (function [o;l;i;f;c;s;w;cr;ri;ct] -> { orth=o; lemma=l; interp=i;
          frq=f; compos=c; sgjp_status=s; word_type=w; corr=cr; rule_id=ri;
          cat=ct}
          | [] -> failwith "Empty entry in the freq file"
Szymon Rutkowski authored
52
          | _::_ -> failwith "Malformatted entry in the freq file")))
Szymon Rutkowski authored
53
54
55
56
57
58
59
        in
        (* count the total frequency of all entries *)
        let total_freq = Xlist.fold gen_freq 0
        (fun tally etr -> tally + int_of_string etr.frq)
        in
        let float_total_freq = float_of_int total_freq
        in
Szymon Rutkowski authored
60
61
62
63
64
65
66
67
68
69
70
71
72

        (* split the gen_freq into in_sgjp i non_sgjp parts *)
        let in_sgjp_freq = (List.filter
        (fun etr -> etr.sgjp_status != "NON-SGJP")
        gen_freq) in
        let non_sgjp_freq = (List.filter
        (fun etr -> etr.sgjp_status = "NON-SGJP")
        gen_freq) in
        let float_total_insgjp_freq = float_of_int (sum_list_freq in_sgjp_freq)
        in
        let float_total_nonsgjp_freq = float_of_int (sum_list_freq non_sgjp_freq)
        in
Szymon Rutkowski authored
73
74
75
76
        (* make a map of the freq, indexed by word forms *)
        let freq_map = map_from_list gen_freq (fun etr -> etr.orth)
        in
        (* and another by lemma:cat *)
Szymon Rutkowski authored
77
        let freq_insgjp_map_lmcat = map_from_list in_sgjp_freq
Szymon Rutkowski authored
78
79
        (fun etr -> etr.lemma ^ "~" ^ etr.cat)
        in
Szymon Rutkowski authored
80
        let freq_insgjp_map_lmitp = map_from_list in_sgjp_freq
Szymon Rutkowski authored
81
82
83
        (fun etr -> etr.lemma ^ "~" ^ etr.interp)
        in
        (* count P(interp|cat)'s *)
Szymon Rutkowski authored
84
        let insgjp_itp_given_cat = map_interp_given_cat in_sgjp_freq
Szymon Rutkowski authored
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
        in 

        (* prepare a map of rules: id -> rule entry (as Rules) *)
        let rules_by_id =
                (map_from_list
                (* load the freq_rules *)
                (File.load_tab "../resources/SGJP/freq_rules.tab"
                (function [id; _; _; _; fsuf; _; _] -> [id; fsuf]
                | _ -> failwith "bad entry in freq_rules.tab"))
                (* index by id *)
                (function [id; fsuf] -> id
                | _ -> failwith "error when making a map of freq rules"))
        in

        (* print probabilities to be used by the model for SGJP entries *)
        print_list "doc/prob_lemmacat.txt"
Szymon Rutkowski authored
101
        (StringMap.fold freq_insgjp_map_lmcat []
Szymon Rutkowski authored
102
103
        (fun lst lmcat vnts -> Printf.sprintf "%s\t%f\n"
        (Str.global_replace (Str.regexp "~") "\t" lmcat)
Szymon Rutkowski authored
104
        ((float_of_int (sum_list_freq vnts)) /. float_total_insgjp_freq) :: lst));
Szymon Rutkowski authored
105
106

        print_list "doc/prob_itp_givencat.txt"
Szymon Rutkowski authored
107
        (StringMap.fold insgjp_itp_given_cat []
Szymon Rutkowski authored
108
109
110
111
112
        (fun lst cat itps ->
                lst @
                StringMap.fold itps [] (fun ilst itp prob ->
                        Printf.sprintf "%s\t%s\t%f\n" itp cat prob :: ilst)));
Szymon Rutkowski authored
113
        let fsuf_probs = count_fsuf_freq non_sgjp_freq rules_by_id in
Szymon Rutkowski authored
114
        print_list "doc/prob_fsuf.txt"
Szymon Rutkowski authored
115
116
117
        (List.sort (fun e1 e2 -> compare
        (StringMap.find fsuf_probs (List.hd (Xstring.split_delim "\\\t" e2)))
        (StringMap.find fsuf_probs (List.hd (Xstring.split_delim "\\\t" e1))))
Szymon Rutkowski authored
118
119
        (StringMap.fold fsuf_probs []
        (fun lst fsuf frq -> Printf.sprintf "%s\t%f\t%d\n" fsuf
Szymon Rutkowski authored
120
        ((float_of_int frq) /. float_total_insgjp_freq) frq :: lst)));
Szymon Rutkowski authored
121
Szymon Rutkowski authored
122
        let fsuf_cat_probs = count_fsuf_cat_freq non_sgjp_freq rules_by_id in
Szymon Rutkowski authored
123
        print_list "doc/prob_fsuf_cat.txt"
Szymon Rutkowski authored
124
125
126
127
128
        (List.sort (fun e1 e2 -> compare
        (StringMap.find fsuf_cat_probs
        (join "~+~" (slice (Xstring.split_delim "\\\t" e2) 0 2)))
        (StringMap.find fsuf_cat_probs
        (join "~+~" (slice (Xstring.split_delim "\\\t" e1) 0 2))))
Szymon Rutkowski authored
129
130
131
132
133
        (StringMap.fold fsuf_cat_probs []
        (fun lst fsufcat_str frq ->
        let fsufcat = Str.split_delim (Str.regexp "~\+~") fsufcat_str in 
        try
        (Printf.sprintf "%s\t%s\t%f\t%d\n" (List.nth fsufcat 0) (List.nth fsufcat 1)
Szymon Rutkowski authored
134
        ((float_of_int frq) /. float_total_insgjp_freq) frq) :: lst
Szymon Rutkowski authored
135
        with _->(Printf.printf "problem with fsufcat %s -> %d\n"
Szymon Rutkowski authored
136
        fsufcat_str frq; lst))))