freqStats.ml
10.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
(* Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Dec 2016-Jan 2017. *)
open Xstd
open FreqUtils
let print_rules_freq rfreq rules_by_id fname =
(* given a rfreq (rule id -> its frequency), and index of rules by their ids
* and fname, print the rules from rfreq to file (in fname), adding to the
* original data from rules_by_id the frequency from rfreq as 3rd column *)
let string_of_star = function
| Types.Productive -> ""
| Types.Star -> "*"
| Types.Ndm -> "ndm" in
(print_list fname
(StringMap.fold rfreq []
(fun lst rid frq ->
if rid = "" then lst else
let (r_etr : Types.rule) =
List.hd (StringMap.find rules_by_id rid) in
Printf.sprintf "%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\n" r_etr.id
r_etr.freq frq (string_of_star r_etr.star)
r_etr.pref r_etr.find r_etr.set r_etr.interp:: lst)))
let model_prob freq_map freq_map_lmcat itp_given_cat float_total_freq orth interp cat =
(* P (form|lemma, cat, interp): *) (1.0
*. ((((* P(lemma, cat): *)
(float_of_int
(if strmap_contains freq_map_lmcat
(interp ^ "~" ^ cat)
(* sum frequencies of entries with given lemma:cat: *)
then sum_list_freq (StringMap.find freq_map_lmcat
(interp ^ "~" ^ cat)) + 1
else 1))
/. float_total_freq)
(* P(interp|cat) - precalculated probability: *)
*. try (let itps_for_cat = StringMap.find itp_given_cat cat in
StringMap.find itps_for_cat interp)
with Not_found ->
((*Printf.printf (*failwith*) "Can't find freq for interp: %s\n"
interp*) 0.0))
/. (* P(form): *)
(float_of_int
(if strmap_contains freq_map orth
then sum_list_freq (StringMap.find freq_map orth) + 1
else 1)
/. float_total_freq)))
let all_model_probs orth itp_list mod_prob_func =
(* for interps from itp_list, return a tuple (probability, interp) *)
Xlist.map itp_list
(fun itp -> ((mod_prob_func orth itp (cat_of_tag itp)), itp))
let normalize_probs probs =
let total = Xlist.fold probs 0.0 (fun t (p, i) -> t +. p) in
Xlist.map probs (fun (p, i) -> (p /. total, i))
let most_probable probs threshold =
(* probs contains tuples (prob, interp), return interps such that
* their accumulated probability minimally exceeds the threshold *)
List.map
(fun (p, i) -> i)
(Xlist.fold (List.sort compare probs) []
(fun lst (prob, itp) ->
if (Xlist.fold lst 0.0 (fun sum (p, i) -> sum +. p)) > threshold
then lst else (prob, itp) :: lst))
let eval_model threshold freq_map itp_lst float_total_freq mod_prob_func =
StringMap.fold freq_map 0.0
(fun accum form etrs ->
let total_local_freq = float_of_int (sum_list_freq etrs) in
accum +.
(* weight of this form: *)
(total_local_freq /. float_total_freq)
*. ((float_of_int (sum_list_freq
(* get the interps below the threshold *)
(Xlist.fold (most_probable
(normalize_probs (all_model_probs form itp_lst mod_prob_func))
threshold)
[]
(* get entries for these interps, so we'll sum their freqs *)
(fun lst interp ->
try ((List.find (fun etr -> etr.interp = interp) etrs)
:: lst)
with _ -> lst))))
/. total_local_freq))
let _ =
let itp_list = File.load_tab "data/interps_general.tab"
(function [_; tag; _] -> tag
| [] -> failwith "Empty entry in the interp file"
| _::_ -> failwith "Malformatted entry in the interp file")
in
let sgjp_map = create_sgjp_map_nohyph
"../../NLP resources/sgjp-20160724.tab"
in
let gen_freq =
(List.filter
(* filter out errors and symbols *)
(fun etr -> etr.corr = "CORR" && not (etr.word_type = "SYMB"
|| etr.word_type = "COMPD" || etr.word_type = "WEB"
|| etr.word_type = "ACRO"))
(File.load_tab
"../resources/NKJP1M/NKJP1M-generalized-frequency.tab"
(function [o;l;i;f;c;s;w;cr;ri;ct] -> { orth=o; lemma=l; interp=i;
frq=f; compos=c; sgjp_status=s; word_type=w; corr=cr; rule_id=ri;
cat=ct}
| [] -> failwith "Empty entry in the freq file"
| _::_ -> failwith "Malformatted entry in the freq file")))
in
(* count the total frequency of all entries *)
let total_freq = Xlist.fold gen_freq 0
(fun tally etr -> tally + int_of_string etr.frq)
in
let float_total_freq = float_of_int total_freq
in
(* split the gen_freq into in_sgjp i non_sgjp parts *)
let in_sgjp_freq = (List.filter
(fun etr -> etr.sgjp_status != "NON-SGJP")
gen_freq) in
let non_sgjp_freq = (List.filter
(fun etr -> etr.sgjp_status = "NON-SGJP")
gen_freq) in
let float_total_insgjp_freq = float_of_int (sum_list_freq in_sgjp_freq)
in
let float_total_nonsgjp_freq = float_of_int (sum_list_freq non_sgjp_freq)
in
(* make a map of the freq, indexed by word forms *)
let freq_insgjp_map = map_from_list in_sgjp_freq (fun etr -> etr.orth)
in
(* and another by lemma:cat *)
let freq_insgjp_map_lmcat = map_from_list in_sgjp_freq
(fun etr -> etr.lemma ^ "~" ^ etr.cat)
in
let freq_insgjp_map_lmitp = map_from_list in_sgjp_freq
(fun etr -> etr.lemma ^ "~" ^ etr.interp)
in
(* count P(interp|cat)'s *)
let insgjp_itp_given_cat = map_interp_given_cat in_sgjp_freq
in
(* prepare a map of rules: id -> rule entry (as Rules) *)
let rules_by_id =
(map_from_list
(* load the freq_rules *)
(File.load_tab "../resources/SGJP/freq_rules.tab"
(function [id; _; _; _; fsuf; _; _] -> [id; fsuf]
| _ -> failwith "bad entry in freq_rules.tab"))
(* index by id *)
(function [id; fsuf] -> id
| _ -> failwith "error when making a map of freq rules"))
in
(** print the number of forms from freq that are not NON-SGJP *)
(Printf.printf "All forms that are in freq and not NON-SGJP %d/%d\n"
(sum_list_freq non_sgjp_freq)
total_freq;
(* print the number of forms from freq that are present in SGJP, but not
* in their correct interpretations *)
(Printf.printf "Forms present in SGJP, w/o the correct interp: %d/%d\n"
(Xlist.fold in_sgjp_freq 0
(fun tally etr ->
if strmap_contains sgjp_map etr.orth
&& List.length (Xlist.filter (StringMap.find sgjp_map etr.orth)
(fun sg_etr -> sg_etr.sg_lemma = etr.lemma
&& Str.string_match (regexp_of_tag etr.interp)
sg_etr.sg_interp 0))
= 0
then ((*Printf.printf "%s\n" etr.orth; (*FIXME printing 'traps' to stdout...*)*)
(tally+int_of_string etr.frq)) else tally))
total_freq);
(** count non-uniform forms in SGJP *)
(let out = open_out "doc/multi_forms.txt" in
StringMap.iter
(* fold SGJP into a map (lemma~interp)->their occurences *)
(StringMap.fold sgjp_map StringMap.empty
(fun m _ entries ->
Xlist.fold entries m
(fun m e -> let ident = (e.sg_lemma ^ "~" ^ e.sg_interp) in
StringMap.add_inc m ident [e.sg_orth] (fun l -> e.sg_orth :: l))))
(* for each entry in this map, print it if has more >1 occurence*)
(fun k fs -> let fs = uniq fs in
if List.length fs > 1
then Printf.fprintf out "\nMore than 1 form of %s: %s %s" k
(Xlist.fold fs "" (fun str fm -> str ^ " " ^ fm))
(* look up the freq and print the entries if relevant *)
(join " "
(List.map (fun etr -> "(from freq "^etr.orth^": "^etr.frq^")")
(if strmap_contains freq_insgjp_map_lmitp k
then StringMap.find freq_insgjp_map_lmitp k else [])))
else ());
close_out out);
(* rules productivity in the non-sgjp part *)
let rules_by_id = map_from_list
(Rules.load_freq_rules "../resources/SGJP/freq_rules.tab")
(fun r -> r.id)
in
let rules_freq = (Xlist.fold non_sgjp_freq StringMap.empty
(fun m etr -> StringMap.add_inc m etr.rule_id (int_of_string etr.frq)
(fun n -> n + (int_of_string etr.frq))))
in
(* split the rules_freq into parts containing: 1. rules that alternate
* the form from lemma, 2. as 1 but the form is the basic form of the
* lexeme 3. alternating rules *)
let (non_alt, non_alt_root, alt) = StringMap.fold rules_freq
(StringMap.empty, StringMap.empty, StringMap.empty)
(fun (non_alt, non_alt_root, alt) rid frq ->
if rid = "" then (non_alt, non_alt_root, alt)
else let (r_etr : Types.rule) =
List.hd (StringMap.find rules_by_id rid) in
if r_etr.find = r_etr.set
then if r_etr.find!="" && is_root_form_tag r_etr.interp
then (non_alt, StringMap.add non_alt_root rid frq, alt)
else (StringMap.add non_alt rid frq, non_alt_root, alt)
else (non_alt, non_alt_root, StringMap.add alt rid frq))
in
(print_rules_freq rules_freq rules_by_id "doc/rules_productivity.txt";
print_rules_freq non_alt rules_by_id "doc/rules_productivity_nalt.txt";
print_rules_freq non_alt_root rules_by_id "doc/rules_productivity_nalt_root.txt";
print_rules_freq alt rules_by_id "doc/rules_productivity_alt.txt");
Printf.printf "Model evaluation: %f\n"
(eval_model 0.95 freq_insgjp_map itp_list
float_total_insgjp_freq
(* model probability function: *)
(model_prob freq_insgjp_map freq_insgjp_map_lmcat insgjp_itp_given_cat
float_total_insgjp_freq)))