freqProbs.ml
5.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
(* Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Dec 2016-Jan 2017. *)
open FreqUtils
open Xstd
let count_fsuf_freq freq rules_by_id =
(* Return a map: fsuf (find) in rules -> frequency to the matching
* forms *)
Xlist.fold freq StringMap.empty
(fun map etr ->
if etr.rule_id = "" || etr.rule_id = "ALT" then map
else let fsuf =
(* we receive a list here, so unpack the only element *)
(match try (List.hd (StringMap.find rules_by_id etr.rule_id))
with _ -> failwith (Printf.sprintf "can't find rule %s" etr.rule_id)
with
| [id; fsuf] -> fsuf | _ -> failwith "bad entry in rule map")
in
StringMap.add_inc map fsuf (int_of_string etr.frq)
(fun accum -> accum + (int_of_string etr.frq)))
let count_fsuf_cat_freq freq rules_by_id =
(* Return a map: fsuf (find) in rules -> frequency to the matching
* forms *)
Xlist.fold freq StringMap.empty
(fun map etr ->
if etr.rule_id = "" || etr.rule_id = "ALT" then map
else if etr.cat = "" then failwith etr.orth
else let fsuf =
(* we receive a list here, so unpack the only element *)
(match try (List.hd (StringMap.find rules_by_id etr.rule_id))
with _ -> failwith (Printf.sprintf "can't find rule %s" etr.rule_id)
with
| [id; fsuf] -> fsuf | _ -> failwith "bad entry in rule map")
in
StringMap.add_inc map (fsuf^"~+~"^etr.cat) (int_of_string etr.frq)
(fun accum -> accum + (int_of_string etr.frq)))
let _ =
let gen_freq =
(List.filter
(* filter out errors and symbols *)
(fun etr -> etr.corr = "CORR" && not (etr.word_type = "SYMB"
|| etr.word_type = "COMPD" || etr.word_type = "WEB"
|| etr.word_type = "ACRO"))
(File.load_tab
"../resources/NKJP1M/NKJP1M-generalized-frequency.tab"
(function [o;l;i;f;c;s;w;cr;ri;ct] -> { orth=o; lemma=l; interp=i;
frq=f; compos=c; sgjp_status=s; word_type=w; corr=cr; rule_id=ri;
cat=ct}
| [] -> failwith "Empty entry in the freq file"
| _::_ -> failwith "Malformatted entry in the freq file")))
in
(* count the total frequency of all entries *)
let total_freq = Xlist.fold gen_freq 0
(fun tally etr -> tally + int_of_string etr.frq)
in
let float_total_freq = float_of_int total_freq
in
(* split the gen_freq into in_sgjp i non_sgjp parts *)
let in_sgjp_freq = (List.filter
(fun etr -> etr.sgjp_status != "NON-SGJP")
gen_freq) in
let non_sgjp_freq = (List.filter
(fun etr -> etr.sgjp_status = "NON-SGJP")
gen_freq) in
let float_total_insgjp_freq = float_of_int (sum_list_freq in_sgjp_freq)
in
let float_total_nonsgjp_freq = float_of_int (sum_list_freq non_sgjp_freq)
in
(* make a map of the freq, indexed by word forms *)
let freq_map = map_from_list gen_freq (fun etr -> etr.orth)
in
(* and another by lemma:cat *)
let freq_insgjp_map_lmcat = map_from_list in_sgjp_freq
(fun etr -> etr.lemma ^ "~" ^ etr.cat)
in
let freq_insgjp_map_lmitp = map_from_list in_sgjp_freq
(fun etr -> etr.lemma ^ "~" ^ etr.interp)
in
(* count P(interp|cat)'s *)
let insgjp_itp_given_cat = map_interp_given_cat in_sgjp_freq
in
(* prepare a map of rules: id -> rule entry (as Rules) *)
let rules_by_id =
(map_from_list
(* load the freq_rules *)
(File.load_tab "../resources/SGJP/freq_rules.tab"
(function [id; _; _; _; fsuf; _; _] -> [id; fsuf]
| _ -> failwith "bad entry in freq_rules.tab"))
(* index by id *)
(function [id; fsuf] -> id
| _ -> failwith "error when making a map of freq rules"))
in
(* print probabilities to be used by the model for SGJP entries *)
print_list "doc/prob_lemmacat.txt"
(StringMap.fold freq_insgjp_map_lmcat []
(fun lst lmcat vnts -> Printf.sprintf "%s\t%f\n"
(Str.global_replace (Str.regexp "~") "\t" lmcat)
((float_of_int (sum_list_freq vnts)) /. float_total_insgjp_freq) :: lst));
print_list "doc/prob_itp_givencat.txt"
(StringMap.fold insgjp_itp_given_cat []
(fun lst cat itps ->
lst @
StringMap.fold itps [] (fun ilst itp prob ->
Printf.sprintf "%s\t%s\t%f\n" itp cat prob :: ilst)));
let fsuf_probs = count_fsuf_freq non_sgjp_freq rules_by_id in
print_list "doc/prob_fsuf.txt"
(List.sort (fun e1 e2 -> compare
(StringMap.find fsuf_probs (List.hd (Xstring.split_delim "\\\t" e2)))
(StringMap.find fsuf_probs (List.hd (Xstring.split_delim "\\\t" e1))))
(StringMap.fold fsuf_probs []
(fun lst fsuf frq -> Printf.sprintf "%s\t%f\t%d\n" fsuf
((float_of_int frq) /. float_total_insgjp_freq) frq :: lst)));
let fsuf_cat_probs = count_fsuf_cat_freq non_sgjp_freq rules_by_id in
print_list "doc/prob_fsuf_cat.txt"
(List.sort (fun e1 e2 -> compare
(StringMap.find fsuf_cat_probs
(join "~+~" (slice (Xstring.split_delim "\\\t" e2) 0 2)))
(StringMap.find fsuf_cat_probs
(join "~+~" (slice (Xstring.split_delim "\\\t" e1) 0 2))))
(StringMap.fold fsuf_cat_probs []
(fun lst fsufcat_str frq ->
let fsufcat = Str.split_delim (Str.regexp "~\+~") fsufcat_str in
try
(Printf.sprintf "%s\t%s\t%f\t%d\n" (List.nth fsufcat 0) (List.nth fsufcat 1)
((float_of_int frq) /. float_total_insgjp_freq) frq) :: lst
with _->(Printf.printf "problem with fsufcat %s -> %d\n"
fsufcat_str frq; lst))))