freqProbs.ml
4.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
(* Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Dec 2016-Jan 2017. *)
open FreqUtils
open Xstd
let count_fsuf_freq freq rules_by_id =
(* Return a map: fsuf (find) in rules -> frequency to the matching
* forms *)
Xlist.fold freq StringMap.empty
(fun map etr ->
if etr.rule_id = "" || etr.rule_id = "ALT" then map
else let fsuf =
(* we receive a list here, so unpack the only element *)
(match try (List.hd (StringMap.find rules_by_id etr.rule_id))
with _ -> failwith (Printf.sprintf "can't find rule %s" etr.rule_id)
with
| [id; fsuf] -> fsuf | _ -> failwith "bad entry in rule map")
in
StringMap.add_inc map fsuf (int_of_string etr.frq)
(fun accum -> accum + (int_of_string etr.frq)))
let count_fsuf_cat_freq freq rules_by_id =
(* Return a map: fsuf (find) in rules -> frequency to the matching
* forms *)
Xlist.fold freq StringMap.empty
(fun map etr ->
if etr.rule_id = "" || etr.rule_id = "ALT" then map
else if etr.cat = "" then failwith etr.orth
else let fsuf =
(* we receive a list here, so unpack the only element *)
(match try (List.hd (StringMap.find rules_by_id etr.rule_id))
with _ -> failwith (Printf.sprintf "can't find rule %s" etr.rule_id)
with
| [id; fsuf] -> fsuf | _ -> failwith "bad entry in rule map")
in
StringMap.add_inc map (fsuf^"~+~"^etr.cat) (int_of_string etr.frq)
(fun accum -> accum + (int_of_string etr.frq)))
let _ =
let gen_freq = File.load_tab
"../resources/NKJP1M/NKJP1M-generalized-frequency.tab"
(function [o;l;i;f;c;s;w;cr;ri;ct] -> { orth=o; lemma=l; interp=i;
frq=f; compos=c; sgjp_status=s; word_type=w; corr=cr; rule_id=ri;
cat=ct}
| [] -> failwith "Empty entry in the freq file"
| _::_ -> failwith "Malformatted entry in the freq file")
in
(* count the total frequency of all entries *)
let total_freq = Xlist.fold gen_freq 0
(fun tally etr -> tally + int_of_string etr.frq)
in
let float_total_freq = float_of_int total_freq
in
(* make a map of the freq, indexed by word forms *)
let freq_map = map_from_list gen_freq (fun etr -> etr.orth)
in
(* and another by lemma:cat *)
let freq_map_lmcat = map_from_list gen_freq
(fun etr -> etr.lemma ^ "~" ^ etr.cat)
in
let freq_map_lmitp = map_from_list gen_freq
(fun etr -> etr.lemma ^ "~" ^ etr.interp)
in
(* count P(interp|cat)'s *)
let itp_given_cat = map_interp_given_cat gen_freq
in
(* prepare a map of rules: id -> rule entry (as Rules) *)
let rules_by_id =
(map_from_list
(* load the freq_rules *)
(File.load_tab "../resources/SGJP/freq_rules.tab"
(function [id; _; _; _; fsuf; _; _] -> [id; fsuf]
| _ -> failwith "bad entry in freq_rules.tab"))
(* index by id *)
(function [id; fsuf] -> id
| _ -> failwith "error when making a map of freq rules"))
in
(* print probabilities to be used by the model for SGJP entries *)
print_list "doc/prob_lemmacat.txt"
(StringMap.fold freq_map_lmcat []
(fun lst lmcat vnts -> Printf.sprintf "%s\t%f\n"
(Str.global_replace (Str.regexp "~") "\t" lmcat)
((float_of_int (sum_list_freq vnts)) /. float_total_freq) :: lst));
print_list "doc/prob_itp_givencat.txt"
(StringMap.fold itp_given_cat []
(fun lst cat itps ->
lst @
StringMap.fold itps [] (fun ilst itp prob ->
Printf.sprintf "%s\t%s\t%f\n" itp cat prob :: ilst)));
let fsuf_probs = count_fsuf_freq gen_freq rules_by_id in
print_list "doc/prob_fsuf.txt"
(StringMap.fold fsuf_probs []
(fun lst fsuf frq -> Printf.sprintf "%s\t%f\t%d\n" fsuf
((float_of_int frq) /. float_total_freq) frq :: lst));
let fsuf_cat_probs = count_fsuf_cat_freq gen_freq rules_by_id in
print_list "doc/prob_fsuf_cat.txt"
(StringMap.fold fsuf_cat_probs []
(fun lst fsufcat_str frq ->
let fsufcat = Str.split_delim (Str.regexp "~\+~") fsufcat_str in
try
(Printf.sprintf "%s\t%s\t%f\t%d\n" (List.nth fsufcat 0) (List.nth fsufcat 1)
((float_of_int frq) /. float_total_freq) frq) :: lst
with _->(Printf.printf "problem with fsufcat %s -> %d\n"
fsufcat_str frq; lst)))