|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
(* Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Dec 2016-Jan 2017. *)
open FreqUtils
open Xstd
let count_fsuf_freq freq rules_by_id =
(* Return a map: fsuf (find) in rules -> frequency to the matching
* forms *)
Xlist.fold freq StringMap.empty
(fun map etr ->
if etr.rule_id = "" || etr.rule_id = "ALT" then map
else let fsuf =
(* we receive a list here, so unpack the only element *)
(match try (List.hd (StringMap.find rules_by_id etr.rule_id))
with _ -> failwith (Printf.sprintf "can't find rule %s" etr.rule_id)
with
| [id; fsuf] -> fsuf | _ -> failwith "bad entry in rule map")
in
StringMap.add_inc map fsuf (int_of_string etr.frq)
(fun accum -> accum + (int_of_string etr.frq)))
let count_fsuf_cat_freq freq rules_by_id =
(* Return a map: fsuf (find) in rules -> frequency to the matching
* forms *)
Xlist.fold freq StringMap.empty
(fun map etr ->
if etr.rule_id = "" || etr.rule_id = "ALT" then map
else if etr.cat = "" then failwith etr.orth
else let fsuf =
(* we receive a list here, so unpack the only element *)
(match try (List.hd (StringMap.find rules_by_id etr.rule_id))
with _ -> failwith (Printf.sprintf "can't find rule %s" etr.rule_id)
with
| [id; fsuf] -> fsuf | _ -> failwith "bad entry in rule map")
in
StringMap.add_inc map (fsuf^"~+~"^etr.cat) (int_of_string etr.frq)
(fun accum -> accum + (int_of_string etr.frq)))
let _ =
|
|
40
41
42
43
44
45
46
|
let gen_freq =
(List.filter
(* filter out errors and symbols *)
(fun etr -> etr.corr = "CORR" && not (etr.word_type = "SYMB"
|| etr.word_type = "COMPD" || etr.word_type = "WEB"
|| etr.word_type = "ACRO"))
(File.load_tab
|
|
47
48
49
50
51
|
"../resources/NKJP1M/NKJP1M-generalized-frequency.tab"
(function [o;l;i;f;c;s;w;cr;ri;ct] -> { orth=o; lemma=l; interp=i;
frq=f; compos=c; sgjp_status=s; word_type=w; corr=cr; rule_id=ri;
cat=ct}
| [] -> failwith "Empty entry in the freq file"
|
|
52
|
| _::_ -> failwith "Malformatted entry in the freq file")))
|
|
53
54
55
56
57
58
59
|
in
(* count the total frequency of all entries *)
let total_freq = Xlist.fold gen_freq 0
(fun tally etr -> tally + int_of_string etr.frq)
in
let float_total_freq = float_of_int total_freq
in
|
|
60
61
62
63
64
65
66
67
68
69
70
71
72
|
(* split the gen_freq into in_sgjp i non_sgjp parts *)
let in_sgjp_freq = (List.filter
(fun etr -> etr.sgjp_status != "NON-SGJP")
gen_freq) in
let non_sgjp_freq = (List.filter
(fun etr -> etr.sgjp_status = "NON-SGJP")
gen_freq) in
let float_total_insgjp_freq = float_of_int (sum_list_freq in_sgjp_freq)
in
let float_total_nonsgjp_freq = float_of_int (sum_list_freq non_sgjp_freq)
in
|
|
73
74
75
76
|
(* make a map of the freq, indexed by word forms *)
let freq_map = map_from_list gen_freq (fun etr -> etr.orth)
in
(* and another by lemma:cat *)
|
|
77
|
let freq_insgjp_map_lmcat = map_from_list in_sgjp_freq
|
|
78
79
|
(fun etr -> etr.lemma ^ "~" ^ etr.cat)
in
|
|
80
|
let freq_insgjp_map_lmitp = map_from_list in_sgjp_freq
|
|
81
82
83
|
(fun etr -> etr.lemma ^ "~" ^ etr.interp)
in
(* count P(interp|cat)'s *)
|
|
84
|
let insgjp_itp_given_cat = map_interp_given_cat in_sgjp_freq
|
|
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
|
in
(* prepare a map of rules: id -> rule entry (as Rules) *)
let rules_by_id =
(map_from_list
(* load the freq_rules *)
(File.load_tab "../resources/SGJP/freq_rules.tab"
(function [id; _; _; _; fsuf; _; _] -> [id; fsuf]
| _ -> failwith "bad entry in freq_rules.tab"))
(* index by id *)
(function [id; fsuf] -> id
| _ -> failwith "error when making a map of freq rules"))
in
(* print probabilities to be used by the model for SGJP entries *)
print_list "doc/prob_lemmacat.txt"
|
|
101
|
(StringMap.fold freq_insgjp_map_lmcat []
|
|
102
103
|
(fun lst lmcat vnts -> Printf.sprintf "%s\t%f\n"
(Str.global_replace (Str.regexp "~") "\t" lmcat)
|
|
104
|
((float_of_int (sum_list_freq vnts)) /. float_total_insgjp_freq) :: lst));
|
|
105
106
|
print_list "doc/prob_itp_givencat.txt"
|
|
107
|
(StringMap.fold insgjp_itp_given_cat []
|
|
108
109
110
111
112
|
(fun lst cat itps ->
lst @
StringMap.fold itps [] (fun ilst itp prob ->
Printf.sprintf "%s\t%s\t%f\n" itp cat prob :: ilst)));
|
|
113
|
let fsuf_probs = count_fsuf_freq non_sgjp_freq rules_by_id in
|
|
114
|
print_list "doc/prob_fsuf.txt"
|
|
115
116
117
|
(List.sort (fun e1 e2 -> compare
(StringMap.find fsuf_probs (List.hd (Xstring.split_delim "\\\t" e2)))
(StringMap.find fsuf_probs (List.hd (Xstring.split_delim "\\\t" e1))))
|
|
118
119
|
(StringMap.fold fsuf_probs []
(fun lst fsuf frq -> Printf.sprintf "%s\t%f\t%d\n" fsuf
|
|
120
|
((float_of_int frq) /. float_total_insgjp_freq) frq :: lst)));
|
|
121
|
|
|
122
|
let fsuf_cat_probs = count_fsuf_cat_freq non_sgjp_freq rules_by_id in
|
|
123
|
print_list "doc/prob_fsuf_cat.txt"
|
|
124
125
126
127
128
|
(List.sort (fun e1 e2 -> compare
(StringMap.find fsuf_cat_probs
(join "~+~" (slice (Xstring.split_delim "\\\t" e2) 0 2)))
(StringMap.find fsuf_cat_probs
(join "~+~" (slice (Xstring.split_delim "\\\t" e1) 0 2))))
|
|
129
130
131
132
133
|
(StringMap.fold fsuf_cat_probs []
(fun lst fsufcat_str frq ->
let fsufcat = Str.split_delim (Str.regexp "~\+~") fsufcat_str in
try
(Printf.sprintf "%s\t%s\t%f\t%d\n" (List.nth fsufcat 0) (List.nth fsufcat 1)
|
|
134
|
((float_of_int frq) /. float_total_insgjp_freq) frq) :: lst
|
|
135
|
with _->(Printf.printf "problem with fsufcat %s -> %d\n"
|
|
136
|
fsufcat_str frq; lst))))
|