stem.ml
7.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
open Xstd
open Printf
open Types
(* Zakładam, że zbiór form należy do jednego leksemu *)
let adj_stem_sel = [
1,"adj:sg:nom.voc:f:pos", "a","";
1,"adj:sg:nom:f:pos", "a","";
]
let noun_stem_sel =
List.flatten (Xlist.map ["m1";"m2";"m3";"n1";"n2";"f";"p1";"p2";"p3";"m1:pt";"n:col";"n:ncol";"n:pt";"";""] (fun gender -> [
1,"subst:pl:loc:" ^ gender, "’ach","";
1,"subst:pl:loc:" ^ gender, "-ach","";
1,"subst:pl:loc:" ^ gender, "-etach","";
1,"subst:pl:loc:" ^ gender, "-otach","";
2,"subst:pl:dat:" ^ gender, "om","";
2,"subst:pl:loc:" ^ gender, "ach","";
2,"subst:pl:loc:" ^ gender, "ych","";
2,"subst:pl:loc:" ^ gender, "bich","bi";
2,"subst:pl:loc:" ^ gender, "cich","ci";
2,"subst:pl:loc:" ^ gender, "dzich","dzi";
2,"subst:pl:loc:" ^ gender, "fich","fi";
2,"subst:pl:loc:" ^ gender, "mich","mi";
2,"subst:pl:loc:" ^ gender, "nich","ni";
2,"subst:pl:loc:" ^ gender, "pich","pi";
2,"subst:pl:loc:" ^ gender, "sich","si";
2,"subst:pl:loc:" ^ gender, "wich","wi";
2,"subst:pl:loc:" ^ gender, "zich","zi";
2,"subst:pl:loc:" ^ gender, "kich","k";
2,"subst:pl:loc:" ^ gender, "gich","g";
2,"subst:pl:loc:" ^ gender, "lich","l";
2,"subst:pl:loc:" ^ gender, "żich","żi";
3,"subst:sg:gen:" ^ gender, "kiego","k";
3,"subst:sg:gen:" ^ gender, "ojego","oj";
3,"subst:sg:gen:" ^ gender, "nego","n";
3,"subst:sg:gen:" ^ gender, "tego","t";
3,"subst:sg:gen:" ^ gender, "wego","w";
3,"subst:sg:gen:" ^ gender, "siego","si";
3,"subst:sg:gen:" ^ gender, "ojej","oj";
])) @
[3,"subst:pl:inst:p1", "wem","w";
3,"subst:pl:inst:m1:pt", "wem","w";
3,"subst:pl:nom:m1", "owie","";
4,"subst:pl:gen:p1", "oich","oj";
]
let verb_stem_sel2 =
List.flatten (Xlist.map ["imperf";"perf";"imperf.perf"] (fun aspect -> [
4,"praet:sg:f:" ^ aspect, "kła","k";
4,"praet:sg:f:" ^ aspect, "gła","g";
4,"praet:sg:f:" ^ aspect, "zła","z";
4,"praet:sg:f:" ^ aspect, "sła","s";
4,"praet:sg:f:" ^ aspect, "zła","z";
4,"praet:sg:f:" ^ aspect, "dła","d";
4,"praet:sg:f:" ^ aspect, "tła","t";
4,"praet:sg:f:" ^ aspect, "bła","b";
4,"praet:sg:f:" ^ aspect, "łła","ł";
4,"praet:sg:f:" ^ aspect, "rła","r";
5,"inf:" ^ aspect, "ieć","";
6,"inf:" ^ aspect, "eć","";
(* 3,"ppas:sg:nom.voc:m1.m2.m3:" ^ aspect ^ ":aff", "ty",""; *)
(* 3,"praaaet:sg:f:" ^ aspect, "zła","z"; *)
]))
let prepare_stem_sel map stem_sel =
Xlist.fold stem_sel map (fun map (priority,tags,a,b) ->
StringMap.add_inc map tags [a,b,priority] (fun l -> (a,b,priority) :: l))
let stem_sel =
let map = prepare_stem_sel StringMap.empty adj_stem_sel in
let map = prepare_stem_sel map noun_stem_sel in
let map = prepare_stem_sel map verb_stem_sel2 in
map
let adv_stem_sel = [
"o","",1;
"wie","w",1;
"nie","n",1;
"dze","g",1;
"le","ł",1;
"cie","t",1;
"dzie","d",1;
"mie","m",1;
"rze","r",1;
"ce","k",1;
]
let verb_stem_sel = [
"ować","",1;
"owywać","",1;
"iwać","",1;
"ywać","",2;
"awać","",1;
"ać","",3;
"nąć","",1;
"ąć","",2;
(* "eć","e",1; *)
"ić","",1;
"yć","",1;
"uć","u",1;
(* "ć","",2; *)
]
let lemma_stem_sel =
let map = StringMap.add StringMap.empty "adv" adv_stem_sel in
let map = StringMap.add map "verb" verb_stem_sel in
map
let is_applicable_sel (pat,_,_) s = Xstring.check_sufix pat s
let apply_sel (pat,set,_) s =
(Xstring.cut_sufix pat s) ^ set
let get_priority (_,_,p) = p
let simplify_lemma s =
match Xstring.split ":" s with
[s] -> s
| [s;_] -> s
| _ -> failwith "simplify_lemma"
let simplify_lemma_full s =
match Xstring.split ":" s with
[s] -> s,""
| [s;t] -> s,t
| _ -> failwith "simplify_lemma"
let generate_stem entry =
let orth = simplify_lemma entry.lemma in
let lemma_stem_sel = try StringMap.find lemma_stem_sel entry.cat with Not_found -> [] in
let stems = Xlist.fold lemma_stem_sel StringMap.empty (fun stems sel ->
if is_applicable_sel sel orth then
StringMap.add_inc stems (apply_sel sel orth) (get_priority sel) (fun priority -> min priority (get_priority sel))
else stems) in
let stems2 = Xlist.fold entry.forms StringMap.empty (fun stems form ->
let sels = try StringMap.find stem_sel form.interp with Not_found -> [] in
Xlist.fold sels stems (fun stems sel ->
if is_applicable_sel sel form.orth then
StringMap.add_inc stems (apply_sel sel form.orth) (get_priority sel) (fun priority -> min priority (get_priority sel))
else stems)) in
let stems = if StringMap.is_empty stems then stems2 else stems in
let stems,_ = StringMap.fold stems ([],max_int) (fun (stems,priority) stem p ->
if p < priority then [stem],p else
if p > priority then stems,priority else
stem :: stems, priority) in
match stems with
[] -> (*print_endline ("stem not found for " ^ entry.lemma);
Xlist.iter entry.forms (fun (form.orth,form.interp) -> printf " %s\t%s\n" form.orth form.interp);*)
""
| [s] -> s
| l -> print_endline ("many stems found for " ^ entry.lemma ^ ": " ^ String.concat " " l); ""
(*printf "\"%s\"; " entry.lemma; ""*)
let phon_generate_stem entry =
let stems = Xlist.fold entry.phon_lemma StringMap.empty (fun stems orth ->
let lemma_stem_sel = try StringMap.find lemma_stem_sel entry.cat with Not_found -> [] in
Xlist.fold lemma_stem_sel stems (fun stems sel ->
if is_applicable_sel sel orth then
StringMap.add_inc stems (apply_sel sel orth) (get_priority sel) (fun priority -> min priority (get_priority sel))
else stems)) in
let stems2 = Xlist.fold entry.forms StringMap.empty (fun stems form ->
Xlist.fold form.phon_orth stems (fun stems orth ->
let sels = try StringMap.find stem_sel form.interp with Not_found -> [] in
Xlist.fold sels stems (fun stems sel ->
if is_applicable_sel sel orth then
StringMap.add_inc stems (apply_sel sel orth) (get_priority sel) (fun priority -> min priority (get_priority sel))
else stems))) in
let stems = if StringMap.is_empty stems then stems2 else stems in
let stems,_ = StringMap.fold stems ([],max_int) (fun (stems,priority) stem p ->
if p < priority then [stem],p else
if p > priority then stems,priority else
stem :: stems, priority) in
(match stems with
[] -> print_endline ("stem not found for " ^ entry.lemma)
| [_] -> ()
| l -> print_endline ("many stems found for " ^ entry.lemma ^ ": " ^ String.concat " " l));
stems
let rec merge_digraph = function
[] -> []
| "b" :: "'" :: l -> "b'" :: (merge_digraph l)
| "f" :: "'" :: l -> "f'" :: (merge_digraph l)
| "c" :: "h" :: l -> "ch" :: (merge_digraph l)
| "c" :: "z" :: l -> "cz" :: (merge_digraph l)
| "d" :: "h" :: l -> "dh" :: (merge_digraph l)
| "d" :: "z" :: l -> "dz" :: (merge_digraph l)
| "d" :: "ź" :: l -> "dź" :: (merge_digraph l)
| "d" :: "ż" :: l -> "dż" :: (merge_digraph l)
| "g" :: "h" :: l -> "gh" :: (merge_digraph l)
| "n" :: "h" :: l -> "nh" :: (merge_digraph l)
| "r" :: "h" :: l -> "rh" :: (merge_digraph l)
| "r" :: "z" :: l -> "rz" :: (merge_digraph l)
| "s" :: "z" :: l -> "sz" :: (merge_digraph l)
| "q" :: "u" :: l -> "qu" :: (merge_digraph l)
| s :: l -> s :: (merge_digraph l)
let cut_stem_sufix s =
let l = Xunicode.utf8_chars_of_utf8_string s in
let l = match List.rev l with
"i" :: _ :: l -> l
| "j" :: _ :: l -> l
| _ :: l -> l
| _ -> [] in
String.concat "" (List.rev l)
let rec longest_common_prefix rev = function
x1 :: l1, x2 :: l2 -> if x1 = x2 then longest_common_prefix (x1 :: rev) (l1,l2) else List.rev rev
| _ -> List.rev rev
let generate_stem_lu lemma orth =
let l = longest_common_prefix [] (Xunicode.utf8_chars_of_utf8_string lemma,Xunicode.utf8_chars_of_utf8_string orth) in
let stem = String.concat "" l in
(* Printf.printf "%s %s %s\n%!" lemma orth stem; *)
stem