stem.ml
6.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
(*
* ENIAMmorphology, a morphological analyser and a guesser for Polish
* Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open Printf
open ENIAMmorphologyTypes
(* Zakładam, że zbiór form należy do jednego leksemu *)
let adj_stem_sel = [
1,"adj:sg:nom.voc:f:pos", "a","";
1,"adj:sg:nom:f:pos", "a","";
]
let noun_stem_sel =
List.flatten (Xlist.map ["m1";"m2";"m3";"n1";"n2";"f";"p1";"p2";"p3";"m1:pt";"n:col";"n:ncol";"n:pt";"";""] (fun gender -> [
1,"subst:pl:loc:" ^ gender, "’ach","";
1,"subst:pl:loc:" ^ gender, "-ach","";
1,"subst:pl:loc:" ^ gender, "-etach","";
1,"subst:pl:loc:" ^ gender, "-otach","";
2,"subst:pl:dat:" ^ gender, "om","";
2,"subst:pl:loc:" ^ gender, "ach","";
2,"subst:pl:loc:" ^ gender, "ych","";
2,"subst:pl:loc:" ^ gender, "bich","bi";
2,"subst:pl:loc:" ^ gender, "cich","ci";
2,"subst:pl:loc:" ^ gender, "dzich","dzi";
2,"subst:pl:loc:" ^ gender, "fich","fi";
2,"subst:pl:loc:" ^ gender, "mich","mi";
2,"subst:pl:loc:" ^ gender, "nich","ni";
2,"subst:pl:loc:" ^ gender, "pich","pi";
2,"subst:pl:loc:" ^ gender, "sich","si";
2,"subst:pl:loc:" ^ gender, "wich","wi";
2,"subst:pl:loc:" ^ gender, "zich","zi";
2,"subst:pl:loc:" ^ gender, "kich","k";
2,"subst:pl:loc:" ^ gender, "gich","g";
2,"subst:pl:loc:" ^ gender, "lich","l";
2,"subst:pl:loc:" ^ gender, "żich","żi";
3,"subst:sg:gen:" ^ gender, "kiego","k";
3,"subst:sg:gen:" ^ gender, "ojego","oj";
3,"subst:sg:gen:" ^ gender, "nego","n";
3,"subst:sg:gen:" ^ gender, "tego","t";
3,"subst:sg:gen:" ^ gender, "wego","w";
3,"subst:sg:gen:" ^ gender, "siego","si";
3,"subst:sg:gen:" ^ gender, "ojej","oj";
])) @
[3,"subst:pl:inst:p1", "wem","w";
3,"subst:pl:inst:m1:pt", "wem","w";
3,"subst:pl:nom:m1", "owie","";
4,"subst:pl:gen:p1", "oich","oj";
]
let verb_stem_sel2 =
List.flatten (Xlist.map ["imperf";"perf";"imperf.perf"] (fun aspect -> [
4,"praet:sg:f:" ^ aspect, "kła","k";
4,"praet:sg:f:" ^ aspect, "gła","g";
4,"praet:sg:f:" ^ aspect, "zła","z";
4,"praet:sg:f:" ^ aspect, "sła","s";
4,"praet:sg:f:" ^ aspect, "zła","z";
4,"praet:sg:f:" ^ aspect, "dła","d";
4,"praet:sg:f:" ^ aspect, "tła","t";
4,"praet:sg:f:" ^ aspect, "bła","b";
4,"praet:sg:f:" ^ aspect, "łła","ł";
4,"praet:sg:f:" ^ aspect, "rła","r";
5,"inf:" ^ aspect, "ieć","";
6,"inf:" ^ aspect, "eć","";
(* 3,"ppas:sg:nom.voc:m1.m2.m3:" ^ aspect ^ ":aff", "ty",""; *)
(* 3,"praaaet:sg:f:" ^ aspect, "zła","z"; *)
]))
let prepare_stem_sel map stem_sel =
Xlist.fold stem_sel map (fun map (priority,tags,a,b) ->
StringMap.add_inc map tags [a,b,priority] (fun l -> (a,b,priority) :: l))
let stem_sel =
let map = prepare_stem_sel StringMap.empty adj_stem_sel in
let map = prepare_stem_sel map noun_stem_sel in
let map = prepare_stem_sel map verb_stem_sel2 in
map
let adv_stem_sel = [
"o","",1;
"wie","w",1;
"nie","n",1;
"dze","g",1;
"le","ł",1;
"cie","t",1;
"dzie","d",1;
"mie","m",1;
"rze","r",1;
"ce","k",1;
]
let verb_stem_sel = [
"ować","",1;
"owywać","",1;
"iwać","",1;
"ywać","",2;
"awać","",1;
"ać","",3;
"nąć","",1;
"ąć","",2;
(* "eć","e",1; *)
"ić","",1;
"yć","",1;
"uć","u",1;
(* "ć","",2; *)
]
let lemma_stem_sel =
let map = StringMap.add StringMap.empty "adv" adv_stem_sel in
let map = StringMap.add map "verb" verb_stem_sel in
map
let is_applicable_sel (pat,_,_) s = Xstring.check_sufix pat s
let apply_sel (pat,set,_) s =
(Xstring.cut_sufix pat s) ^ set
let get_priority (_,_,p) = p
let simplify_lemma s =
match Xstring.split ":" s with
[s] -> s
| [s;_] -> s
| _ -> failwith "simplify_lemma"
let simplify_lemma_full s =
match Xstring.split ":" s with
[s] -> s,""
| [s;t] -> s,t
| _ -> failwith "simplify_lemma"
let generate_stem entry =
let orth = simplify_lemma entry.lemma in
let lemma_stem_sel = try StringMap.find lemma_stem_sel entry.cat with Not_found -> [] in
let stems = Xlist.fold lemma_stem_sel StringMap.empty (fun stems sel ->
if is_applicable_sel sel orth then
StringMap.add_inc stems (apply_sel sel orth) (get_priority sel) (fun priority -> min priority (get_priority sel))
else stems) in
let stems2 = Xlist.fold entry.forms StringMap.empty (fun stems form ->
let sels = try StringMap.find stem_sel form.interp with Not_found -> [] in
Xlist.fold sels stems (fun stems sel ->
if is_applicable_sel sel form.orth then
StringMap.add_inc stems (apply_sel sel form.orth) (get_priority sel) (fun priority -> min priority (get_priority sel))
else stems)) in
let stems = if StringMap.is_empty stems then stems2 else stems in
let stems,_ = StringMap.fold stems ([],max_int) (fun (stems,priority) stem p ->
if p < priority then [stem],p else
if p > priority then stems,priority else
stem :: stems, priority) in
match stems with
[] -> (*print_endline ("stem not found for " ^ entry.lemma);
Xlist.iter entry.forms (fun form -> printf " %s\t%s\n" form.orth form.interp); *)
""
| [s] -> if s = "" then print_endline ("empty stem found for " ^ entry.lemma);
s
| l -> print_endline ("many stems found for " ^ entry.lemma ^ ": " ^ String.concat " " l); ""
(*printf "\"%s\"; " entry.lemma; ""*)
let cut_stem_sufix s =
let l = Xunicode.utf8_chars_of_utf8_string s in
let l = match List.rev l with
"i" :: _ :: l -> l
| "j" :: _ :: l -> l
| _ :: l -> l
| _ -> [] in
String.concat "" (List.rev l)
let rec longest_common_prefix rev = function
x1 :: l1, x2 :: l2 -> if x1 = x2 then longest_common_prefix (x1 :: rev) (l1,l2) else List.rev rev
| _ -> List.rev rev
let generate_stem_lu lemma orth =
let l = longest_common_prefix [] (Xunicode.utf8_chars_of_utf8_string lemma,Xunicode.utf8_chars_of_utf8_string orth) in
let stem = String.concat "" l in
(* Printf.printf "%s %s %s\n%!" lemma orth stem; *)
stem