ENIAMinflexion.ml
9.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
(*
* ENIAMmorphology, a morphological analyser and a guesser for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMmorphologyTypes
open Xstd
let load_stems filename =
File.fold_tab filename StringMap.empty (fun stems -> function
[stem; lemma_suf; aspect; ids] ->
let ids = StringSet.of_list (Xstring.split " " ids) in
StringMap.add_inc stems stem [lemma_suf,aspect,ids] (fun l -> (lemma_suf,aspect,ids) :: l)
| l -> failwith ("load_stems: " ^ String.concat " " l))
let load_tab filename =
File.load_tab filename (function
orth :: lemma :: interp :: _ ->
{empty_entry with lemma=lemma; forms=[{empty_form with orth=orth; interp=interp}]}
| line -> failwith ("load_tab: " ^ (String.concat "\t" line)))
let simplify_lemma s =
match Xstring.split ":" s with
[s] -> s,""
| [s;t] -> s,t
| _ -> failwith "simplify_lemma"
type status = LemmaVal | LemmaAlt | LemmNotVal | TokNotFound
let string_of_status = function
LemmaVal -> "LemmaVal"
| LemmaAlt -> "LemmaAlt"
| LemmNotVal -> "LemmNotVal"
| TokNotFound -> "TokNotFound"
type t = {lemma: string; lemma_suf: string; interp: string; freq: int; status: status; star: star; tags: (string * string) list}
let string_of_interpretation t =
let lemma = if t.lemma_suf = "" then t.lemma else t.lemma ^ ":" ^ t.lemma_suf in
Printf.sprintf "%s\t%s\t%d\t%s\t%s\t%s" lemma t.interp t.freq (string_of_status t.status)
(ENIAMmorphologyRules.string_of_star t.star) (String.concat " " (Xlist.map t.tags (fun (k,v) -> k ^ "=" ^ v)))
let string_of_interpretations l =
String.concat "\n" (Xlist.map l string_of_interpretation)
let xml_of_interpretation t =
Xml.Element("t",["lemma",t.lemma; "lemma_suf",t.lemma_suf; "interp",t.interp;
"freq",string_of_int t.freq; "status",string_of_status t.status;
"star",ENIAMmorphologyRules.string_of_star t.star],
Xlist.map t.tags (fun (k,v) ->
Xml.Element("attr",["name",k;"value",v],[])))
let xml_of_interpretations l msg =
if msg = "" then Xml.Element("data",[],Xlist.map l xml_of_interpretation)
else Xml.Element("error",[],[Xml.PCData msg])
let html_of_interpretation t =
let lemma = if t.lemma_suf = "" then t.lemma else t.lemma ^ ":" ^ t.lemma_suf in
Printf.sprintf "<tr><td>%s</td><td>%s</td><td>%d</td><td>%s</td><td>%s</td><td>%s</td></tr>"
lemma t.interp t.freq (string_of_status t.status)
(ENIAMmorphologyRules.string_of_star t.star)
(String.concat " " (Xlist.map t.tags (fun (k,v) -> k ^ "=" ^ v)))
let html_of_interpretations l msg =
if msg = "" then
"<table><tr><td><b>lemma</b></td><td><b>interp</b></td><td><b>freq</b></td><td><b>status</b></td><td><b>star</b></td></td><td><b>attrs</b></td></tr>" ^
String.concat "\n" (List.rev (Xlist.rev_map l html_of_interpretation)) ^
"</table>"
else msg
let prepare_alt alt alt_filename =
let alt2 = load_tab alt_filename in
let alt = Xlist.fold alt2 alt (fun alt entry ->
Xlist.fold entry.forms alt (fun alt form ->
let simple_lemma,lemma_suf = simplify_lemma entry.lemma in
let v = true, {lemma=simple_lemma; lemma_suf=lemma_suf; interp=form.interp; freq=1; status=LemmaAlt; star=Star; tags=[]} in
StringMap.add_inc alt form.orth [v] (fun l -> v :: l))) in
alt
let prepare_rules rules_filename =
let rules = ENIAMmorphologyRules.load_freq_rules rules_filename in
let rules = ENIAMmorphologyRules.CharTrees.create rules in
rules
let prepare_wyglos wyglos_filename =
let wyglos = File.fold_tab wyglos_filename [] (fun wyglos -> function
[freq; con; s; t; a; b] ->
{empty_rule with freq=int_of_string freq; find=s; interp=con} :: wyglos
| line -> failwith ("prepare_wyglos: " ^ (String.concat "\t" line))) in
let wyglos = ENIAMmorphologyRules.CharTrees.create wyglos in
wyglos
let alt = ref (StringMap.empty : (bool * t) list StringMap.t)
let stems = ref (StringMap.empty : (string * string * StringSet.t) list StringMap.t)
let rules = ref ([] : (StringMap.key * ENIAMmorphologyRules.CharTrees.t) list)
let wyglos = ref ([] : (StringMap.key * ENIAMmorphologyRules.CharTrees.t) list)
let initialize () =
alt := prepare_alt StringMap.empty alt_filename;
alt := prepare_alt !alt alt_supplement_filename;
stems := load_stems stem_filename;
rules := prepare_rules rules_filename;
wyglos := prepare_wyglos wyglos_filename
(* let initialize () =
alt := prepare_alt StringMap.empty "resources/alt.tab";
stems := load_stems "resources/stem.tab";
rules := prepare_rules "resources/freq_rules.tab";
wyglos := prepare_wyglos "resources/wyglos.tab" *)
let manage_aspect aspect interp =
let l = Xstring.split_delim "imperf\\.perf" interp in
String.concat aspect l
(* match Xstring.split_delim "imperf\\.perf" interp with
[s] -> s
| [s;t] -> s ^ aspect ^ t
| _ -> failwith ("manage_aspect: " ^ interp) *)
(* let has_vovel_sufix s =
let n = String.length s in
let a = String.get s (n-1) in
if a = 'a' || a = 'e' || a = 'i' || a = 'o' || a = 'u' || a = 'y' then true else
let a = String.sub s (n-2) 2 in
if a = "ó" || a = "ą" || a = "ę" then true else
false *)
(* let has_vovel_sufix = function
"a" :: _ -> true
| "ą" :: _ -> true
| "e" :: _ -> true
| "ę" :: _ -> true
| "i" :: _ -> true
| "o" :: _ -> true
| "ó" :: _ -> true
| "u" :: _ -> true
| "y" :: _ -> true
| _ -> false *)
let check_diftongs s t =
if t = "" || s = "" then true else
let n = String.length s in
let a = String.get s (n-1) in
let b = String.get t 0 in
if (a = 'c' || a = 'd' || a = 'r' || a = 's') && b = 'z' then false else
if a = 'c' && b = 'h' then false else
if String.length t < 2 then true else
let b = String.sub t 0 2 in
if a = 'd' && (b = "ź" || b = "ż") then false else
true
(* let check_diftongs = function
"c" :: _, "z" :: _ -> false
| "d" :: _, "z" :: _ -> false
| "r" :: _, "z" :: _ -> false
| "s" :: _, "z" :: _ -> false
| "d" :: _, "ź" :: _ -> false
| "d" :: _, "ż" :: _ -> false
| "c" :: _, "h" :: _ -> false
| _ -> true *)
let char_tree_mem tree lcon2 s =
let l = ENIAMmorphologyRules.CharTrees.find tree s in
let l = if lcon2 = "e" || lcon2 = "′e" then
Xlist.fold l [] (fun l (stem,rule) -> if rule.interp = "e" then (stem,rule) :: l else l) else
Xlist.fold l [] (fun l (stem,rule) -> if rule.interp = "" then (stem,rule) :: l else l) in
l <> []
let is_uppercase s =
if s = "" then false else
let c = String.get s 0 in
if 'A' <= c && 'Z' >= c then true else
if Xstring.check_prefix "Ć" s || Xstring.check_prefix "Ś" s || Xstring.check_prefix "Ń" s ||
Xstring.check_prefix "Ż" s || Xstring.check_prefix "Ł" s then true else
false
let check_fluency stem rule =
(* Printf.printf "%s\t%s\n%!" stem (ENIAMmorphologyRules.string_of_freq_rule rule); *)
(* let rev_stem = List.rev (Xunicode.utf8_chars_of_utf8_string stem) in
let rule_find = Xunicode.utf8_chars_of_utf8_string rule.find in *)
(* let rule_set = Xunicode.utf8_chars_of_utf8_string rule.set in *)
(* if not (check_diftongs (rev_stem,rule_find)) then false else *)
if not (check_diftongs stem rule.find) then false else
if rule.set = rule.find then true else
let cat = ENIAMmorphologyRules.get_tag rule.tags "cat" in
let lemma = ENIAMmorphologyRules.get_tag rule.tags "lemma" in
let lcon2 = ENIAMmorphologyRules.get_tag rule.tags "lcon2" in
if cat = "noun" && lemma = "ε" then
if (rule.star = Aux || rule.star = Aux2 || rule.star = Acro) && lcon2 <> "" then false else
if (rule.star = Aux || rule.star = Aux2 || rule.star = Acro || is_uppercase rule.find) && lcon2 = "" then true else
(* if (*String.get rule.set 0 = 'e' &&*) has_vovel_sufix rev_stem then false else *)
(* true else *)
char_tree_mem !wyglos lcon2 (stem ^ rule.set) else
true
let select_fluent candidates =
let selected =
Xlist.fold candidates [] (fun candidates2 (b,x) ->
if b then x :: candidates2 else candidates2) in
if selected = [] then Xlist.map candidates snd else selected
let get_interpretations orth =
let candidates = ENIAMmorphologyRules.CharTrees.find !rules orth in
let found = try StringMap.find !alt orth with Not_found -> [] in
let found = Xlist.fold candidates found (fun found (stem,rule) ->
(* Printf.printf "%s\t%s\n%!" stem (ENIAMmorphologyRules.string_of_freq_rule rule); *)
let fluency = check_fluency stem rule in
let l = try StringMap.find !stems stem with Not_found -> [] in
let l = Xlist.fold l [] (fun l (lemma_suf,aspect,ids) ->
if StringSet.mem ids rule.id then (lemma_suf,aspect) :: l else l) in
if l = [] then
if rule.star = Star then found else
(fluency,{lemma=stem ^ rule.set; lemma_suf=""; interp=rule.interp; freq=rule.freq; status=LemmNotVal; star=rule.star; tags=rule.tags}) :: found else
Xlist.fold l found (fun found (lemma_suf,aspect) ->
(true,{lemma=stem ^ rule.set; lemma_suf=lemma_suf; interp=manage_aspect aspect rule.interp; freq=rule.freq; status=LemmaVal; star=rule.star; tags=rule.tags}) :: found)) in
let found = select_fluent found in
if found = [] then [{lemma=orth; lemma_suf=""; interp="unk"; freq=1; status=TokNotFound; star=Star; tags=[]}] else found
let catch_get_interpretations form =
try
let result = get_interpretations form in result,""
with e -> [], Printexc.to_string e