inflexion.ml
13.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
(*
* ENIAM: Categorial Syntactic-Semantic Parser for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open Printf
let load_alt filename =
let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
List.rev (Xlist.fold l [] (fun l line ->
if String.length line = 0 then l else
if String.get line 0 = '#' then l else
match Str.split_delim (Str.regexp "\t") line with
[orth; lemma; interp] -> (orth,lemma,interp) :: l
| _ -> failwith ("load_alt: " ^ line)))
let load_dict filename =
let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
List.rev (Xlist.fold l [] (fun l line ->
if String.length line = 0 then l else
if String.get line 0 = '#' then l else
match Str.split_delim (Str.regexp "\t") line with
[stem; lemma_suf2; rule_names] -> (stem,lemma_suf2,Str.split (Str.regexp " ") rule_names) :: l
| _ -> failwith ("load_dict: " ^ line)))
let load_rules filename =
let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
List.rev (Xlist.fold l [] (fun l line ->
if String.length line = 0 then l else
if String.get line 0 = '#' then l else
match Str.split_delim (Str.regexp "\t") line with
[rule_name; quantity; lemma_suf; orth_suf; interp] -> (rule_name,int_of_string quantity,lemma_suf,orth_suf,interp) :: l
| _ -> failwith ("load_rules: " ^ line)))
let make_rules_map rules =
Xlist.fold rules StringMap.empty (fun rules (rule_name,quantity,lemma_suf,orth_suf,interp) ->
let rules2 = try StringMap.find rules orth_suf with Not_found -> StringMap.empty in
let rules2 = StringMap.add rules2 rule_name (lemma_suf,interp) in
StringMap.add rules orth_suf rules2)
module OrderedChar = struct
type t = char
let compare = compare
end
module CharMap = Xmap.Make(OrderedChar)
type char_tree = N of char_tree CharMap.t * (string * string list) list * (string * string) list
(* następne możliwości * (lemma_suf2 * lista reguł) list * lista alt *)
type char_tree_suf = M of char_tree_suf CharMap.t * (string * int * string * string) list
(* następne możliwości * (rule_name * lemma_suf * interp) list *)
let empty_char_tree = N(CharMap.empty,[],[])
let empty_char_tree_suf = M(CharMap.empty,[])
let rec add_path_dict stem i n lemma_suf2 rule_names (N(map,rules,alts)) =
if i = n then N(map,(lemma_suf2,rule_names) :: rules,alts) else
let tree = try CharMap.find map (String.get stem i) with Not_found -> empty_char_tree in
let tree = add_path_dict stem (i+1) n lemma_suf2 rule_names tree in
N(CharMap.add map (String.get stem i) tree,rules,alts)
let rec add_path_alt stem i n lemma interp (N(map,rules,alts)) =
if i = n then N(map,rules,(lemma,interp) :: alts) else
let tree = try CharMap.find map (String.get stem i) with Not_found -> empty_char_tree in
let tree = add_path_alt stem (i+1) n lemma interp tree in
N(CharMap.add map (String.get stem i) tree,rules,alts)
let make_char_tree dict alt =
let tree = Xlist.fold dict empty_char_tree (fun tree (stem,lemma_suf2,rule_names) ->
add_path_dict stem 0 (String.length stem) lemma_suf2 rule_names tree) in
Xlist.fold alt tree (fun tree (orth,lemma,interp) ->
add_path_alt orth 0 (String.length orth) lemma interp tree)
let rec add_path_rules rule_name quantity orth_suf i lemma_suf interp (M(map,rules)) =
if i = -1 then M(map,(rule_name,quantity,lemma_suf,interp) :: rules) else
let tree = try CharMap.find map (String.get orth_suf i) with Not_found -> empty_char_tree_suf in
let tree = add_path_rules rule_name quantity orth_suf (i-1) lemma_suf interp tree in
M(CharMap.add map (String.get orth_suf i) tree,rules)
let make_char_tree_suf rules =
let tree = Xlist.fold rules empty_char_tree_suf (fun tree (rule_name,quantity,lemma_suf,orth_suf,interp) ->
add_path_rules rule_name quantity orth_suf (String.length orth_suf - 1) lemma_suf interp tree) in
tree
let rec find_char_tree_rec i n orth (N(map,rules,alts)) =
if i = n then [orth,"",rules,alts] else
let l = try find_char_tree_rec (i+1) n orth (CharMap.find map (String.get orth i)) with Not_found -> [] in
(String.sub orth 0 i,String.sub orth i (n-i),rules,[]) :: l
let find_char_tree tree rules orth =
let l = find_char_tree_rec 0 (String.length orth) orth tree in
Xlist.fold l [] (fun found (stem,suf,rule_names,alts) ->
let rules2 = try StringMap.find rules suf with Not_found -> StringMap.empty in
let found = alts @ found in
Xlist.fold rule_names found (fun found (lemma_suf2,rule_names2) ->
Xlist.fold rule_names2 found (fun found rule_name ->
try
let lemma_suf,interp = StringMap.find rules2 rule_name in
let lemma = if lemma_suf2 = "" then stem ^ lemma_suf else stem ^ lemma_suf ^ ":" ^ lemma_suf2 in
(lemma,interp) :: found
with Not_found -> found)))
let rec find_char_tree_suf_rec i orth (M(map,rules)) =
if i = 0 then Xlist.fold rules [] (fun l (rule_name,quantity,lemma_suf,interp) -> ("", rule_name, quantity, lemma_suf, interp, i) :: l) else
let l = try find_char_tree_suf_rec (i-1) orth (CharMap.find map (String.get orth (i-1))) with Not_found -> [] in
Xlist.fold rules l (fun l (rule_name, quantity, lemma_suf,interp) ->
(String.sub orth 0 i, rule_name, quantity, String.sub orth 0 i ^ lemma_suf, interp, i) :: l)
let find_char_tree_suf rules_tree stem_map alt_map orth =
let alt_l = Xlist.rev_map (try StringMap.find alt_map orth with Not_found -> []) (fun (lemma,interp) -> lemma,interp,1,[]) in
let l = find_char_tree_suf_rec (String.length orth) orth rules_tree in
let found = Xlist.fold l alt_l (fun found (stem,rule_name,quantity,lemma,interp,i) ->
try
let rule_names = StringMap.find stem_map stem in
if StringSet.mem rule_names rule_name then (lemma,interp,1,[]) :: found else found (* FIXME: czy na pewno nie dodawać reguł niepasujących? to powoduje że lemat tak samo brzmiący a mający inną odmianę nie zostanie rozpoznany *)
with Not_found -> if quantity < 100 || (String.length orth = i && stem = lemma) then found else (lemma,interp,quantity,["lemma not validated"]) :: found) in (* FIXME: ucięcie żadkich reguł powinno być inaczej sterowane *)
(* if found = [] then [orth,"unk",1,["token not found"]] else *)
let found = (orth,"unk",1,["token not found"]) :: found in
let valid = Xlist.fold found [] (fun valid -> function
lemma,interp,quantity,[] -> (lemma,interp,quantity,[]) :: valid
| _ -> valid) in
if valid = [] then found else valid
let prepare_inflexion alt_filename dict_filename rules_filename =
let alt = load_alt alt_filename in
let dict = load_dict dict_filename in
let rules = load_rules rules_filename in
let tree = make_char_tree dict alt in
let rules = make_rules_map rules in
tree,rules
let tree,rules =
(* prepare_inflexion (morfeusz_path ^ Paths.alt_adj) (morfeusz_path ^ Paths.dict_adj) (morfeusz_path ^ Paths.rules_adj) *)
(* prepare_inflexion (morfeusz_path ^ Paths.alt_all) (morfeusz_path ^ Paths.dict_all) (morfeusz_path ^ Paths.rules_all) *)
empty_char_tree,StringMap.empty
let make_alt_map alt =
Xlist.fold alt StringMap.empty (fun alt_map (orth,lemma,interp) ->
StringMap.add_inc alt_map orth [lemma,interp] (fun l -> (lemma,interp) :: l))
let prepare_inflexion_suf alt_filename dict_filename rules_filename =
let alt = load_alt alt_filename in
let rules = load_rules rules_filename in
let rules_tree = make_char_tree_suf rules in
let alt_map = make_alt_map alt in
let dict = load_dict dict_filename in
let stem_map = Xlist.fold dict StringMap.empty (fun stem_map (stem,lemma_suf2,rule_names) ->
StringMap.add_inc stem_map stem (StringSet.of_list rule_names) (fun set -> Xlist.fold rule_names set StringSet.add)) in
alt_map,rules_tree,stem_map
let alt_map,rules_tree,stem_map =
prepare_inflexion_suf (Paths.sgjp_path ^ Paths.alt_all) (Paths.sgjp_path ^ Paths.dict_all) (Paths.sgjp_path ^ Paths.rules_all)
let check_prefix pat s =
let n = String.length pat in
if n > String.length s then false else
String.sub s 0 n = pat
let cut_prefix pat s =
let i = String.length pat in
let n = String.length s in
if i >= n then "" else
try String.sub s i (n-i) with _ -> failwith ("cut_prefix: " ^ s ^ " " ^ string_of_int i)
let check_sufix pat s =
let n = String.length pat in
let m = String.length s in
if n > m then false else
String.sub s (m-n) n = pat
let cut_sufix pat s =
let i = String.length pat in
let n = String.length s in
try String.sub s 0 (n-i) with _ -> failwith ("cut_sufix: " ^ s)
let rec select_interp_sufix pat = function
[] -> []
| (lemma,interp) :: l -> if check_sufix pat interp then (lemma,interp) :: (select_interp_sufix pat l) else select_interp_sufix pat l
let rec select_interp_sufix_suf pat = function
[] -> []
| (lemma,interp,quantity,attrs) :: l -> if check_sufix pat interp then (lemma,interp,quantity,attrs) :: (select_interp_sufix_suf pat l) else select_interp_sufix_suf pat l
let rec remove_interp_sufix pat = function
[] -> []
| (lemma,interp) :: l -> if check_sufix pat interp then remove_interp_sufix pat l else (lemma,interp) :: (remove_interp_sufix pat l)
let rec remove_interp_sufix_suf pat = function
[] -> []
| (lemma,interp,quantity,attrs) :: l ->
if interp = "adv:sup" then (lemma,interp,quantity,attrs) :: (remove_interp_sufix_suf pat l) else (* FIXME: zaślepka, wymaga poprawienia algorytmu generowania słowników *)
if check_sufix pat interp then remove_interp_sufix_suf pat l else (lemma,interp,quantity,attrs) :: (remove_interp_sufix_suf pat l)
let get_interpretations orth =
(if check_prefix "naj" orth then select_interp_sufix ":sup" (find_char_tree tree rules (cut_prefix "naj" orth)) else []) @
(if check_prefix "nie" orth then select_interp_sufix ":neg" (find_char_tree tree rules (cut_prefix "nie" orth)) else []) @
(remove_interp_sufix ":neg" (remove_interp_sufix ":sup" (find_char_tree tree rules orth)))
let get_interpretations_suf orth = (* FIXME: nie działa dla adv:sup pisanych z wielkiej litery np Najdoskonalej Najlepiej *)
if orth = "siebie" then ["siebie","siebie:acc.gen",1,[]] else
if orth = "sobie" then ["siebie","siebie:dat.loc",1,[]] else
if orth = "sobą" then ["siebie","siebie:inst",1,[]] else
(if check_prefix "naj" orth then select_interp_sufix_suf ":sup" (find_char_tree_suf rules_tree stem_map alt_map (cut_prefix "naj" orth)) else []) @
(if check_prefix "nie" orth then select_interp_sufix_suf ":neg" (find_char_tree_suf rules_tree stem_map alt_map (cut_prefix "nie" orth)) else []) @
(if check_prefix "Naj" orth then select_interp_sufix_suf ":sup" (find_char_tree_suf rules_tree stem_map alt_map (cut_prefix "Naj" orth)) else []) @
(if check_prefix "Nie" orth then select_interp_sufix_suf ":neg" (find_char_tree_suf rules_tree stem_map alt_map (cut_prefix "Nie" orth)) else []) @
(remove_interp_sufix_suf ":neg" (remove_interp_sufix_suf ":sup" (find_char_tree_suf rules_tree stem_map alt_map orth)))
(* Testy *)
let print_interpretations l =
Xlist.iter (Xlist.sort l compare) (fun (lemma,interp) ->
printf "%s\t%s\n" lemma interp)
(*let _ =
let l = get_interpretations "życzliwą" in
print_interpretations l;
let l = get_interpretations "żyźniejszego" in
print_interpretations l;
let l = get_interpretations "zwiśli" in
print_interpretations l;
let l = get_interpretations "najzieleńsza" in
print_interpretations l;
let l = get_interpretations "najtandetniejsza" in
print_interpretations l;
let l = get_interpretations "nieżelazny" in
print_interpretations l;
()*)
(*let sgjp_filename = "sgjp-20151020.tab"
let polimorf_filename = "polimorf-20151020.tab"
let _ =
let interp_sel = Morf.load_interp_sel "data/interps.tab" in
print_endline "loading sgjp";
let sgjp = Morf.load_tab (morfeusz_path ^ sgjp_filename) in
print_endline "loading polimorf";
let polimorf = Morf.load_tab (morfeusz_path ^ polimorf_filename) in
print_endline "merging";
let dicts = Morf.merge_dicts [sgjp;polimorf] in
let adj_interp_sel = StringMap.find interp_sel "adj" in
let adj_sup_interp_sel = StringMap.find interp_sel "adj-sup" in
(* let dicts = Morf.remove_prefix dicts "naj" adj_sup_interp_sel in *)
print_endline "preparing queries";
let queries = StringMap.fold dicts StringMap.empty (fun queries lemma interps ->
let interps = Morf.select_interps interps (adj_interp_sel @ adj_sup_interp_sel) in
StringMap.fold interps queries (fun queries interp orths ->
Xlist.fold orths queries (fun queries orth ->
let s = lemma ^ "\t" ^ interp in
StringMap.add_inc queries orth (StringSet.singleton s) (fun set -> StringSet.add set s)))) in
print_endline "testing";
StringMap.iter queries (fun orth set ->
let set = Xlist.fold (get_interpretations orth) set (fun set (lemma,interp) ->
let s = lemma ^ "\t" ^ interp in
if StringSet.mem set s then StringSet.remove set s else (
printf "excessing interpretation: %s\t%s" orth s;
set)) in
if StringSet.is_empty set then () else
StringSet.iter set (fun s ->
printf "lacking interpretation: %s\t%s" orth s))*)