ENIAMplWordnet.ml
18.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
(*
* ENIAMplWordnet, a converter for Polish Wordnet "Słowosieć".
* Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open ENIAMplWordnetTypes
let process_unit = function
Xml.Element("unit-id",[],[Xml.PCData s]) -> int_of_string s, empty_lu
| node -> failwith ("process_unit " ^ (Xml.to_string node))
let process_tests = function
Xml.Element("test",["text",text;"pos",pos],[]) -> text,pos
| node -> failwith ("process_tests " ^ (Xml.to_string node))
let process_abstract = function
"true" -> true
| "false" -> false
| _ -> failwith "process_abstract"
(* funkcja zwraca:
lexical-unit map - wiąże leksemy z identyfikatorami
synset map
lexicalrelations
synsetrelations
relationtypes map
*)
let process_entry (lumap,synmap,lr,sr,rtmap) = function
Xml.Element("lexical-unit",["id",id;"name",name;"pos",pos;"tagcount",tagcount;"domain",domain;"workstate",workstate;
"source",source;"variant",variant],[]) ->
let lumap = IntMap.add_inc lumap (int_of_string id) {lu_id=int_of_string id; lu_name=name; lu_pos=pos; lu_tagcount=tagcount; lu_domain=domain; lu_desc="";
lu_workstate=workstate; lu_source=source; lu_variant=variant; lu_syn=(-1)} (fun _ -> failwith "process_entry 2") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("lexical-unit",["id",id;"name",name;"pos",pos;"tagcount",tagcount;"domain",domain;"desc",desc;"workstate",workstate;
"source",source;"variant",variant],[]) ->
let lumap = IntMap.add_inc lumap (int_of_string id) {lu_id=int_of_string id; lu_name=name; lu_pos=pos; lu_tagcount=tagcount; lu_domain=domain; lu_desc=desc;
lu_workstate=workstate; lu_source=source; lu_variant=variant; lu_syn=(-1)} (fun _ -> failwith "process_entry 3") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("synset",["id",id;"workstate",workstate;"split",split;"owner",owner;"definition",definition;"desc",desc;
"abstract",abstract],units) ->
let units = Xlist.map units process_unit in
let synmap = IntMap.add_inc synmap (int_of_string id) {syn_workstate=workstate; syn_split=split; syn_owner=owner; syn_definition=definition;
syn_desc=desc; syn_abstract=process_abstract abstract; syn_units=units; syn_pos=""; syn_no_hipo=0; syn_domain=""} (fun _ -> failwith "process_entry 4") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("synset",["id",id;"workstate",workstate;"split",split;"owner",owner;"desc",desc;
"abstract",abstract],units) ->
let units = Xlist.map units process_unit in
let synmap = IntMap.add_inc synmap (int_of_string id) {syn_workstate=workstate; syn_split=split; syn_owner=owner; syn_definition="";
syn_desc=desc; syn_abstract=process_abstract abstract; syn_units=units; syn_pos=""; syn_no_hipo=0; syn_domain=""} (fun _ -> failwith "process_entry 4") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("lexicalrelations",["parent",parent;"child",child;"relation",relation;"valid",valid;"owner",owner],[]) ->
let lr = {r_parent=int_of_string parent; r_child=int_of_string child; r_relation=int_of_string relation; r_valid=valid; r_owner=owner} :: lr in
lumap,synmap,lr,sr,rtmap
| Xml.Element("synsetrelations",["parent",parent;"child",child;"relation",relation;"valid",valid;"owner",owner],[]) ->
let sr = {r_parent=int_of_string parent; r_child=int_of_string child; r_relation=int_of_string relation; r_valid=valid; r_owner=owner} :: sr in
lumap,synmap,lr,sr,rtmap
| Xml.Element("relationtypes",["id",id;"type",typ;"reverse",reverse;"name",name;"description",description;
"posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
"pwn",pwn],tests) ->
let tests = Xlist.map tests process_tests in
let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=int_of_string reverse; rt_name=name; rt_description=description;
rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
(fun _ -> failwith "process_entry 5") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("relationtypes",["id",id;"type",typ;"name",name;"description",description;
"posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
"pwn",pwn],tests) ->
let tests = Xlist.map tests process_tests in
let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=(-1); rt_name=name; rt_description=description;
rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
(fun _ -> failwith "process_entry 5") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("relationtypes",["id",id;"type",typ;"parent",parent;"reverse",reverse;"name",name;"description",description;
"posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
"pwn",pwn],tests) ->
let tests = Xlist.map tests process_tests in
let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=int_of_string reverse; rt_name=name; rt_description=description;
rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
(fun _ -> failwith "process_entry 5") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("relationtypes",["id",id;"type",typ;"parent",parent;"name",name;"description",description;
"posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
"pwn",pwn],tests) ->
let tests = Xlist.map tests process_tests in
let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=(-1); rt_name=name; rt_description=description;
rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
(fun _ -> failwith "process_entry 5") in
lumap,synmap,lr,sr,rtmap
| node -> print_endline (Xml.to_string node); failwith "process_entry 1"
let load_data filename =
match try Xml.parse_file filename with Xml.Error e -> failwith ("load_data Xml.Error " ^ Xml.error e) with
Xml.Element("array-list",_,entries) ->
Xlist.fold entries (IntMap.empty,IntMap.empty,[],[],IntMap.empty) process_entry
| node -> failwith ("load_data " ^ (Xml.to_string node))
let check_lu_syn_consistency lumap synmap =
let set = IntMap.fold lumap IntSet.empty (fun set id _ ->
if IntSet.mem set id then failwith "check_lu_syn_consistency 1" else
IntSet.add set id) in
let set = IntMap.fold synmap set (fun set _ syn ->
Xlist.fold syn.syn_units set (fun set (id,_) ->
if not (IntSet.mem set id) then failwith "check_lu_syn_consistency 2" else
IntSet.remove set id)) in
if not (IntSet.is_empty set) then failwith "check_lu_syn_consistency 3" else
()
let merge_lu_syn lumap synmap =
IntMap.map synmap (fun syn ->
let units = Xlist.map syn.syn_units (fun (id,_) -> id, IntMap.find lumap id) in
let pos = match StringSet.to_list (Xlist.fold units StringSet.empty (fun set (_,lu) ->
StringSet.add set lu.lu_pos)) with
[] -> failwith "merge_lu_syn: empty synset"
| [pos] -> pos
| _ -> failwith "merge_lu_syn: inconsistent pos" in
{syn with syn_units=units; syn_pos=pos})
let set_lu_syn lumap synmap =
IntMap.fold synmap lumap (fun lumap syn_id syn ->
Xlist.fold syn.syn_units lumap (fun lumap (id,_) ->
let lu = try IntMap.find lumap id with Not_found -> failwith "set_lu_syn" in
if lu.lu_syn <> -1 then failwith "set_lu_syn" else
IntMap.add lumap id {lu with lu_syn=syn_id}))
let count_relations qmap rtmap rels =
Xlist.fold rels qmap (fun qmap rel ->
if not (IntMap.mem rtmap rel.r_relation) then print_endline ("unknown relation: " ^ string_of_int rel.r_relation);
IntQMap.add qmap rel.r_relation)
let lu_name lu =
lu.lu_name ^ "-" ^ lu.lu_variant
let syn_name syn =
String.concat ", " (Xlist.map syn.syn_units (fun (_,lu) -> lu_name lu))
let syn_name_single syn =
if syn.syn_units = [] then "empty" else
lu_name (snd (List.hd syn.syn_units))
let pwn_pos = ["czasownik pwn"; "przymiotnik pwn"; "przysłówek pwn"; "rzeczownik pwn"]
let is_pwn_lu lu =
Xlist.mem pwn_pos lu.lu_pos
let is_pwn_syn syn =
Xlist.mem pwn_pos syn.syn_pos
let get_pos_lu lu = lu.lu_pos
let get_pos_syn syn = syn.syn_pos
let add_pwn_qmap map rel parent child =
let s = Printf.sprintf "%s-%s" parent child in
IntMap.add_inc map rel.r_relation (StringQMap.add StringQMap.empty s) (fun qmap -> StringQMap.add qmap s)
let test_pwn_elem is_pwn_fun map elem =
try
if is_pwn_fun (IntMap.find map elem) then "en" else "pl"
with Not_found -> "NF"
let test_pos_elem get_pos_fun map elem =
try
get_pos_fun (IntMap.find map elem)
with Not_found -> "NF"
let count_pwn_relation qmap lumap synmap rtmap rels t =
Xlist.fold rels qmap (fun qmap rel ->
match (*(IntMap.find rtmap rel.r_relation).rt_type,*)t with
(*"relacja pomiędzy synsetami",*)"sr" -> add_pwn_qmap qmap rel (test_pwn_elem is_pwn_syn synmap rel.r_parent) (test_pwn_elem is_pwn_syn synmap rel.r_child)
| (*"relacja leksykalna",*)"lr" -> add_pwn_qmap qmap rel (test_pwn_elem is_pwn_lu lumap rel.r_parent) (test_pwn_elem is_pwn_lu lumap rel.r_child)
(* | "relacja synonimii" -> qmap *)
| _ -> failwith "count_pwn_relation")
let count_pos_relation qmap lumap synmap rtmap rels t =
Xlist.fold rels qmap (fun qmap rel ->
match (*(IntMap.find rtmap rel.r_relation).rt_type,*)t with
(*"relacja pomiędzy synsetami",*)"sr" -> add_pwn_qmap qmap rel (test_pos_elem get_pos_syn synmap rel.r_parent) (test_pos_elem get_pos_syn synmap rel.r_child)
| (*"relacja leksykalna",*)"lr" -> add_pwn_qmap qmap rel (test_pos_elem get_pos_lu lumap rel.r_parent) (test_pos_elem get_pos_lu lumap rel.r_child)
(* | "relacja synonimii" -> qmap *)
| _ -> failwith "count_pwn_relation")
let select_plWordnet lumap synmap lr sr rtmap =
let lr = Xlist.fold lr [] (fun lr rel ->
if test_pwn_elem is_pwn_lu lumap rel.r_parent = "pl" &&
test_pwn_elem is_pwn_lu lumap rel.r_child = "pl" &&
IntSet.mem pl_pl_relations rel.r_relation then rel :: lr else lr) in
let sr = Xlist.fold sr [] (fun sr rel ->
if test_pwn_elem is_pwn_syn synmap rel.r_parent = "pl" &&
test_pwn_elem is_pwn_syn synmap rel.r_child = "pl" &&
IntSet.mem pl_pl_relations rel.r_relation then rel :: sr else sr) in
let lumap = IntMap.fold lumap IntMap.empty (fun lumap id lu ->
if is_pwn_lu lu then lumap else IntMap.add lumap id lu) in
let synmap = IntMap.fold synmap IntMap.empty (fun synmap id syn ->
if is_pwn_syn syn then synmap else IntMap.add synmap id syn) in
let rtmap = IntMap.fold rtmap IntMap.empty (fun rtmap id rt ->
if IntSet.mem pl_pl_relations id then IntMap.add rtmap id rt else rtmap) in
lumap,synmap,lr,sr,rtmap
let create_relation_map rel_id rels =
Xlist.fold rels Relation.empty (fun graph r ->
if r.r_relation = rel_id then
Relation.add_new graph r.r_parent r.r_child 0
else graph)
let create_relation_maps rel_maps rels =
Xlist.fold rels rel_maps (fun graphs r ->
let graph = try IntMap.find graphs r.r_relation with Not_found -> Relation.empty in
let graph = Relation.add_new graph r.r_parent r.r_child 0 in
IntMap.add graphs r.r_relation graph)
let create_relation_map_lex lumap rel_id rels =
Xlist.fold rels Relation.empty (fun graph r ->
if r.r_relation = rel_id then
let parent = (IntMap.find lumap r.r_parent).lu_syn in
let child = (IntMap.find lumap r.r_child).lu_syn in
Relation.add graph parent child 0
else graph)
let create_relation_maps_lex rel_maps lumap rels =
Xlist.fold rels rel_maps (fun graphs r ->
let graph = try IntMap.find graphs r.r_relation with Not_found -> Relation.empty in
let parent = (IntMap.find lumap r.r_parent).lu_syn in
let child = (IntMap.find lumap r.r_child).lu_syn in
let graph = Relation.add graph parent child 0 in
IntMap.add graphs r.r_relation graph)
let assign_no_hipo synmap hipo =
IntMap.mapi synmap (fun id syn ->
{syn with syn_no_hipo=IntSet.size (Relation.find_descendants hipo id)})
let check_rel_class_coverage rel_maps rel_sets =
let set = Xlist.fold (List.tl rel_sets) (List.hd rel_sets) IntSet.union in
IntMap.iter rel_maps (fun rel_id _ ->
if not (IntSet.mem set rel_id) then Printf.printf "only in rel_maps: %d\n" rel_id);
IntSet.iter set (fun rel_id ->
if not (IntMap.mem rel_maps rel_id) then Printf.printf "only in rel_sets: %d\n" rel_id)
let get_syn_id synmap lu_name lu_variant =
let found = IntMap.fold synmap [] (fun found id syn ->
Xlist.fold syn.syn_units found (fun found (_,lu) ->
if lu.lu_name = lu_name && lu.lu_variant = lu_variant then
id :: found else found)) in
match found with
[] -> failwith "get_syn_id: not found"
| [id] -> id
| _ -> failwith "get_syn_id: multiple id found"
let add_relations rel_maps rev_rel_maps ex_hipo relations =
Xlist.fold relations ex_hipo (fun ex_hipo (cost,dir,rel_ids) ->
Xlist.fold rel_ids ex_hipo (fun ex_hipo rel_id ->
let graph = IntMap.find (if dir = Straight then rel_maps else rev_rel_maps) rel_id in
IntMap.fold graph ex_hipo (fun ex_hipo parent children ->
IntMap.fold children ex_hipo (fun ex_hipo child _ ->
Relation.add_inc ex_hipo parent child cost min))))
let add_hipo_extensions synmap rel_maps rev_rel_maps ex_hipo hipo_extensions =
Xlist.fold hipo_extensions ex_hipo (fun ex_hipo (cost,lu_name,lu_variant,dir,rel_ids) ->
let hiper_id = get_syn_id synmap lu_name lu_variant in
Xlist.fold rel_ids ex_hipo (fun ex_hipo rel_id ->
let graph = IntMap.find (if dir = Parent then rel_maps else rev_rel_maps) rel_id in
IntMap.fold graph ex_hipo (fun ex_hipo hipo_id _ ->
Relation.add_inc ex_hipo hipo_id hiper_id cost min)))
let add_hipo_extensions2 synmap ex_hipo hipo_extensions =
Xlist.fold hipo_extensions ex_hipo (fun ex_hipo (cost,lu_name,lu_variant,poss) ->
let hiper_id = get_syn_id synmap lu_name lu_variant in
IntMap.fold synmap ex_hipo (fun ex_hipo hipo_id syn ->
if Xlist.mem poss syn.syn_pos then Relation.add_inc ex_hipo hipo_id hiper_id cost min else ex_hipo))
let create_ex_hipo synmap rel_maps rev_rel_maps =
let ex_hipo = add_relations rel_maps rev_rel_maps IntMap.empty hipo_relations in
let ex_hipo = add_hipo_extensions synmap rel_maps rev_rel_maps ex_hipo hipo_extensions in
let ex_hipo = add_hipo_extensions2 synmap ex_hipo hipo_extensions2 in
ex_hipo
let rec get_hipero_rec found ex_hipo id cost =
let cost2 = try IntMap.find found id with Not_found -> max_int in
if cost2 <= cost || cost > 7 then found else
let found = IntMap.add found id cost in
let map = try IntMap.find ex_hipo id with Not_found -> IntMap.empty in
IntMap.fold map found (fun found id2 cost2 ->
get_hipero_rec found ex_hipo id2 (cost + cost2))
let get_hipero ex_hipo syn_id =
get_hipero_rec IntMap.empty ex_hipo syn_id 0
let select_big_synsets synmap threshold =
IntMap.fold synmap IntSet.empty (fun selected id syn ->
if syn.syn_no_hipo >= threshold then IntSet.add selected id else selected)
let print_subtree synmap ex_hipo path lu_name lu_variant =
let syn_id = get_syn_id synmap lu_name lu_variant in
let tree = Relation.descendants_tree ex_hipo syn_id 0 in
File.file_out (path ^ lu_name ^ "-" ^ lu_variant ^ ".txt") (fun file ->
Relation.print_tree file tree (fun syn_id cost ->
let syn = IntMap.find synmap syn_id in
let abstract = if syn.syn_abstract then "*" else "" in
Printf.sprintf "%d %s%s" syn.syn_no_hipo abstract (syn_name syn)));
File.file_out (path ^ lu_name ^ "-" ^ lu_variant ^ ".xml") (fun file ->
Relation.print_tree_xml file tree (fun syn_id cost ->
let syn = IntMap.find synmap syn_id in
["name",syn_name syn;
"size",string_of_int syn.syn_no_hipo] @
(if syn.syn_abstract then ["abstract","true"] else [])))
(* w semimport/plWordnet.ml była jeszcze procedura wypisująca poddrzewa słowosieci scalone z Walentym *)
let print_subtree_graph synmap hipo path lu_name lu_variant threshold =
let syn_id = get_syn_id synmap lu_name lu_variant in
let big = select_big_synsets synmap threshold in
let hipo = Relation.select hipo (fun parent child cost -> IntSet.mem big parent && IntSet.mem big child) in
let descendants = Relation.find_descendants hipo syn_id in
let hipo2 = Relation.select hipo (fun parent child cost -> IntSet.mem descendants parent || IntSet.mem descendants child) in
Relation.print_graph path (lu_name ^ "-" ^ lu_variant) true hipo2 (fun id ->
let syn = IntMap.find synmap id in
Printf.sprintf "%s\\n%d" (syn_name_single syn) syn.syn_no_hipo) (fun _ -> "")
let rt_names = ["type"; "reverse"; "name"; "description"; "posstr"; "display"; "shortcut"; "autoreverse"; "pwn"; "tests"]
let string_of_tests tests =
String.concat " " (Xlist.map tests (fun (t,p) -> "(" ^ t ^ "," ^ p ^ ")"))
let string_of_rt rt =
Printf.sprintf "\"%s\";\"%d\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"" rt.rt_type rt.rt_reverse rt.rt_name rt.rt_description rt.rt_posstr
rt.rt_display rt.rt_shortcut rt.rt_autoreverse rt.rt_pwn (string_of_tests rt.rt_tests)
let print_rt_map filename rel_count rtmap =
File.file_out filename (fun file ->
Printf.fprintf file "id;quantity;%s\n" (String.concat ";" rt_names);
IntMap.iter rtmap (fun id rt ->
Printf.fprintf file "%d;%d;%s\n" id (try IntQMap.find rel_count id with Not_found -> 0) (string_of_rt rt)))