ENIAMplWordnetAnalyze.ml
12.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
(*
* ENIAMplWordnet, a converter for Polish Wordnet "Słowosieć".
* Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
(* UWAGA: pole rt_type nie jest wypełnione poprawnie w Słowosieci 3.0 *)
open ENIAMplWordnetTypes
open Xstd
open Printf
let print_stringqmap file qmap =
StringQMap.iter qmap (fun k v ->
fprintf file "%6d %s\n" v k)
let print_lexical_units_field path name selector lumap =
File.file_out (path ^ name ^ ".tab") (fun file ->
let qmap = IntMap.fold lumap StringQMap.empty (fun qmap _ lu ->
StringQMap.add qmap (selector lu)) in
StringQMap.iter qmap (fun k v ->
fprintf file "%6d %s\n" v k))
let print_lexical_units_fields path lumap =
print_lexical_units_field path "pos" (fun lu -> lu.lu_pos) lumap;
print_lexical_units_field path "tagcount " (fun lu -> lu.lu_tagcount) lumap;
print_lexical_units_field path "domain" (fun lu -> lu.lu_domain) lumap;
print_lexical_units_field path "desc" (fun lu -> lu.lu_desc) lumap;
print_lexical_units_field path "workstate" (fun lu -> lu.lu_workstate) lumap;
print_lexical_units_field path "source" (fun lu -> lu.lu_source) lumap;
()
let print_lexical_units_full path lumap =
let map = IntMap.fold lumap StringMap.empty (fun map _ lu ->
StringMap.add_inc map lu.lu_pos [lu] (fun l -> lu :: l)) in
StringMap.iter map (fun pos l ->
File.file_out (path ^ pos ^ ".tab") (fun file ->
Xlist.iter l (fun lu ->
fprintf file "%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" lu.lu_id lu.lu_name lu.lu_variant lu.lu_tagcount lu.lu_domain lu.lu_desc lu.lu_workstate lu.lu_source)))
let rt_names = ["type"; "reverse"; "name"; "description"; "posstr"; "display"; "shortcut"; "autoreverse"; "pwn"; "tests"]
let string_of_tests tests =
String.concat " " (Xlist.map tests (fun (t,p) -> "(" ^ t ^ "," ^ p ^ ")"))
let string_of_rt rt =
sprintf "\"%s\";\"%d\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"" rt.rt_type rt.rt_reverse rt.rt_name rt.rt_description rt.rt_posstr
rt.rt_display rt.rt_shortcut rt.rt_autoreverse rt.rt_pwn (string_of_tests rt.rt_tests)
let print_rt_map filename rel_count rtmap =
File.file_out filename (fun file ->
fprintf file "id;quantity;%s\n" (String.concat ";" rt_names);
IntMap.iter rtmap (fun id rt ->
fprintf file "%d;%d;%s\n" id (try IntQMap.find rel_count id with Not_found -> 0) (string_of_rt rt)))
let print_lu_relation filename lumap attr =
File.file_out filename (fun file ->
IntMap.iter attr (fun id1 l ->
Xlist.iter l (fun id2 ->
let lu1 = IntMap.find lumap id1 in
let lu2 = IntMap.find lumap id2 in
fprintf file "%s\t%s\n" (ENIAMplWordnet.lu_name lu1) (ENIAMplWordnet.lu_name lu2))))
let print_syn_relation filename synmap attr =
File.file_out filename (fun file ->
IntMap.iter attr (fun id1 l ->
Xlist.iter l (fun id2 ->
let syn1 = IntMap.find synmap id1 in
let syn2 = IntMap.find synmap id2 in
fprintf file "%s\t%s\n" (ENIAMplWordnet.syn_name syn1) (ENIAMplWordnet.syn_name syn2))))
let print_lu_relations path lumap lr =
let relations = Xlist.fold lr IntMap.empty (fun relations r ->
let map = try IntMap.find relations r.r_relation with Not_found -> IntMap.empty in
let map = IntMap.add_inc map r.r_parent [r.r_child] (fun l -> r.r_child :: l) in
IntMap.add relations r.r_relation map) in
IntMap.iter relations (fun id relation ->
print_lu_relation (path ^ string_of_int id ^ "_lu.tab") lumap relation)
let print_syn_relations path synmap sr =
let relations = Xlist.fold sr IntMap.empty (fun relations r ->
let map = try IntMap.find relations r.r_relation with Not_found -> IntMap.empty in
let map = IntMap.add_inc map r.r_parent [r.r_child] (fun l -> r.r_child :: l) in
IntMap.add relations r.r_relation map) in
IntMap.iter relations (fun id relation ->
print_syn_relation (path ^ string_of_int id ^ "_syn.tab") synmap relation)
let print_pwn_relations filename rel_pwn_count =
File.file_out filename (fun file ->
IntMap.iter rel_pwn_count (fun id qmap ->
let l = StringQMap.fold qmap [] (fun l k v -> (sprintf "%s:%d" k v) :: l) in
let s = String.concat "\t" (Xlist.sort l compare) in
fprintf file "%d\t%s\n" id s))
let print_synset_names filename rel_maps rev_rel_maps synmap selected =
File.file_out filename (fun file ->
IntSet.iter selected (fun id ->
let l = IntMap.fold rel_maps [] (fun l rel_id graph ->
if Relation.mem_parent graph id then rel_id :: l else l) in
let rels = String.concat "," (Xlist.map (List.rev l) string_of_int) in
let l = IntMap.fold rev_rel_maps [] (fun l rel_id graph ->
if Relation.mem_parent graph id then rel_id :: l else l) in
let rev_rels = String.concat "," (Xlist.map (List.rev l) string_of_int) in
let syn = IntMap.find synmap id in
let names = ENIAMplWordnet.syn_name syn in
fprintf file "%d\t%d\t%s\t%s\t%s\t%s\n" syn.syn_no_hipo id syn.syn_pos rels rev_rels names))
let print_synset_id synmap lu_name lu_variant =
printf "name=%s variant=%s --> syn_id=%d\n" lu_name lu_variant (ENIAMplWordnet.get_syn_id synmap lu_name lu_variant)
let print_hipero synmap ex_hipo lu_name lu_variant =
let syn_id = ENIAMplWordnet.get_syn_id synmap lu_name lu_variant in
let hipero = ENIAMplWordnet.get_hipero ex_hipo syn_id in
IntMap.iter hipero (fun id cost ->
printf "%d %s\n" cost (ENIAMplWordnet.syn_name (IntMap.find synmap id)))
let print_subtree_cost synmap ex_hipo path lu_name lu_variant =
let syn_id = ENIAMplWordnet.get_syn_id synmap lu_name lu_variant in
let tree = Relation.descendants_tree ex_hipo syn_id 0 in
File.file_out (path ^ lu_name ^ "-" ^ lu_variant ^ ".txt") (fun file ->
Relation.print_tree file tree (fun syn_id cost -> sprintf "%d %s" cost (ENIAMplWordnet.syn_name (IntMap.find synmap syn_id))))(*;
Relation.print_tree_as_graph "results/" "hipo_graph" tree (fun syn_id -> ENIAMplWordnet.syn_name_single (IntMap.find synmap syn_id)) string_of_int*)
let print_subtree synmap ex_hipo path lu_name lu_variant =
let syn_id = ENIAMplWordnet.get_syn_id synmap lu_name lu_variant in
let tree = Relation.descendants_tree ex_hipo syn_id 0 in
File.file_out (path ^ lu_name ^ "-" ^ lu_variant ^ ".txt") (fun file ->
Relation.print_tree file tree (fun syn_id cost ->
let syn = IntMap.find synmap syn_id in
let abstract = if syn.syn_abstract then "*" else "" in
sprintf "%d %s%s" syn.syn_no_hipo abstract (ENIAMplWordnet.syn_name syn)));
File.file_out (path ^ lu_name ^ "-" ^ lu_variant ^ ".xml") (fun file ->
Relation.print_tree_xml file tree (fun syn_id cost ->
let syn = IntMap.find synmap syn_id in
["name",ENIAMplWordnet.syn_name syn;
"size",string_of_int syn.syn_no_hipo] @
(if syn.syn_abstract then ["abstract","true"] else [])))
(* w semimport/plWordnet.ml była jeszcze procedura wypisująca poddrzewa słowosieci scalone z Walentym *)
let print_subtree_graph synmap hipo path lu_name lu_variant threshold =
let syn_id = ENIAMplWordnet.get_syn_id synmap lu_name lu_variant in
let tree = Relation.descendants_tree ex_hipo syn_id 0 in
let big_synset_ids = PlWordnet.select_big_synsets synmap threshold in
let hipo = Graph.select_node_set hipo big_synset_ids in
let chidlren_ids = Graph.get_children_ids hipo IntSet.empty syn_id in
let hipo2 = Graph.select_node_set hipo chidlren_ids in
Graph.print_graph path (lu_name ^ "-" ^ lu_variant) hipo2 (fun id ->
let syn = IntMap.find synmap id in
Printf.sprintf "%s\\n%d" (PlWordnet.syn_name_single syn) syn.PlWordnet.syn_no_hipo);
()
let hipo_roots = [
(* "coś", "1", 50;
"CZASOWNIK", "1", 50;
"GERUNDIUM", "1", 50;(*"organizm",40*)
"istota", "1", 50;
"rezultat", "1", 40;
"całość", "1", 30;
"czynność", "1", 30;
"cecha", "1", 30;
"zjawisko", "1", 30;
"materia", "3", 30;
"część", "3", 20;
"środek", "1", 20;
"okoliczności", "1", 20;
"miejsce", "1", 20;
"informacja", "1", 20;
"ilość", "1", 10;
"ciąg", "4", 10;
"wydarzenie", "1", 10;
"proces o charakterze fizykalnym polegający na ruchu", "1", 10;
"negatywny", "5", 10;
"pozytywny", "2", 10;
"przestrzeń", "1", 10;
"działalność", "1", 10;
"podmiot", "3", 10;
"czyn", "1", 10;
"inny", "3", 10;
"inny", "1", 10;
"dobrze", "5", 10;
"sprecyzowany", "1", 10;
"niezwykle", "1", 10;
"własność", "1", 10;
"okres", "1", 10;
"podobny", "1", 10;
"inaczej", "1", 10;
"negatywnie", "2", 10;
"domena", "2", 10;
"zależność", "3", 10;
"rzecz", "6", 10;
"tkanka", "1", 10;
"właściwość", "2", 10;
"meritum", "1", 10;
"zachowanie", "8", 10;
"rzecz", "2", 10;
"typ", "1", 10;
"przedmiot", "4", 10;
"rzadkość", "1", 10;
"jakiś", "1", 10;*)
"byt", "1", 10;
]
let _ =
if Array.length Sys.argv < 2 then print_endline "missing argument" else (
let lumap,synmap,lr,sr,rtmap = ENIAMplWordnet.load_data Sys.argv.(1) in
print_endline "data loaded";
(*print_lexical_units_fields "results/lufield_" lumap;
print_lexical_units_full "results/lu_" lumap;
ENIAMplWordnet.check_lu_syn_consistency lumap synmap; *)
let synmap = ENIAMplWordnet.merge_lu_syn lumap synmap in
let lumap = ENIAMplWordnet.set_lu_syn lumap synmap in
(* let lu_syn = ENIAMplWordnet.make_lu_syn_map synmap in *)
(* Wykrycie i usunięcie angielskiego wordnetu *)
(*let rel_count = ENIAMplWordnet.count_relations IntQMap.empty rtmap lr in
let rel_count = ENIAMplWordnet.count_relations rel_count rtmap sr in
print_rt_map "results/rt.csv" rel_count rtmap;
let rel_pwn_count = ENIAMplWordnet.count_pwn_relation IntMap.empty lumap synmap rtmap lr "lr" in
let rel_pwn_count = ENIAMplWordnet.count_pwn_relation rel_pwn_count lumap synmap rtmap sr "sr" in
print_pwn_relations "results/pwn_relations.csv" rel_pwn_count;*)
let lumap,synmap,lr,sr,rtmap = ENIAMplWordnet.select_plWordnet lumap synmap lr sr rtmap in
(*print_lu_relations "results/rels/rel_" lumap lr;
print_syn_relations "results/rels/rel_" synmap sr;
let rel_pos_count = ENIAMplWordnet.count_pos_relation IntMap.empty lumap synmap rtmap lr "lr" in
let rel_pos_count = ENIAMplWordnet.count_pos_relation rel_pos_count lumap synmap rtmap sr "sr" in
print_pwn_relations "results/pos_relations.csv" rel_pos_count; *)
(* Utworzenie uogólnionej hiponimii *)
let hipero = ENIAMplWordnet.create_relation_map 10 sr in (* klucz (parent) jest hiponimem *)
let hipo = ENIAMplWordnet.create_relation_map 11 sr in (* klucz (parent) jest hiperonimem *)
let synmap = ENIAMplWordnet.assign_no_hipo synmap hipo in
(* ENIAMplWordnet.test_rel_pos_consistency synmap hipero; *)
(* Relation.test_reverse hipero hipo;
Relation.test_reverse hipo hipero; *)
let rel_maps = ENIAMplWordnet.create_relation_maps IntMap.empty sr in
let rel_maps = ENIAMplWordnet.create_relation_maps_lex rel_maps lumap lr in
(* ENIAMplWordnet.check_rel_class_coverage rel_maps [morf_relations;semimorf_relations;nomorf_relations]; *)
(* ENIAMplWordnet.check_rel_class_coverage rel_maps [instance_relations;synonymy_relations;attr_relations;has_relations;antonymy_relations;caus_relations]; *)
let rev_rel_maps = IntMap.map rel_maps Relation.reverse in
(* let ex_hipo = ENIAMplWordnet.create_ex_hipo synmap rel_maps rev_rel_maps in *)
(* let max_syn = Relation.select_childless (IntMap.fold synmap IntSet.empty (fun set id _ -> IntSet.add set id)) ex_hipo in
print_synset_names "results/max_syn.tab" rel_maps rev_rel_maps synmap max_syn; *)
(* print_hipero synmap ex_hipo "płatek" "2"; *)
(* print_subtree synmap ex_hipo "results/ex_tree_" "płatek" "2"; *)
(* Analiza struktury hiponimii *)
(* let max_syn = Relation.select_childless (IntMap.fold synmap IntSet.empty (fun set id _ -> IntSet.add set id)) hipero in *)
(* print_synset_names "results/max_syn_hipo.tab" rel_maps rev_rel_maps synmap max_syn; *)
Xlist.iter hipo_roots (fun (name,variant,_) ->
print_subtree synmap hipo "results/tree_" name variant);
Xlist.iter hipo_roots (fun (name,variant,threshold) ->
print_subtree_graph synmap hipo "results/tree_" name variant threshold);
(* let hipo = Relation.assign_costs hipo (fun id -> (IntMap.find synmap id).syn_no_hipo) in *)
())