plWordnet.ml
31.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
(********************************************************)
(* *)
(* Copyright 2014 Wojciech Jaworski. *)
(* *)
(* All rights reserved. *)
(* *)
(********************************************************)
open Xstd
(*let nexus_path = "/home/yacheu/Dokumenty/Badania/Jezyk i Umysl/Przetwarzanie Języka Naturalnego/zasoby/"
let toshiba_ub_path = "/home/wjaworski/Dokumenty/zasoby/"
let get_host_name () =
let chan = Unix.open_process_in "uname -n" in
input_line chan
let zasoby_path =
match get_host_name () with
"nexus" -> nexus_path
| "toshiba-UB" -> toshiba_ub_path
(* | "mozart" -> "." *)
| s -> failwith ("unknown host: " ^ s)*)
let zasoby_path = "../../NLP resources/"
let plwordnet_filename = zasoby_path ^ "Słowosieć/plwordnet-3.0.xml"
type lu = {lu_id: int; lu_name: string; lu_pos: string; lu_tagcount: string; lu_domain: string; lu_desc: string;
lu_workstate: string; lu_source: string; lu_variant: string; lu_syn: int}
type syn = {syn_workstate: string; syn_split: string; syn_owner: string; syn_definition: string;
syn_desc: string; syn_abstract: string; syn_units: (int * lu) list; syn_pos: string; syn_no_hipo: int; syn_domain: string}
type rels = {r_parent: int; r_child: int; r_relation: int; r_valid: string; r_owner: string}
type rt = {rt_type: string; rt_reverse: int; rt_name: string; rt_description: string;
rt_posstr: string; rt_display: string; rt_shortcut: string; rt_autoreverse: string; rt_pwn: string; rt_tests: (string * string) list}
let empty_lu = {lu_id=(-1); lu_name=""; lu_pos=""; lu_tagcount=""; lu_domain=""; lu_desc="";
lu_workstate=""; lu_source=""; lu_variant=""; lu_syn=(-1)}
let process_unit = function
Xml.Element("unit-id",[],[Xml.PCData s]) -> int_of_string s, empty_lu
| node -> failwith ("process_unit " ^ (Xml.to_string node))
let process_tests = function
Xml.Element("test",["text",text;"pos",pos],[]) -> text,pos
| node -> failwith ("process_tests " ^ (Xml.to_string node))
(* funkcja zwraca:
lexical-unit map - wiąże leksemy z identyfikatorami
synset map
lexicalrelations
synsetrelations
relationtypes map
*)
let process_entry (lumap,synmap,lr,sr,rtmap) = function
Xml.Element("lexical-unit",["id",id;"name",name;"pos",pos;"tagcount",tagcount;"domain",domain;"workstate",workstate;
"source",source;"variant",variant],[]) ->
let lumap = IntMap.add_inc lumap (int_of_string id) {lu_id=int_of_string id; lu_name=name; lu_pos=pos; lu_tagcount=tagcount; lu_domain=domain; lu_desc="";
lu_workstate=workstate; lu_source=source; lu_variant=variant; lu_syn=(-1)} (fun _ -> failwith "process_entry 2") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("lexical-unit",["id",id;"name",name;"pos",pos;"tagcount",tagcount;"domain",domain;"desc",desc;"workstate",workstate;
"source",source;"variant",variant],[]) ->
let lumap = IntMap.add_inc lumap (int_of_string id) {lu_id=int_of_string id; lu_name=name; lu_pos=pos; lu_tagcount=tagcount; lu_domain=domain; lu_desc=desc;
lu_workstate=workstate; lu_source=source; lu_variant=variant; lu_syn=(-1)} (fun _ -> failwith "process_entry 3") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("synset",["id",id;"workstate",workstate;"split",split;"owner",owner;"definition",definition;"desc",desc;
"abstract",abstract],units) ->
let units = Xlist.map units process_unit in
let synmap = IntMap.add_inc synmap (int_of_string id) {syn_workstate=workstate; syn_split=split; syn_owner=owner; syn_definition=definition;
syn_desc=desc; syn_abstract=abstract; syn_units=units; syn_pos=""; syn_no_hipo=0; syn_domain=""} (fun _ -> failwith "process_entry 4") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("synset",["id",id;"workstate",workstate;"split",split;"owner",owner;"desc",desc;
"abstract",abstract],units) ->
let units = Xlist.map units process_unit in
let synmap = IntMap.add_inc synmap (int_of_string id) {syn_workstate=workstate; syn_split=split; syn_owner=owner; syn_definition="";
syn_desc=desc; syn_abstract=abstract; syn_units=units; syn_pos=""; syn_no_hipo=0; syn_domain=""} (fun _ -> failwith "process_entry 4") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("lexicalrelations",["parent",parent;"child",child;"relation",relation;"valid",valid;"owner",owner],[]) ->
let lr = {r_parent=int_of_string parent; r_child=int_of_string child; r_relation=int_of_string relation; r_valid=valid; r_owner=owner} :: lr in
lumap,synmap,lr,sr,rtmap
| Xml.Element("synsetrelations",["parent",parent;"child",child;"relation",relation;"valid",valid;"owner",owner],[]) ->
let sr = {r_parent=int_of_string parent; r_child=int_of_string child; r_relation=int_of_string relation; r_valid=valid; r_owner=owner} :: sr in
lumap,synmap,lr,sr,rtmap
| Xml.Element("relationtypes",["id",id;"type",typ;"reverse",reverse;"name",name;"description",description;
"posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
"pwn",pwn],tests) ->
let tests = Xlist.map tests process_tests in
let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=int_of_string reverse; rt_name=name; rt_description=description;
rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
(fun _ -> failwith "process_entry 5") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("relationtypes",["id",id;"type",typ;"name",name;"description",description;
"posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
"pwn",pwn],tests) ->
let tests = Xlist.map tests process_tests in
let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=(-1); rt_name=name; rt_description=description;
rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
(fun _ -> failwith "process_entry 5") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("relationtypes",["id",id;"type",typ;"parent",parent;"reverse",reverse;"name",name;"description",description;
"posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
"pwn",pwn],tests) ->
let tests = Xlist.map tests process_tests in
let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=int_of_string reverse; rt_name=name; rt_description=description;
rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
(fun _ -> failwith "process_entry 5") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("relationtypes",["id",id;"type",typ;"parent",parent;"name",name;"description",description;
"posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
"pwn",pwn],tests) ->
let tests = Xlist.map tests process_tests in
let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=(-1); rt_name=name; rt_description=description;
rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
(fun _ -> failwith "process_entry 5") in
lumap,synmap,lr,sr,rtmap
| node -> print_endline (Xml.to_string node); failwith "process_entry 1"
let load_data filename =
match try Xml.parse_file filename with Xml.Error e -> failwith ("load_data Xml.Error " ^ Xml.error e) with
Xml.Element("array-list",_,entries) ->
Xlist.fold entries (IntMap.empty,IntMap.empty,[],[],IntMap.empty) process_entry
| node -> failwith ("load_data " ^ (Xml.to_string node))
let merge_lu_syn lumap synmap =
IntMap.map synmap (fun syn ->
let units = Xlist.map syn.syn_units (fun (id,_) -> id, IntMap.find lumap id) in
let pos = match StringSet.to_list (Xlist.fold units StringSet.empty (fun set (_,lu) ->
StringSet.add set lu.lu_pos)) with
[] -> failwith "merge_lu_syn: empty synset"
| [pos] -> pos
| _ -> failwith "merge_lu_syn: inconsistent pos" in
{syn with syn_units=units; syn_pos=pos})
let merge_lu_syn2 lumap synmap =
IntMap.fold synmap lumap (fun lumap syn_id syn ->
Xlist.fold syn.syn_units lumap (fun lumap (id,_) ->
let lu = try IntMap.find lumap id with Not_found -> failwith "merge_lu_syn2" in
if lu.lu_syn <> -1 then failwith "merge_lu_syn2" else
IntMap.add lumap id {lu with lu_syn=syn_id}))
let create_relation_map rel_id rels =
Xlist.fold rels Graph.empty (fun graph r ->
if r.r_relation = rel_id then
Graph.add graph r.r_parent r.r_child
else graph)
let assign_no_hipo synmap hipo =
IntMap.mapi synmap (fun id syn ->
{syn with syn_no_hipo=IntSet.size (Graph.get_children_ids hipo IntSet.empty id)})
let select_pos synmap pos =
IntMap.fold synmap IntSet.empty (fun selected id syn ->
if syn.syn_pos = pos then IntSet.add selected id else selected)
let select_big_synsets synmap threshold =
IntMap.fold synmap IntSet.empty (fun selected id syn ->
if syn.syn_no_hipo >= threshold then IntSet.add selected id else selected)
let lu_name lu =
lu.lu_name ^ "-" ^ lu.lu_variant
let syn_name syn =
String.concat ", " (Xlist.map syn.syn_units (fun (_,lu) -> lu_name lu))
let syn_name_single syn =
if syn.syn_units = [] then "empty" else
lu_name (snd (List.hd syn.syn_units))
let print_synset_names filename synmap selected =
File.file_out filename (fun file ->
IntSet.iter selected (fun id ->
let syn = IntMap.find synmap id in
let names = syn_name syn in
Printf.fprintf file "%d\t%d\t%s\t%s\n" syn.syn_no_hipo id syn.syn_pos names))
(**************************************************)
(*
let string_of_tests tests =
String.concat " " (Xlist.map tests (fun (t,p) -> "(" ^ t ^ "," ^ p ^ ")"))
let string_of_units units =
String.concat " " (Xlist.map units fst)
let string_of_lu lu =
Printf.sprintf "\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"" lu.lu_name lu.lu_pos lu.lu_tagcount lu.lu_domain
lu.lu_desc lu.lu_workstate lu.lu_source lu.lu_variant
let string_of_syn syn =
Printf.sprintf "\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"" syn.syn_workstate syn.syn_split
syn.syn_owner syn.syn_definition syn.syn_desc syn.syn_abstract (string_of_units syn.syn_units)
let string_of_rt rt =
Printf.sprintf "\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"" rt.rt_type rt.rt_reverse rt.rt_name rt.rt_description rt.rt_posstr
rt.rt_display rt.rt_shortcut rt.rt_autoreverse rt.rt_pwn (string_of_tests rt.rt_tests)
let lu_names = ["name"; "pos"; "tagcount"; "domain"; "desc"; "workstate"; "source"; "variant"]
let syn_names = ["workstate"; "split"; "owner"; "definition"; "desc"; "abstract"; "units"]
let rt_names = ["type"; "reverse"; "name"; "description"; "posstr"; "display"; "shortcut"; "autoreverse"; "pwn"; "tests"]
let rel_names = ["parent"; "child"; "valid"; "owner"]
let print_lu_map filename lumap =
File.file_out filename (fun file ->
Printf.fprintf file "id;%s\n" (String.concat ";" lu_names);
StringMap.iter lumap (fun id lu ->
Printf.fprintf file "%s;%s\n" id (string_of_lu lu)))
let print_syn_map filename synmap =
File.file_out filename (fun file ->
Printf.fprintf file "id;%s\n" (String.concat ";" syn_names);
StringMap.iter synmap (fun id syn ->
Printf.fprintf file "%s;%s\n" id (string_of_syn syn)))
let print_rt_map filename rel_count rtmap =
File.file_out filename (fun file ->
Printf.fprintf file "id;quantity;%s\n" (String.concat ";" rt_names);
StringMap.iter rtmap (fun id rt ->
Printf.fprintf file "%s;%d;%s\n" id (try StringQMap.find rel_count id with Not_found -> 0) (string_of_rt rt)))
let print_rels filename rel_id rels =
File.file_out filename (fun file ->
Printf.fprintf file "%s\n" (String.concat ";" rel_names);
Xlist.iter rels (fun r ->
if r.r_relation = rel_id then
Printf.fprintf file "%s;%s;%s;%s\n" r.r_parent r.r_child r.r_valid r.r_owner))
let print_stringqmap file name qmap =
Printf.fprintf file "%s\n" name;
StringQMap.iter qmap (fun k v ->
Printf.fprintf file "%6d %s\n" v k)
let check_lexical_units_fields lumap =
let pos = StringMap.fold lumap StringQMap.empty (fun pos _ lu ->
StringQMap.add pos lu.lu_pos) in
print_stringqmap stdout "wartości lu_pos" pos;
(* let tagcount = StringMap.fold lumap StringQMap.empty (fun tagcount _ lu ->
StringQMap.add tagcount lu.lu_tagcount) in
print_stringqmap stdout "wartości lu_tagcount" tagcount; *)
let domain = StringMap.fold lumap StringQMap.empty (fun domain _ lu ->
StringQMap.add domain lu.lu_domain) in
print_stringqmap stdout "wartości lu_domain" domain;
let desc = StringMap.fold lumap StringQMap.empty (fun desc _ lu ->
StringQMap.add desc lu.lu_desc) in
print_stringqmap stdout "wartości lu_desc" desc;
let workstate = StringMap.fold lumap StringQMap.empty (fun workstate _ lu ->
StringQMap.add workstate lu.lu_workstate) in
print_stringqmap stdout "wartości lu_workstate" workstate;
let source = StringMap.fold lumap StringQMap.empty (fun source _ lu ->
StringQMap.add source lu.lu_source) in
print_stringqmap stdout "wartości lu_source" source;
()
let check_lu_syn_consistency lumap synmap =
let set = StringMap.fold lumap StringSet.empty (fun set id _ ->
if StringSet.mem set id then failwith "check_lu_syn_consistency 1" else
StringSet.add set id) in
let set = StringMap.fold synmap set (fun set _ syn ->
Xlist.fold syn.syn_units set (fun set (id,_) ->
if not (StringSet.mem set id) then failwith "check_lu_syn_consistency 2" else
StringSet.remove set id)) in
if not (StringSet.is_empty set) then failwith "check_lu_syn_consistency 3" else
()
let count_relations qmap rtmap rels =
Xlist.fold rels qmap (fun qmap rel ->
if not (StringMap.mem rtmap rel.r_relation) then print_endline ("unknown relation: " ^ rel.r_relation);
StringQMap.add qmap rel.r_relation)
let print_lu_relation filename lumap attr =
File.file_out filename (fun file ->
StringMap.iter attr (fun id1 l ->
Xlist.iter l (fun id2 ->
let lu1 = StringMap.find lumap id1 in
let lu2 = StringMap.find lumap id2 in
Printf.fprintf file "%s\t%s\n" (lu_name lu1) (lu_name lu2))))
let print_syn_relation filename synmap attr =
File.file_out filename (fun file ->
StringMap.iter attr (fun id1 l ->
Xlist.iter l (fun id2 ->
let syn1 = StringMap.find synmap id1 in
let syn2 = StringMap.find synmap id2 in
Printf.fprintf file "%s\t%s\n" (syn_name syn1) (syn_name syn2))))
let print_lu_relations path lumap lr =
let relations = Xlist.fold lr StringMap.empty (fun relations r ->
let map = try StringMap.find relations r.r_relation with Not_found -> StringMap.empty in
let map = StringMap.add_inc map r.r_parent [r.r_child] (fun l -> r.r_child :: l) in
StringMap.add relations r.r_relation map) in
StringMap.iter relations (fun id relation ->
print_lu_relation (path ^ id ^ ".tab") lumap relation)
let print_syn_relations path synmap sr =
let relations = Xlist.fold sr StringMap.empty (fun relations r ->
let map = try StringMap.find relations r.r_relation with Not_found -> StringMap.empty in
let map = StringMap.add_inc map r.r_parent [r.r_child] (fun l -> r.r_child :: l) in
StringMap.add relations r.r_relation map) in
StringMap.iter relations (fun id relation ->
print_syn_relation (path ^ id ^ ".tab") synmap relation)
let create_relation_map_lex lu_syn rel_id rels =
Xlist.fold rels Graph.empty (fun graph r ->
if r.r_relation = rel_id then
let parent = StringMap.find lu_syn r.r_parent in
let child = StringMap.find lu_syn r.r_child in
Graph.add graph (int_of_string r.r_parent) (int_of_string r.r_child)
else graph)
let test_reverse_rel ra rb =
StringMap.iter ra (fun ida l ->
Xlist.iter l (fun idb ->
try
let l = StringMap.find rb idb in
if Xlist.mem l ida then ()
else print_endline ("test_reverse_rel a: " ^ ida)
with Not_found -> print_endline ("test_reverse_rel b: " ^ idb)))
let pwn_pos = ["czasownik pwn"; "przymiotnik pwn"; "przysłówek pwn"; "rzeczownik pwn"]
let remove_pwn synmap =
StringMap.fold synmap StringMap.empty (fun synmap id syn ->
if Xlist.mem pwn_pos syn.syn_pos then synmap else StringMap.add synmap id syn)
(*let get_maximal_not_isolated_synsets synmap hipero hipo =
let set = get_maximal_synsets synmap hipero in
let set = StringSet.fold set StringSet.empty (fun set id ->
if StringMap.mem hipo id then StringSet.add set id else set) in
set*)
let rev_relation map =
StringMap.fold map StringMap.empty (fun map id l ->
Xlist.fold l map (fun map id2 ->
StringMap.add_inc map id2 [id] (fun l -> id :: l)))
let sum_relation map map2 =
StringMap.fold map2 map (fun map id l ->
StringMap.add_inc map id l (fun l2 -> l @ l2))
let synset_name synmap lumap id =
let syn = try StringMap.find synmap id with Not_found -> failwith "synset_name" in
let names = Xlist.map syn.syn_units (fun (luid,_) ->
let lu = try StringMap.find lumap luid with Not_found -> failwith "synset_name" in
lu.lu_name ^ " " ^ lu.lu_variant) in
String.concat ", " names
(* let synset_name_with_args walenty synmap lumap id =
let syn = try StringMap.find synmap id with Not_found -> failwith "synset_name_with_args" in
let names = Xlist.map syn.syn_units (fun luid ->
let lu = try StringMap.find lumap luid with Not_found -> failwith "synset_name_with_args" in
lu.lu_name, lu.lu_variant) in
String.concat ", " (Xlist.map names (fun (name,variant) ->
let args = try StringSet.to_list (StringMap.find walenty name) with Not_found -> [] in
if args = [] then name ^ " " ^ variant else
name ^ " " ^ variant ^ " <B>" ^ String.concat " " args ^ "</B>"))
*)
let is_synset_abstract synmap id =
let syn = try StringMap.find synmap id with Not_found -> failwith "is_synset_abstract" in
match syn.syn_abstract with
"true" -> true
| "false" -> false
| _ -> failwith "is_synset_abstract"
(*
let count_relation_map rmap =
let qmap = StringMap.fold rmap IntQMap.empty (fun qmap _ l ->
IntQMap.add qmap (Xlist.size l)) in
IntQMap.iter qmap (fun len q ->
Printf.printf "%6d %2d\n" q len)
*)
let rec create_spaces n =
if n = 0 then "" else " " ^ (create_spaces (n - 1))
let rec is_synset_abstract2_rec tree synmap id visited =
if is_synset_abstract synmap id then true,visited else
if StringSet.mem visited id then false,visited else
let visited = StringSet.add visited id in
Xlist.fold (try StringMap.find tree id with Not_found -> []) (false,visited) (fun (b,visited) id2 ->
let v,visited = is_synset_abstract2_rec tree synmap id2 visited in
v || b, visited)
let is_synset_abstract2 tree synmap id =
fst (is_synset_abstract2_rec tree synmap id StringSet.empty)
let rec print_subtree_rec file treshold tree synmap lumap level abs_parent id visited =
if StringSet.mem visited id then (
if is_synset_abstract2 tree synmap id || StringSet.size (get_subtree tree id StringSet.empty) >= treshold || abs_parent then
Printf.fprintf file "%s%d VISITED %s%s\n" (create_spaces level)
(StringSet.size (get_subtree tree id StringSet.empty))
(if is_synset_abstract synmap id then "*" else "")
(synset_name synmap lumap id);
visited
) else (
if is_synset_abstract2 tree synmap id || StringSet.size (get_subtree tree id StringSet.empty) >= treshold || abs_parent then
Printf.fprintf file "%s%d %s%s\n" (create_spaces level)
(StringSet.size (get_subtree tree id StringSet.empty))
(if is_synset_abstract synmap id then "*" else "")
(synset_name synmap lumap id);
Xlist.fold (try StringMap.find tree id with Not_found -> []) (StringSet.add visited id) (fun visited id2 ->
print_subtree_rec file treshold tree synmap lumap (level+2) (is_synset_abstract synmap id) id2 visited))
let print_subtree filename treshold tree synmap lumap id =
File.file_out filename (fun file ->
ignore(print_subtree_rec file treshold tree synmap lumap 0 true id StringSet.empty))
let rec create_subtree_xml tree synmap lumap id visited =
let name = synset_name synmap lumap id in
let size = StringSet.size (get_subtree tree id StringSet.empty) in
let abstract = if is_synset_abstract synmap id then ["abstract","true"] else [] in
if StringSet.mem visited id then
Xml.Element("node",("name",name) :: ("visited","true") :: ("size",string_of_int size) :: abstract,[]), visited
else
let l,visited = Xlist.fold (try StringMap.find tree id with Not_found -> []) ([],StringSet.add visited id) (fun (l,visited) id2 ->
let xml,visited = create_subtree_xml tree synmap lumap id2 visited in
xml :: l, visited) in
Xml.Element("node",("name",name) :: ("size",string_of_int size) :: abstract,l),visited
let print_subtree_xml filename tree synmap lumap id =
File.file_out filename (fun file ->
let xml,_ = create_subtree_xml tree synmap lumap id StringSet.empty in
Printf.fprintf file "%s" (Xml.to_string_fmt xml))
let rec find_connected_components_rec rel threshold synmap set conn id =
(* if id = "28358" then print_endline "find_connected_components_rec 1"; *)
if StringSet.mem set id || (StringMap.find synmap id).syn_no_hipo < threshold then conn,set else (
(* if id = "28358" then print_endline "find_connected_components_rec 2"; *)
let conn = StringSet.add conn id in
let set = StringSet.add set id in
let l = try StringMap.find rel id with Not_found -> [] in
(* if id = "28358" then Printf.printf "find_connected_components_rec 3: |conn|=%d\n%!" (StringSet.size conn); *)
Xlist.fold l (conn,set) (fun (conn,set) id ->
find_connected_components_rec rel threshold synmap set conn id))
let find_connected_components rel threshold synmap =
let l,_ = StringMap.fold rel ([],StringSet.empty) (fun (l,set) id _ ->
if StringSet.mem set id then l,set else
let conn,set = find_connected_components_rec rel threshold synmap set StringSet.empty id in
(* if StringSet.mem conn "28358" then print_endline "find_connected_components 1"; *)
conn :: l, set) in
l
let has_syn_above_threshold synmap threshold conn =
StringSet.fold conn false (fun b id ->
if (StringMap.find synmap id).syn_no_hipo >= threshold then true else b)
let remove_conn l id =
Xlist.fold l [] (fun l conn ->
if StringSet.mem conn id then l else conn :: l)
let select_conn l id =
Xlist.fold l [] (fun l conn ->
if StringSet.mem conn id then conn :: l else l)
let print_hipo_graph path name threshold synmap hipo conn =
ignore (Xlist.fold conn 1 (fun n conn ->
let name = name ^ "_" ^ string_of_int n in
if has_syn_above_threshold synmap threshold conn then (
File.file_out (path ^ name ^ ".gv") (fun file ->
Printf.fprintf file "digraph G {\n node [shape=box]\n";(* "rankdir = LR\n";*)
StringMap.iter synmap (fun id syn ->
if StringSet.mem conn id && syn.syn_no_hipo >= threshold then
Printf.fprintf file " %s [label=\"%s\\n%d\"]\n" id (syn_name_single syn) syn.syn_no_hipo);
StringMap.iter hipo (fun id1 l ->
if StringSet.mem conn id1 && (StringMap.find synmap id1).syn_no_hipo >= threshold then
Xlist.iter l (fun id2 ->
if (StringMap.find synmap id2).syn_no_hipo >= threshold then
Printf.fprintf file " %s -> %s\n" id1 id2));
Printf.fprintf file "}\n");
Sys.chdir path;
ignore (Sys.command ("dot -Tpng " ^ name ^ ".gv -o " ^ name ^ ".png"));
Sys.chdir "..";
n+1) else n))
(*
let rec create_spaces_html n =
if n = 0 then "" else " " ^ (create_spaces_html (n - 1))
let create_args_map walenty synmap lumap =
StringMap.fold synmap StringMap.empty (fun map id syn ->
let names = Xlist.map syn.syn_units (fun luid ->
let lu = try StringMap.find lumap luid with Not_found -> failwith "find_args" in
lu.lu_name) in
let args = Xlist.fold names StringSet.empty (fun set name ->
try
StringSet.union set (StringMap.find walenty name)
with Not_found -> set) in
if StringSet.is_empty args then map else
StringMap.add map id args)
let rec create_args_tree_map_rec tree argmap id map =
if StringMap.mem map id then map,StringMap.find map id else
let args = try StringMap.find argmap id with Not_found -> StringSet.empty in
let map,args = Xlist.fold (try StringMap.find tree id with Not_found -> []) (map,args) (fun (map,args) id2 ->
let map2,args2 = create_args_tree_map_rec tree argmap id2 map in
map2, StringSet.union args args2) in
StringMap.add map id args, args
let create_args_tree_map tree argmap id =
fst (create_args_tree_map_rec tree argmap id StringMap.empty)
let string_of_args argmap argtreemap id =
let args = StringSet.to_list (try StringMap.find argtreemap id with Not_found -> StringSet.empty) in
let local_args = try StringMap.find argmap id with Not_found -> StringSet.empty in
let args = Xlist.map args (fun arg ->
if StringSet.mem local_args arg then "<B>" ^ arg ^ "</B>"
else arg) in
String.concat " " args
let string_of_args2 argmap id =
let args = StringSet.to_list (try StringMap.find argmap id with Not_found -> StringSet.empty) in
"<B>" ^ String.concat " " args ^ "</B>"
(*let find_args walenty tree synmap lumap id =
let syn = try StringMap.find synmap id with Not_found -> failwith "find_args" in
let names = Xlist.map syn.syn_units (fun luid ->
let lu = try StringMap.find lumap luid with Not_found -> failwith "find_args" in
lu.lu_name) in
let args = Xlist.fold names StringSet.empty (fun set name ->
try
StringSet.union set (StringMap.find walenty name)
with Not_found -> set) in
String.concat ", " (StringSet.to_list args) *)
let rec print_arg_subtree_rec file argmap argtreemap tree synmap lumap level abs_parent id visited =
if StringSet.mem visited id then (
if is_synset_abstract2 tree synmap id || StringSet.size (get_subtree tree id StringSet.empty) >= 100 || abs_parent then
Printf.fprintf file "%s<FONT SIZE=2>%s</FONT> %d VISITED %s%s %s<BR>\n" (create_spaces_html level) id
(StringSet.size (get_subtree tree id StringSet.empty))
(if is_synset_abstract synmap id then "*" else "")
(synset_name synmap lumap id)
(string_of_args argmap argtreemap id);
visited
) else (
if is_synset_abstract2 tree synmap id || StringSet.size (get_subtree tree id StringSet.empty) >= 100 || abs_parent then
Printf.fprintf file "%s<FONT SIZE=2>%s</FONT> %d %s%s %s<BR>\n" (create_spaces_html level) id
(StringSet.size (get_subtree tree id StringSet.empty))
(if is_synset_abstract synmap id then "*" else "")
(synset_name synmap lumap id)
(string_of_args argmap argtreemap id);
Xlist.fold (try StringMap.find tree id with Not_found -> []) (StringSet.add visited id) (fun visited id2 ->
print_arg_subtree_rec file argmap argtreemap tree synmap lumap (level+2) (is_synset_abstract synmap id) id2 visited))
let print_html_header file =
Printf.fprintf file "<HTML><HEAD><META HTTP-EQUIV=\"CONTENT-TYPE\" CONTENT=\"text/html; charset=utf8\"></HEAD><BODY>\n"
let print_html_trailer file =
Printf.fprintf file "</BODY></HTML>\n"
let print_arg_subtree filename argmap argtreemap tree synmap lumap id =
File.file_out filename (fun file ->
print_html_header file;
ignore(print_arg_subtree_rec file argmap argtreemap tree synmap lumap 0 true id StringSet.empty);
print_html_trailer file)
let has_interesting_arg argtreemap id =
try
StringSet.size (StringMap.find argtreemap id) > 0
with Not_found -> false
let rec print_arg_subtree_rec2 file argmap argtreemap walenty tree synmap lumap level abs_parent id visited =
if StringSet.mem visited id then (
if has_interesting_arg argtreemap id then
Printf.fprintf file "%s<FONT SIZE=2>%s</FONT> %d VISITED %s%s<BR>\n" (create_spaces_html level) id
(StringSet.size (get_subtree tree id StringSet.empty))
(if is_synset_abstract synmap id then "*" else "")
(synset_name_with_args walenty synmap lumap id)(*
(string_of_args2 argmap id)*);
visited
) else (
if has_interesting_arg argtreemap id then
Printf.fprintf file "%s<FONT SIZE=2>%s</FONT> %d %s%s<BR>\n" (create_spaces_html level) id
(StringSet.size (get_subtree tree id StringSet.empty))
(if is_synset_abstract synmap id then "*" else "")
(synset_name_with_args walenty synmap lumap id)(*
(string_of_args2 argmap id)*);
Xlist.fold (try StringMap.find tree id with Not_found -> []) (StringSet.add visited id) (fun visited id2 ->
print_arg_subtree_rec2 file argmap argtreemap walenty tree synmap lumap (level+2) (is_synset_abstract synmap id) id2 visited))
let print_arg_subtree2 filename argmap argtreemap walenty tree synmap lumap id =
File.file_out filename (fun file ->
print_html_header file;
ignore(print_arg_subtree_rec2 file argmap argtreemap walenty tree synmap lumap 0 true id StringSet.empty);
print_html_trailer file)
let compare_lu_id (x,_,_) (y,_,_) = compare (int_of_string x) (int_of_string y)
let print_lexical_units path lumap =
let map = StringMap.fold lumap StringMap.empty (fun map id lu ->
StringMap.add_inc map lu.lu_pos [id,lu.lu_name,lu.lu_variant] (fun l -> (id,lu.lu_name,lu.lu_variant) :: l)) in
StringMap.iter map (fun pos l ->
File.file_out (path ^ pos ^ ".tab") (fun file ->
Xlist.iter (Xlist.sort l compare_lu_id) (fun (id,lemma,variant) ->
Printf.fprintf file "%s\t%s\t%s\n" id lemma variant)))
*)
let print_lexical_units_full path lumap =
let map = StringMap.fold lumap StringMap.empty (fun map id lu ->
StringMap.add_inc map lu.lu_pos [id,lu] (fun l -> (id,lu) :: l)) in
StringMap.iter map (fun pos l ->
File.file_out (path ^ pos ^ ".tab") (fun file ->
Xlist.iter l (fun (id,lu) ->
Printf.fprintf file "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" id lu.lu_name lu.lu_tagcount lu.lu_domain lu.lu_desc lu.lu_workstate lu.lu_source lu.lu_variant)))
(*
let print_synsets filename synmap =
File.file_out filename (fun file ->
StringMap.iter synmap (fun id syn ->
Printf.fprintf file "%s\t%s\n" id (String.concat " " syn.syn_units)))
let print_hipero filename hipero =
File.file_out filename (fun file ->
StringMap.iter hipero (fun id l ->
Printf.fprintf file "%s\t%s\n" id (String.concat " " l)))
*)
(*let xml_fold filename s f =
File.file_in filename (fun file ->
let scanbuf = Scanf.Scanning.from_function (fun () -> input_char file) in
let (*size*)_ =
try
Scanf.bscanf scanbuf "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<array-list owner=\"\" date=\"2013-07-24 11.45.32\" version=\"WordnetLoom 1.64.0\">\n"
with
_ -> failwith ("xml_fold " ^ filename ^ ": invalid header") in
let lex = Lexing.from_function (fun buf len -> input file buf 0 len) in
let par = XmlParser.make () in
XmlParser.prove par false;
XmlParser.check_eof par false;
let r = ref s in
let i = ref 0 in
try
while true do
try
let graph = (*of_xml*) (XmlParser.parse par (XmlParser.SLexbuf lex)) in
r := f (!r) graph;
incr i;
with Parsing.Parse_error -> failwith ("xml_fold " ^ filename ^ ": Parse_error")
done;
!r
with
Xml.Error e -> (
print_endline ("xml_fold Xml.Error " ^ Xml.error e);
Printf.printf "%d\n" (!i);
(*if !i = size then*) !r
(*else failwith ("xml_fold " ^ filename ^ ": invalid file contents")*)))*)
*)