ENIAMplWordnet.ml
8.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
(*
* ENIAMplWordnet, a converter for Polish Wordnet "Słowosieć".
* Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open ENIAMplWordnetTypes
let process_unit = function
Xml.Element("unit-id",[],[Xml.PCData s]) -> int_of_string s, empty_lu
| node -> failwith ("process_unit " ^ (Xml.to_string node))
let process_tests = function
Xml.Element("test",["text",text;"pos",pos],[]) -> text,pos
| node -> failwith ("process_tests " ^ (Xml.to_string node))
(* funkcja zwraca:
lexical-unit map - wiąże leksemy z identyfikatorami
synset map
lexicalrelations
synsetrelations
relationtypes map
*)
let process_entry (lumap,synmap,lr,sr,rtmap) = function
Xml.Element("lexical-unit",["id",id;"name",name;"pos",pos;"tagcount",tagcount;"domain",domain;"workstate",workstate;
"source",source;"variant",variant],[]) ->
let lumap = IntMap.add_inc lumap (int_of_string id) {lu_id=int_of_string id; lu_name=name; lu_pos=pos; lu_tagcount=tagcount; lu_domain=domain; lu_desc="";
lu_workstate=workstate; lu_source=source; lu_variant=variant; lu_syn=(-1)} (fun _ -> failwith "process_entry 2") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("lexical-unit",["id",id;"name",name;"pos",pos;"tagcount",tagcount;"domain",domain;"desc",desc;"workstate",workstate;
"source",source;"variant",variant],[]) ->
let lumap = IntMap.add_inc lumap (int_of_string id) {lu_id=int_of_string id; lu_name=name; lu_pos=pos; lu_tagcount=tagcount; lu_domain=domain; lu_desc=desc;
lu_workstate=workstate; lu_source=source; lu_variant=variant; lu_syn=(-1)} (fun _ -> failwith "process_entry 3") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("synset",["id",id;"workstate",workstate;"split",split;"owner",owner;"definition",definition;"desc",desc;
"abstract",abstract],units) ->
let units = Xlist.map units process_unit in
let synmap = IntMap.add_inc synmap (int_of_string id) {syn_workstate=workstate; syn_split=split; syn_owner=owner; syn_definition=definition;
syn_desc=desc; syn_abstract=abstract; syn_units=units; syn_pos=""; syn_no_hipo=0; syn_domain=""} (fun _ -> failwith "process_entry 4") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("synset",["id",id;"workstate",workstate;"split",split;"owner",owner;"desc",desc;
"abstract",abstract],units) ->
let units = Xlist.map units process_unit in
let synmap = IntMap.add_inc synmap (int_of_string id) {syn_workstate=workstate; syn_split=split; syn_owner=owner; syn_definition="";
syn_desc=desc; syn_abstract=abstract; syn_units=units; syn_pos=""; syn_no_hipo=0; syn_domain=""} (fun _ -> failwith "process_entry 4") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("lexicalrelations",["parent",parent;"child",child;"relation",relation;"valid",valid;"owner",owner],[]) ->
let lr = {r_parent=int_of_string parent; r_child=int_of_string child; r_relation=int_of_string relation; r_valid=valid; r_owner=owner} :: lr in
lumap,synmap,lr,sr,rtmap
| Xml.Element("synsetrelations",["parent",parent;"child",child;"relation",relation;"valid",valid;"owner",owner],[]) ->
let sr = {r_parent=int_of_string parent; r_child=int_of_string child; r_relation=int_of_string relation; r_valid=valid; r_owner=owner} :: sr in
lumap,synmap,lr,sr,rtmap
| Xml.Element("relationtypes",["id",id;"type",typ;"reverse",reverse;"name",name;"description",description;
"posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
"pwn",pwn],tests) ->
let tests = Xlist.map tests process_tests in
let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=int_of_string reverse; rt_name=name; rt_description=description;
rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
(fun _ -> failwith "process_entry 5") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("relationtypes",["id",id;"type",typ;"name",name;"description",description;
"posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
"pwn",pwn],tests) ->
let tests = Xlist.map tests process_tests in
let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=(-1); rt_name=name; rt_description=description;
rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
(fun _ -> failwith "process_entry 5") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("relationtypes",["id",id;"type",typ;"parent",parent;"reverse",reverse;"name",name;"description",description;
"posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
"pwn",pwn],tests) ->
let tests = Xlist.map tests process_tests in
let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=int_of_string reverse; rt_name=name; rt_description=description;
rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
(fun _ -> failwith "process_entry 5") in
lumap,synmap,lr,sr,rtmap
| Xml.Element("relationtypes",["id",id;"type",typ;"parent",parent;"name",name;"description",description;
"posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
"pwn",pwn],tests) ->
let tests = Xlist.map tests process_tests in
let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=(-1); rt_name=name; rt_description=description;
rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
(fun _ -> failwith "process_entry 5") in
lumap,synmap,lr,sr,rtmap
| node -> print_endline (Xml.to_string node); failwith "process_entry 1"
let load_data filename =
match try Xml.parse_file filename with Xml.Error e -> failwith ("load_data Xml.Error " ^ Xml.error e) with
Xml.Element("array-list",_,entries) ->
Xlist.fold entries (IntMap.empty,IntMap.empty,[],[],IntMap.empty) process_entry
| node -> failwith ("load_data " ^ (Xml.to_string node))
let check_lu_syn_consistency lumap synmap =
let set = IntMap.fold lumap IntSet.empty (fun set id _ ->
if IntSet.mem set id then failwith "check_lu_syn_consistency 1" else
IntSet.add set id) in
let set = IntMap.fold synmap set (fun set _ syn ->
Xlist.fold syn.syn_units set (fun set (id,_) ->
if not (IntSet.mem set id) then failwith "check_lu_syn_consistency 2" else
IntSet.remove set id)) in
if not (IntSet.is_empty set) then failwith "check_lu_syn_consistency 3" else
()
let merge_lu_syn lumap synmap =
IntMap.map synmap (fun syn ->
let units = Xlist.map syn.syn_units (fun (id,_) -> id, IntMap.find lumap id) in
let pos = match StringSet.to_list (Xlist.fold units StringSet.empty (fun set (_,lu) ->
StringSet.add set lu.lu_pos)) with
[] -> failwith "merge_lu_syn: empty synset"
| [pos] -> pos
| _ -> failwith "merge_lu_syn: inconsistent pos" in
{syn with syn_units=units; syn_pos=pos})
let set_lu_syn lumap synmap =
IntMap.fold synmap lumap (fun lumap syn_id syn ->
Xlist.fold syn.syn_units lumap (fun lumap (id,_) ->
let lu = try IntMap.find lumap id with Not_found -> failwith "set_lu_syn" in
if lu.lu_syn <> -1 then failwith "set_lu_syn" else
IntMap.add lumap id {lu with lu_syn=syn_id}))