ENIAMwalConnect.ml
13.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
(*
* ENIAMwalenty, an interface for Polish Valence Dictionary "Walenty".
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
let rec parse_comp = function
"int",[] -> ENIAMwalTypes.Int,[]
| "rel",[] -> ENIAMwalTypes.Rel,[]
| "int",l -> ENIAMwalTypes.Int, Xlist.map l (fun s -> ENIAMwalTypes.Comp s)
| "rel",l -> ENIAMwalTypes.Rel, Xlist.map l (fun s -> ENIAMwalTypes.Comp s)
| s,[] -> ENIAMwalTypes.CompTypeUndef,[ENIAMwalTypes.Comp s]
| _ -> failwith "parse_comp"
let rec morf_of_phrase = function
NP c -> ENIAMwalTypes.Phrase (ENIAMwalTypes.NP(ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]))
| PrepNP(prep,c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.PrepNP(ENIAMwalTypes.Sem,ENIAMwalParser.parse_prep [ENIAMwalTypes.Text prep],ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]))
| AdjP c -> ENIAMwalTypes.Phrase (ENIAMwalTypes.AdjP(ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]))
| PrepAdjP(prep,c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.PrepAdjP(ENIAMwalTypes.Sem,ENIAMwalParser.parse_prep [ENIAMwalTypes.Text prep],ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]))
| ComprepNP prep -> ENIAMwalTypes.Phrase (ENIAMwalTypes.ComprepNP(ENIAMwalTypes.Sem,ENIAMwalParser.parse_prep [ENIAMwalTypes.Text prep]))
| CP(co) -> ENIAMwalTypes.PhraseComp(ENIAMwalTypes.Cp,parse_comp co)
| NCP(c,co) -> ENIAMwalTypes.PhraseComp(ENIAMwalTypes.Ncp(ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]),parse_comp co)
| PrepNCP(prep,c,co) -> ENIAMwalTypes.PhraseComp(ENIAMwalTypes.Prepncp(ENIAMwalParser.parse_prep [ENIAMwalTypes.Text prep],ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]),parse_comp co)
| InfP(a) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.InfP(ENIAMwalParser.parse_aspect [ENIAMwalTypes.Text a]))
| XP(mode,phrases) -> ENIAMwalTypes.PhraseAbbr(ENIAMwalTypes.Xp(fst (ENIAMwalParser.parse_mode [ENIAMwalTypes.Text mode])), Xlist.map phrases morf_of_phrase)
| AdvP mode -> ENIAMwalTypes.PhraseAbbr(ENIAMwalTypes.Advp(fst (ENIAMwalParser.parse_mode [ENIAMwalTypes.Text mode])), [])
| NumP(c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.NumP(ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]))
| PrepNumP(prep,c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.PrepNumP(ENIAMwalTypes.Sem,ENIAMwalParser.parse_prep [ENIAMwalTypes.Text prep],ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]))
| ComparP prep -> ENIAMwalTypes.PhraseAbbr (ENIAMwalTypes.ComparP(ENIAMwalParser.parse_prep [ENIAMwalTypes.Text prep]),[])
| Nonch -> ENIAMwalTypes.PhraseAbbr (ENIAMwalTypes.Nonch,[])
| Or -> ENIAMwalTypes.Phrase ENIAMwalTypes.Or
| Refl -> ENIAMwalTypes.Phrase (ENIAMwalTypes.Lex "się")
| Recip -> ENIAMwalTypes.Phrase (ENIAMwalTypes.Lex "się")
| E -> ENIAMwalTypes.E ENIAMwalTypes.Null
| DistrP -> ENIAMwalTypes.PhraseAbbr (ENIAMwalTypes.Distrp,[])
| PossP -> ENIAMwalTypes.PhraseAbbr (ENIAMwalTypes.Possp,[])
| FixedP(_,s) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.FixedP s)
| Lex lex -> (*print_endline "lex";*) ENIAMwalTypes.Phrase (ENIAMwalTypes.Null) (* FIXME: ni *)
| Null -> ENIAMwalTypes.Phrase (ENIAMwalTypes.Null)
| _ -> failwith "morf_of_phrase"
(* | GerP(c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.
| PrepGerP(prep,c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.
| PpasP(c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.
| PrepPpasP(prep,c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.
| PPact(c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.
| PrepPactP(prep,c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.
| Qub -> ENIAMwalTypes.Phrase (ENIAMwalTypes.*)
open ENIAMwalTypes
let process_phrases phrases =
Xlist.fold phrases StringMap.empty (fun phrases (id,phrase) ->
let id =
match id with
{hash=false; numbers=[(*_;_;_;*)id]; suffix="phr"} -> id
| _ -> failwith "process_phrases" in
StringMap.add phrases id phrase)
let process_positions positions =
Xlist.fold positions StringMap.empty (fun positions position ->
let id =
match position.psn_id with
{hash=false; numbers=[(*_;_;*)id]; suffix="psn"} -> id
| _ -> failwith "process_positions" in
let r,cr,ce = ENIAMwalParser.parse_roles (position.gf :: position.control) in
let phrases = process_phrases position.phrases in
StringMap.add positions id (r,cr,ce,phrases))
let process_schemata schemata =
Xlist.fold schemata StringMap.empty (fun schemata schema ->
let id =
match schema.sch_id with
{hash=false; numbers=[(*_;*)id]; suffix="sch"} -> id
| _ -> failwith "process_schemata" in
let schema_atrs = DefaultAtrs([],
ENIAMwalParser.parse_refl [Text schema.reflexiveMark],
ENIAMwalParser.parse_opinion schema.opinion,
ENIAMwalParser.parse_negation [Text schema.negativity],
ENIAMwalParser.parse_pred [Text schema.predicativity],
ENIAMwalParser.parse_aspect [Text schema.aspect]) in
let positions = process_positions schema.positions in
StringMap.add schemata id (schema_atrs,positions))
let add_meanings meanings = function
DefaultAtrs(_,r,o,n,p,a) -> DefaultAtrs(meanings,r,o,n,p,a)
| _ -> failwith "add_meanings"
let process_arguments arguments =
Xlist.fold arguments StringMap.empty (fun arguments argument ->
let id =
match argument.arg_id with
{hash=false; numbers=[(*_;_;*)id]; suffix="arg"} -> id
| _ -> failwith "process_arguments" in
StringMap.add arguments id (argument.role,argument.role_attribute,argument.sel_prefs))
let get_meaning_id meaning =
match parse_full_id meaning with
{hash=true; numbers=[_;id]; suffix="mng"} -> id
| _ -> failwith "get_meaning_id"
let get_schema_id alt =
try
match parse_full_id (List.hd ((List.hd alt.connections).phrases)) with
{hash=true; numbers=[_;id;_;_]; suffix="phr"} -> id
| _ -> failwith "get_schema_id 1"
with _ -> failwith "get_schema_id 2"
let get_frame_id alt =
try
match parse_full_id ((List.hd alt.connections).argument) with
{hash=true; numbers=[_;id;_]; suffix="arg"} -> id
| _ -> failwith "get_frame_id"
with _ -> failwith "get_frame_id"
let get_argument_id arg =
match parse_full_id arg with
{hash=true; numbers=[_;_;id]; suffix="arg"} -> id
| _ -> failwith "get_argument_id"
let get_position_id phrases =
try
match parse_full_id (List.hd phrases) with
{hash=true; numbers=[_;_;id;_]; suffix="phr"} -> id
| _ -> failwith "get_position_id"
with _ -> failwith "get_position_id"
let get_phrase_id arg =
match parse_full_id arg with
{hash=true; numbers=[_;_;_;id]; suffix="phr"} -> id
| _ -> failwith "get_phrase_id"
let process_frames frames =
Xlist.fold frames StringMap.empty (fun frames frame ->
let id =
match frame.frm_id with
{hash=false; numbers=[(*_;*)id]; suffix="frm"} -> id
| _ -> failwith "process_frames" in
let arguments = process_arguments frame.arguments in
let meaning_ids = Xlist.map frame.meanings get_meaning_id in
StringMap.add frames id (meaning_ids,arguments))
let process_meanings meanings =
Xlist.fold meanings StringMap.empty (fun meanings meaning ->
let id =
match meaning.mng_id with
{hash=false; numbers=[(*_;*)id]; suffix="mng"} -> id
| _ -> failwith "process_meanings" in
StringMap.add meanings id (meaning.name ^ " " ^ meaning.variant))
let process_sel_pref = function
Numeric s -> (try ENIAMplWordnet.synset_name s with Not_found -> "unknown")
| Symbol s -> s
| Relation(s,t) -> "REL" (* FIXME *)
let connect entry =
let schemata = process_schemata entry.schemata in
let frames = process_frames entry.frames in
let meanings = process_meanings entry.meanings in
Xlist.fold entry.alternations [] (fun found alt ->
if alt.connections = [] then found else
let schema_id = get_schema_id alt in
let frame_id = get_frame_id alt in
let schema_atrs,positions = StringMap.find schemata schema_id in
let meaning_ids,arguments = StringMap.find frames frame_id in
let positions = Xlist.fold alt.connections [] (fun positions2 conn ->
let argument_id = get_argument_id conn.argument in
let position_id = get_position_id conn.phrases in
let r,cr,ce,phrases = StringMap.find positions position_id in
let phrases = Xlist.fold conn.phrases [] (fun phrases2 id ->
let phrase_id = get_phrase_id id in
try StringMap.find phrases phrase_id :: phrases2
with Not_found -> (*Printf.printf "%s\n%!" entry.form_orth;*)phrases2) in
let role,role_attribute,sel_prefs = StringMap.find arguments argument_id in
let sel_prefs = Xlist.map (List.flatten sel_prefs) process_sel_pref in
{gf=r; role=role; role_attr=role_attribute; sel_prefs=sel_prefs;
cr=cr; ce=ce; dir=Both; morfs=Xlist.map phrases morf_of_phrase} :: positions2) in
let meanings = List.rev (Xlist.fold meaning_ids [] (fun l id ->
(StringMap.find meanings id) :: l)) in
let schema_atrs = add_meanings meanings schema_atrs in
(entry.form_orth,entry.form_pos,Frame(schema_atrs,positions)) :: found)
let connect2 entry =
let schemata = process_schemata entry.schemata in
StringMap.fold schemata [] (fun found _ (schema_atrs,positions) ->
let positions = StringMap.fold positions [] (fun positions2 _ (r,cr,ce,phrases) ->
let phrases = StringMap.fold phrases [] (fun phrases2 _ phrase -> phrase :: phrases2) in
{gf=r; role=""; role_attr=""; sel_prefs=[];
cr=cr; ce=ce; dir=Both; morfs=Xlist.map phrases morf_of_phrase} :: positions2) in
(entry.form_orth,entry.form_pos,Frame(schema_atrs,positions)) :: found)
let load_walenty2 () =
let walenty = load_walenty walenty_filename in
Xlist.fold walenty StringMap.empty (fun walenty entry ->
if entry.frames = [] then Xlist.fold (connect2 entry) walenty (fun walenty (lemma,pos,frame) ->
let map = try StringMap.find walenty pos with Not_found -> StringMap.empty in
let map = StringMap.add_inc map lemma [frame] (fun l -> frame :: l) in
StringMap.add walenty pos map)
else Xlist.fold (connect entry) walenty (fun walenty (lemma,pos,frame) ->
let map = try StringMap.find walenty pos with Not_found -> StringMap.empty in
let map = StringMap.add_inc map lemma [frame] (fun l -> frame :: l) in
StringMap.add walenty pos map))
let print_stringqmap filename qmap =
let l = StringQMap.fold qmap [] (fun l k v -> (v,k) :: l) in
File.file_out filename (fun file ->
Xlist.iter (Xlist.sort l compare) (fun (v,k) ->
Printf.fprintf file "%5d %s\n" v k))
let sel_prefs_quantities walenty =
Xlist.fold walenty StringQMap.empty (fun quant e ->
Xlist.fold e.frames quant (fun quant f ->
Xlist.fold f.arguments quant (fun quant a ->
Xlist.fold a.sel_prefs quant (fun quant l ->
Xlist.fold l quant (fun quant -> function
Numeric s ->
let name = try ENIAMplWordnet.synset_name s with Not_found -> "unknown" in
StringQMap.add quant ("N " ^ s ^ " " ^ name)
| Symbol s -> StringQMap.add quant ("S " ^ s)
| Relation(s,t) -> StringQMap.add quant ("R " ^ s ^ " | " ^ t))))))
(*let _ =
let walenty = load_walenty walenty_filename in
let quant = sel_prefs_quantities walenty in
print_stringqmap "results/quant_sel_prefs.txt" quant*)
let print_entry filename lex =
match Xml.parse_file filename with
Xml.Element("TEI", _,
[Xml.Element("teiHeader",_,_) ;
Xml.Element("text",[],[Xml.Element("body",[],entries)])]) ->
Xlist.iter entries (function
Xml.Element("entry",_,Xml.Element("form", [], [Xml.Element("orth",[],[Xml.PCData orth]);_]) :: xml :: _) ->
if orth = lex then print_endline (Xml.to_string_fmt xml)
| _ -> failwith "print_entry")
| _ -> failwith "print_entry"
(*let _ =
print_entry walenty_filename "bębnić"*)
let print_full_entry filename lex =
match Xml.parse_file filename with
Xml.Element("TEI", _,
[Xml.Element("teiHeader",_,_) ;
Xml.Element("text",[],[Xml.Element("body",[],entries)])]) ->
Xlist.iter entries (function
Xml.Element("entry",e,Xml.Element("form",[],[Xml.Element("orth",[],[Xml.PCData orth]);p]) :: syn :: _ :: l) ->
let xml = Xml.Element("entry",e,Xml.Element("form",[],[Xml.Element("orth",[],[Xml.PCData orth]);p]) :: syn :: l) in
if orth = lex then print_endline (Xml.to_string_fmt xml)
| _ -> failwith "print_full_entry")
| _ -> failwith "print_full_entry"
(*let _ =
print_full_entry walenty_filename "bębnić"*)
(*let _ =
let walenty = load_walenty2 () in
let frames_sem = try StringMap.find (StringMap.find walenty "verb") "bębnić" with Not_found -> failwith "walTEI" in
Xlist.iter frames_sem (fun frame ->
print_endline (WalStringOf.frame "bębnić" frame))*)