ENIAMwalGenerate.ml
16.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
(*
* ENIAMwalenty, a converter for Polish Valence Dictionary "Walenty".
* Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMwalTypes
open Xstd
let correct_walenty entry =
if entry.form_orth = "podobać" then
{entry with schemata=Xlist.map entry.schemata (fun s ->
{s with positions=Xlist.map s.positions (fun p ->
if p.gf=SUBJ then {p with morfs=List.flatten (Xlist.map p.morfs (function
MorfId 126 -> []
| m -> [m]))}
else p)})}
else entry
let load_walenty walenty_filename expands_filename =
let walenty,phrases = ENIAMwalTEI.load_walenty walenty_filename in
let walenty = Xlist.rev_map walenty correct_walenty in
let expands = ENIAMwalTEI.load_expands expands_filename in
let meanings =
Xlist.fold walenty IntMap.empty (fun meanings entry ->
Xlist.fold entry.meanings meanings (fun meanings meaning ->
IntMap.add meanings meaning.mng_id meaning)) in
let connected_walenty =
Xlist.fold walenty Entries.empty (fun connected_walenty e ->
let entries = ENIAMwalConnect.connect e in
Entries.add_inc_list connected_walenty e.form_pos e.form_orth entries) in
let schemata_walenty =
Xlist.fold walenty Entries.empty (fun schemata_walenty e ->
let entries = ENIAMwalConnect.schemata e in
Entries.add_inc_list schemata_walenty e.form_pos e.form_orth entries) in
let expands,compreps,subtypes,equivs,adv_types =
ENIAMwalRealizations.load_realizations (expands,ENIAMwalTEI.subtypes,ENIAMwalTEI.equivs) in
let phrases =
IntMap.map phrases (fun morf ->
let morf = ENIAMwalRealizations.expand_schema_morf expands morf in
let morfs = ENIAMwalRealizations.expand_subtypes_morf subtypes morf in
let morf = List.flatten (Xlist.map morfs (ENIAMwalRealizations.expand_equivs_morf equivs)) in
morf) in
let compreps = Xlist.map compreps (fun (lemma,morfs) ->
lemma, ENIAMwalLex.expand_lexicalizations_morfs morfs) in
let entries = ENIAMwalLex.extract_lex_entries_comprepnp [] compreps in
let phrases,entries =
IntMap.fold phrases (IntMap.empty,entries) (fun (phrases,entries) id morfs ->
let morfs = ENIAMwalLex.expand_lexicalizations_morfs morfs in
let morfs,entries = Xlist.fold morfs ([],entries) ENIAMwalLex.extract_lex_entries in
IntMap.add phrases id morfs, entries) in
let entries = Xlist.fold entries Entries.empty (fun entries (pos,lemma,entry) ->
Entries.add_inc entries pos lemma entry) in
let entries = Entries.map2 entries (fun pos lemma entries -> EntrySet.to_list (EntrySet.of_list entries)) in
let entries = Entries.flatten_map entries (fun pos lemma entry ->
ENIAMwalLex.expand_restr [] lemma pos entry) in
(* let entries =
StringMap.mapi entries (fun pos entries2 ->
StringMap.mapi entries2 (fun lemma entries3 ->
EntrySet.fold entries3 [] (fun entries3 entry ->
(ENIAMwalLex.expand_restr [] lemma pos entry) @ entries3))) in *)
connected_walenty, schemata_walenty, phrases, entries, meanings, adv_types
let print_entries filename entries =
File.file_out filename (fun file ->
Entries.iter entries (fun pos lemma entry ->
Printf.fprintf file "%s\t%s\t%s\n" pos lemma (ENIAMwalStringOf.lex_entry entry)))
let print_phrases filename phrases =
File.file_out filename (fun file ->
IntMap.iter phrases (fun id morfs ->
let morfs = Xlist.map morfs ENIAMwalStringOf.morf in
Printf.fprintf file "%d\t%s\n" id (String.concat "\t" morfs)))
let print_schemata filename schemata =
File.file_out filename (fun file ->
Entries.iter schemata (fun pos lemma (opinion,(n,p,a),schema) ->
Printf.fprintf file "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" pos lemma
(ENIAMwalStringOf.opinion opinion)
(ENIAMwalStringOf.negation n)
(ENIAMwalStringOf.pred p)
(ENIAMwalStringOf.aspect a)
(ENIAMwalStringOf.simple_schema schema)))
let print_connected filename connected =
File.file_out filename (fun file ->
Entries.iter connected (fun pos lemma (sopinion,fopinion,meanings,(n,p,a),schema) ->
Printf.fprintf file "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" pos lemma
(ENIAMwalStringOf.opinion sopinion)
(ENIAMwalStringOf.opinion fopinion)
(String.concat "," (Xlist.map meanings string_of_int))
(ENIAMwalStringOf.negation n)
(ENIAMwalStringOf.pred p)
(ENIAMwalStringOf.aspect a)
(ENIAMwalStringOf.connected_schema schema)))
let split_tokens s =
let l = List.flatten (Xlist.map (Str.full_split (Str.regexp " \\|,\\|-") s) (function
Str.Delim " " -> []
| Str.Delim s -> [s]
| Str.Text s -> [s])) in
String.concat " " l
let print_fixed filename fixed =
File.file_out filename (fun file ->
StringSet.iter fixed (fun s ->
let t = split_tokens s in
Printf.fprintf file "%s\t%s\tfixed\n" t s))
let print_adv_types filename adv_types =
File.file_out filename (fun file ->
Xlist.iter adv_types (fun (m,l) ->
Xlist.iter l (fun s ->
Printf.fprintf file "%s\t%s\n" s m)))
let add_fixed fixed = function
Phrase (FixedP s) -> StringSet.add fixed s
| SimpleLexArg(s,FIXED) -> StringSet.add fixed s
| LexArg(_,s,FIXED) -> StringSet.add fixed s
| _ -> fixed
let find_fixed_schema fixed schema =
Xlist.fold schema fixed (fun schema p ->
Xlist.fold p.morfs fixed add_fixed)
let find_fixed phrases entries =
let fixed = IntMap.fold phrases StringSet.empty (fun fixed _ morfs ->
Xlist.fold morfs fixed add_fixed) in
Entries.fold entries fixed (fun fixed pos lemma -> function
SimpleLexEntry(s,"fixed") -> StringSet.add fixed s
| SimpleLexEntry(s,_) -> fixed
| LexEntry(_,s,"fixed",_,schema) -> find_fixed_schema (StringSet.add fixed s) schema
| LexEntry(_,_,_,_,schema) -> find_fixed_schema fixed schema
| ComprepNPEntry(_,_,schema) -> find_fixed_schema fixed schema)
let print_meanings filename meanings =
File.file_out filename (fun file ->
IntMap.iter meanings (fun _ m ->
Printf.fprintf file "%d\t%s\t%s\t%d\t%s\n" m.mng_id m.name m.variant m.plwnluid m.gloss))
(* let connected_walenty, schemata_walenty, phrases, entries, meanings = load_walenty
"/home/yacheu/Dokumenty/NLP resources/Walenty/walenty_20170311.xml"
"/home/yacheu/Dokumenty/NLP resources/Walenty/phrase_types_expand_20170311.xml" *)
(* Generowanie zasobów *)
let _ =
if Array.length Sys.argv < 3 then print_endline "missing argument" else (
let connected_walenty, schemata_walenty, phrases, entries, meanings, adv_types = load_walenty Sys.argv.(1) Sys.argv.(2) in
print_entries "resources/entries.tab" entries;
print_phrases "resources/phrases.tab" phrases;
print_schemata "resources/schemata.tab" schemata_walenty;
print_connected "resources/connected.tab" connected_walenty;
print_fixed "resources/fixed.tab" (find_fixed phrases entries);
print_meanings "resources/meanings.tab" meanings;
print_adv_types "resources/adv_modes.tab" adv_types;
())
(* Test wczytywania Walentego TEI *)
(* let _ =
let walenty,phrases = ENIAMwalTEI.load_walenty "/home/yacheu/Dokumenty/NLP resources/Walenty/walenty_20170311.xml" in
let n = Xlist.fold ENIAMwalTEI.walenty 0 (fun n e -> let l = connect e in n + Xlist.size l) in
let m = Xlist.fold ENIAMwalTEI.walenty 0 (fun n e -> let l = schemata e in n + Xlist.size l) in
Printf.printf "%d connected\n%d schemata\n|phrases|=%d\n" n m (IntMap.size phrases);
() *)
(* Test unikalności indeksów sensów *)
(* let _ =
let walenty,phrases = ENIAMwalTEI.load_walenty "/home/yacheu/Dokumenty/NLP resources/Walenty/walenty_20170311.xml" in
Xlist.fold walenty IntMap.empty (fun map e ->
Xlist.fold e.meanings map (fun map m ->
IntMap.add_inc map m.mng_id m (fun m1 -> if m1 = m then m else failwith "meaning"))) *)
(*
(* let insert_phrases phrases = function
Frame(atrs,s) -> Frame(atrs,Xlist.map s (fun p ->
{p with morfs=Xlist.map p.morfs (function
MorfId id -> (try IntMap.find phrases id with Not_found -> failwith "insert_phrases")
| _ -> failwith "insert_phrases")}))
| _ -> failwith "insert_phrases: ni"
let print_entry pos_map pos orth =
let orth_map = try StringMap.find pos_map pos with Not_found -> StringMap.empty in
let frames = try StringMap.find orth_map orth with Not_found -> [] in
Xlist.iter frames (fun frame ->
let frame = insert_phrases ENIAMwalTEI.phrases frame in
print_endline (ENIAMwalStringOf.frame orth frame)) *)
(* Wypisanie hasła *)
(* let _ =
print_entry connected_walenty "verb" "brudzić";
() *)
(* let has_nontrivial_lex = function
Frame(atrs,s) -> Xlist.fold s false (fun b p ->
if p.role = "Lemma" && p.role_attr = "" then b else
Xlist.fold p.morfs b (fun b -> function
MorfId id -> failwith "has_nontrivial_lex"
| LexPhrase _ -> true
(* | LexRPhrase _ -> true
| LexPhraseMode _ -> true *)
| _ -> b))
| _ -> failwith "has_nontrivial_lex: ni" *)
(* Leksykalizacje nie wchodzące do lematu *)
(* let _ =
StringMap.iter connected_walenty (fun _ orth_map ->
StringMap.iter orth_map (fun orth frames ->
Xlist.iter frames (fun frame ->
let frame = insert_phrases ENIAMwalTEI.phrases frame in
if has_nontrivial_lex frame then
print_endline (ENIAMwalStringOf.frame orth frame)))) *)
let simplify_frame_verb = function
Phrase(NP(Case "dat")) -> []
| Phrase(NP(Case "inst")) -> []
| Phrase(PrepNP _) -> []
| Phrase(ComprepNP _) -> []
| Phrase(AdvP) -> []
| t -> [t]
let simplify_frame_noun = function
Phrase(NP(Case "gen")) -> []
| Phrase(NP(Case "nom")) -> []
| Phrase(NP(CaseAgr)) -> []
| Phrase(PrepNP _) -> []
| Phrase(ComprepNP _) -> []
| Phrase(AdjP CaseAgr) -> []
| PhraseComp(Ncp(Case "gen"),_)
| PhraseComp(Prepncp(_,_),_) -> []
| PhraseAbbr(Possp,[]) -> []
| t -> [t]
let simplify_frame_adj = function
| t -> [t]
let simplify_frame_adv = function
| t -> [t]
(* let simplify_frame pos = function
Frame(atrs,s) ->
let schema = Xlist.fold s [] (fun schema p ->
let morfs = Xlist.fold p.morfs [] (fun morfs morf ->
match pos with
"verb" -> simplify_frame_verb morf @ morfs
| "noun" -> simplify_frame_noun morf @ morfs
| "adj" -> simplify_frame_adj morf @ morfs
| "adv" -> simplify_frame_adv morf @ morfs
| _ -> failwith "simplify_frame") in
if morfs = [] then schema else
{p with ce=[]; cr=[]; morfs=morfs} :: schema) in
if schema = [] then [] else [Frame(atrs,schema)]
| _ -> failwith "simplify_frame: ni" *)
(* Uproszczone schematy *)
(* let _ =
StringMap.iter schemata_walenty (fun pos orth_map ->
if pos = "noun" then
StringMap.iter orth_map (fun orth frames ->
Xlist.iter frames (fun frame ->
let frame = insert_phrases ENIAMwalTEI.phrases frame in
let frames = simplify_frame pos frame in
Xlist.iter frames (fun frame -> print_endline (ENIAMwalStringOf.frame orth frame))))) *)
(* let has_mode_coordination = function
Frame(atrs,s) -> Xlist.fold s false (fun b p ->
let n = Xlist.fold p.morfs 0 (fun n -> function
MorfId id -> failwith "has_nontrivial_lex"
| PhraseAbbr(Advp _,_) -> n+1
| PhraseAbbr(Xp _,_) -> n+1
(* | LexPhraseMode _ -> n+1 FIXME*)
| _ -> n) in
if n>1 then true else b)
| _ -> failwith "has_nontrivial_lex: ni" *)
(* Koordynacja z mode *)
(* let _ =
StringMap.iter schemata_walenty(*connected_walenty*) (fun _ orth_map ->
StringMap.iter orth_map (fun orth frames ->
Xlist.iter frames (fun frame ->
let frame = insert_phrases ENIAMwalTEI.phrases frame in
if has_mode_coordination frame then
print_endline (ENIAMwalStringOf.frame orth frame)))) *)
(* let get_entry orth pos *)
(*
let load_walenty2 () =
let walenty = load_walenty walenty_filename in
Xlist.fold walenty StringMap.empty (fun walenty entry ->
if entry.frames = [] then Xlist.fold (connect2 entry) walenty (fun walenty (lemma,pos,frame) ->
let map = try StringMap.find walenty pos with Not_found -> StringMap.empty in
let map = StringMap.add_inc map lemma [frame] (fun l -> frame :: l) in
StringMap.add walenty pos map)
else Xlist.fold (connect entry) walenty (fun walenty (lemma,pos,frame) ->
let map = try StringMap.find walenty pos with Not_found -> StringMap.empty in
let map = StringMap.add_inc map lemma [frame] (fun l -> frame :: l) in
StringMap.add walenty pos map))
let print_stringqmap filename qmap =
let l = StringQMap.fold qmap [] (fun l k v -> (v,k) :: l) in
File.file_out filename (fun file ->
Xlist.iter (Xlist.sort l compare) (fun (v,k) ->
Printf.fprintf file "%5d %s\n" v k))
let sel_prefs_quantities walenty =
Xlist.fold walenty StringQMap.empty (fun quant e ->
Xlist.fold e.frames quant (fun quant f ->
Xlist.fold f.arguments quant (fun quant a ->
Xlist.fold a.sel_prefs quant (fun quant l ->
Xlist.fold l quant (fun quant -> function
Numeric s ->
let name = try ENIAMplWordnet.synset_name s with Not_found -> "unknown" in
StringQMap.add quant ("N " ^ s ^ " " ^ name)
| Symbol s -> StringQMap.add quant ("S " ^ s)
| Relation(s,t) -> StringQMap.add quant ("R " ^ s ^ " | " ^ t))))))
*)
(*let _ =
let walenty = load_walenty walenty_filename in
let quant = sel_prefs_quantities walenty in
print_stringqmap "results/quant_sel_prefs.txt" quant*)
(*let _ =
let walenty = load_walenty2 () in
let frames_sem = try StringMap.find (StringMap.find walenty "verb") "bębnić" with Not_found -> failwith "walTEI" in
Xlist.iter frames_sem (fun frame ->
print_endline (WalStringOf.frame "bębnić" frame))*)
(* Wypisanie realizacji *)
(* let _ =
Xlist.iter ENIAMwalTEI.expands (fun (id,morf,l) ->
Printf.printf "%d %s:\n" id (ENIAMwalStringOf.morf morf);
Xlist.iter l (fun morf -> Printf.printf " %s\n" (ENIAMwalStringOf.morf morf))) *)
(* Wypisanie realizacji po przetworzeniu *)
(* let _ =
AbbrMap.iter expands (fun morf l ->
Printf.printf "%s:\n" (ENIAMwalStringOf.phrase_abbr morf);
Xlist.iter l (fun morf -> Printf.printf " %s\n" (ENIAMwalStringOf.morf morf))) *)
let has_realization = function
PhraseAbbr _ -> true
| PhraseComp _ -> true
| _ -> false
(* Wypisanie fraz, które podlegają rozwijaniu *)
(*let _ =
IntMap.iter ENIAMwalTEI.phrases (fun i morf ->
if has_realization morf then
Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf morf)) *)
(* Wypisanie fraz, które podlegają rozwijaniu *)
(* let _ =
IntMap.iter phrases (fun i morf ->
if has_realization morf then
Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf morf)) *)
(* let test_phrases = [17088; 17133; 1642]
let _ =
Xlist.iter test_phrases (fun i ->
let m1 = IntMap.find ENIAMwalTEI.phrases i in
let m2 = IntMap.find phrases i in
Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf m1);
Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf m2)) *)
(* let print_entries entries =
StringMap.iter entries (fun pos entries2 ->
StringMap.iter entries2 (fun lemma entries3 ->
EntrySet.iter entries3 (fun entry ->
Printf.printf "%s: %s: %s\n" pos lemma (ENIAMwalStringOf.entry entry)))) *)
(* let _ = print_entries entries *)
*)