LCGvalence.ml
16.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
(*
* ENIAM: Categorial Syntactic-Semantic Parser for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMwalTypes
open LCGtypes
open Printf
open Xstd
let rec list_assoc2 x = function
(s,a,b) :: l -> if x = s then a,b else list_assoc2 x l
| [] -> raise Not_found
let meaning_weight = -1.
let prepare_senses lemma meanings senses =
match meanings,senses with
[],[] -> [lemma, ["ALL"],0.] (* FIXME *)
| [],_ ->
Xlist.map senses (fun (sense,hipero,weight) ->
if hipero = ["0"] then sense,["0"],weight else
sense,(if hipero = [] then ["ALL"] else hipero),weight)
| _,[] -> Xlist.map meanings (fun meaning -> meaning, ["ALL"],meaning_weight)
| _,_ ->
Xlist.map meanings (fun meaning ->
let hipero,weight = try list_assoc2 meaning senses with Not_found -> [],meaning_weight in
if hipero = ["0"] then meaning,["0"],weight else meaning,(if hipero = [] then ["ALL"] else hipero),weight)
let extract_meaning lemma = function
DefaultAtrs(m,r,o,neg,p,a) -> m,lemma,DefaultAtrs([],r,o,neg,p,a)
| EmptyAtrs m -> m,lemma,EmptyAtrs []
| NounAtrs(m,nsyn,s(*,typ*)) -> m,lemma,NounAtrs([],nsyn,s(*,typ*))
| AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> m,lemma,AdjAtrs([],c,adjsyn(*,adjsem,typ*))
| PersAtrs(m,le,neg,mo,t,au,a) -> m,le,PersAtrs([],le,neg,mo,t,au,a)
| GerAtrs(m,le,neg,a) -> m,le,GerAtrs([],le,neg,a)
| NonPersAtrs(m,le,role,role_attr,neg,a) -> m,le,NonPersAtrs([],le,role,role_attr,neg,a)
| _ -> failwith "extract_meaning"
let extract_roles = function
NonPersAtrs(m,le,role,role_attr,neg,a) -> role,role_attr
| _ -> failwith "extract_roles"
let get_lemma = function
ENIAMtokenizerTypes.Lemma(lemma,cat,_) -> lemma,cat
| ENIAMtokenizerTypes.Interp lemma -> lemma,"interp"
| _ -> "",""
let prepare_valence tokens lex_sems =
let valence = Array.make (ExtArray.size tokens) [] in
Int.iter 1 (ExtArray.size tokens - 1) (fun id ->
let c = ExtArray.get tokens id in
let d = ExtArray.get lex_sems id in
let lemma,cat = get_lemma c.ENIAMtokenizerTypes.token in
let lemma = if lemma = "<or-sentence>" (*|| lemma = ":s" || lemma = "„s"*) then "pro-komunikować" else lemma in
if lemma = "" then () else
let prep_valence =
if cat = "prep" then
(* (0,lemma,StringSet.empty,0.,"NOSEM","",Frame(EmptyAtrs[],[])) :: *)
match d.ENIAMlexSemanticsTypes.semantics with
ENIAMlexSemanticsTypes.Normal -> []
| ENIAMlexSemanticsTypes.PrepSemantics l -> (* FIXME: uzgadnianie cases *)
Xlist.rev_map l (fun (case,lrole,lrole_attr,hipero,sel_prefs) ->
0,lemma,hipero,0.,lrole,lrole_attr,Frame(EmptyAtrs[],[]))
| _ -> failwith "prepare_valence"
else [] in
let valence2 = if d.ENIAMlexSemanticsTypes.valence = [] then [0,Frame(EmptyAtrs[],[])] else d.ENIAMlexSemanticsTypes.valence in
let lrole,lrole_attr = d.ENIAMlexSemanticsTypes.lroles in
valence.(id) <- prep_valence @ List.flatten (Xlist.map valence2 (function
fnum,Frame(attrs,schema) ->
let meanings,lemma,attrs = extract_meaning lemma attrs in
let lrole,lrole_attr =
if cat = "pact" || cat = "ppas" then extract_roles attrs else
if cat = "pcon" then "Con","" else
if cat = "pant" then "Ant","" else
d.ENIAMlexSemanticsTypes.lroles in
Xlist.map (prepare_senses lemma meanings d.ENIAMlexSemanticsTypes.senses) (fun (meaning,hipero,weight) ->
let hipero = if cat = "conj" then ["0"] else hipero in
fnum,meaning,StringSet.of_list hipero,weight,lrole,lrole_attr,
Frame(attrs,Xlist.map schema (fun s ->
(* let s = if s.sel_prefs=[] then (print_endline ("prepare_valence empty sel_prefs: " ^ lemma ^ " " ^ cat); {s with sel_prefs=["ALL"]}) else s in *)
if s.role="" && s.gf <> ADJUNCT && s.gf <> NOSEM then (
printf "%d: %s\n%!" fnum (ENIAMwalStringOf.frame lemma (Frame(attrs,schema)));
failwith ("prepare_valence empty role: " ^ lemma ^ " " ^ cat)) else
{s with morfs=List.sort compare s.morfs})))
| fnum,(LexFrame _ as frame) -> [fnum,"lex",StringSet.empty,0.,lrole,lrole_attr,frame]
| fnum,(ComprepFrame _ as frame) -> [fnum,"comprep",StringSet.empty,0.,lrole,lrole_attr,frame])));
valence
(*let create_pro_frames t =
[0,t.pred,StringSet.singleton "0",0.,"","",Frame(EmptyAtrs[],[])]*)
let get_fnum t =
let x = try Xlist.assoc t.attrs "FNUM" with Not_found -> Val "0" in
(match x with
Val s -> (try int_of_string s with _ -> failwith "get_fnum 1")
| _ -> failwith "get_fnum 2")
let select_frames l t =
(* printf "a1 pred=%s\n" t.pred; *)
let fnum = get_fnum t in
let l = Xlist.fold l [] (fun l (n,meaning,hipero,weight,lrole,lrole_attr,frame) ->
if n = fnum then (meaning,hipero,weight,lrole,lrole_attr,frame) :: l else l) in
(* printf "a2 pred=%s\n" t.pred; *)
match l with
[] -> "",[]
| [_,_,_,_,_,LexFrame _] -> "",[]
| [_,_,_,_,_,ComprepFrame _] -> "",[]
| _ -> LCGreductions.get_variant_label (),
fst (Xlist.fold l ([],1) (fun (l,i) t -> (string_of_int i, t) :: l, i+1))
let rec get_arg_refs found = function
Variant(_,l) -> Xlist.fold l found (fun found (_,t) -> get_arg_refs found t)
| Ref i -> i :: found
| t -> failwith ("get_arg_refs: " ^ LCGstringOf.linear_term 0 t)
let rec match_position = function
a :: la, b :: lb ->
if a = b then match_position (la,b :: lb) else
if a > b then match_position (a :: la,lb)
else false
| [],_ -> true
| _,[] -> false
let mark_sem_morfs morfs =
Xlist.map morfs (function
| Phrase(PrepNP(_,prep,c)) -> Phrase(PrepNP(Sem,prep,c))
| Phrase(PrepAdjP(_,prep,c)) -> Phrase(PrepAdjP(Sem,prep,c))
| Phrase(PrepNumP(_,prep,c)) -> Phrase(PrepNumP(Sem,prep,c))
| Phrase(ComprepNP(_,prep)) -> Phrase(ComprepNP(Sem,prep))
| Phrase(ComparNP(_,prep,c)) -> Phrase(ComparNP(Sem,prep,c))
| Phrase(ComparPP(_,prep)) -> Phrase(ComparPP(Sem,prep))
| Phrase(PrepNCP(_,prep,c,ct,co)) -> Phrase(PrepNCP(Sem,prep,c,ct,co))
| t -> t)
let mark_nosem_morf = function
Phrase(PrepNP(_,prep,c)) -> Phrase(PrepNP(NoSem,prep,c))
| Phrase(PrepAdjP(_,prep,c)) -> Phrase(PrepAdjP(NoSem,prep,c))
| Phrase(PrepNumP(_,prep,c)) -> Phrase(PrepNumP(NoSem,prep,c))
| Phrase(ComprepNP(_,prep)) -> Phrase(ComprepNP(NoSem,prep))
| Phrase(ComparNP(_,prep,c)) -> Phrase(ComparNP(NoSem,prep,c))
| Phrase(ComparPP(_,prep)) -> Phrase(ComparPP(NoSem,prep))
| Phrase(PrepNCP(_,prep,c,ct,co)) -> Phrase(PrepNCP(NoSem,prep,c,ct,co))
| t -> failwith "mark_nosem_morf"
let rec is_nosem_morf = function
Phrase(PrepNP(NoSem,prep,c)) -> true
| Phrase(PrepAdjP(NoSem,prep,c)) -> true
| Phrase(PrepNumP(NoSem,prep,c)) -> true
| Phrase(ComprepNP(NoSem,prep)) -> true
| Phrase(ComparNP(NoSem,prep,c)) -> true
| Phrase(ComparPP(NoSem,prep)) -> true
| Phrase(PrepNCP(NoSem,prep,c,ct,co)) -> true
| _ -> false
let rec is_sem_morf = function
Phrase(PrepNP(Sem,prep,c)) -> true
| Phrase(PrepAdjP(Sem,prep,c)) -> true
| Phrase(PrepNumP(Sem,prep,c)) -> true
| Phrase(ComprepNP(Sem,prep)) -> true
| Phrase(ComparNP(Sem,prep,c)) -> true
| Phrase(ComparPP(Sem,prep)) -> true
| Phrase(PrepNCP(Sem,prep,c,ct,co)) -> true
| _ -> false
let rec exclude_sem_morfs = function
[] -> []
| morf :: morfs -> if is_sem_morf morf then exclude_sem_morfs morfs else morf :: exclude_sem_morfs morfs
(* UWAGA: dopasowywane ramy są preselekcjonowane, więc wszystkie argumenty muszą się maczować *)
let match_args_pos modifications nodes e i schema t =
(* printf "match_args_pos\n"; *)
(* if schema = [] then schema else *)
let refs = get_arg_refs [] t in
let gfl,morfs = Xlist.fold refs ([],[]) (fun (gfl,morfs) i -> nodes.(i).agf :: gfl, nodes.(i).amorf :: morfs) in
if gfl = [] then failwith "match_args_pos: empty gfl" else
let gf = Xlist.fold (List.tl gfl) (List.hd gfl) (fun gf gf2 -> if gf = gf2 then gf else failwith "match_args_pos 2") in
(* if gf = NOSEM || gf = NOGF then schema else
if gf = CORE then schema else (* FIXME: semantyka dla core *)*)
let morfs = exclude_sem_morfs morfs in
let schema,selected =
if morfs = [] then schema,[] else
let morfs = List.sort compare morfs in
(* printf "gf=%s morfs=%s\n%!" (ENIAMwalStringOf.gf gf) (String.concat ";" (Xlist.map morfs ENIAMwalStringOf.morf)); *)
Xlist.fold schema ([],[]) (fun (schema,selected) pos ->
(* printf "pos.gf=%s pos.morfs=%s\n%!" (ENIAMwalStringOf.gf pos.gf) (String.concat ";" (Xlist.map pos.morfs ENIAMwalStringOf.morf)); *)
if gf = pos.gf || (gf = ADJUNCT && pos.gf=ARG) then
if match_position (morfs,(*mark_sem_morfs*) pos.morfs) then schema, pos :: selected else pos :: schema, selected
else pos :: schema, selected) in
(match selected with
[] -> (*if gf = ARG then failwith "match_args_pos 3" else*)
Xlist.iter refs (fun r ->
modifications.(r) <- StringMap.add modifications.(r) (e ^ i)
LCGrenderer.empty_schema_field(*{gf=ADJUNCT; role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[]; dir=Both; morfs=[]}*) )
(* FIXME: przy kilku pasujących pozycjach wybieram pierwszą a nich, do poprawienia przy okazji porządków z walencją
np walencja leksemu "godzina":
1: : : common: time: Poss,T|{null;np(gen);nump(gen)}+Arg,T|{null;np(gen);nump(gen)};
1: : : common: time: Temp,T|{null;np(gen);nump(gen)}]; *)
| pos :: _ -> Xlist.iter refs (fun r -> (* FIXME: gdzieś tu trzeba wstawić uzupełnianie brakujących ról dla argumentów i adjunctów *) (* FIXME: tu pewnie trzeba będzie wstawić rolę dla adjunctów, które pojawią się w wyniku niespełniania SEL-PREFS *)
modifications.(r) <- StringMap.add modifications.(r) (e ^ i) pos)
(*| _ -> failwith "match_args_pos 4"*));
schema
let rec match_args_tuple modifications nodes e i schema = function
Tuple l ->
Xlist.fold l schema (fun schema t ->
match_args_tuple modifications nodes e i schema t)
| t -> match_args_pos modifications nodes e i schema t
let match_args modifications nodes e i t = function
Frame(_,schema) -> ignore (match_args_tuple modifications nodes e i schema t.args)
| LexFrame _ -> failwith "match_args"
| ComprepFrame _ -> failwith "match_args"
let rec assign_frames_and_senses_rec modifications valence nodes t =
(* printf "pred=%s id=%d\n" t.pred t.id; *)
(* if t.id = 0 then failwith ("assign_frames_and_senses_rec: t.id=0 pred=" ^ t.pred) else *) (* FIXME: zbadać po co było to zabezpieczenie *)
let e,node_valence = select_frames ((*if t.id >= Array.length valence then create_pro_frames t else*) valence.(t.id)) t in
let node_valence = if node_valence <> [] then node_valence else ["1",(t.pred,StringSet.empty,0.,"","",Frame(EmptyAtrs[],[]))] in
let l = Xlist.map node_valence (fun (i,(meaning,hipero,weight,lrole,lrole_attr,frame)) ->
let t = if lrole = "" then t else {t with arole=lrole;arole_attr=lrole_attr} in (* FIXME: pomijam to, że role dla rzeczowników dotyczą tylko inst *)
(* let t = if lrole = "NOSEM" then {t with amorf=mark_nosem_morf t.amorf} else t in *)
if t.args <> Dot then match_args modifications nodes e i t frame;
(* printf "meaning=%s\n" meaning; *)
i,Node{t with meaning=meaning;
hipero=hipero;
meaning_weight=weight}) in
Variant(e,l)
let rec is_nosem_morfs morfs =
let sem = Xlist.fold morfs false (fun b m -> b || is_sem_morf m) in
let nosem = Xlist.fold morfs false (fun b m -> b || is_nosem_morf m) in
nosem && not sem (* FIXME: pewne uproszczenie, ale liczę, że nie ma wielu ramek z xp koordynowanym z prep *)
let has_tuple = function
Tuple _ -> true
| _ -> false
let rec apply_modifications_rec (*tokens*) pos = function
Variant(e,l) -> Variant(e,List.rev (Xlist.fold l [] (fun l (i,t) ->
let t = apply_modifications_rec (*tokens*) pos t in
(*if t = Dot then l else*) (i, (*apply_modifications_rec (*tokens*) pos*) t) :: l)))
| Node t ->
if is_sem_morf t.amorf then Node t else (* FIXME: czy to jest poprawne? *)
let t = if is_nosem_morf t.amorf then {t with agf=ARG} else t in
Node{t with position=pos}
| _ -> failwith "apply_modifications_rec"
(* let empty_pos = {gf=ADJUNCT; role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[]; dir=Both; morfs=[]} (* FIXME: jaka GF? *) *)
let apply_modifications (*tokens*) modifications nodes references =
Int.iter 1 (Array.length references - 1) (fun r ->
(* if StringMap.is_empty modifications.(r) then failwith ("apply_modifications: " ^ nodes.(r).pred) else *)
references.(r) <- Choice(StringMap.map modifications.(r) (fun pos ->
apply_modifications_rec (*tokens*) pos references.(r))))
let rec extract_nosem rev = function
[] -> List.rev rev, false
| ("NOSEM",Val "+") :: l -> (List.rev rev) @ l, true
| x :: l -> extract_nosem (x :: rev) l
let get_nodes = function
Node t ->
let attrs,b = extract_nosem [] t.attrs in
(* let t = if t.pred = "<query1>" || t.pred = "<query2>" || t.pred = "<query3>" || t.pred = "<query4>" || t.pred = "<query5>" || t.pred = "<query6>" then {t with agf=CORE} else t in *)
let t = if t.pred = "<conll_root>" || t.pred = "<sentence>" || t.pred = "pro-komunikować" then {t with agf=CORE} else t in (* FIXME: przetestować na mowie niezależnej *)
if t.agf = NOGF then failwith ("get_nodes agf=NOGF: " ^ t.pred) else
if b then {t with amorf=mark_nosem_morf t.amorf; attrs=attrs} else t
| _ -> failwith "get_nodes"
let rec propagate_nosem_selprefs modifications ei = function
Choice choice -> Choice(StringMap.map choice (propagate_nosem_selprefs modifications ""))
| Variant(e,l) -> Variant(e,Xlist.map l (fun (i,t) -> i, propagate_nosem_selprefs modifications (e ^ i) t))
| Node t ->
if (t.cat = "prep" && t.arole = "NOSEM") || t.cat = "num" then
let refs = IntSet.of_list (get_arg_refs [] t.args) in
IntSet.iter refs (fun r ->
modifications.(r) <- StringMap.add_inc modifications.(r) ei t.position.ENIAMwalTypes.sel_prefs (fun l ->
if l = t.position.ENIAMwalTypes.sel_prefs then l else failwith ("propagate_nosem_selprefs 1: [" ^ String.concat ";" l ^ "] [" ^ String.concat ";" t.position.ENIAMwalTypes.sel_prefs ^ "]")));
Node{t with position= {t.position with ENIAMwalTypes.sel_prefs = []}}
else Node t
| _ -> failwith "propagate_nosem_selprefs 2"
let rec apply_modifications2_rec mods = function
Variant(e,l) -> Variant(e,Xlist.map l (fun (i,t) -> i, apply_modifications2_rec mods t))
| Node t ->
if t.position.ENIAMwalTypes.sel_prefs <> [] then failwith "apply_modifications2_rec" else
Node{t with position={t.position with ENIAMwalTypes.sel_prefs=mods}}
| _ -> failwith "apply_modifications2_rec"
let apply_modifications2 modifications references =
Int.iter 1 (Array.length references - 1) (fun r ->
if not (StringMap.is_empty modifications.(r)) then
match references.(r) with
Choice choice ->
references.(r) <- Choice(StringMap.mapi choice (fun ei t ->
try apply_modifications2_rec (StringMap.find modifications.(r) ei) t with Not_found -> t))
| _ -> failwith "apply_modifications2")
let assign_frames_and_senses tokens lex_sems references =
let modifications = Array.make (Array.length references) StringMap.empty in
let valence = prepare_valence tokens lex_sems in
let nodes = Array.map get_nodes references in
let references = Array.map (assign_frames_and_senses_rec modifications valence nodes) nodes in
apply_modifications (*tokens*) modifications nodes references;
(* let modifications = Array.make (Array.length references) StringMap.empty in
Int.iter 0 (Array.length references - 1) (fun r -> references.(r) <- propagate_nosem_selprefs modifications "" references.(r)); (* FIXME: propagowanie preferencji selekcyjnych więcej niż jeden poziom w głąb nie działa *)
apply_modifications2 modifications references;
Int.iter 0 (Array.length references - 1) (fun r -> references.(r) <- propagate_nosem_selprefs modifications "" references.(r));
apply_modifications2 modifications references;
Int.iter 0 (Array.length references - 1) (fun r -> references.(r) <- propagate_nosem_selprefs modifications "" references.(r));
apply_modifications2 modifications references; *)
references