ENIAMadjuncts.ml
16.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
(*
* ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
* Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMwalTypes
open Xstd
let simplify_position_verb mode l = function (* FIXME: dodać czyszczenie E Pro *)
NP(Case "dat") -> l
| NP(Case "inst") -> l
| NCP(Case "dat",_,_) -> l
| NCP(Case "inst",_,_) -> l
| CP _ -> l
| PrepNP _ -> l
| PrepAdjP _ -> l
| ComprepNP _ -> l
| ComparP _ -> l
| PrepNCP _ -> l
| AdvP _ -> l
| Or -> l
| SimpleLexArg("się",QUB) -> l
| E Or -> l
| E (CP(CompTypeUndef,CompUndef)) -> l
| E (PrepNP(prep,Case case)) -> l
| E (PrepNCP(prep,Case case,CompTypeUndef,CompUndef)) -> l
| NP(Case "gen") as t -> if mode = "temp" then l else t :: l
| NP(Case "acc") as t -> if mode = "dur" then l else t :: l
| t -> t :: l
let simplify_position_noun mode l = function
NP(Case "gen") -> l
| NP(Case "nom") -> l
| NCP(Case "gen",_,_) -> l
| NP(CaseAgr) -> l
| AdjP AllAgr -> l
| PrepNP _ -> l
| ComprepNP _ -> l
| ComparP _ -> l
| PrepNCP _ -> l
| t -> t :: l
let simplify_position_adj mode l = function
AdvP _ -> l
| ComparP _ -> l
| t -> t :: l
let simplify_position_adv mode l = function
AdvP _ -> l (* FIXME: czy na pewno zostawić swobodę modyfikowania przysłówka? *)
| t -> t :: l
(*
let simplify_position pos l s =
let morfs = match pos with
"verb" -> List.rev (Xlist.fold s.morfs [] simplify_position_verb)
| "noun" -> List.rev (Xlist.fold s.morfs [] simplify_position_noun)
| "adj" -> List.rev (Xlist.fold s.morfs [] simplify_position_adj)
| "adv" -> List.rev (Xlist.fold s.morfs [] simplify_position_adv)
| _ -> s.morfs in
match morfs with
[] -> l
| [Phrase Null] -> l
| _ -> {s with morfs=morfs} :: l
let simplify_schemata pos schemata =
let schemata = Xlist.fold schemata StringMap.empty (fun schemata (schema,frame) ->
let schema = List.sort compare (Xlist.fold schema [] (fun l s ->
let s = {s with role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[]; morfs=List.sort compare s.morfs} in
if s.gf <> ARG && s.gf <> ADJUNCT then s :: l else
(* if s.cr <> [] || s.ce <> [] then s :: l else *)
simplify_position pos l s)) in
StringMap.add_inc schemata (ENIAMwalStringOf.schema schema) (schema,[frame]) (fun (_,frames) -> schema, frame :: frames)) in
StringMap.fold schemata [] (fun l _ s -> s :: l)
let simplify_schemata2 pos schemata =
let simplify_position_fun = match pos with
"verb" -> simplify_position_verb2
| "noun" -> simplify_position_noun
| "adj" -> simplify_position_adj
| "adv" -> simplify_position_adv
| _ -> (fun l x -> x :: l) in
let morfs = Xlist.fold schemata [] (fun morfs schema ->
Xlist.fold schema morfs (fun morfs s ->
Xlist.fold s.morfs morfs simplify_position_fun)) in
let morfs = Xlist.fold morfs StringMap.empty (fun map s ->
StringMap.add map (ENIAMwalStringOf.morf s) s) in
let schema = StringMap.fold morfs [] (fun schema _ morf ->
{gf=ARG; role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[];
dir=Both; morfs=[Phrase Null;morf]} :: schema) in
schema*)
(*let rec classify_phrase = function
NP _ as phrase -> ENIAMwalStringOf.phrase phrase
| PrepNP _ as phrase -> ENIAMwalStringOf.phrase phrase
| AdjP _ as phrase -> ENIAMwalStringOf.phrase phrase
| PrepAdjP _ as phrase -> ENIAMwalStringOf.phrase phrase
| ComprepNP _ as phrase -> ENIAMwalStringOf.phrase phrase
| ComparP _ as phrase -> ENIAMwalStringOf.phrase phrase
| CP _ -> "cp"
| NCP(case,_,_) -> ENIAMwalStringOf.phrase (NP case)
| PrepNCP(prep,case,_,_) -> ENIAMwalStringOf.phrase (PrepNP(prep,case))
| InfP _ -> "infp"
| AdvP _ -> "advp"
| FixedP _ as phrase -> ENIAMwalStringOf.phrase phrase
| Or -> "or"
| E Or -> "or"
| E phrase -> classify_phrase phrase
| SimpleLexArg("się",QUB) -> "się"
(* | SimpleLexArg _ -> "lex" *)
(* | LexArg _ -> "lex" *)
| SimpleLexArg _ as phrase -> ENIAMwalStringOf.phrase phrase
| LexArg _ as phrase -> ENIAMwalStringOf.phrase phrase
| phrase -> print_endline ("classify_phrase: " ^ ENIAMwalStringOf.phrase phrase); "other"
let classify_position pos p =
let l = (*StringSet.to_list*) (Xlist.fold p.morfs StringSet.empty (fun set morf ->
StringSet.add set ((*classify_phrase*)ENIAMwalStringOf.phrase morf))) in
(* match l with
[] -> "empty"
| [c] -> c
(* | ["np(gen)"; "np(acc)"] -> "np(str)"
| ["np(gen)"; "infp"] -> "np(gen)-infp"
| ["np(acc)"; "infp"] -> "np(acc)-infp" *)
| _ -> let c = String.concat " " l in if pos="adv" then print_endline c; c *)
l*)
module OrderedPhrase = struct
type t = phrase
let compare = compare
end
module PhraseSet = Xset.Make(OrderedPhrase)
let remove_adjuncts_schema pos lemma schema =
let simplify_position_fun = match pos with
"verb" -> simplify_position_verb
| "noun" -> simplify_position_noun
| "adj" -> simplify_position_adj
| "adv" -> simplify_position_adv
| _ -> (fun _ l x -> x :: l) in
List.flatten (Xlist.map schema (fun p ->
let morfs = Xlist.fold p.morfs [] (simplify_position_fun (String.concat " " p.mode)) in
if morfs = [] then [] else [PhraseSet.of_list morfs]))
(* let schema2 = List.flatten (Xlist.map schema1 (fun p ->
let p = {p with morfs = Xlist.fold p.morfs [] (simplify_position_fun (String.concat " " p.mode))} in
let c = classify_position pos p in
if StringSet.is_empty c (*"empty"*) then [] else [c,[p]])) in
(* let sum = Xlist.fold schema2 StringSet.empty (fun set (c,p) -> StringSet.union set c) in
let n = Xlist.fold schema2 0 (fun n (c,p) -> n + StringSet.size c) in
if StringSet.size sum <> n (*&& pos = "noun"*) then (*Printf.printf "%s %s %s\n" pos lemma (ENIAMwalStringOf.schema schema1);*)
Printf.printf "%s %s %s\n" pos lemma (String.concat "+" (Xlist.map schema2 (fun (c,p) -> String.concat "," p.mode ^ "{" ^ String.concat ";" (StringSet.to_list c) ^ "}"))); *)
(* let set = Xlist.fold schema2 StringSet.empty (fun set (c,_) ->
StringSet.add set c) in
(* if StringSet.mem set "np(acc)" && StringSet.mem set "infp" then
Printf.printf "%s %s %s\n" pos lemma (ENIAMwalStringOf.schema schema1); *)
let schema2 = if StringSet.mem set "np(gen)" && StringSet.mem set "np(acc)" then schema2 else
Xlist.map schema2 (function
"np(gen)",p -> "np(str)",p
| "np(acc)",p -> "np(str)",p
| c,p -> c,p) in*)
schema2 *)
let is_disjunctive schema =
let sum = Xlist.fold schema PhraseSet.empty (fun set morfs -> PhraseSet.union set morfs) in
let n = Xlist.fold schema 0 (fun n morfs -> n + PhraseSet.size morfs) in
PhraseSet.size sum = n
let rec find_overlapping morfs rev = function
morfs2 :: schema ->
if PhraseSet.is_empty (PhraseSet.intersection morfs morfs2) then find_overlapping morfs (morfs2 :: rev) schema
else morfs2, List.rev rev @ schema
| [] -> raise Not_found
let rec merge_schemata_rec cont = function
[] -> cont
| [schema] -> schema :: cont
| schema1 :: schema2 :: schemata ->
let sum_schema,diff_schema = Xlist.fold schema1 ([],schema2) (fun (sum_schema,diff_schema) morfs ->
try
let morfs2,diff_schema = find_overlapping morfs [] diff_schema in
(PhraseSet.union morfs morfs2) :: sum_schema,diff_schema
with Not_found -> morfs :: sum_schema,diff_schema) in
let schema = sum_schema @ diff_schema in
if is_disjunctive schema then ((*print_endline "A";*) merge_schemata_rec cont (schema :: schemata))
else ((*print_endline "B";*) merge_schemata_rec (schema :: cont) schemata)
let rec merge_schemata schemata =
let cont,schemata = Xlist.fold schemata ([],[]) (fun (cont,schemata) schema ->
if is_disjunctive schema then cont, schema :: schemata else ((*print_endline "C";*) schema :: cont, schemata)) in
merge_schemata_rec cont schemata
open ENIAM_LCGlexiconTypes
let nie_vebs = StringSet.of_list ["fin";"bedzie";"praet";"winien";"impt";
"imps";"pred";"inf";"pcon";"pant"]
let imp_aux = StringSet.of_list ["niech";"niechaj";"niechże";"niechajże"]
let rec check_selector_lex_constraints lexemes pos = function
[] -> true
| (Negation,Eq,["neg"]) :: selectors ->
if not (StringSet.mem lexemes "nie") && (StringSet.mem nie_vebs pos) then false
else check_selector_lex_constraints lexemes pos selectors
| (Mood,Eq,["conditional"]) :: selectors ->
if not (StringSet.mem lexemes "by") && (pos = "praet" || pos = "winien") then false
else check_selector_lex_constraints lexemes pos selectors
| (Mood,Eq,["imperative"]) :: selectors ->
if StringSet.is_empty (StringSet.intersection lexemes imp_aux) && pos = "fin" then false
else check_selector_lex_constraints lexemes pos selectors
| _ :: selectors -> check_selector_lex_constraints lexemes pos selectors
module OrderedSelector = struct
type t = (selector * selector_relation * string list) list
let compare = compare
end
module SelectorMap = Xmap.Make(OrderedSelector)
let simplify_schemata lexemes pos pos2 lemma schemata =
(* Xlist.iter schemata (fun (selectors,schema) -> if pos2="verb" then Printf.printf "A %s %s [%s] %s\n" pos lemma (ENIAMcategoriesPL.string_of_selectors selectors) (ENIAMwalStringOf.schema schema)); *)
let map = Xlist.fold schemata SelectorMap.empty (fun map (selectors,schema) ->
SelectorMap.add_inc map selectors [schema] (fun l -> schema :: l)) in
let map = SelectorMap.fold map SelectorMap.empty (fun map selectors schemata ->
if check_selector_lex_constraints lexemes pos selectors then SelectorMap.add map selectors schemata else map) in
let schemata = SelectorMap.fold map [] (fun new_schemata selectors schemata ->
(selectors,Xlist.map schemata (remove_adjuncts_schema pos2 lemma)) :: new_schemata) in
(* Xlist.iter schemata (fun (_,schemata) ->
Xlist.iter schemata (fun schema -> if pos2="verb" then Printf.printf "%s %s %s\n" pos lemma (ENIAMwalStringOf.schema schema))); *)
(* Xlist.iter schemata (fun (selectors,schemata) -> Xlist.iter schemata (fun schema ->
if pos2="verb" then Printf.printf "B %s %s [%s] %s\n" pos lemma (ENIAMcategoriesPL.string_of_selectors selectors) (String.concat "+" (Xlist.map schema (fun morfs ->
"{" ^ String.concat ";" (PhraseSet.fold morfs [] (fun l m -> ENIAMwalStringOf.phrase m :: l)) ^ "}"))))); *)
let schemata = List.flatten (Xlist.map schemata (fun (selectors,schemata) ->
Xlist.map (merge_schemata schemata) (fun schema -> selectors,Xlist.map schema PhraseSet.to_list))) in
(* Xlist.iter schemata (fun (selectors,schema) ->
if pos2="verb" then Printf.printf "C %s %s [%s] %s\n" pos lemma (ENIAMcategoriesPL.string_of_selectors selectors) (String.concat "+" (Xlist.map schema (fun morfs ->
"{" ^ String.concat ";" (PhraseSet.fold morfs [] (fun l m -> ENIAMwalStringOf.phrase m :: l)) ^ "}")))); *)
schemata
let add_adjuncts preps compreps compars pos2 (selectors,cat,(*has_context,*)local_schema,schema,distant_schema) =
let compreps = Xlist.rev_map compreps ENIAMwalRenderer.render_comprep in
let prepnps = Xlist.rev_map preps (fun (prep,cases) -> ENIAMwalRenderer.render_prepnp prep cases) in
let prepadjps = Xlist.rev_map preps (fun (prep,cases) -> ENIAMwalRenderer.render_prepadjp prep cases) in
let compars = Xlist.rev_map compars ENIAMwalRenderer.render_compar in
match pos2 with
"verb" -> [selectors,cat,(*has_context,*)local_schema,schema @ ENIAMwalRenderer.verb_adjuncts_simp @ prepnps @ prepadjps @ compreps @ compars,distant_schema]
| "noun" -> [
[Nsyn,Eq,["proper"]] @ selectors,cat,(*has_context,*)local_schema,ENIAMwalRenderer.proper_noun_adjuncts_simp @ prepnps @ compreps @ compars,distant_schema;
[Nsyn,Eq,["common"];Nsem,Eq,["measure"]] @ selectors,cat,(*has_context,*)local_schema,ENIAMwalRenderer.measure_noun_adjuncts_simp @ prepnps @ compreps @ compars,distant_schema;
[Nsyn,Eq,["common"];Nsem,Neq,["measure"]] @ selectors,cat,(*has_context,*)local_schema,ENIAMwalRenderer.common_noun_adjuncts_simp @ prepnps @ compreps @ compars,distant_schema]
| "adj" -> [selectors,cat,(*has_context,*)local_schema,schema @ ENIAMwalRenderer.adj_adjuncts_simp @ compars,distant_schema]
| "adv" -> [selectors,cat,(*has_context,*)local_schema,schema @ ENIAMwalRenderer.adv_adjuncts_simp @ compars,distant_schema]
| _ -> []
open ENIAMlexSemanticsTypes
let add_subj_cr cr positions =
Xlist.map positions (fun p ->
if p.gf = SUBJ then {p with cr=cr :: p.cr} else p)
let add_connected_adjuncts preps compreps compars pos2 frame =
let compreps = Xlist.rev_map compreps ENIAMwalRenderer.render_connected_comprep in
let prepnps = Xlist.rev_map preps (fun (prep,cases) -> ENIAMwalRenderer.render_connected_prepnp prep cases) in
let prepadjps = Xlist.rev_map preps (fun (prep,cases) -> ENIAMwalRenderer.render_connected_prepadjp prep cases) in
let compars = Xlist.rev_map compars ENIAMwalRenderer.render_connected_compar in
match pos2 with
"verb" -> [{frame with positions=(add_subj_cr "3" frame.positions) @ ENIAMwalRenderer.verb_connected_adjuncts_simp @ prepnps @ prepadjps @ compreps @ compars}]
| "noun" -> [
{frame with selectors=[Nsyn,Eq,["proper"]] @ frame.selectors; positions=ENIAMwalRenderer.proper_noun_connected_adjuncts_simp @ prepnps @ compreps @ compars};
{frame with selectors=[Nsyn,Eq,["common"];Nsem,Eq,["measure"]] @ frame.selectors; positions=ENIAMwalRenderer.measure_noun_connected_adjuncts_simp @ prepnps @ compreps @ compars};
{frame with selectors=[Nsyn,Eq,["common"];Nsem,Neq,["measure"]] @ frame.selectors; positions=frame.positions @ ENIAMwalRenderer.common_noun_connected_adjuncts_simp @ prepnps @ compreps @ compars}]
| "adj" -> [{frame with positions=frame.positions @ ENIAMwalRenderer.adj_connected_adjuncts_simp @ compars}]
| "adv" -> [{frame with positions=frame.positions @ ENIAMwalRenderer.adv_connected_adjuncts_simp @ compars}]
| _ -> []
(* let _ =
let schemata,entries = ENIAMvalence.prepare_all_valence ENIAMwalParser.phrases ENIAMwalParser.schemata ENIAMwalParser.entries in
let _ = Entries.map2 schemata (fun pos lemma schemata -> simplify_schemata pos (ENIAMvalence.simplify_pos pos) lemma schemata) in
() *)
(*
let default_frames = Xlist.fold [ (* FIXME: poprawić domyślne ramki po ustaleniu adjunctów *)
"verb",(ReflEmpty,Domyslny,NegationUndef,PredNA,AspectUndef,"subj{np(str)}+obj{np(str)}"); (* FIXME: dodać ramkę z refl *)
"noun",(ReflEmpty,Domyslny,NegationNA,PredNA,AspectNA,"{possp}+{adjp(agr)}");
"adj",(ReflEmpty,Domyslny,NegationNA,PredNA,AspectNA,"");
"adv",(ReflEmpty,Domyslny,NegationNA,PredNA,AspectNA,"");
"empty",(ReflEmpty,Domyslny,NegationNA,PredNA,AspectNA,"");
"date",(ReflEmpty,Domyslny,NegationNA,PredNA,AspectNA,"{null;lex(np(gen),sg,'rok',natr)}");
"date2",(ReflEmpty,Domyslny,NegationNA,PredNA,AspectNA,"{null;lex(np(gen),sg,'rok',atr1({adjp(agr)}))}"); (* FIXME: wskazać możliwe podrzędniki *)
"day",(ReflEmpty,Domyslny,NegationNA,PredNA,AspectNA,""
(*"{lex(np(gen),sg,XOR('styczeń','luty','marzec','kwiecień','maj','czerwiec','lipiec','sierpień','wrzesień','październik','litopad','grudzień'),atr1({np(gen)}))}"*)); (* FIXME: wskazać możliwe podrzędniki *)
"hour",(ReflEmpty,Domyslny,NegationNA,PredNA,AspectNA,"{null;lex(advp(temp),pos,'rano',natr)}");
] StringMap.empty (fun map (k,(refl,opinion,negation,pred,aspect,schema)) ->
StringMap.add map k (Frame(DefaultAtrs([],refl,opinion,negation,pred,aspect),prepare_schema expands subtypes equivs schema)))
*)