ENIAM_LCGlexicon.ml
13.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
(*
* ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open ENIAM_LCGtypes
open ENIAM_LCGlexiconTypes
open ENIAMcategoriesPL
let rec find_selector s = function
(t,Eq,x :: _) :: l -> if t = s then x else find_selector s l
| (t,_,_) :: l -> if t = s then failwith "find_selector 1" else find_selector s l
| [] -> failwith "find_selector 2"
let rec get_syntax rev = function
Syntax syntax :: rule -> syntax, (List.rev rev) @ rule
| t :: rule -> get_syntax (t :: rev) rule
| [] -> failwith "get_syntax"
let rec get_quant rev = function
Quant quant :: rule -> quant, (List.rev rev) @ rule
| t :: rule -> get_quant (t :: rev) rule
| [] -> [], List.rev rev
let rec get_bracket rev = function
Bracket :: rule -> true, (List.rev rev) @ rule
| t :: rule -> get_bracket (t :: rev) rule
| [] -> false, List.rev rev
let rec get_raised rev = function
Raised raised :: rule -> raised, (List.rev rev) @ rule
| t :: rule -> get_raised (t :: rev) rule
| [] -> raise Not_found
let merge_quant pos_quants quants =
let map = Xlist.fold quants SelectorMap.empty (fun map (k,v) -> SelectorMap.add map k v) in
let l,map = Xlist.fold pos_quants ([],map) (fun (l,map) (cat,v) ->
if SelectorMap.mem map cat then (cat,SelectorMap.find map cat) :: l, SelectorMap.remove map cat
else (cat,v) :: l, map) in
List.rev (SelectorMap.fold map l (fun l cat v -> (cat,v) :: l))
let assign_quantifiers (selectors,rule,weight) =
let pos = find_selector Pos selectors in
let categories =
try StringMap.find pos_categories pos
with Not_found -> failwith ("assign_quantifiers: " ^ pos) in
let categories = Xlist.map categories (fun s -> s,Top) in
let syntax,rule = get_syntax [] rule in
let quant,rule = get_quant [] rule in
let bracket,rule = get_bracket [] rule in
let quant = merge_quant categories quant in
selectors, (bracket,quant,syntax),(rule,weight)
let assign_semantics (selectors,(bracket,quant,syntax),(rule,weight)) =
let semantics = try
let raised,rule = get_raised [] rule in
if rule <> [] then failwith "assign_semantics" else
RaisedSem(Xlist.map quant fst, raised)
with Not_found -> BasicSem(Xlist.map quant fst) in
selectors,(bracket,quant,syntax),(semantics,weight)
let rec extract_category pat rev = function
(cat,rel,v) :: l -> if cat = pat then rel,v,(List.rev rev @ l) else extract_category pat ((cat,rel,v) :: rev) l
| [] -> raise Not_found
let dict_of_grammar grammar =
(* print_endline "dict_of_grammar"; *)
Xlist.fold grammar StringMap.empty (fun dict (selectors,(bracket,quant,syntax),semantics) ->
let pos_rel,poss,selectors = try extract_category Pos [] selectors with Not_found -> failwith "dict_of_grammar 1" in
let lemma_rel,lemmas,selectors = try extract_category Lemma [] selectors with Not_found -> Eq,[],selectors in
if pos_rel <> Eq || lemma_rel <> Eq then failwith "dict_of_grammar 2" else
let rule = selectors,(bracket,quant,syntax),semantics in
Xlist.fold poss dict (fun dict pos ->
let dict2,l = try StringMap.find dict pos with Not_found -> StringMap.empty,[] in
let dict2,l =
if lemmas = [] then dict2,rule :: l else
Xlist.fold lemmas dict2 (fun dict2 lemma ->
StringMap.add_inc dict2 lemma [rule] (fun l -> rule :: l)),l in
StringMap.add dict pos (dict2,l)))
let make_rules filename =
let lexicon = ENIAM_LCGlexiconParser.load_lexicon filename in
let lexicon = List.rev (Xlist.rev_map lexicon assign_quantifiers) in
let lexicon = List.rev (Xlist.rev_map lexicon assign_semantics) in
dict_of_grammar lexicon
let find_rules rules cats =
let lex_rules,rules = try StringMap.find rules cats.pos with Not_found -> failwith "find_rules 1" in
let rules = try StringMap.find lex_rules cats.lemma @ rules with Not_found -> rules in
Xlist.fold rules [] (fun rules (selectors,syntax,semantics) ->
try
let cats = apply_selectors cats selectors in
(cats,syntax,semantics) :: rules
with Not_found -> rules)
let assign_valence valence rules =
Xlist.fold rules [] (fun l (cats,(bracket,quant,syntax),semantics) ->
(* Printf.printf "%s |valence|=%d\n" cats.lemma (Xlist.size valence); *)
if ENIAM_LCGrenderer.count_avar "schema" syntax > 0 then
Xlist.fold valence l (fun l (selectors,schema) ->
try
let cats = apply_selectors cats selectors in
(cats,(bracket,quant,ENIAM_LCGrenderer.substitute_schema "schema" schema syntax),semantics) :: l
with Not_found -> l)
else (cats,(bracket,quant,syntax),semantics) :: l)
type labels = {
number: string;
case: string;
gender: string;
person: string;
aspect: string;
}
let get_label e = function
Number -> e.number
| Case -> e.case
| Gender -> e.gender
| Person -> e.person
| Aspect -> e.aspect
| _ -> ENIAM_LCGreductions.get_variant_label ()
let get_labels () = {
number=ENIAM_LCGreductions.get_variant_label ();
case=ENIAM_LCGreductions.get_variant_label ();
gender=ENIAM_LCGreductions.get_variant_label ();
person=ENIAM_LCGreductions.get_variant_label ();
aspect=ENIAM_LCGreductions.get_variant_label ();
}
let make_quantification e rules =
Xlist.map rules (fun (cats,(bracket,quant,syntax),semantics) ->
let syntax = Xlist.fold (List.rev quant) syntax (fun syntax (cat,t) ->
let t = if t = Top then ENIAM_LCGrenderer.make_quant_restriction (match_selector cats cat) else t in
let category = string_of_selector cat in
WithVar(category,t,get_label e cat,syntax)) in
let syntax = if bracket then ENIAM_LCGtypes.Bracket(true,true,syntax) else ENIAM_LCGtypes.Bracket(false,false,syntax) in
cats,syntax,semantics)
(* let translate_negation = function
(Negation:negation) -> ["neg"]
| Aff -> ["aff"]
| NegationUndef -> ["aff";"neg"]
| NegationNA -> []
let translate_aspect = function
(Aspect s:aspect) -> [s]
| AspectUndef -> ["imperf";"perf"]
| AspectNA -> []
let translate_case = function
(Case s:case) -> [s]
| CaseUndef -> all_cases
| _ -> failwith "translate_case"
let translate_nsem = function
Common s -> [s]
| Time -> ["time"]
let define_valence_selectors = function
DefaultAtrs(m,r,o,neg,p,a) -> failwith "apply_valence_selectors"
| EmptyAtrs m -> []
| NounAtrs(m,nsyn,nsem) -> [Nsyn,Eq,[nsyn];Nsem,Eq,translate_nsem nsem]
| AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> [Case,Eq,translate_case c]
| PersAtrs(m,le,neg,mo,t,au,a) -> [Negation,Eq,translate_negation neg;Mood,Eq,[mo];Tense,Eq,[t];Aspect,Eq,translate_aspect a]
| GerAtrs(m,le,neg,a) -> [Negation,Eq,translate_negation neg;Aspect,Eq,translate_aspect a]
| NonPersAtrs(m,le,role,role_attr,neg,a) -> [Negation,Eq,translate_negation neg;Aspect,Eq,translate_aspect a]
| ComprepAtrs _ -> failwith "apply_valence_selectors" *)
(* FIXME: argumenty X *)
(* let render_schema schema =
Xlist.map schema (function
{morfs=[Multi args]} as s -> LCGrenderer.dir_of_dir s.dir, Maybe(Plus(Xlist.map args LCGrenderer.make_arg_phrase))
| s -> LCGrenderer.dir_of_dir s.dir, Plus(Xlist.map s.morfs (LCGrenderer.make_arg []))) *)
(* let assign_valence valence rules =
Xlist.fold rules [] (fun l (cats,(bracket,quant,syntax,semantics),weight) ->
Printf.printf "%s |valence|=%d\n" cats.lemma (Xlist.size valence);
if LCGrenderer.count_avar "schema" syntax > 0 then
Xlist.fold valence l (fun l -> function
Frame(attr,schema) ->
(try
let selectors = define_valence_selectors attr in
let cats = apply_selectors cats selectors in
(cats,(bracket,quant,substitute_schema "schema" (render_schema schema) syntax,semantics),weight) :: l
with Not_found -> l)
| _ -> l)
else (cats,(bracket,quant,syntax,semantics),weight) :: l) *)
(* FIXME: problem z atrybutami przy zamianie kolejności rzędników *)
let make_node id orth lemma pos syntax weight cat_list is_raised =
let attrs = Xlist.fold cat_list [] (fun attrs -> function
| Lemma -> attrs
| Pos -> attrs
| Pos2 -> attrs
| Cat -> ("CAT",SubstVar "cat") :: attrs
| Number -> ("NUM",SubstVar "number") :: attrs
| Case -> ("CASE",SubstVar "case") :: attrs
| Gender -> ("GEND",SubstVar "gender") :: attrs
| Person -> ("PERS",SubstVar "person") :: attrs
| Grad -> ("GRAD",SubstVar "grad") :: attrs
| Praep -> attrs
| Acm -> ("ACM",SubstVar "acm") :: attrs
| Aspect -> ("ASPECT", SubstVar "aspect") :: attrs
| Negation -> ("NEGATION",SubstVar "negation") :: attrs
| Mood -> ("MOOD", SubstVar "mood") :: attrs
| Tense -> ("TENSE", SubstVar "tense") :: attrs
| Nsyn -> ("NSYN", SubstVar "nsyn") :: attrs
| Nsem -> ("NSEM", SubstVar "nsem") :: attrs
| Ctype -> ("CTYPE", SubstVar "ctype") :: attrs
| Inumber -> attrs
| Igender -> attrs
| Iperson -> attrs
| Nperson -> attrs
| Plemma -> attrs
| Unumber -> attrs
| Ucase -> attrs
| Ugender -> attrs
| Uperson -> attrs) in
(* | s -> (string_of_selector s, Dot) :: attrs) in *)
(* | "lex" -> ("LEX",Val "+") :: attrs *)
(* | s -> failwith ("make_node: " ^ (string_of_selector s))) in *)
let symbol = if is_raised then
ENIAM_LCGrenderer.make_raised_symbol syntax
else ENIAM_LCGrenderer.make_symbol syntax in
{ENIAM_LCGrenderer.empty_node with
orth=orth; lemma=lemma; pos=pos; symbol=symbol;
weight=weight; id=id; attrs=List.rev attrs; args=Dot}
let make_term id orth rules =
Xlist.map rules (fun (cats,syntax,(semantics,weight)) ->
ENIAM_LCGrenderer.reset_variable_names ();
ENIAM_LCGrenderer.add_variable_numbers ();
match semantics with
BasicSem cat_list ->
let node = make_node id orth cats.lemma cats.pos syntax weight(*+.token.ENIAMtokenizerTypes.weight*) cat_list false in
let semantics = ENIAM_LCGrenderer.make_term node syntax in
ENIAM_LCGrenderer.simplify (syntax,semantics)
| RaisedSem(cat_list,outer_cat_list) ->
(* FIXME: jakie atrybuty powinien mieć outer node (w szczególności jaką wagę?) *)
let node = make_node id orth cats.lemma cats.pos syntax weight(*+.token.ENIAMtokenizerTypes.weight*) cat_list true in
let outer_node = make_node id orth cats.lemma cats.pos syntax weight(*+.token.ENIAMtokenizerTypes.weight*) outer_cat_list false in
let semantics = ENIAM_LCGrenderer.make_raised_term node outer_node syntax in
ENIAM_LCGrenderer.simplify (syntax,semantics)
| _ -> failwith "make_term: ni")
(*cats,bracket,quant,syntax,Dot*)
let create_entries rules id orth cats valence =
Xlist.fold cats [] (fun l cats ->
(* variable_name_ref := []; *)
if cats.pos="interp" && cats.lemma="<clause>" then (BracketSet(Forward),Dot) :: l else
if cats.pos="interp" && cats.lemma="</clause>" then (BracketSet(Backward),Dot) :: l else
let e = get_labels () in
(* print_endline "create_entries 1"; *)
let rules = find_rules rules cats in
(* print_endline "create_entries 2"; *)
let rules = assign_valence valence rules in
(* print_endline "create_entries 3"; *)
let rules = make_quantification e rules in
(* print_endline "create_entries 4"; *)
let rules = make_term id orth rules in
(* print_endline "create_entries 5"; *)
rules @ l)
(*
(* FIXME: poprawić i dodać moduł testujący *)
module OrderedIntInt = struct
type t = int * int
let compare = compare
end
module IntIntSet = Xset.Make(OrderedIntInt)
let create (paths,last) tokens lex_sems =
(* uni_weight := 0.; *)
let chart = LCGchart.make last in
let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
let token = ExtArray.get tokens id in
let lex_sem = ExtArray.get lex_sems id in
(* if t.weight < -0.9 || Xlist.mem t.attrs "notvalidated proper" || Xlist.mem t.attrs "lemmatized as lowercase" then chart else *)
let chart = LCGchart.add_inc chart lnode rnode (Tensor[Atom ("[" ^ token.ENIAMtokenizerTypes.orth ^ "]")], Dot) 0 in
LCGchart.add_inc_list chart lnode rnode (create_entries (*tokens lex_sems*) id (token:ENIAMtokenizerTypes.token_record) lex_sem (*false*)) 0) in
let set = Xlist.fold paths IntIntSet.empty (fun set (_,lnode,rnode) -> IntIntSet.add set (lnode,rnode)) in
let chart = IntIntSet.fold set chart (fun chart (i,j) -> LCGchart.make_unique chart i j) in
chart
*)