ENIAM_LCGrendererPL.ml
12.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
(*
* ENIAM_LCGgrammarPL is a library that provides LCG lexicon form Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open ENIAM_LCGgrammarPLtypes
open ENIAM_LCGtypes
let rec extract_category pat rev = function
(cat,rel,v) :: l -> if cat = pat then rel,v,(List.rev rev @ l) else extract_category pat ((cat,rel,v) :: rev) l
| [] -> raise Not_found
let dict_of_grammar grammar =
print_endline "dict_of_grammar";
Xlist.fold grammar StringMap.empty (fun dict (selectors,(bracket,quant,syntax,semantics),weight) ->
let pos_rel,poss,selectors = try extract_category Pos [] selectors with Not_found -> failwith "dict_of_grammar 1" in
let lemma_rel,lemmas,selectors = try extract_category Lemma [] selectors with Not_found -> Eq,[],selectors in
if pos_rel <> Eq || lemma_rel <> Eq then failwith "dict_of_grammar 2" else
let rule = selectors,(bracket,quant,syntax,semantics),weight in
Xlist.fold poss dict (fun dict pos ->
let dict2,l = try StringMap.find dict pos with Not_found -> StringMap.empty,[] in
let dict2,l =
if lemmas = [] then dict2,rule :: l else
Xlist.fold lemmas dict2 (fun dict2 lemma ->
StringMap.add_inc dict2 lemma [rule] (fun l -> rule :: l)),l in
StringMap.add dict pos (dict2,l)))
let rules = dict_of_grammar ENIAM_LCGlexiconPL.grammar
let match_selector cats = function
Lemma -> [cats.lemma]
(* | NewLemma -> [] *)
| Number -> cats.numbers
| Case -> cats.cases
| Gender -> cats.genders
| Person -> cats.persons
| Grad -> cats.grads
| Praep -> cats.praeps
| Acm -> cats.acms
| Aspect -> cats.aspects
| Negation -> cats.negations
| Mood -> cats.moods
| Tense -> cats.tenses
| Nsyn -> cats.nsyn
| Nsem -> cats.nsem
| c -> failwith ("match_selector: " ^ string_of_cat c)
let set_selector cats vals = function
Number -> {cats with numbers=vals}
| Case -> {cats with cases=vals}
| Gender -> {cats with genders=vals}
| Person -> {cats with persons=vals}
| Grad -> {cats with grads=vals}
| Praep -> {cats with praeps=vals}
| Acm -> {cats with acms=vals}
| Aspect -> {cats with aspects=vals}
| Negation -> {cats with negations=vals}
| Mood -> {cats with moods=vals}
| Tense -> {cats with tenses=vals}
| Nsyn -> {cats with nsyn=vals}
| Nsem -> {cats with nsem=vals}
| c -> failwith ("set_selector: " ^ string_of_cat c)
let rec apply_selectors cats = function
[] -> cats
| (sel,Eq,vals) :: l ->
let vals = StringSet.intersection (StringSet.of_list (match_selector cats sel)) (StringSet.of_list vals) in
if StringSet.is_empty vals then raise Not_found else
apply_selectors (set_selector cats (StringSet.to_list vals) sel) l
| (sel,Neq,vals) :: l ->
let vals = StringSet.difference (StringSet.of_list (match_selector cats sel)) (StringSet.of_list vals) in
if StringSet.is_empty vals then raise Not_found else
apply_selectors (set_selector cats (StringSet.to_list vals) sel) l
(* let translate_negation = function
(Negation:negation) -> ["neg"]
| Aff -> ["aff"]
| NegationUndef -> ["aff";"neg"]
| NegationNA -> []
let translate_aspect = function
(Aspect s:aspect) -> [s]
| AspectUndef -> ["imperf";"perf"]
| AspectNA -> []
let translate_case = function
(Case s:case) -> [s]
| CaseUndef -> all_cases
| _ -> failwith "translate_case"
let translate_nsem = function
Common s -> [s]
| Time -> ["time"]
let define_valence_selectors = function
DefaultAtrs(m,r,o,neg,p,a) -> failwith "apply_valence_selectors"
| EmptyAtrs m -> []
| NounAtrs(m,nsyn,nsem) -> [Nsyn,Eq,[nsyn];Nsem,Eq,translate_nsem nsem]
| AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> [Case,Eq,translate_case c]
| PersAtrs(m,le,neg,mo,t,au,a) -> [Negation,Eq,translate_negation neg;Mood,Eq,[mo];Tense,Eq,[t];Aspect,Eq,translate_aspect a]
| GerAtrs(m,le,neg,a) -> [Negation,Eq,translate_negation neg;Aspect,Eq,translate_aspect a]
| NonPersAtrs(m,le,role,role_attr,neg,a) -> [Negation,Eq,translate_negation neg;Aspect,Eq,translate_aspect a]
| ComprepAtrs _ -> failwith "apply_valence_selectors" *)
let find_rules rules cats =
let lex_rules,rules = try StringMap.find rules cats.pos with Not_found -> failwith "find_rules 1" in
let rules = try StringMap.find lex_rules cats.lemma @ rules with Not_found -> rules in
Xlist.fold rules [] (fun rules (selectors,(bracket,quant,syntax,semantics),weight) ->
try
let cats = apply_selectors cats selectors in
(cats,(bracket,quant,syntax,semantics),weight) :: rules
with Not_found -> rules)
(* let render_schema schema =
Xlist.map schema (function
{morfs=[Multi args]} as s -> LCGrenderer.dir_of_dir s.dir, Maybe(Plus(Xlist.map args LCGrenderer.make_arg_phrase))
| s -> LCGrenderer.dir_of_dir s.dir, Plus(Xlist.map s.morfs (LCGrenderer.make_arg []))) *)
(* FIXME: pomijam NewLemma *)
(* let assign_valence valence rules =
Xlist.fold rules [] (fun l (cats,(bracket,quant,syntax,semantics),weight) ->
Printf.printf "%s |valence|=%d\n" cats.lemma (Xlist.size valence);
if LCGrenderer.count_avar "schema" syntax > 0 then
Xlist.fold valence l (fun l -> function
Frame(attr,schema) ->
(try
let selectors = define_valence_selectors attr in
let cats = apply_selectors cats selectors in
(cats,(bracket,quant,substitute_schema "schema" (render_schema schema) syntax,semantics),weight) :: l
with Not_found -> l)
| _ -> l)
else (cats,(bracket,quant,syntax,semantics),weight) :: l) *)
let assign_valence valence rules =
Xlist.fold rules [] (fun l (cats,(bracket,quant,syntax,semantics),weight) ->
(* Printf.printf "%s |valence|=%d\n" cats.lemma (Xlist.size valence); *)
if ENIAM_LCGrenderer.count_avar "schema" syntax > 0 then
Xlist.fold valence l (fun l (selectors,schema) ->
try
let cats = apply_selectors cats selectors in
(cats,(bracket,quant,ENIAM_LCGrenderer.substitute_schema "schema" schema syntax,semantics),weight) :: l
with Not_found -> l)
else (cats,(bracket,quant,syntax,semantics),weight) :: l)
(* FIXME: ustawienie wartości symbol i arg_symbol *)
let make_node id lemma cat weight cat_list =
let attrs = Xlist.fold cat_list(*Xlist.rev_map quant fst*) [] (fun attrs -> function
| Lemma -> attrs
| Number -> ("NUM",SubstVar "number") :: attrs
| Case -> ("CASE",SubstVar "case") :: attrs
| Gender -> ("GEND",SubstVar "gender") :: attrs
| Person -> ("PERS",SubstVar "person") :: attrs
| Grad -> ("GRAD",SubstVar "grad") :: attrs
| Praep -> attrs
| Acm -> ("ACM",SubstVar "acm") :: attrs
| Aspect -> ("ASPECT", SubstVar "aspect") :: attrs
| Negation -> ("NEGATION",SubstVar "negation") :: attrs
| Mood -> ("MOOD", SubstVar "mood") :: attrs
| Tense -> ("TENSE", SubstVar "tense") :: attrs
| Nsyn -> ("NSYN", SubstVar "nsyn") :: attrs
| Nsem -> ("NSEM", SubstVar "nsem") :: attrs
| Ctype -> ("CTYPE", SubstVar "ctype") :: attrs
(* | "lex" -> ("LEX",Val "+") :: attrs *)
| s -> failwith ("make_node: " ^ (string_of_cat s))) in
{ENIAM_LCGrenderer.empty_node with pred=lemma; cat=cat; weight=weight; id=id; attrs=List.rev attrs; args=Dot}
(* FIXME: przenieść niezależne od języka procedury do ENIAM_LCGrenderer *)
let variable_name_ref = ref []
let rec add_variable_name = function
[] -> ["a"]
| "z" :: l -> "a" :: add_variable_name l
| s :: l -> String.make 1 (Char.chr (Char.code (String.get s 0) + 1)) :: l
let get_variable_name () =
variable_name_ref := add_variable_name (!variable_name_ref);
String.concat "" (List.rev (!variable_name_ref))
let rec make_term_arg = function
Tensor l -> let v = get_variable_name () in v, Cut(Var v)
| Plus l -> let v = get_variable_name () in v, Case(Var v,Xlist.map l make_term_arg)
(* | Imp(s,d,t2) -> *)
| One -> get_variable_name (), Dot
| Maybe s ->
let v,arg = make_term_arg s in
let w = get_variable_name () in
w, Fix(Var w,Lambda(v,arg))
| _ -> failwith "make_term_arg"
let add_args node args =
{node with args=Tuple(node.args :: args)}
let rec make_term_imp node = function
| Imp(s,d,t2) ->
let v,arg = make_term_arg t2 in
Lambda(v,make_term_imp (add_args node [arg]) s)
| ImpSet(s,l) ->
let vars,args = List.split (Xlist.map l (fun (_,t) -> make_term_arg t)) in
LambdaSet(vars,make_term_imp (add_args node args) s)
| Tensor l -> Node node
| _ -> failwith "make_term_imp"
let make_term id token rules =
Xlist.map rules (fun (cats,(bracket,quant,syntax,semantics),weight) ->
match semantics with
BasicSem cat_list ->
let node = make_node id cats.lemma cats.pos (weight+.token.ENIAMtokenizerTypes.weight) cat_list in
cats,bracket,quant,syntax,make_term_imp node syntax
| _ -> failwith "make_term: ni")
(*cats,bracket,quant,syntax,Dot*)
type labels = {
number: string;
case: string;
gender: string;
person: string;
aspect: string;
}
let get_label e = function
Number -> e.number
| Case -> e.case
| Gender -> e.gender
| Person -> e.person
| Aspect -> e.aspect
| _ -> ENIAM_LCGreductions.get_variant_label ()
let get_labels () = {
number=ENIAM_LCGreductions.get_variant_label ();
case=ENIAM_LCGreductions.get_variant_label ();
gender=ENIAM_LCGreductions.get_variant_label ();
person=ENIAM_LCGreductions.get_variant_label ();
aspect=ENIAM_LCGreductions.get_variant_label ();
}
let make_quantification e rules =
Xlist.map rules (fun (cats,bracket,quant,syntax,semantics) ->
let syntax,semantics = Xlist.fold (List.rev quant) (syntax,semantics) (fun (syntax,semantics) (cat,t) ->
let t = if t = Top then ENIAM_LCGlexiconPL.make_symbol (match_selector cats cat) else t in
let category = string_of_cat cat in
ENIAM_LCGrenderer.simplify_withvar (WithVar(category,t,get_label e cat,syntax), VariantVar(category,semantics))) in
if bracket then Bracket(true,true,syntax),semantics else Bracket(false,false,syntax),semantics)
(* FIXME: poprawić i dodać moduł testujący *)
(*
let create_entries id token lex_sem =
Xlist.fold lex_sem.cats [] (fun l cats ->
variable_name_ref := [];
if cats.pos="interp" && cats.lemma="<clause>" then (BracketSet(Forward),Dot) :: l else
if cats.pos="interp" && cats.lemma="</clause>" then (BracketSet(Backward),Dot) :: l else
let e = get_labels () in
(* print_endline "create_entries 1"; *)
let rules = find_rules rules cats in
(* print_endline "create_entries 2"; *)
let rules = assign_valence lex_sem.very_simple_valence rules in
(* print_endline "create_entries 3"; *)
let rules = make_term id token rules in
(* print_endline "create_entries 4"; *)
let rules = make_quantification e rules in
(* print_endline "create_entries 5"; *)
rules @ l)
module OrderedIntInt = struct
type t = int * int
let compare = compare
end
module IntIntSet = Xset.Make(OrderedIntInt)
let create (paths,last) tokens lex_sems =
(* uni_weight := 0.; *)
let chart = LCGchart.make last in
let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
let token = ExtArray.get tokens id in
let lex_sem = ExtArray.get lex_sems id in
(* if t.weight < -0.9 || Xlist.mem t.attrs "notvalidated proper" || Xlist.mem t.attrs "lemmatized as lowercase" then chart else *)
let chart = LCGchart.add_inc chart lnode rnode (Tensor[Atom ("[" ^ token.ENIAMtokenizerTypes.orth ^ "]")], Dot) 0 in
LCGchart.add_inc_list chart lnode rnode (create_entries (*tokens lex_sems*) id (token:ENIAMtokenizerTypes.token_record) lex_sem (*false*)) 0) in
let set = Xlist.fold paths IntIntSet.empty (fun set (_,lnode,rnode) -> IntIntSet.add set (lnode,rnode)) in
let chart = IntIntSet.fold set chart (fun chart (i,j) -> LCGchart.make_unique chart i j) in
chart
*)