ENIAM_LCGlexiconParser.ml
15.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
(*
* ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open ENIAM_LCGtypes
open ENIAM_LCGlexiconTypes
open ENIAMcategoriesPL
let rec get_first n = function
[] -> []
| s :: l -> if n = 0 then [] else s :: (get_first (n-1) l)
let print_prefix n l =
print_endline (String.concat " " (get_first n l))
let remove_comments line =
try
let n = String.index line '#' in
String.sub line 0 n
with Not_found -> line
let rec parse_phrase_names_rec rev = function
"@WEIGHTS" :: tokens -> List.rev rev, "@WEIGHTS" :: tokens
| "@LEXICON" :: tokens -> List.rev rev, "@LEXICON" :: tokens
| t :: tokens -> parse_phrase_names_rec (t :: rev) tokens
| [] -> failwith "parse_phrase_names_rec: unexpexted end of input"
let parse_phrase_names = function
"@PHRASE_NAMES" :: tokens -> parse_phrase_names_rec [] tokens
| s :: _ -> failwith ("parse_phrase_names: '@PHRASE_NAMES' expected while '" ^ s ^ "' found")
| [] -> failwith "parse_phrase_names: unexpexted end of input"
let rec parse_weights_rec weights = function
"@LEXICON" :: tokens -> weights, "@LEXICON" :: tokens
| w :: "=" :: n :: tokens -> parse_weights_rec (StringMap.add weights w (float_of_string n)) tokens
| s :: _ -> failwith ("parse_weights_rec: unexpexted token '" ^ s ^ "'")
| [] -> failwith "parse_weights_rec: unexpexted end of input"
let parse_weights = function
"@WEIGHTS" :: tokens -> parse_weights_rec StringMap.empty tokens
| "@LEXICON" :: tokens -> StringMap.empty, "@LEXICON" :: tokens
| s :: _ -> failwith ("parse_weights: '@WEIGHTS' expected while '" ^ s ^ "' found")
| [] -> failwith "parse_weights: unexpexted end of input"
let rec split_semic found rev = function
"lemma" :: "=" :: ";" :: l -> split_semic found (";" :: "=" :: "lemma" :: rev) l
| ";" :: l -> split_semic (List.rev rev :: found) [] l
| s :: l -> split_semic found (s :: rev) l
| [] -> if rev = [] then List.rev found else List.rev ((List.rev rev) :: found)
let rec split_colon found rev = function
"lemma" :: "=" :: ":" :: l -> split_colon found (":" :: "=" :: "lemma" :: rev) l
| ":" :: l -> split_colon (List.rev rev :: found) [] l
| s :: l -> split_colon found (s :: rev) l
| [] -> if rev = [] then List.rev found else List.rev ((List.rev rev) :: found)
let rec split_comma found rev = function
"lemma" :: "=" :: "," :: l -> split_comma found ("," :: "=" :: "lemma" :: rev) l
| "," :: l -> split_comma (List.rev rev :: found) [] l
| s :: l -> split_comma found (s :: rev) l
| [] -> if rev = [] then List.rev found else List.rev ((List.rev rev) :: found)
let rec find_right_bracket rev = function
"]" :: l -> List.rev rev, l
| s :: l -> find_right_bracket (s :: rev) l
| [] -> failwith "find_right_bracket"
let match_selectors = function
s :: l -> (try selector_of_string s,l with _ -> failwith ("match_selectors: " ^ s))
| [] -> failwith "match_selectors: empty"
let match_relation = function
(* cat,"=" :: "=" :: l -> cat,StrictEq,l *)
| cat,"!" :: "=" :: l -> cat,Neq,l
| cat,"=" :: l -> cat,Eq,l
| cat,s :: l -> failwith ("match_relation: " ^ (String.concat " " (s :: l)))
| cat,[] -> failwith "match_relation: empty"
let rec split_mid rev = function
[s] -> List.rev (s :: rev)
| s :: "|" :: l -> split_mid (s :: rev) l
| [] -> failwith "split_mid: empty"
| l -> failwith ("split_mid: " ^ (String.concat " " l))
let rec check_value selector l =
let vals = try SelectorMap.find selector_values selector
with Not_found -> failwith ("check_value: invalid selector " ^ string_of_selector selector) in
if vals = [] then () else
Xlist.iter l (fun s ->
if not (Xlist.mem vals s) then
failwith ("check_value: invalid selector " ^ string_of_selector selector ^ "=" ^ s));
l
let match_value = function
cat,rel,[s] -> cat,rel,[s]
| cat,rel,[] -> failwith "match_value: empty"
| cat,rel,l -> cat,rel, check_value cat (split_mid [] l)
let parse_selectors l =
(* print_endline s; *)
(* let l = Xlist.map (Str.full_split (Str.regexp "|\\|,\\|=\\|!") s) (function
Str.Text s -> s
| Str.Delim s -> s) in *)
let ll = split_comma [] [] l in
let l = Xlist.rev_map ll match_selectors in
let l = Xlist.rev_map l match_relation in
let l = Xlist.rev_map l match_value in
l
let manage_lemmata = function
"lemma" :: "=" :: ":" :: "," :: tokens -> ["lemma";"=";":";","],tokens
| "lemma" :: "=" :: ":" :: s :: "," :: tokens -> ["lemma";"=";":"^s;","],tokens
| "lemma" :: "=" :: "<" :: "/" :: s :: "," :: tokens -> ["lemma";"=";"</"^s;","],tokens
| tokens -> [],tokens
type syntax =
A of string
| B of internal_grammar_symbol
| C of grammar_symbol
| D of direction * grammar_symbol
| E of (direction * grammar_symbol) list
let make_atoms phrase_names =
SelectorMap.fold selector_values (StringSet.of_list phrase_names) (fun atoms _ l ->
Xlist.fold l atoms StringSet.add)
let operators = StringSet.of_list [
"*"; "+"; "/"; "|"; "\\"; "("; ")"; ","; "{"; "}"; "?"]
let find_internal_grammar_symbols atoms = function
| "T" -> B Top
| "1" -> C One
| "schema" -> D(Both,Tensor[AVar "schema"])
| "adjuncts" -> D(Both,Tensor[AVar "adjuncts"])
| s -> if StringSet.mem selector_names s then B (AVar s) else
if StringSet.mem atoms s then B (Atom s) else
if StringSet.mem operators s then A s else
failwith ("find_internal_grammar_symbols: unknown symbol " ^ s)
let rec find_tensor = function
B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: A "*" :: B s7 :: A "*" :: B s8 :: l -> failwith "find_tensor 1"
| B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: A "*" :: B s7 :: l -> C (Tensor[s1;s2;s3;s4;s5;s6;s7]) :: find_tensor l
| B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: l -> C (Tensor[s1;s2;s3;s4;s5;s6]) :: find_tensor l
| B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: l -> C (Tensor[s1;s2;s3;s4;s5]) :: find_tensor l
| B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: l -> C (Tensor[s1;s2;s3;s4]) :: find_tensor l
| B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: l -> C (Tensor[s1;s2;s3]) :: find_tensor l
| B s1 :: A "*" :: B s2 :: l -> C (Tensor[s1;s2]) :: find_tensor l
| B s1 :: l -> C (Tensor[s1]) :: find_tensor l
| A "*" :: _ -> failwith "find_tensor 2: unexpected '*'"
| t :: l -> t :: find_tensor l
| [] -> []
let rec find_plus = function
C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: A "+" :: C s5 :: A "+" :: C s6 :: A "+" :: C s7 :: l -> failwith "find_plus 1"
| C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: A "+" :: C s5 :: A "+" :: C s6 :: l -> C (Plus[s1;s2;s3;s4;s5;s6]) :: find_plus l
| C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: A "+" :: C s5 :: l -> C (Plus[s1;s2;s3;s4;s5]) :: find_plus l
| C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: l -> C (Plus[s1;s2;s3;s4]) :: find_plus l
| C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: l -> C (Plus[s1;s2;s3]) :: find_plus l
| C s1 :: A "+" :: C s2 :: l -> C (Plus[s1;s2]) :: find_plus l
| A "+" :: _ -> failwith "find_plus 2: unexpected '+'"
| t :: l -> t :: find_plus l
| [] -> []
let rec find_paren = function
A "(" :: C s :: A ")" :: l -> C s :: find_paren l
| s :: l -> s :: find_paren l
| [] -> []
let rec find_imp = function
| C s1 :: A "/" :: C s2 :: l -> C (Imp(s1,Forward,s2)) :: find_imp l
| C s1 :: A "|" :: C s2 :: l -> C (Imp(s1,Both,s2)) :: find_imp l
| C s1 :: A "\\" :: C s2 :: l -> C (Imp(s1,Backward,s2)) :: find_imp l
| s :: l -> s :: find_imp l
| [] -> []
let rec find_maybe = function
| A "?" :: C s2 :: l -> C (Maybe s2) :: find_maybe l
| A "?" :: _ -> failwith "find_maybe 1: unexpected '?'"
| s :: l -> s :: find_maybe l
| [] -> []
let rec find_mult_imp = function
| A "{" :: A "/" :: C s2 :: l -> A "{" :: D (Forward,s2) :: find_mult_imp l
| A "{" :: A "|" :: C s2 :: l -> A "{" :: D (Both,s2) :: find_mult_imp l
| A "{" :: A "\\" :: C s2 :: l -> A "{" :: D (Backward,s2) :: find_mult_imp l
| A "," :: A "/" :: C s2 :: l -> A "," :: D (Forward,s2) :: find_mult_imp l
| A "," :: A "|" :: C s2 :: l -> A "," :: D (Both,s2) :: find_mult_imp l
| A "," :: A "\\" :: C s2 :: l -> A "," :: D (Backward,s2) :: find_mult_imp l
| A "/" :: _ -> failwith "find_mult_imp 1: unexpected '/'"
| A "|" :: _ -> failwith "find_mult_imp 2: unexpected '|'"
| A "\\" :: _ -> failwith "find_mult_imp 3: unexpected '\\'"
| A "(" :: _ -> failwith "find_mult_imp 4: unexpected '('"
| A ")" :: _ -> failwith "find_mult_imp 5: unexpected ')'"
| s :: l -> s :: find_mult_imp l
| [] -> []
let rec find_mult = function
A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "," :: D(s7,t7) :: A "," :: D(s8,t8) :: l -> failwith "find_mult 1"
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "," :: D(s7,t7) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5;s6,t6;s7,t7] :: find_mult l
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5;s6,t6] :: find_mult l
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5] :: find_mult l
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4] :: find_mult l
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3] :: find_mult l
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "}" :: l -> E[s1,t1;s2,t2] :: find_mult l
| A "{" :: D(s1,t1) :: A "}" :: l -> E[s1,t1] :: find_mult l
| A "{" :: _ -> failwith "find_mult 2: unexpected '{'"
| A "}" :: _ -> failwith "find_mult 3: unexpected '}'"
| A "," :: _ -> failwith "find_mult 4: unexpected ','"
| t :: l -> t :: find_mult l
| [] -> []
let rec apply_mult = function
C s :: E t :: l -> apply_mult (C (ImpSet(s,t)) :: l)
| [C s] -> C s
| _ -> failwith "apply_mult"
let parse_syntax atoms l =
(* print_endline s; *)
(* let l = Xlist.map (Str.full_split (Str.regexp "?\\|}\\|{\\|,\\|*\\|/\\|+\\|)\\|(\\||\\|\\") s) (function
Str.Text s -> s
| Str.Delim s -> s) in *)
let l = List.rev (Xlist.rev_map l (find_internal_grammar_symbols atoms)) in
let l = find_tensor l in
let l = find_plus l in
let l = find_paren l in
let l = find_maybe l in
let l = find_imp l in
let l = find_paren l in
let l = find_imp l in
let l = find_paren l in
let l = find_imp l in
let l = find_paren l in
let l = find_mult_imp l in
let l = find_mult l in
match apply_mult l with
C s -> s
| _ -> failwith "parse_syntax"
let check_quant_range cat l =
let set = StringSet.of_list (
try SelectorMap.find selector_values cat
with Not_found -> failwith ("check_quant_range: " ^ string_of_selector cat)) in
if StringSet.is_empty set then () else
Xlist.iter l (fun v ->
if not (StringSet.mem set v) then
failwith ("check_quant_range: " ^ string_of_selector cat ^ "=" ^ v))
let parse_quant_range = function
_,["0"] -> Zero
| _,["T"] -> Top
| _,["all_numbers"] -> ENIAM_LCGrenderer.make_quant_restriction all_numbers
| _,["all_cases"] -> ENIAM_LCGrenderer.make_quant_restriction all_cases
| _,["all_genders"] -> ENIAM_LCGrenderer.make_quant_restriction all_genders
| _,["all_persons"] -> ENIAM_LCGrenderer.make_quant_restriction all_persons
| cat,l ->
let l = Xstring.split "&" (String.concat "" l) in
check_quant_range cat l;
ENIAM_LCGrenderer.make_quant_restriction l
let parse_quantifiers tokens =
Xlist.map (split_comma [] [] tokens) (function
cat :: "=" :: tokens ->
let cat = selector_of_string cat in
cat, parse_quant_range (cat,tokens)
| t :: _ -> failwith ("parse_quantifiers: unexpected token '" ^ t ^ "'")
| [] -> failwith "parse_quantifiers: no token")
let parse_raised tokens =
Xlist.map (split_comma [] [] tokens) (function
[cat] -> selector_of_string cat
| t :: _ -> failwith ("parse_raised: unexpected token '" ^ t ^ "'")
| [] -> failwith "parse_raised: no token")
let rec find_syntax_end rev = function
("BRACKET" :: _) as tokens -> List.rev rev, tokens
| ("QUANT" :: "[" :: _) as tokens -> List.rev rev, tokens
| ("RAISED" :: "[" :: _) as tokens -> List.rev rev, tokens
| ("SEM" :: "[" :: _) as tokens -> List.rev rev, tokens
| s :: tokens -> find_syntax_end (s :: rev) tokens
| [] -> List.rev rev, []
let parse_sem_term sem_term = String.concat "" sem_term
let rec parse_rule atoms = function
"BRACKET" :: tokens -> Bracket :: parse_rule atoms tokens
| "QUANT" :: "[" :: tokens ->
let quant,tokens = find_right_bracket [] tokens in
Quant(parse_quantifiers quant) :: parse_rule atoms tokens
| "RAISED" :: "[" :: tokens ->
let raised,tokens = find_right_bracket [] tokens in
Raised(parse_raised raised) :: parse_rule atoms tokens
| "SEM" :: "[" :: tokens ->
let sem_term,tokens = find_right_bracket [] tokens in
Sem(parse_sem_term sem_term) :: parse_rule atoms tokens
| [] -> []
| tokens ->
let syntax,tokens = find_syntax_end [] tokens in
(* print_prefix 100 tokens; *)
Syntax(parse_syntax atoms syntax) :: parse_rule atoms tokens
let parse_entry atoms weights tokens =
let prefix,tokens = manage_lemmata tokens in
let selectors, rule, weight =
match split_colon [] [] tokens with
[selectors;rule] -> selectors, rule, 0.
| [selectors;rule;[weight]] -> selectors, rule,
(try StringMap.find weights weight
with Not_found -> failwith ("parse_entry: unknown weight symbol '" ^ weight ^ "'"))
| _ -> failwith ("parse_entry: invalid number of ':' in entry " ^ (String.concat " " tokens)) in
let selectors = parse_selectors (prefix @ selectors) in
let rule = parse_rule atoms rule in
selectors, rule, weight
let parse_lexicon atoms weights = function
"@LEXICON" :: tokens ->
let entries = split_semic [] [] tokens in
List.rev (Xlist.rev_map entries (parse_entry atoms weights))
| s :: _ -> failwith ("parse_lexicon: '@LEXICON' expected while '" ^ s ^ "' found")
| [] -> failwith "parse_lexicon: unexpexted end of input"
let load_lexicon filename =
let lines = File.load_lines filename in
let lines = List.rev (Xlist.rev_map lines remove_comments) in
let tokens = List.flatten (Xlist.rev_map lines (fun line ->
Xlist.rev_map (Str.full_split
(Str.regexp "\\]\\| \\|\t\\|\r\\|\\?\\|:\\|;\\|&\\|!\\|=\\|}\\|{\\|,\\|\\*\\|/\\|\\+\\|)\\|(\\||\\|\\[\\|\\") line) (function
Str.Text s -> s
| Str.Delim s -> s))) in
let tokens = Xlist.fold tokens [] (fun tokens -> function
" " -> tokens
| "\t" -> tokens
| "\r" -> tokens
| t -> t :: tokens) in
let phrase_names,tokens = parse_phrase_names tokens in
let atoms = make_atoms phrase_names in
let weights,tokens = parse_weights tokens in
let lexicon = parse_lexicon atoms weights tokens in
lexicon