ENIAM_LCGlexiconParser.ml
21.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
(*
* ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Xstd
open ENIAM_LCGtypes
open ENIAM_LCGlexiconTypes
open ENIAMcategoriesPL
exception ParseError of string * string * int
let rec get_first n = function
[] -> []
| s :: l -> if n = 0 then [] else s :: (get_first (n-1) l)
let print_prefix n l =
print_endline (String.concat " " (get_first n l))
let remove_comments line =
try
let n = String.index line '#' in
String.sub line 0 n
with Not_found -> line
let rec parse_phrase_names_rec i0 rev = function
(i,"@WEIGHTS") :: tokens -> i, List.rev rev, (i,"@WEIGHTS") :: tokens
| (i,"@LEXICON") :: tokens -> i, List.rev rev, (i,"@LEXICON") :: tokens
| (i,t) :: tokens -> parse_phrase_names_rec i0 ((i,t) :: rev) tokens
| [] -> raise (ParseError("parse_phrase_names_rec", "unexpexted end of input", i0))
let parse_phrase_names i0 = function
(i,"@PHRASE_NAMES") :: tokens -> parse_phrase_names_rec i [] tokens
| (i,s) :: _ -> raise (ParseError("parse_phrase_names", "'@PHRASE_NAMES' expected while '" ^ s ^ "' found", i))
| [] -> raise (ParseError("parse_phrase_names", "unexpexted end of input", i0))
let rec parse_weights_rec i0 weights = function
(i,"@LEXICON") :: tokens -> i, weights, (i,"@LEXICON") :: tokens
| (_,w) :: (_,"=") :: (i,n) :: tokens -> parse_weights_rec i (StringMap.add weights w (float_of_string n)) tokens
| (i,s) :: _ -> raise (ParseError("parse_weights_rec", "unexpexted token '" ^ s ^ "'", i))
| [] -> raise (ParseError("parse_weights_rec", "unexpexted end of input", i0))
let parse_weights i0 = function
(i,"@WEIGHTS") :: tokens -> parse_weights_rec i StringMap.empty tokens
| (i,"@LEXICON") :: tokens -> i, StringMap.empty, (i,"@LEXICON") :: tokens
| (i,s) :: _ -> raise (ParseError("parse_weights", "'@WEIGHTS' expected while '" ^ s ^ "' found", i))
| [] -> raise (ParseError("parse_weights", "unexpexted end of input", i0))
let rec split_semic i0 found rev = function
(i1,"lemma") :: (i2,"=") :: (i3,";") :: l -> split_semic (if rev = [] then i1 else i0) found ((i1,";") :: (i2,"=") :: (i3,"lemma") :: rev) l
| (i,";") :: l -> split_semic i ((i0, List.rev rev) :: found) [] l
| (i,s) :: l -> split_semic (if rev = [] then i else i0) found ((i,s) :: rev) l
| [] -> if rev = [] then List.rev found else List.rev ((i0, List.rev rev) :: found)
let rec split_colon found rev = function
(i1,"lemma") :: (i2,"=") :: (i3,":") :: l -> split_colon found ((i1,":") :: (i2,"=") :: (i3,"lemma") :: rev) l
| (_,":") :: l -> split_colon (List.rev rev :: found) [] l
| (i,s) :: l -> split_colon found ((i,s) :: rev) l
| [] -> if rev = [] then List.rev found else List.rev ((List.rev rev) :: found)
let rec split_comma i0 found rev = function
(i1,"lemma") :: (i2,"=") :: (i3,",") :: l -> split_comma (if rev = [] then i1 else i0) found ((i1,",") :: (i2,"=") :: (i3,"lemma") :: rev) l
| (i,",") :: l -> split_comma i ((i0, List.rev rev) :: found) [] l
| (i,s) :: l -> split_comma (if rev = [] then i else i0) found ((i,s) :: rev) l
| [] -> if rev = [] then List.rev found else List.rev ((i0, List.rev rev) :: found)
let catch_selector_of_string i proc s =
try selector_of_string s
with _ -> raise (ParseError(proc, "unknown selector: " ^ s, i))
let match_selectors = function
i0,(i,s) :: l -> i,catch_selector_of_string i "match_selectors" s,l
| i0,[] -> raise (ParseError("match_selectors", "empty", i0))
let match_relation = function
(* cat,"=" :: "=" :: l -> cat,StrictEq,l *)
| i,cat,(_,"!") :: (_,"=") :: l -> i,cat,Neq,l
| i,cat,(_,"=") :: l -> i,cat,Eq,l
| _,cat,(i,s) :: l -> raise (ParseError("match_relation", "relation symbol not found: " ^ String.concat " " (s :: Xlist.map l snd), i))
| i,cat,[] -> raise (ParseError("match_relation", "empty", i))
let rec split_mid i0 rev = function
[i,s] -> List.rev ((i,s) :: rev)
| (i1,s) :: (i2,"|") :: (i3,"|") :: l -> raise (ParseError("split_mid", "duplicated delimeter found", i2))
| (i1,s) :: (i2,"|") :: l -> split_mid i2 ((i1,s) :: rev) l
| [] -> raise (ParseError("split_mid", "empty", i0))
| (i,s) :: l -> raise (ParseError("split_mid", "delimiter not found: " ^ String.concat " " (s :: Xlist.map l snd), i))
let rec check_value i0 selector l =
let vals = try SelectorMap.find selector_values selector
with Not_found -> raise (ParseError("check_value", "invalid selector: " ^ string_of_selector selector, i0)) in
if vals = [] then () else
Xlist.iter l (fun (i,s) ->
if not (Xlist.mem vals s) then
raise (ParseError("check_value", "invalid selector: " ^ string_of_selector selector ^ "=" ^ s, i)));
Xlist.map l snd
let match_value = function
i,cat,rel,[s] -> cat,rel, check_value i cat [s]
| i,cat,rel,[] -> raise (ParseError("match_value", "empty", i))
| i,cat,rel,l -> cat,rel, check_value i cat (split_mid i [] l)
let parse_selectors i0 l =
(* print_endline s; *)
(* let l = Xlist.map (Str.full_split (Str.regexp "|\\|,\\|=\\|!") s) (function
Str.Text s -> s
| Str.Delim s -> s) in *)
let ll = split_comma i0 [] [] l in
let l = Xlist.rev_map ll match_selectors in
let l = Xlist.rev_map l match_relation in
let l = Xlist.rev_map l match_value in
l
let manage_lemmata = function
(i1,"lemma") :: (i2,"=") :: (i3,":") :: (i4,",") :: tokens -> [i1,"lemma";i2,"=";i3,":";i4,","],tokens
| (i1,"lemma") :: (i2,"=") :: (i3,":") :: (i4,s) :: (i5,",") :: tokens -> [i1,"lemma";i2,"=";i3,":"^s;i5,","],tokens
| (i1,"lemma") :: (i2,"=") :: (i3,"<") :: (i4,"/") :: (i5,s) :: (i6,",") :: tokens -> [i1,"lemma";i2,"=";i3,"</"^s;i6,","],tokens
| tokens -> [],tokens
type syntax =
A of string
| B of internal_grammar_symbol
| C of grammar_symbol
| D of direction * grammar_symbol
| E of (direction * grammar_symbol) list
let make_atoms phrase_names =
SelectorMap.fold selector_values (StringSet.of_list (Xlist.rev_map phrase_names snd)) (fun atoms _ l ->
Xlist.fold l atoms StringSet.add)
let rec find_right_bracket i0 rev = function
(_,"]") :: l -> List.rev rev, l
| (i,s) :: l -> find_right_bracket i ((i,s) :: rev) l
| [] -> raise (ParseError("find_right_bracket", "empty", i0))
let operators = StringSet.of_list [
"*"; "+"; "/"; "|"; "\\"; "("; ")"; ","; "{"; "}"; "?"]
let find_internal_grammar_symbols atoms = function
| i,"T" -> i,B Top
| i,"1" -> i,C One
| i,"schema" -> i,D(Both,Tensor[AVar "schema"])
| i,"local-schema" -> i,D(Both,Tensor[AVar "local-schema"])
| i,"distant-schema" -> i,D(Both,Tensor[AVar "distant-schema"])
| i,"adjuncts" -> i,D(Both,Tensor[AVar "adjuncts"])
| i,s -> if StringSet.mem selector_names s then i,B (AVar s) else
if StringSet.mem atoms s then i,B (Atom s) else
if StringSet.mem operators s then i,A s else
raise (ParseError("find_internal_grammar_symbols", "unknown symbol " ^ s, i))
let rec find_tensor2 rev = function
(_,B s1) :: (_,A "*") :: (i,B s2) :: l -> find_tensor2 (s1 :: rev) ((i,B s2) :: l)
| (_,B s1) :: l -> List.rev (s1 :: rev), l
| (i,t) :: l -> raise (ParseError("find_tensor2", "", i))
| [] -> failwith "find_tensor2"
let rec find_tensor = function
(* B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: A "*" :: B s7 :: A "*" :: B s8 :: l -> failwith "find_tensor 1"
| B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: A "*" :: B s7 :: l -> C (Tensor[s1;s2;s3;s4;s5;s6;s7]) :: find_tensor l
| B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: l -> C (Tensor[s1;s2;s3;s4;s5;s6]) :: find_tensor l
| B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: l -> C (Tensor[s1;s2;s3;s4;s5]) :: find_tensor l
| B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: l -> C (Tensor[s1;s2;s3;s4]) :: find_tensor l
| B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: l -> C (Tensor[s1;s2;s3]) :: find_tensor l
| B s1 :: A "*" :: B s2 :: l -> C (Tensor[s1;s2]) :: find_tensor l *)
| (i,B s1) :: l -> let sl,l = find_tensor2 [] ((i,B s1) :: l) in (i,C (Tensor sl)) :: find_tensor l
| (i,A "*") :: _ -> raise (ParseError("find_tensor", "unexpected '*'", i))
| t :: l -> t :: find_tensor l
| [] -> []
let rec find_plus2 rev = function
(_,C s1) :: (_,A "+") :: (i,C s2) :: l -> find_plus2 (s1 :: rev) ((i,C s2) :: l)
| (_,C s1) :: l -> List.rev (s1 :: rev), l
| (i,t) :: l -> raise (ParseError("find_plus2", "", i))
| [] -> failwith "find_plus2"
let rec find_plus = function
(* C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: A "+" :: C s5 :: A "+" :: C s6 :: A "+" :: C s7 :: l -> failwith "find_plus 1"
| C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: A "+" :: C s5 :: A "+" :: C s6 :: l -> C (Plus[s1;s2;s3;s4;s5;s6]) :: find_plus l
| C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: A "+" :: C s5 :: l -> C (Plus[s1;s2;s3;s4;s5]) :: find_plus l
| C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: l -> C (Plus[s1;s2;s3;s4]) :: find_plus l
| C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: l -> C (Plus[s1;s2;s3]) :: find_plus l *)
| (i1,C s1) :: (i2,A "+") :: (i3,C s2) :: l -> let sl,l = find_plus2 [] ((i1,C s1) :: (i2,A "+") :: (i3,C s2) :: l) in (i1,C (Plus sl)) :: find_plus l
| (i,A "+") :: _ -> raise (ParseError("find_plus 2", "unexpected '+'", i))
| t :: l -> t :: find_plus l
| [] -> []
let rec find_paren = function
(_,A "(") :: (i,C s) :: (_,A ")") :: l -> (i,C s) :: find_paren l
| (i,s) :: l -> (i,s) :: find_paren l
| [] -> []
let rec find_imp = function
| (i,C s1) :: (_,A "/") :: (_,C s2) :: l -> (i,C (Imp(s1,Forward,s2))) :: find_imp l
| (i,C s1) :: (_,A "|") :: (_,C s2) :: l -> (i,C (Imp(s1,Both,s2))) :: find_imp l
| (i,C s1) :: (_,A "\\") :: (_,C s2) :: l -> (i,C (Imp(s1,Backward,s2))) :: find_imp l
| (i,s) :: l -> (i,s) :: find_imp l
| [] -> []
let rec find_maybe = function
| (i,A "?") :: (_,C s2) :: l -> (i,C (Maybe s2)) :: find_maybe l
| (i,A "?") :: _ -> raise (ParseError("find_maybe 1", "unexpected '?'", i))
| (i,s) :: l -> (i,s) :: find_maybe l
| [] -> []
let rec find_mult_imp = function
| (i1,A "{") :: (i2,A "/") :: (_,C s2) :: l -> (i1,A "{") :: (i2,D (Forward,s2)) :: find_mult_imp l
| (i1,A "{") :: (i2,A "|") :: (_,C s2) :: l -> (i1,A "{") :: (i2,D (Both,s2)) :: find_mult_imp l
| (i1,A "{") :: (i2,A "\\") :: (_,C s2) :: l -> (i1,A "{") :: (i2,D (Backward,s2)) :: find_mult_imp l
| (i1,A ",") :: (i2,A "/") :: (_,C s2) :: l -> (i1,A ",") :: (i2,D (Forward,s2)) :: find_mult_imp l
| (i1,A ",") :: (i2,A "|") :: (_,C s2) :: l -> (i1,A ",") :: (i2,D (Both,s2)) :: find_mult_imp l
| (i1,A ",") :: (i2,A "\\") :: (_,C s2) :: l -> (i1,A ",") :: (i2,D (Backward,s2)) :: find_mult_imp l
| (i,A "/") :: _ -> raise (ParseError("find_mult_imp 1", "unexpected '/'", i))
| (i,A "|") :: _ -> raise (ParseError("find_mult_imp 2", "unexpected '|'", i))
| (i,A "\\") :: _ -> raise (ParseError("find_mult_imp 3", "unexpected '\\'", i))
| (i,A "(") :: _ -> raise (ParseError("find_mult_imp 4", "unexpected '('", i))
| (i,A ")") :: _ -> raise (ParseError("find_mult_imp 5", "unexpected ')'", i))
| (i,s) :: l -> (i,s) :: find_mult_imp l
| [] -> []
let rec find_mult2 rev = function
(_,D(s1,t1)) :: (_,A ",") :: (i,D(s2,t2)) :: l -> find_mult2 ((s1,t1) :: rev) ((i,D(s2,t2)) :: l)
| (_,D(s1,t1)) :: (_,A "}") :: l -> List.rev ((s1,t1) :: rev), l
| (i,t) :: l -> raise (ParseError("find_mult2", "", i))
| [] -> failwith "find_mult2"
let rec find_mult = function
(* A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "," :: D(s7,t7) :: A "," :: D(s8,t8) :: A "," :: D(s9,t9) :: A "," :: D(s10,t10) :: A "," :: D _ :: l -> failwith "find_mult 1: to many elements in { }"
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "," :: D(s7,t7) :: A "," :: D(s8,t8) :: A "," :: D(s9,t9) :: A "," :: D(s10,t10) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5;s6,t6;s7,t7;s8,t8;s9,t9;s10,t10] :: find_mult l
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "," :: D(s7,t7) :: A "," :: D(s8,t8) :: A "," :: D(s9,t9) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5;s6,t6;s7,t7;s8,t8;s9,t9] :: find_mult l
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "," :: D(s7,t7) :: A "," :: D(s8,t8) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5;s6,t6;s7,t7;s8,t8] :: find_mult l
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "," :: D(s7,t7) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5;s6,t6;s7,t7] :: find_mult l
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5;s6,t6] :: find_mult l
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5] :: find_mult l
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4] :: find_mult l
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3] :: find_mult l
| A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "}" :: l -> E[s1,t1;s2,t2] :: find_mult l *)
| (_,A "{") :: (i,D(s1,t1)) :: l -> let sl,l = find_mult2 [] ((i,D(s1,t1)) :: l) in (i,E sl) :: find_mult l
| (i,A "{") :: _ -> raise (ParseError("find_mult 2", "unexpected '{'", i))
| (i,A "}") :: _ -> raise (ParseError("find_mult 3", "unexpected '}'", i))
| (i,A ",") :: _ -> raise (ParseError("find_mult 4", "unexpected ','", i))
| t :: l -> t :: find_mult l
| [] -> []
let rec apply_mult i0 = function
(i1,C s) :: (i2,E t) :: l -> apply_mult i2 ((i1,C (ImpSet(s,t))) :: l)
| [i,C s] -> i,C s
| _ -> raise (ParseError("apply_mult","",i0))
let parse_syntax i0 atoms l =
(* print_endline s; *)
(* let l = Xlist.map (Str.full_split (Str.regexp "?\\|}\\|{\\|,\\|*\\|/\\|+\\|)\\|(\\||\\|\\") s) (function
Str.Text s -> s
| Str.Delim s -> s) in *)
let l = List.rev (Xlist.rev_map l (find_internal_grammar_symbols atoms)) in
let l = find_tensor l in
let l = find_plus l in
let l = find_paren l in
let l = find_maybe l in
let l = find_imp l in
let l = find_paren l in
let l = find_imp l in
let l = find_paren l in
let l = find_imp l in
let l = find_paren l in
let l = find_mult_imp l in
let l = find_mult l in
match apply_mult i0 l with
_,C s -> s
| i,_ -> raise (ParseError("parse_syntax","",i))
let check_quant_range i0 cat l =
let set = StringSet.of_list (
try SelectorMap.find selector_values cat
with Not_found -> raise (ParseError("check_quant_range", string_of_selector cat, i0))) in
if StringSet.is_empty set then () else
Xlist.iter l (fun v ->
if not (StringSet.mem set v) then
raise (ParseError("check_quant_range", string_of_selector cat ^ "=" ^ v, i0)))
let parse_quant_range i0 = function
_,[_,"0"] -> Zero
| _,[_,"T"] -> Top
| _,[_,"all_numbers"] -> ENIAM_LCGrenderer.make_quant_restriction all_numbers
| _,[_,"all_cases"] -> ENIAM_LCGrenderer.make_quant_restriction all_cases
| _,[_,"all_genders"] -> ENIAM_LCGrenderer.make_quant_restriction all_genders
| _,[_,"all_persons"] -> ENIAM_LCGrenderer.make_quant_restriction all_persons
| cat,l ->
let l = Xstring.split "&" (String.concat "" (Xlist.map l snd)) in
check_quant_range i0 cat l;
ENIAM_LCGrenderer.make_quant_restriction l
let parse_quantifiers i0 tokens =
Xlist.map (split_comma i0 [] [] tokens) (function
_,(i,cat) :: (_,"=") :: [] -> raise (ParseError("parse_quantifiers", "empty range", i))
| _,(i,cat) :: (_,"=") :: tokens ->
let cat = catch_selector_of_string i "parse_quantifiers" cat in
cat, parse_quant_range i (cat,tokens)
| _,(i,t) :: _ -> raise (ParseError("parse_quantifiers", "unexpected token '" ^ t ^ "'", i))
| i0,[] -> raise (ParseError("parse_quantifiers", "no token", i0)))
let parse_raised i0 tokens =
Xlist.map (split_comma i0 [] [] tokens) (function
_,[i,cat] -> catch_selector_of_string i "parse_raised" cat
| _,(i,t) :: _ -> raise (ParseError("parse_raised", "unexpected token '" ^ t ^ "'", i))
| i0,[] -> raise (ParseError("parse_raised", "no token", i0)))
let rec find_syntax_end rev = function
((_,"BRACKET") :: _) as tokens -> List.rev rev, tokens
| ((_,"COORD") :: _) as tokens -> List.rev rev, tokens
| ((_,"PRECOORD") :: _) as tokens -> List.rev rev, tokens
| ((_,"SINGLE-COST") :: _) as tokens -> List.rev rev, tokens
| ((_,"DOUBLE-COST") :: _) as tokens -> List.rev rev, tokens
| ((_,"QUANT") :: (_,"[") :: _) as tokens -> List.rev rev, tokens
| ((_,"RAISED") :: (_,"[") :: _) as tokens -> List.rev rev, tokens
| ((_,"SEM") :: (_,"[") :: _) as tokens -> List.rev rev, tokens
| s :: tokens -> find_syntax_end (s :: rev) tokens
| [] -> List.rev rev, []
let parse_sem_term sem_term = String.concat "" (Xlist.map sem_term snd)
let rec parse_rule atoms = function
(_,"BRACKET") :: tokens -> Bracket :: parse_rule atoms tokens
| (_,"QUANT") :: (i,"[") :: tokens ->
let quant,tokens = find_right_bracket i [] tokens in
Quant(parse_quantifiers i quant) :: parse_rule atoms tokens
| (_,"COORD") :: tokens -> Coord :: parse_rule atoms tokens
| (_,"PRECOORD") :: tokens -> PreCoord :: parse_rule atoms tokens
| (_,"SINGLE-COST") :: tokens -> Cost 1 :: parse_rule atoms tokens
| (_,"DOUBLE-COST") :: tokens -> Cost 2 :: parse_rule atoms tokens
| (_,"RAISED") :: (i,"[") :: tokens ->
let raised,tokens = find_right_bracket i [] tokens in
Raised(parse_raised i raised) :: parse_rule atoms tokens
| (_,"SEM") :: (i,"[") :: tokens ->
let sem_term,tokens = find_right_bracket i [] tokens in
Sem(parse_sem_term sem_term) :: parse_rule atoms tokens
| [] -> []
| tokens ->
let i = fst (List.hd tokens) in
let syntax,tokens = find_syntax_end [] tokens in
(* print_prefix 100 tokens; *)
Syntax(parse_syntax i atoms syntax) :: parse_rule atoms tokens
let parse_entry i0 atoms weights tokens =
let prefix,tokens = manage_lemmata tokens in
let selectors, rule, weight =
match split_colon [] [] tokens with
[selectors;rule] -> selectors, rule, 0.
| [selectors;rule;[i,weight]] -> selectors, rule,
(try StringMap.find weights weight
with Not_found -> raise (ParseError("parse_entry", "unknown weight symbol '" ^ weight ^ "'", i)))
| _ -> raise (ParseError("parse_entry", "invalid number of ':' in entry " ^ String.concat " " (Xlist.map tokens snd), i0)) in
let selectors = parse_selectors i0 (prefix @ selectors) in
let rule = parse_rule atoms rule in
{empty_entry with selectors=selectors; rule=rule; weight=weight}
let string_of_parse_error proc s i line =
Printf.sprintf "LCG lexicon error in line %d: %s\n%s: %s" i line proc s
let parse_lexicon i0 a atoms weights = function
(i,"@LEXICON") :: tokens ->
let entries = split_semic i [] [] tokens in
Xlist.fold entries ([],true) (fun (entries,is_correct) (i,entry) ->
try (parse_entry i atoms weights entry) :: entries, is_correct
with ParseError(proc,s,i) ->
print_endline (string_of_parse_error proc s i a.(i-1));
entries,false)
| (i,s) :: _ -> raise (ParseError("parse_lexicon", "'@LEXICON' expected while '" ^ s ^ "' found", i))
| [] -> raise (ParseError("parse_lexicon", "unexpexted end of input", i0))
let load_lexicon filename =
let lines = Xstring.split "\n" (File.load_file filename) in
let a = Array.of_list lines in
let lines,no_lines = Xlist.fold lines ([],1) (fun (lines,i) line -> (i,line) :: lines, i+1) in
let lines = Xlist.rev_map lines (fun (i,line) -> i, remove_comments line) in
let tokens = List.flatten (Xlist.rev_map lines (fun (i,line) ->
Xlist.rev_map (Str.full_split
(Str.regexp "\\]\\| \\|\t\\|\r\\|\\?\\|:\\|;\\|&\\|!\\|=\\|}\\|{\\|,\\|\\*\\|/\\|\\+\\|)\\|(\\||\\|\\[\\|\\") line) (function
Str.Text s -> i,s
| Str.Delim s -> i,s))) in
let tokens = Xlist.fold tokens [] (fun tokens -> function
_," " -> tokens
| _,"\t" -> tokens
| _,"\r" -> tokens
| i,t -> (i,t) :: tokens) in
try
let i,phrase_names,tokens = parse_phrase_names 1 tokens in
let atoms = make_atoms phrase_names in
let i,weights,tokens = parse_weights i tokens in
let lexicon,is_correct = parse_lexicon i a atoms weights tokens in
if is_correct then List.rev lexicon else exit 0
with ParseError(proc,s,i) ->
print_endline (string_of_parse_error proc s i a.(i-1));
exit 0