ENIAMcategoriesPL.ml
22.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
(*
* ENIAM_LCGgrammarPL is a library that provides LCG lexicon form Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open ENIAMlexSemanticsTypes
open Xstd
let all_genders = ["m1";"m2";"m3";"f";"n1";"n2";"p1";"p2";"p3"]
let all_cases = ["nom";"gen";"dat";"acc";"inst";"loc";"voc"]
let all_persons = ["pri";"sec";"ter"]
let all_numbers = ["sg";"pl"]
(* FIXME: zamiast wszystkich możliwych wartości można używać Zero gdy nie ma uzgodnienia *)
let expand_numbers numbers =
if Xlist.mem numbers "_" then ["sg";"pl"] else numbers
let expand_genders genders =
if Xlist.mem genders "_" then all_genders else genders
let expand_cases cases =
if Xlist.mem cases "_" || Xlist.mem cases "$C" then all_cases else cases
let expand_akcs akcs =
if Xlist.mem akcs "_" then ["akc";"nakc"] else akcs
let split_voc cases =
Xlist.fold cases ([],[]) (fun (cases,voc) -> function
"voc" -> cases, "voc" :: voc
| s -> s :: cases, voc)
let subst_uncountable_lexemes = StringSet.of_list (File.load_lines subst_uncountable_lexemes_filename)
let subst_uncountable_lexemes2 = StringSet.of_list (File.load_lines subst_uncountable_lexemes_filename2)
let subst_container_lexemes = StringSet.of_list (File.load_lines subst_container_lexemes_filename)
let subst_numeral_lexemes = StringSet.of_list (File.load_lines subst_numeral_lexemes_filename)
let subst_time_lexemes = StringSet.of_list (File.load_lines subst_time_lexemes_filename)
let subst_pronoun_lexemes = StringSet.of_list ["co"; "kto"; "cokolwiek"; "ktokolwiek"; "nic"; "nikt"; "coś"; "ktoś"; "to"]
let adj_pronoun_lexemes = StringSet.of_list ["czyj"; "jaki"; "który"; "jakiś"; "ten"; "taki"]
(* let adj_quant_lexemes = StringSet.of_list ["każdy"; "wszelki"; "wszystek"; "żaden"; "jakiś"; "pewien"; "niektóry"; "jedyny"; "sam"] *)
let noun_type proper lemma pos =
let nsyn =
if proper then "proper" else
if pos = "ppron12" || pos = "ppron3" || pos = "siebie" then "pronoun" else
if pos = "symbol" || pos = "date" || pos = "date-interval" || pos = "hour" || pos = "hour-minute" || pos = "hour-interval" || pos = "hour-minute-interval" ||
pos = "year" || pos = "year-interval" || pos = "day" || pos = "day-interval" || pos = "day-month" || pos = "day-month-interval" ||
pos = "match-result" || pos = "month-interval" || pos = "roman" || pos = "roman-interval" || pos = "url" || pos = "email" || pos = "obj-id" || pos = "date" then "proper" else
if StringSet.mem subst_pronoun_lexemes lemma then "pronoun" else
"common" in
let nsem =
if pos = "ppron12" || pos = "ppron3" || pos = "siebie" then ["count"] else
if StringSet.mem subst_time_lexemes lemma then ["time"] else
let l = ["count"] in
let l = if StringSet.mem subst_uncountable_lexemes lemma || StringSet.mem subst_uncountable_lexemes2 lemma then "mass" :: l else l in
if StringSet.mem subst_container_lexemes lemma then "measure" :: l else l in
[nsyn],nsem
let clarify_categories proper = function
lemma,"subst",[numbers;cases;genders] ->
let numbers = expand_numbers numbers in
let cases = expand_cases cases in
let genders = expand_genders genders in
let cases,voc = split_voc cases in
let nsyn,nsem = noun_type proper lemma "subst" in
(if cases = [] then [] else
[{empty_cats with lemma=lemma; pos="subst"; pos2="noun"; numbers=numbers; cases=cases; genders=genders; persons=["ter"]; nsyn=nsyn; nsem=nsem}]) @
(if voc = [] then [] else
[{empty_cats with lemma=lemma; pos="subst"; pos2="noun"; numbers=numbers; cases=cases; genders=genders; persons=["sec"]; nsyn=nsyn; nsem=nsem}])
| lemma,"depr",[numbers;cases;genders] ->
let numbers = expand_numbers numbers in
let cases = expand_cases cases in
let genders = expand_genders genders in
let cases,voc = split_voc cases in
let nsyn,nsem = noun_type proper lemma "depr" in
(if cases = [] then [] else
[{empty_cats with lemma=lemma; pos="subst"; pos2="noun"; numbers=numbers; cases=cases; genders=genders; persons=["ter"]; nsyn=nsyn; nsem=nsem}]) @
(if voc = [] then [] else
[{empty_cats with lemma=lemma; pos="subst"; pos2="noun"; numbers=numbers; cases=cases; genders=genders; persons=["sec"]; nsyn=nsyn; nsem=nsem}])
| lemma,"ppron12",[numbers;cases;genders;persons] ->
let numbers = expand_numbers numbers in
let cases = expand_cases cases in
let genders = expand_genders genders in
[{empty_cats with lemma=lemma; pos="ppron12"; pos2="pron"; numbers=numbers; cases=cases; genders=genders; persons=persons}]
| lemma,"ppron12",[numbers;cases;genders;persons;akcs] ->
let numbers = expand_numbers numbers in
let cases = expand_cases cases in
let genders = expand_genders genders in
[{empty_cats with lemma=lemma; pos="ppron12"; pos2="pron"; numbers=numbers; cases=cases; genders=genders; persons=persons}]
| lemma,"ppron3",[numbers;cases;genders;persons] ->
let numbers = expand_numbers numbers in
let cases = expand_cases cases in
let genders = expand_genders genders in
[{empty_cats with lemma=lemma; pos="ppron3"; pos2="pron"; numbers=numbers; cases=cases; genders=genders; persons=persons; praeps=["praep-npraep"]}]
| lemma,"ppron3",[numbers;cases;genders;persons;akcs] ->
let numbers = expand_numbers numbers in
let cases = expand_cases cases in
let genders = expand_genders genders in
[{empty_cats with lemma=lemma; pos="ppron3"; pos2="pron"; numbers=numbers; cases=cases; genders=genders; persons=persons; praeps=["praep-npraep"]}]
| lemma,"ppron3",[numbers;cases;genders;persons;akcs;praep] ->
let numbers = expand_numbers numbers in
let cases = expand_cases cases in
let genders = expand_genders genders in
let praep = match praep with
["praep";"npraep"] -> ["praep-npraep"]
| ["npraep";"praep"] -> ["praep-npraep"]
| _ -> praep in
[{empty_cats with lemma=lemma; pos="ppron3"; pos2="pron"; numbers=numbers; cases=cases; genders=genders; persons=persons; praeps=praep}]
| lemma,"siebie",[cases] -> (* FIXME: czy tu określać numbers genders persons? *)
let cases = expand_cases cases in
[{empty_cats with lemma=lemma; pos="siebie"; pos2="pron"; numbers=all_numbers; cases=cases; genders=all_genders; persons=["ter"]}]
| lemma,"prep",[cases;woks] ->
if lemma = "jak" || lemma = "jako" || lemma = "niż" || lemma = "niczym" || lemma = "niby" || lemma = "co" || lemma = "zamiast" then
[{empty_cats with lemma=lemma; pos="compar"; pos2="prep"}] else
let cases = expand_cases cases in
[{empty_cats with lemma=lemma; pos="prep"; pos2="prep"; cases=cases}]
| lemma,"prep",[cases] ->
if lemma = "jak" || lemma = "jako" || lemma = "niż" || lemma = "niczym" || lemma = "niby" || lemma = "co" || lemma = "zamiast" then
[{empty_cats with lemma=lemma; pos="compar"; pos2="prep"}] else
let cases = expand_cases cases in
[{empty_cats with lemma=lemma; pos="prep"; pos2="prep"; cases=cases}]
| lemma,"num",[numbers;cases;genders;acms] ->
let numbers = expand_numbers numbers in
let cases = expand_cases cases in
let genders = expand_genders genders in
[{empty_cats with lemma=lemma; pos="num"; pos2="num"; numbers=numbers; cases=cases; genders=genders; persons=["ter"]; acms=acms}]
| lemma,"intnum",[] ->
let numbers,acms =
if lemma = "1" || lemma = "-1" then ["sg"],["congr"] else
let s = String.get lemma (String.length lemma - 1) in
["pl"],if s = '2' || s = '3' || s = '4' then ["rec";"congr"] else ["rec"] in
[{empty_cats with lemma=lemma; pos="intnum"; pos2="num"; numbers=numbers; cases=all_cases; genders=all_genders; persons=["ter"]; acms=acms}]
| lemma,"realnum",[] ->
[{empty_cats with lemma=lemma; pos="realnum"; pos2="num"; numbers=["sg"]; cases=all_cases; genders=all_genders; persons=["ter"]; acms=["rec"]}]
| lemma,"intnum-interval",[] ->
[{empty_cats with lemma=lemma; pos="intnum-interval"; pos2="num"; numbers=["pl"]; cases=all_cases; genders=all_genders; persons=["ter"]; acms=["rec";"congr"]}]
| lemma,"realnum-interval",[] ->
[{empty_cats with lemma=lemma; pos="realnum-interval"; pos2="num"; numbers=["sg"]; cases=all_cases; genders=all_genders; persons=["ter"]; acms=["rec"]}]
| lemma,"symbol",[] ->
[{empty_cats with lemma=lemma; pos="symbol"; pos2="noun"; numbers=["sg"]; cases=all_cases; genders=all_genders; persons=["ter"]}]
| lemma,"ordnum",[] ->
[{empty_cats with lemma=lemma; pos="ordnum"; pos2="adj"; numbers=all_numbers; cases=all_cases; genders=all_genders; grads=["pos"]}] (* FIXME: czy dać możliwość więcej niż jednego stopnia *)
| lemma,"date",[] ->
let nsyn,nsem = noun_type proper lemma "date" in
[{empty_cats with lemma=lemma; pos="date"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"date-interval",[] ->
let nsyn,nsem = noun_type proper lemma "date-interval" in
[{empty_cats with lemma=lemma; pos="date-interval"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"hour-minute",[] ->
let nsyn,nsem = noun_type proper lemma "hour-minute" in
[{empty_cats with lemma=lemma; pos="hour-minute"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"hour",[] ->
let nsyn,nsem = noun_type proper lemma "hour" in
[{empty_cats with lemma=lemma; pos="hour"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"hour-minute-interval",[] ->
let nsyn,nsem = noun_type proper lemma "hour-minute-interval" in
[{empty_cats with lemma=lemma; pos="hour-minute-interval"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"hour-interval",[] ->
let nsyn,nsem = noun_type proper lemma "hour-interval" in
[{empty_cats with lemma=lemma; pos="hour-interval"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"year",[] ->
let nsyn,nsem = noun_type proper lemma "year" in
[{empty_cats with lemma=lemma; pos="year"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"year-interval",[] ->
let nsyn,nsem = noun_type proper lemma "year-interval" in
[{empty_cats with lemma=lemma; pos="year-interval"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"day",[] ->
let nsyn,nsem = noun_type proper lemma "day" in
[{empty_cats with lemma=lemma; pos="day"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"day-interval",[] ->
let nsyn,nsem = noun_type proper lemma "day-interval" in
[{empty_cats with lemma=lemma; pos="day-interval"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"day-month",[] ->
let nsyn,nsem = noun_type proper lemma "day-month" in
[{empty_cats with lemma=lemma; pos="day-month"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"day-month-interval",[] ->
let nsyn,nsem = noun_type proper lemma "day-month-interval" in
[{empty_cats with lemma=lemma; pos="day-month-interval"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"month-interval",[] ->
let nsyn,nsem = noun_type proper lemma "month-interval" in
[{empty_cats with lemma=lemma; pos="month-interval"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"roman",[] ->
let nsyn,nsem = noun_type proper lemma "roman" in
[{empty_cats with lemma=lemma; pos="roman-ordnum"; pos2="adj"; numbers=all_numbers; cases=all_cases; genders=all_genders; grads=["pos"]};
{empty_cats with lemma=lemma; pos="roman"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"roman-interval",[] ->
let nsyn,nsem = noun_type proper lemma "roman-interval" in
[{empty_cats with lemma=lemma; pos="roman-interval"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"match-result",[] ->
let nsyn,nsem = noun_type proper lemma "match-result" in
[{empty_cats with lemma=lemma; pos="match-result"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"url",[] ->
let nsyn,nsem = noun_type proper lemma "url" in
[{empty_cats with lemma=lemma; pos="url"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"email",[] ->
let nsyn,nsem = noun_type proper lemma "email" in
[{empty_cats with lemma=lemma; pos="email"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"obj-id",[] ->
let nsyn,nsem = noun_type proper lemma "obj-id" in
[{empty_cats with lemma=lemma; pos="obj-id"; pos2="symbol"; nsyn=nsyn; nsem=nsem}]
| lemma,"adj",[numbers;cases;genders;grads] -> (* FIXME: adjsyn *)
let numbers = expand_numbers numbers in
let cases = expand_cases cases in
let cases = if Xlist.mem cases "nom" then "pred" :: cases else cases in
let genders = expand_genders genders in
let pos,pos2 = if StringSet.mem adj_pronoun_lexemes lemma then "apron","pron" else "adj","adj" in
[{empty_cats with lemma=lemma; pos=pos; pos2=pos2; numbers=numbers; cases=cases; genders=genders; grads=grads}] (* FIXME: czy dać możliwość więcej niż jednego stopnia *)
| lemma,"adjc",[] ->
[{empty_cats with lemma=lemma; pos="adjc"; pos2="adj"; numbers=["sg"]; cases=["pred"]; genders=["m1";"m2";"m3"]; grads=["pos"]}]
| lemma,"adjp",[] ->
[{empty_cats with lemma=lemma; pos="adjp"; pos2="adj"; numbers=all_numbers; cases=["postp"]; genders=all_genders; grads=["pos"]}]
| lemma,"adja",[] -> [{empty_cats with lemma=lemma; pos="adja"; pos2="adja"}]
| lemma,"adv",[grads] -> [{empty_cats with lemma=lemma; pos="adv"; pos2="adv"; grads=grads}]
| lemma,"adv",[] -> [{empty_cats with lemma=lemma; pos="adv"; pos2="adv"; grads=["pos"]}]
| lemma,"ger",[numbers;cases;genders;aspects;negations] ->
let numbers = expand_numbers numbers in
let cases = expand_cases cases in
let genders = expand_genders genders in
[{empty_cats with lemma=lemma; pos="ger"; pos2="verb"; numbers=numbers; cases=cases; genders=genders; persons=["ter"]; aspects=aspects; negations=negations}] (* FIXME: kwestia osoby przy voc *)
| lemma,"pact",[numbers;cases;genders;aspects;negations] ->
let numbers = expand_numbers numbers in
let cases = expand_cases cases in
let cases = if Xlist.mem cases "nom" then "pred" :: cases else cases in
let genders = expand_genders genders in
[{empty_cats with lemma=lemma; pos="pact"; pos2="verb"; numbers=numbers; cases=cases; genders=genders; aspects=aspects; negations=negations}]
| lemma,"ppas",[numbers;cases;genders;aspects;negations] ->
let numbers = expand_numbers numbers in
let cases = expand_cases cases in
let cases = if Xlist.mem cases "nom" then "pred" :: cases else cases in
let genders = expand_genders genders in
[{empty_cats with lemma=lemma; pos="ppas"; pos2="verb"; numbers=numbers; cases=cases; genders=genders; aspects=aspects; negations=negations}]
| lemma,"fin",[numbers;persons;aspects] -> (* FIXME: genders bez przymnogich *)
let numbers = expand_numbers numbers in
let persons2 = Xlist.fold persons [] (fun l -> function "sec" -> l | s -> s :: l) in
let cats = {empty_cats with lemma=lemma; pos="fin"; pos2="verb"; numbers=numbers; genders=all_genders; persons=persons; negations=["aff"; "neg"]; moods=["indicative"]} in
(Xlist.map aspects (function
"imperf" -> {cats with aspects=["imperf"]; tenses=["pres"]}
| "perf" -> {cats with aspects=["perf"]; tenses=["fut"]}
| _ -> failwith "clarify_categories")) @
(if persons2 = [] then [] else
[{empty_cats with lemma=lemma; pos="fin"; pos2="verb"; numbers=numbers; genders=all_genders; persons=persons; aspects=aspects; negations=["aff"; "neg"]; moods=["imperative"]; tenses=["fut"]}])
| lemma,"bedzie",[numbers;persons;aspects] ->
let numbers = expand_numbers numbers in
let persons2 = Xlist.fold persons [] (fun l -> function "sec" -> l | s -> s :: l) in
[{empty_cats with lemma=lemma; pos="fin"; pos2="verb"; numbers=numbers; genders=all_genders; persons=persons; aspects=aspects; negations=["aff"; "neg"]; moods=["indicative"]; tenses=["fut"]}] @
(if persons2 = [] then [] else
[{empty_cats with lemma=lemma; pos="fin"; pos2="verb"; numbers=numbers; genders=all_genders; persons=persons; aspects=aspects; negations=["aff"; "neg"]; moods=["imperative"]; tenses=["fut"]}])
| lemma,"praet",[numbers;genders;aspects;nagl] ->
let numbers = expand_numbers numbers in
let genders = expand_genders genders in
[{empty_cats with lemma=lemma; pos="praet"; pos2="verb"; numbers=numbers; genders=genders; persons=all_persons; aspects=aspects; negations=["aff"; "neg"]; moods=["indicative";"conditional"]; tenses=["past"]}] @
(if Xlist.mem aspects "imperf" then
[{empty_cats with lemma=lemma; pos="praet"; pos2="verb"; numbers=numbers; genders=genders; persons=all_persons; aspects=["imperf"]; negations=["aff"; "neg"]; moods=["indicative"]; tenses=["fut"]}]
else [])
| lemma,"praet",[numbers;genders;aspects] ->
let numbers = expand_numbers numbers in
let genders = expand_genders genders in
[{empty_cats with lemma=lemma; pos="praet"; pos2="verb"; numbers=numbers; genders=genders; persons=all_persons; aspects=aspects; negations=["aff"; "neg"]; moods=["indicative";"conditional"]; tenses=["past"]}] @
(if Xlist.mem aspects "imperf" then
[{empty_cats with lemma=lemma; pos="praet"; pos2="verb"; numbers=numbers; genders=genders; persons=all_persons; aspects=["imperf"]; negations=["aff"; "neg"]; moods=["indicative"]; tenses=["fut"]}]
else [])
| lemma,"winien",[numbers;genders;aspects] ->
let numbers = expand_numbers numbers in
let genders = expand_genders genders in
[{empty_cats with lemma=lemma; pos="winien"; pos2="verb"; numbers=numbers; genders=genders; persons=all_persons; aspects=aspects; negations=["aff"; "neg"]; moods=["indicative";"conditional"]; tenses=["pres"]};
{empty_cats with lemma=lemma; pos="winien"; pos2="verb"; numbers=numbers; genders=genders; persons=all_persons; aspects=aspects; negations=["aff"; "neg"]; moods=["indicative"]; tenses=["past"]}] @
(if Xlist.mem aspects "imperf" then
[{empty_cats with lemma=lemma; pos="winien"; pos2="verb"; numbers=numbers; genders=genders; persons=all_persons; aspects=["imperf"]; negations=["aff"; "neg"]; moods=["indicative"]; tenses=["fut"]}]
else [])
| lemma,"impt",[numbers;persons;aspects] ->
let numbers = expand_numbers numbers in
[{empty_cats with lemma=lemma; pos="impt"; pos2="verb"; numbers=numbers; genders=all_genders; persons=persons; aspects=aspects; negations=["aff"; "neg"]; moods=["imperative"]; tenses=["fut"]}]
| lemma,"imps",[aspects] ->
[{empty_cats with lemma=lemma; pos="imps"; pos2="verb"; numbers=all_numbers; genders=all_genders; persons=all_persons; aspects=aspects; negations=["aff"; "neg"]; moods=["indicative"]; tenses=["past"]}]
| lemma,"pred",[] -> (* FIXME: czy predykatyw zawsze jest niedokonany? *)
[{empty_cats with lemma=lemma; pos="pred"; pos2="verb"; numbers=["sg"]; genders=["n2"]; persons=["ter"]; aspects=["imperf"]; negations=["aff"; "neg"]; moods=["indicative"]; tenses=["pres";"past";"fut"]}]
| lemma,"aglt",[numbers;persons;aspects;wok] ->
let numbers = expand_numbers numbers in
[{empty_cats with lemma=lemma; pos="aglt"; pos2="verb"; numbers=numbers; persons=persons; aspects=aspects}]
| lemma,"inf",[aspects] -> [{empty_cats with lemma=lemma; pos="inf"; pos2="verb"; aspects=aspects}]
| lemma,"pcon",[aspects] -> [{empty_cats with lemma=lemma; pos="inf"; pos2="verb"; aspects=aspects}]
| lemma,"pant",[aspects] -> [{empty_cats with lemma=lemma; pos="inf"; pos2="verb"; aspects=aspects}]
| lemma,"qub",[] -> [{empty_cats with lemma=lemma; pos="qub"; pos2="qub"}]
| lemma,"comp",[] -> [{empty_cats with lemma=lemma; pos="comp"; pos2="comp"}]
| lemma,"conj",[] -> [{empty_cats with lemma=lemma; pos="conj"; pos2="conj"}]
| lemma,"interj",[] -> [{empty_cats with lemma=lemma; pos="interj"; pos2="interj"}]
| lemma,"burk",[] -> [{empty_cats with lemma=lemma; pos="burk"; pos2="burk"}]
| ",","interp",[] -> [{empty_cats with lemma=","; pos="conj"; pos2="conj"}]
| lemma,"interp",[] -> [{empty_cats with lemma=lemma; pos="interp"; pos2="interp"}]
| lemma,"unk",[] ->
[{empty_cats with lemma=lemma; pos="unk"; pos2="noun"; numbers=all_numbers; cases=all_cases; genders=all_genders; persons=["ter"]}]
| lemma,c,l -> failwith ("clarify_categories: " ^ lemma ^ ":" ^ c ^ ":" ^ (String.concat ":" (Xlist.map l (String.concat "."))))
(* FIXME: przenieść gdzieś indziej *)
let assign token =
match token.ENIAMtokenizerTypes.token with
ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> List.flatten (Xlist.map interp (fun interp -> clarify_categories false (lemma,pos,interp)))
| ENIAMtokenizerTypes.Proper(lemma,pos,interp,_) -> List.flatten (Xlist.map interp (fun interp -> clarify_categories true (lemma,pos,interp)))
| ENIAMtokenizerTypes.Interp lemma -> clarify_categories false (lemma,"interp",[])
| _ -> []
let match_selector cats = function
Lemma -> [cats.lemma]
(* | NewLemma -> [] *)
| Number -> cats.numbers
| Case -> cats.cases
| Gender -> cats.genders
| Person -> cats.persons
| Grad -> cats.grads
| Praep -> cats.praeps
| Acm -> cats.acms
| Aspect -> cats.aspects
| Negation -> cats.negations
| Mood -> cats.moods
| Tense -> cats.tenses
| Nsyn -> cats.nsyn
| Nsem -> cats.nsem
| c -> failwith ("match_selector: " ^ string_of_cat c)
let set_selector cats vals = function
Number -> {cats with numbers=vals}
| Case -> {cats with cases=vals}
| Gender -> {cats with genders=vals}
| Person -> {cats with persons=vals}
| Grad -> {cats with grads=vals}
| Praep -> {cats with praeps=vals}
| Acm -> {cats with acms=vals}
| Aspect -> {cats with aspects=vals}
| Negation -> {cats with negations=vals}
| Mood -> {cats with moods=vals}
| Tense -> {cats with tenses=vals}
| Nsyn -> {cats with nsyn=vals}
| Nsem -> {cats with nsem=vals}
| c -> failwith ("set_selector: " ^ string_of_cat c)