Commit 766cb2a4a06f1150cce4d4ed7ce917ab8fbe9598
1 parent
40942928
ordnum i końce zdań na końcach tekstów
Showing
7 changed files
with
140 additions
and
25 deletions
subsyntax/ENIAM_MWE.ml
... | ... | @@ -210,6 +210,14 @@ let get_orths paths = |
210 | 210 | Xlist.fold l orths (fun orths t -> |
211 | 211 | StringSet.add orths (ENIAMtokens.get_orth t.token)))) |
212 | 212 | |
213 | +let get_intnum_orths paths = | |
214 | + IntMap.fold paths StringMap.empty (fun orths _ map -> | |
215 | + IntMap.fold map orths (fun orths _ l -> | |
216 | + Xlist.fold l orths (fun orths t -> | |
217 | + match t.token with | |
218 | + Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma) | |
219 | + | _ -> orths))) | |
220 | + | |
213 | 221 | let rec match_path_rec map found (t:token_env) rev = function |
214 | 222 | [] -> (t :: rev) :: found |
215 | 223 | | s :: l -> |
... | ... | @@ -261,10 +269,19 @@ let apply_rule paths (match_list,lemma,interp) = |
261 | 269 | add_token paths token |
262 | 270 | with Not_found -> paths) |
263 | 271 | |
272 | +(* FIXME: reguły dla ordnum powinny maczować część mowy a nie tylko orth *) | |
273 | +let add_ordnum_rules rules paths = | |
274 | + let orths = get_intnum_orths paths in | |
275 | + StringMap.fold orths rules (fun rules orth lemmas -> | |
276 | + StringSet.fold lemmas rules (fun rules lemma -> | |
277 | + (* Printf.printf "%s %s\n%!" orth lemma; *) | |
278 | + ([orth;"."],lemma,"ordnum") :: rules)) | |
279 | + | |
264 | 280 | let process (paths,last) = |
265 | 281 | let paths = Xlist.fold paths IntMap.empty add_token in |
266 | 282 | let orths = get_orths paths in |
267 | 283 | let rules = preselect_dict orths mwe_dict in |
284 | + let rules = add_ordnum_rules rules paths in | |
268 | 285 | let paths = Xlist.fold rules paths apply_rule in |
269 | 286 | let paths = IntMap.fold paths [] (fun paths _ map -> |
270 | 287 | IntMap.fold map paths (fun paths _ l -> |
... | ... |
subsyntax/test.ml
... | ... | @@ -19,21 +19,25 @@ |
19 | 19 | |
20 | 20 | |
21 | 21 | let test_strings = [ |
22 | - "Szpak frunie."; | |
22 | + (* "Szpak frunie."; | |
23 | 23 | "Kot np. miauczy."; |
24 | 24 | "Ala ma kota."; |
25 | - "Ale mają kota:" | |
25 | + "Ale mają kota:" *) | |
26 | 26 | (* "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *) |
27 | 27 | (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP."; *) |
28 | 28 | (* "Trzy lata później założył pracownię architektoniczną Atelier Bizio + Ligierko, zajmującą się adaptacjami budynków historycznych."; *) |
29 | 29 | (* "Festiwalu Polskich Sztuk Współczesnych R@Port"; *) |
30 | 30 | (* "Przeglądu Teatrów Małych Form „Kontrapunkt”"; *) |
31 | + (* "Dyplom uzyskał w 1994."; | |
32 | + "dyplom uzyskał w 1994"; *) | |
31 | 33 | ] |
32 | 34 | |
33 | 35 | let test_strings2 = [ |
34 | - "Szpak frunie. Kot miauczy."; | |
35 | - "Szpak powiedział: „Frunę. Śpiewam.”"; | |
36 | + (* "Szpak frunie. Kot miauczy."; | |
37 | + "Szpak powiedział: „Frunę. Śpiewam.”"; *) | |
36 | 38 | (* "Istniejący od XI w. Czersk uzyskał prawa miejskie w 1350 r. Mazowsze było wtedy samodzielnym księstwem."; *) |
39 | + "Dyplom uzyskał w 1994."; | |
40 | + "dyplom uzyskał w 1994"; | |
37 | 41 | ] |
38 | 42 | |
39 | 43 | let _ = |
... | ... |
tokenizer/ENIAMacronyms.ml
... | ... | @@ -864,3 +864,6 @@ let abr_patterns = [ |
864 | 864 | [O "ws"; S "."], (function [a;b] -> std a b [1,"w","prep:loc:nwok";1,"sprawa","subst:sg:loc:f"] | _ -> failwith "abr_patterns"); |
865 | 865 | [O "ww"; S "."], (function [a;b] -> std a b [1,"wysoko","adv:com";1,"wymieniony","ppas:_:_:_:perf:aff"] | _ -> failwith "abr_patterns"); |
866 | 866 | ] |
867 | + | |
868 | +(* let query_patterns = [ | |
869 | + [I "<query>"; S "."; O "u"; S "."], (function [a;b;c;d] -> [ct [a;b] "bez" "prep:gen:nwok"; ct [c;d] "uwaga" "subst:pl:gen:f"] | _ -> failwith "abr_patterns"); *) | |
... | ... |
tokenizer/ENIAMpatterns.ml
... | ... | @@ -63,7 +63,6 @@ let dig_value t = |
63 | 63 | Dig(v,_) -> v |
64 | 64 | | _ -> failwith "dig_value" |
65 | 65 | |
66 | -(* FIXME: problem z ordnum - wyklucza year co stanowi problem na końcu zdania *) | |
67 | 66 | let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeba uwzględnić w preprocesingu brak spacji - albo w dezambiguacji *) |
68 | 67 | [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"])); |
69 | 68 | [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"])); |
... | ... | @@ -76,7 +75,7 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb |
76 | 75 | [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); |
77 | 76 | [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); |
78 | 77 | [D "pref3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); |
79 | - [D "intnum"; S "."], (function [token;_] -> Dig(concat_intnum [token],"ordnum") | _ -> failwith "digit_patterns1"); (* FIXME: to nie powinno wykluczać innych interpretacji *) | |
78 | + (* [D "intnum"; S "."], (function [token;_] -> Dig(concat_intnum [token],"ordnum") | _ -> failwith "digit_patterns1"); *) | |
80 | 79 | [D "day"; S "."; D "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns2"); |
81 | 80 | [D "day"; S "."; RD "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3"); |
82 | 81 | [D "day"; S " "; RD "month"; S " "; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3"); |
... | ... | @@ -148,9 +147,9 @@ let digit_patterns3 = [ |
148 | 147 | [D "intnum"; S "-"; O "latku"], (function [x;_;_] -> compose_latek_lemma x "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22"); |
149 | 148 | [D "intnum"; S "-"; O "latkowie"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22"); |
150 | 149 | [D "intnum"; S "-"; O "latków"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22"); |
151 | - [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:dat:m1" | _ -> failwith "digit_patterns22"); | |
152 | - [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:inst:m1" | _ -> failwith "digit_patterns22"); | |
153 | - [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:loc:m1" | _ -> failwith "digit_patterns22"); | |
150 | + [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:dat:m1.f" | _ -> failwith "digit_patterns22"); | |
151 | + [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:inst:m1.f" | _ -> failwith "digit_patterns22"); | |
152 | + [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:loc:m1.f" | _ -> failwith "digit_patterns22"); | |
154 | 153 | [D "intnum"; S "-"; O "latka"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:nom:f" | _ -> failwith "digit_patterns22"); |
155 | 154 | [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:gen:f" | _ -> failwith "digit_patterns22"); |
156 | 155 | [D "intnum"; S "-"; O "latce"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22"); |
... | ... | @@ -159,9 +158,6 @@ let digit_patterns3 = [ |
159 | 158 | [D "intnum"; S "-"; O "latko"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:voc:f" | _ -> failwith "digit_patterns22"); |
160 | 159 | [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22"); |
161 | 160 | [D "intnum"; S "-"; O "latek"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:gen:f" | _ -> failwith "digit_patterns22"); |
162 | - [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:dat:f" | _ -> failwith "digit_patterns22"); | |
163 | - [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:inst:f" | _ -> failwith "digit_patterns22"); | |
164 | - [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:loc:f" | _ -> failwith "digit_patterns22"); | |
165 | 161 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:nom:m1" | _ -> failwith "digit_patterns22"); |
166 | 162 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:gen.acc:m1" | _ -> failwith "digit_patterns22"); |
167 | 163 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowi"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:dat:m1" | _ -> failwith "digit_patterns22"); |
... | ... | @@ -169,9 +165,9 @@ let digit_patterns3 = [ |
169 | 165 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latku"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22"); |
170 | 166 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowie"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22"); |
171 | 167 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latków"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22"); |
172 | - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:dat:m1" | _ -> failwith "digit_patterns22"); | |
173 | - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:inst:m1" | _ -> failwith "digit_patterns22"); | |
174 | - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:loc:m1" | _ -> failwith "digit_patterns22"); | |
168 | + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:dat:m1.f" | _ -> failwith "digit_patterns22"); | |
169 | + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:inst:m1.f" | _ -> failwith "digit_patterns22"); | |
170 | + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:loc:m1.f" | _ -> failwith "digit_patterns22"); | |
175 | 171 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:nom:f" | _ -> failwith "digit_patterns22"); |
176 | 172 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:gen:f" | _ -> failwith "digit_patterns22"); |
177 | 173 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latce"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22"); |
... | ... | @@ -180,9 +176,6 @@ let digit_patterns3 = [ |
180 | 176 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latko"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:voc:f" | _ -> failwith "digit_patterns22"); |
181 | 177 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22"); |
182 | 178 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:gen:f" | _ -> failwith "digit_patterns22"); |
183 | - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:dat:f" | _ -> failwith "digit_patterns22"); | |
184 | - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:inst:f" | _ -> failwith "digit_patterns22"); | |
185 | - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:loc:f" | _ -> failwith "digit_patterns22"); | |
186 | 179 | ] |
187 | 180 | |
188 | 181 | let url_patterns1 = [ |
... | ... | @@ -299,6 +292,7 @@ let match_token = function |
299 | 292 | | CL, CapLetter _ -> true |
300 | 293 | | CL, AllCap _ -> true |
301 | 294 | | CL, SomeCap _ -> true |
295 | + | I pat, Interp s -> pat = s | |
302 | 296 | | _ -> false |
303 | 297 | |
304 | 298 | let rec find_first_token matching pat = function |
... | ... | @@ -381,6 +375,92 @@ let find_abr_patterns patterns tokens = |
381 | 375 | find_abr_pattern (Xlist.map patterns (fun (pattern,command) -> |
382 | 376 | {prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); command_abr=command})) [] tokens |
383 | 377 | |
378 | + | |
379 | +exception PatternFound | |
380 | + | |
381 | +let query_beg_patterns = [ | |
382 | + [I "<query>";I "<sentence>"]; | |
383 | + [I "<query>";I "„s";I "<sentence>"]; | |
384 | + [I "<query>";I "<or>";I "<sentence>"]; | |
385 | + ] | |
386 | + | |
387 | +let query_end_patterns = [ | |
388 | + [I "</sentence>";I "</query>"]; | |
389 | + [I "</sentence>";I "”s";I "</query>"]; | |
390 | + ] | |
391 | + | |
392 | +let find_beg_pattern pattern tokens = | |
393 | + try | |
394 | + let _ = find_pattern_tail [{prefix=[]; matched=[]; suffix=[]; | |
395 | + pattern=pattern; command=(fun _ -> raise PatternFound); | |
396 | + command_abr=(fun _ -> [])}] tokens in false | |
397 | + with PatternFound -> true | Not_found -> false | |
398 | + | |
399 | +let replace_beg_pattern pattern command tokens = | |
400 | + try | |
401 | + let t,l = find_abr_pattern_tail [{prefix=[]; matched=[]; suffix=[]; | |
402 | + pattern=pattern; command=(fun _ -> Symbol ""); | |
403 | + command_abr=command}] tokens in | |
404 | + t :: l | |
405 | + with Not_found -> failwith "replace_beg_pattern" | |
406 | + | |
407 | +(* let s_beg i = {empty_token_env with beg=i;len=1;next=i+1; token=Interp "<sentence>"} | |
408 | +let c_beg i = {empty_token_env with beg=i;len=1;next=i+1; token=Interp "<clause>"} *) | |
409 | +let s_end i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "</sentence>"} | |
410 | +let c_end i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "</clause>"} | |
411 | + | |
412 | +let add_sentence_beg = function | |
413 | + [q;t] -> let next=t.next in [Token q;Token{t with len=t.len-2;next=next-2};ENIAMtokens.s_beg (next-2);ENIAMtokens.c_beg (next-1)] | |
414 | + | [q] -> let next=q.next in [Token{q with len=q.len-2;next=next-2};ENIAMtokens.s_beg (next-2);ENIAMtokens.c_beg (next-1)] | |
415 | + | _ -> failwith "add_sentence_beg" | |
416 | + | |
417 | +let add_sentence_end = function | |
418 | + [q;t] -> let beg=t.beg in [Token q;Token{t with len=t.len-2;beg=beg+2};s_end (beg+1);c_end beg] | |
419 | + | [q] -> let beg=q.beg in [Token{q with len=q.len-2;beg=beg+2};s_end (beg+1);c_end beg] | |
420 | + | _ -> failwith "add_sentence_end" | |
421 | + | |
422 | +let rec revert_tokens = function | |
423 | + Token t -> Token t | |
424 | + | Seq l -> Seq(Xlist.rev_map l revert_tokens) | |
425 | + | Variant l -> Variant(Xlist.map l revert_tokens) | |
426 | + | |
427 | +let manage_query_boundaries tokens = | |
428 | + let b = | |
429 | + try | |
430 | + let _ = find_pattern_tail (Xlist.map query_beg_patterns (fun pattern -> | |
431 | + {prefix=[]; matched=[]; suffix=[]; | |
432 | + pattern=pattern; command=(fun _ -> raise PatternFound); | |
433 | + command_abr=(fun _ -> [])})) tokens in false | |
434 | + with PatternFound -> true | Not_found -> false in | |
435 | + (if b then print_endline "sentence beg found" else print_endline "sentence beg not found"); | |
436 | + let tokens = | |
437 | + if find_beg_pattern [I "<query>";I "„s"] tokens then | |
438 | + if find_beg_pattern [I "<query>";I "„s";I "<sentence>"] tokens then tokens else | |
439 | + replace_beg_pattern [I "<query>";I "„s"] add_sentence_beg tokens else | |
440 | + if find_beg_pattern [I "<query>";I "<or>"] tokens then | |
441 | + if find_beg_pattern [I "<query>";I "<or>";I "<sentence>"] tokens then tokens else | |
442 | + replace_beg_pattern [I "<query>";I "<or>"] add_sentence_beg tokens else | |
443 | + if find_beg_pattern [I "<query>";I "<sentence>"] tokens then tokens else | |
444 | + replace_beg_pattern [I "<query>"] add_sentence_beg tokens in | |
445 | + let b = | |
446 | + try | |
447 | + let _ = find_pattern (Xlist.map query_end_patterns (fun pattern -> | |
448 | + {prefix=[]; matched=[]; suffix=[]; | |
449 | + pattern=pattern; command=(fun _ -> raise PatternFound); | |
450 | + command_abr=(fun _ -> [])})) [] tokens in false | |
451 | + with PatternFound -> true in | |
452 | + (if b then print_endline "sentence end found" else print_endline "sentence end not found"); | |
453 | + let tokens = Xlist.rev_map tokens revert_tokens in | |
454 | + let tokens = | |
455 | + if find_beg_pattern [I "</query>";I "”s"] tokens then | |
456 | + if find_beg_pattern [I "</query>";I "”s";I "</sentence>"] tokens then tokens else | |
457 | + replace_beg_pattern [I "</query>";I "”s"] add_sentence_end tokens else | |
458 | + if find_beg_pattern [I "</query>";I "</sentence>"] tokens then tokens else | |
459 | + replace_beg_pattern [I "</query>"] add_sentence_end tokens in | |
460 | + let tokens = Xlist.rev_map tokens revert_tokens in | |
461 | + tokens | |
462 | + | |
463 | + | |
384 | 464 | let find_replacement_patterns tokens = |
385 | 465 | let tokens = find_patterns digit_patterns1 tokens in |
386 | 466 | let tokens = normalize_tokens [] tokens in |
... | ... |
tokenizer/ENIAMtokenizer.ml
... | ... | @@ -29,6 +29,8 @@ let parse query = |
29 | 29 | let l = ENIAMpatterns.normalize_tokens [] l in |
30 | 30 | let l = ENIAMpatterns.find_replacement_patterns l in |
31 | 31 | let l = ENIAMpatterns.remove_spaces [] l in |
32 | + let l = ENIAMpatterns.manage_query_boundaries l in | |
32 | 33 | let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.abr_patterns l in |
34 | + (* let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.query_patterns l in *) | |
33 | 35 | let l = ENIAMpatterns.normalize_tokens [] l in |
34 | 36 | l |
... | ... |
tokenizer/ENIAMtokenizerTypes.ml
... | ... | @@ -60,7 +60,7 @@ type tokens = |
60 | 60 | | Variant of tokens list |
61 | 61 | | Seq of tokens list |
62 | 62 | |
63 | -type pat = L | CL | D of string | C of string | S of string | RD of string | O of string | |
63 | +type pat = L | CL | D of string | C of string | S of string | RD of string | O of string | I of string | |
64 | 64 | |
65 | 65 | let empty_token_env = { |
66 | 66 | orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.} |
... | ... |
tokenizer/test.ml
... | ... | @@ -36,21 +36,30 @@ let test_strings = [ |
36 | 36 | "Ala 22-25 ."; |
37 | 37 | "Ala 22.5.2000-25.5.2001 ."; |
38 | 38 | "Szpak frunie.";*) |
39 | - "Kot miauczy."; | |
39 | + (* "Kot miauczy."; *) | |
40 | 40 | (* "Np. Ala.";*) |
41 | - "w. dom."; | |
41 | + (* "w. dom."; | |
42 | 42 | "tzn."; |
43 | - "c.d.n."; | |
43 | + "c.d.n."; *) | |
44 | 44 | (* "Arabia Saudyjska biegnie."; |
45 | 45 | "Cauchy'ego ONZ-owska biegnie.";*) |
46 | - "TE-cie E-e."; | |
46 | + (* "TE-cie E-e."; | |
47 | 47 | "MS-DOS-owska CI-cie KRRi-cie UJ-ocie UJ-OCIE."; |
48 | - "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi"; | |
48 | + "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi"; *) | |
49 | 49 | (* "Tom idzie.";*) |
50 | - "Miałem miał."; | |
50 | + (* "Miałem miał."; *) | |
51 | 51 | (* "Szpak śpiewa."; |
52 | 52 | "Ala ma kota."; |
53 | 53 | "Ale mają kota:"*) |
54 | + "Matura."; | |
55 | + "matura"; | |
56 | + "„Matura.”"; | |
57 | + "„Matura”."; | |
58 | + "„matura”"; | |
59 | + "- matura"; | |
60 | + "- Matura"; | |
61 | + "2 jabłka"; | |
62 | + "- 2 jabłka"; | |
54 | 63 | ] |
55 | 64 | |
56 | 65 | let _ = |
... | ... |