Commit 766cb2a4a06f1150cce4d4ed7ce917ab8fbe9598
1 parent
40942928
ordnum i końce zdań na końcach tekstów
Showing
7 changed files
with
140 additions
and
25 deletions
subsyntax/ENIAM_MWE.ml
@@ -210,6 +210,14 @@ let get_orths paths = | @@ -210,6 +210,14 @@ let get_orths paths = | ||
210 | Xlist.fold l orths (fun orths t -> | 210 | Xlist.fold l orths (fun orths t -> |
211 | StringSet.add orths (ENIAMtokens.get_orth t.token)))) | 211 | StringSet.add orths (ENIAMtokens.get_orth t.token)))) |
212 | 212 | ||
213 | +let get_intnum_orths paths = | ||
214 | + IntMap.fold paths StringMap.empty (fun orths _ map -> | ||
215 | + IntMap.fold map orths (fun orths _ l -> | ||
216 | + Xlist.fold l orths (fun orths t -> | ||
217 | + match t.token with | ||
218 | + Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma) | ||
219 | + | _ -> orths))) | ||
220 | + | ||
213 | let rec match_path_rec map found (t:token_env) rev = function | 221 | let rec match_path_rec map found (t:token_env) rev = function |
214 | [] -> (t :: rev) :: found | 222 | [] -> (t :: rev) :: found |
215 | | s :: l -> | 223 | | s :: l -> |
@@ -261,10 +269,19 @@ let apply_rule paths (match_list,lemma,interp) = | @@ -261,10 +269,19 @@ let apply_rule paths (match_list,lemma,interp) = | ||
261 | add_token paths token | 269 | add_token paths token |
262 | with Not_found -> paths) | 270 | with Not_found -> paths) |
263 | 271 | ||
272 | +(* FIXME: reguły dla ordnum powinny maczować część mowy a nie tylko orth *) | ||
273 | +let add_ordnum_rules rules paths = | ||
274 | + let orths = get_intnum_orths paths in | ||
275 | + StringMap.fold orths rules (fun rules orth lemmas -> | ||
276 | + StringSet.fold lemmas rules (fun rules lemma -> | ||
277 | + (* Printf.printf "%s %s\n%!" orth lemma; *) | ||
278 | + ([orth;"."],lemma,"ordnum") :: rules)) | ||
279 | + | ||
264 | let process (paths,last) = | 280 | let process (paths,last) = |
265 | let paths = Xlist.fold paths IntMap.empty add_token in | 281 | let paths = Xlist.fold paths IntMap.empty add_token in |
266 | let orths = get_orths paths in | 282 | let orths = get_orths paths in |
267 | let rules = preselect_dict orths mwe_dict in | 283 | let rules = preselect_dict orths mwe_dict in |
284 | + let rules = add_ordnum_rules rules paths in | ||
268 | let paths = Xlist.fold rules paths apply_rule in | 285 | let paths = Xlist.fold rules paths apply_rule in |
269 | let paths = IntMap.fold paths [] (fun paths _ map -> | 286 | let paths = IntMap.fold paths [] (fun paths _ map -> |
270 | IntMap.fold map paths (fun paths _ l -> | 287 | IntMap.fold map paths (fun paths _ l -> |
subsyntax/test.ml
@@ -19,21 +19,25 @@ | @@ -19,21 +19,25 @@ | ||
19 | 19 | ||
20 | 20 | ||
21 | let test_strings = [ | 21 | let test_strings = [ |
22 | - "Szpak frunie."; | 22 | + (* "Szpak frunie."; |
23 | "Kot np. miauczy."; | 23 | "Kot np. miauczy."; |
24 | "Ala ma kota."; | 24 | "Ala ma kota."; |
25 | - "Ale mają kota:" | 25 | + "Ale mają kota:" *) |
26 | (* "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *) | 26 | (* "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *) |
27 | (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP."; *) | 27 | (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP."; *) |
28 | (* "Trzy lata później założył pracownię architektoniczną Atelier Bizio + Ligierko, zajmującą się adaptacjami budynków historycznych."; *) | 28 | (* "Trzy lata później założył pracownię architektoniczną Atelier Bizio + Ligierko, zajmującą się adaptacjami budynków historycznych."; *) |
29 | (* "Festiwalu Polskich Sztuk Współczesnych R@Port"; *) | 29 | (* "Festiwalu Polskich Sztuk Współczesnych R@Port"; *) |
30 | (* "Przeglądu Teatrów Małych Form „Kontrapunkt”"; *) | 30 | (* "Przeglądu Teatrów Małych Form „Kontrapunkt”"; *) |
31 | + (* "Dyplom uzyskał w 1994."; | ||
32 | + "dyplom uzyskał w 1994"; *) | ||
31 | ] | 33 | ] |
32 | 34 | ||
33 | let test_strings2 = [ | 35 | let test_strings2 = [ |
34 | - "Szpak frunie. Kot miauczy."; | ||
35 | - "Szpak powiedział: „Frunę. Śpiewam.”"; | 36 | + (* "Szpak frunie. Kot miauczy."; |
37 | + "Szpak powiedział: „Frunę. Śpiewam.”"; *) | ||
36 | (* "Istniejący od XI w. Czersk uzyskał prawa miejskie w 1350 r. Mazowsze było wtedy samodzielnym księstwem."; *) | 38 | (* "Istniejący od XI w. Czersk uzyskał prawa miejskie w 1350 r. Mazowsze było wtedy samodzielnym księstwem."; *) |
39 | + "Dyplom uzyskał w 1994."; | ||
40 | + "dyplom uzyskał w 1994"; | ||
37 | ] | 41 | ] |
38 | 42 | ||
39 | let _ = | 43 | let _ = |
tokenizer/ENIAMacronyms.ml
@@ -864,3 +864,6 @@ let abr_patterns = [ | @@ -864,3 +864,6 @@ let abr_patterns = [ | ||
864 | [O "ws"; S "."], (function [a;b] -> std a b [1,"w","prep:loc:nwok";1,"sprawa","subst:sg:loc:f"] | _ -> failwith "abr_patterns"); | 864 | [O "ws"; S "."], (function [a;b] -> std a b [1,"w","prep:loc:nwok";1,"sprawa","subst:sg:loc:f"] | _ -> failwith "abr_patterns"); |
865 | [O "ww"; S "."], (function [a;b] -> std a b [1,"wysoko","adv:com";1,"wymieniony","ppas:_:_:_:perf:aff"] | _ -> failwith "abr_patterns"); | 865 | [O "ww"; S "."], (function [a;b] -> std a b [1,"wysoko","adv:com";1,"wymieniony","ppas:_:_:_:perf:aff"] | _ -> failwith "abr_patterns"); |
866 | ] | 866 | ] |
867 | + | ||
868 | +(* let query_patterns = [ | ||
869 | + [I "<query>"; S "."; O "u"; S "."], (function [a;b;c;d] -> [ct [a;b] "bez" "prep:gen:nwok"; ct [c;d] "uwaga" "subst:pl:gen:f"] | _ -> failwith "abr_patterns"); *) |
tokenizer/ENIAMpatterns.ml
@@ -63,7 +63,6 @@ let dig_value t = | @@ -63,7 +63,6 @@ let dig_value t = | ||
63 | Dig(v,_) -> v | 63 | Dig(v,_) -> v |
64 | | _ -> failwith "dig_value" | 64 | | _ -> failwith "dig_value" |
65 | 65 | ||
66 | -(* FIXME: problem z ordnum - wyklucza year co stanowi problem na końcu zdania *) | ||
67 | let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeba uwzględnić w preprocesingu brak spacji - albo w dezambiguacji *) | 66 | let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeba uwzględnić w preprocesingu brak spacji - albo w dezambiguacji *) |
68 | [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"])); | 67 | [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"])); |
69 | [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"])); | 68 | [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"])); |
@@ -76,7 +75,7 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb | @@ -76,7 +75,7 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb | ||
76 | [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); | 75 | [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); |
77 | [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); | 76 | [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); |
78 | [D "pref3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); | 77 | [D "pref3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); |
79 | - [D "intnum"; S "."], (function [token;_] -> Dig(concat_intnum [token],"ordnum") | _ -> failwith "digit_patterns1"); (* FIXME: to nie powinno wykluczać innych interpretacji *) | 78 | + (* [D "intnum"; S "."], (function [token;_] -> Dig(concat_intnum [token],"ordnum") | _ -> failwith "digit_patterns1"); *) |
80 | [D "day"; S "."; D "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns2"); | 79 | [D "day"; S "."; D "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns2"); |
81 | [D "day"; S "."; RD "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3"); | 80 | [D "day"; S "."; RD "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3"); |
82 | [D "day"; S " "; RD "month"; S " "; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3"); | 81 | [D "day"; S " "; RD "month"; S " "; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3"); |
@@ -148,9 +147,9 @@ let digit_patterns3 = [ | @@ -148,9 +147,9 @@ let digit_patterns3 = [ | ||
148 | [D "intnum"; S "-"; O "latku"], (function [x;_;_] -> compose_latek_lemma x "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22"); | 147 | [D "intnum"; S "-"; O "latku"], (function [x;_;_] -> compose_latek_lemma x "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22"); |
149 | [D "intnum"; S "-"; O "latkowie"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22"); | 148 | [D "intnum"; S "-"; O "latkowie"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22"); |
150 | [D "intnum"; S "-"; O "latków"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22"); | 149 | [D "intnum"; S "-"; O "latków"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22"); |
151 | - [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:dat:m1" | _ -> failwith "digit_patterns22"); | ||
152 | - [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:inst:m1" | _ -> failwith "digit_patterns22"); | ||
153 | - [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:loc:m1" | _ -> failwith "digit_patterns22"); | 150 | + [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:dat:m1.f" | _ -> failwith "digit_patterns22"); |
151 | + [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:inst:m1.f" | _ -> failwith "digit_patterns22"); | ||
152 | + [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:loc:m1.f" | _ -> failwith "digit_patterns22"); | ||
154 | [D "intnum"; S "-"; O "latka"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:nom:f" | _ -> failwith "digit_patterns22"); | 153 | [D "intnum"; S "-"; O "latka"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:nom:f" | _ -> failwith "digit_patterns22"); |
155 | [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:gen:f" | _ -> failwith "digit_patterns22"); | 154 | [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:gen:f" | _ -> failwith "digit_patterns22"); |
156 | [D "intnum"; S "-"; O "latce"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22"); | 155 | [D "intnum"; S "-"; O "latce"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22"); |
@@ -159,9 +158,6 @@ let digit_patterns3 = [ | @@ -159,9 +158,6 @@ let digit_patterns3 = [ | ||
159 | [D "intnum"; S "-"; O "latko"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:voc:f" | _ -> failwith "digit_patterns22"); | 158 | [D "intnum"; S "-"; O "latko"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:voc:f" | _ -> failwith "digit_patterns22"); |
160 | [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22"); | 159 | [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22"); |
161 | [D "intnum"; S "-"; O "latek"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:gen:f" | _ -> failwith "digit_patterns22"); | 160 | [D "intnum"; S "-"; O "latek"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:gen:f" | _ -> failwith "digit_patterns22"); |
162 | - [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:dat:f" | _ -> failwith "digit_patterns22"); | ||
163 | - [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:inst:f" | _ -> failwith "digit_patterns22"); | ||
164 | - [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:loc:f" | _ -> failwith "digit_patterns22"); | ||
165 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:nom:m1" | _ -> failwith "digit_patterns22"); | 161 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:nom:m1" | _ -> failwith "digit_patterns22"); |
166 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:gen.acc:m1" | _ -> failwith "digit_patterns22"); | 162 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:gen.acc:m1" | _ -> failwith "digit_patterns22"); |
167 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowi"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:dat:m1" | _ -> failwith "digit_patterns22"); | 163 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowi"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:dat:m1" | _ -> failwith "digit_patterns22"); |
@@ -169,9 +165,9 @@ let digit_patterns3 = [ | @@ -169,9 +165,9 @@ let digit_patterns3 = [ | ||
169 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latku"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22"); | 165 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latku"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22"); |
170 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowie"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22"); | 166 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowie"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22"); |
171 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latków"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22"); | 167 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latków"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22"); |
172 | - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:dat:m1" | _ -> failwith "digit_patterns22"); | ||
173 | - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:inst:m1" | _ -> failwith "digit_patterns22"); | ||
174 | - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:loc:m1" | _ -> failwith "digit_patterns22"); | 168 | + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:dat:m1.f" | _ -> failwith "digit_patterns22"); |
169 | + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:inst:m1.f" | _ -> failwith "digit_patterns22"); | ||
170 | + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:loc:m1.f" | _ -> failwith "digit_patterns22"); | ||
175 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:nom:f" | _ -> failwith "digit_patterns22"); | 171 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:nom:f" | _ -> failwith "digit_patterns22"); |
176 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:gen:f" | _ -> failwith "digit_patterns22"); | 172 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:gen:f" | _ -> failwith "digit_patterns22"); |
177 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latce"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22"); | 173 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latce"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22"); |
@@ -180,9 +176,6 @@ let digit_patterns3 = [ | @@ -180,9 +176,6 @@ let digit_patterns3 = [ | ||
180 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latko"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:voc:f" | _ -> failwith "digit_patterns22"); | 176 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latko"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:voc:f" | _ -> failwith "digit_patterns22"); |
181 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22"); | 177 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22"); |
182 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:gen:f" | _ -> failwith "digit_patterns22"); | 178 | [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:gen:f" | _ -> failwith "digit_patterns22"); |
183 | - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:dat:f" | _ -> failwith "digit_patterns22"); | ||
184 | - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:inst:f" | _ -> failwith "digit_patterns22"); | ||
185 | - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:loc:f" | _ -> failwith "digit_patterns22"); | ||
186 | ] | 179 | ] |
187 | 180 | ||
188 | let url_patterns1 = [ | 181 | let url_patterns1 = [ |
@@ -299,6 +292,7 @@ let match_token = function | @@ -299,6 +292,7 @@ let match_token = function | ||
299 | | CL, CapLetter _ -> true | 292 | | CL, CapLetter _ -> true |
300 | | CL, AllCap _ -> true | 293 | | CL, AllCap _ -> true |
301 | | CL, SomeCap _ -> true | 294 | | CL, SomeCap _ -> true |
295 | + | I pat, Interp s -> pat = s | ||
302 | | _ -> false | 296 | | _ -> false |
303 | 297 | ||
304 | let rec find_first_token matching pat = function | 298 | let rec find_first_token matching pat = function |
@@ -381,6 +375,92 @@ let find_abr_patterns patterns tokens = | @@ -381,6 +375,92 @@ let find_abr_patterns patterns tokens = | ||
381 | find_abr_pattern (Xlist.map patterns (fun (pattern,command) -> | 375 | find_abr_pattern (Xlist.map patterns (fun (pattern,command) -> |
382 | {prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); command_abr=command})) [] tokens | 376 | {prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); command_abr=command})) [] tokens |
383 | 377 | ||
378 | + | ||
379 | +exception PatternFound | ||
380 | + | ||
381 | +let query_beg_patterns = [ | ||
382 | + [I "<query>";I "<sentence>"]; | ||
383 | + [I "<query>";I "„s";I "<sentence>"]; | ||
384 | + [I "<query>";I "<or>";I "<sentence>"]; | ||
385 | + ] | ||
386 | + | ||
387 | +let query_end_patterns = [ | ||
388 | + [I "</sentence>";I "</query>"]; | ||
389 | + [I "</sentence>";I "”s";I "</query>"]; | ||
390 | + ] | ||
391 | + | ||
392 | +let find_beg_pattern pattern tokens = | ||
393 | + try | ||
394 | + let _ = find_pattern_tail [{prefix=[]; matched=[]; suffix=[]; | ||
395 | + pattern=pattern; command=(fun _ -> raise PatternFound); | ||
396 | + command_abr=(fun _ -> [])}] tokens in false | ||
397 | + with PatternFound -> true | Not_found -> false | ||
398 | + | ||
399 | +let replace_beg_pattern pattern command tokens = | ||
400 | + try | ||
401 | + let t,l = find_abr_pattern_tail [{prefix=[]; matched=[]; suffix=[]; | ||
402 | + pattern=pattern; command=(fun _ -> Symbol ""); | ||
403 | + command_abr=command}] tokens in | ||
404 | + t :: l | ||
405 | + with Not_found -> failwith "replace_beg_pattern" | ||
406 | + | ||
407 | +(* let s_beg i = {empty_token_env with beg=i;len=1;next=i+1; token=Interp "<sentence>"} | ||
408 | +let c_beg i = {empty_token_env with beg=i;len=1;next=i+1; token=Interp "<clause>"} *) | ||
409 | +let s_end i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "</sentence>"} | ||
410 | +let c_end i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "</clause>"} | ||
411 | + | ||
412 | +let add_sentence_beg = function | ||
413 | + [q;t] -> let next=t.next in [Token q;Token{t with len=t.len-2;next=next-2};ENIAMtokens.s_beg (next-2);ENIAMtokens.c_beg (next-1)] | ||
414 | + | [q] -> let next=q.next in [Token{q with len=q.len-2;next=next-2};ENIAMtokens.s_beg (next-2);ENIAMtokens.c_beg (next-1)] | ||
415 | + | _ -> failwith "add_sentence_beg" | ||
416 | + | ||
417 | +let add_sentence_end = function | ||
418 | + [q;t] -> let beg=t.beg in [Token q;Token{t with len=t.len-2;beg=beg+2};s_end (beg+1);c_end beg] | ||
419 | + | [q] -> let beg=q.beg in [Token{q with len=q.len-2;beg=beg+2};s_end (beg+1);c_end beg] | ||
420 | + | _ -> failwith "add_sentence_end" | ||
421 | + | ||
422 | +let rec revert_tokens = function | ||
423 | + Token t -> Token t | ||
424 | + | Seq l -> Seq(Xlist.rev_map l revert_tokens) | ||
425 | + | Variant l -> Variant(Xlist.map l revert_tokens) | ||
426 | + | ||
427 | +let manage_query_boundaries tokens = | ||
428 | + let b = | ||
429 | + try | ||
430 | + let _ = find_pattern_tail (Xlist.map query_beg_patterns (fun pattern -> | ||
431 | + {prefix=[]; matched=[]; suffix=[]; | ||
432 | + pattern=pattern; command=(fun _ -> raise PatternFound); | ||
433 | + command_abr=(fun _ -> [])})) tokens in false | ||
434 | + with PatternFound -> true | Not_found -> false in | ||
435 | + (if b then print_endline "sentence beg found" else print_endline "sentence beg not found"); | ||
436 | + let tokens = | ||
437 | + if find_beg_pattern [I "<query>";I "„s"] tokens then | ||
438 | + if find_beg_pattern [I "<query>";I "„s";I "<sentence>"] tokens then tokens else | ||
439 | + replace_beg_pattern [I "<query>";I "„s"] add_sentence_beg tokens else | ||
440 | + if find_beg_pattern [I "<query>";I "<or>"] tokens then | ||
441 | + if find_beg_pattern [I "<query>";I "<or>";I "<sentence>"] tokens then tokens else | ||
442 | + replace_beg_pattern [I "<query>";I "<or>"] add_sentence_beg tokens else | ||
443 | + if find_beg_pattern [I "<query>";I "<sentence>"] tokens then tokens else | ||
444 | + replace_beg_pattern [I "<query>"] add_sentence_beg tokens in | ||
445 | + let b = | ||
446 | + try | ||
447 | + let _ = find_pattern (Xlist.map query_end_patterns (fun pattern -> | ||
448 | + {prefix=[]; matched=[]; suffix=[]; | ||
449 | + pattern=pattern; command=(fun _ -> raise PatternFound); | ||
450 | + command_abr=(fun _ -> [])})) [] tokens in false | ||
451 | + with PatternFound -> true in | ||
452 | + (if b then print_endline "sentence end found" else print_endline "sentence end not found"); | ||
453 | + let tokens = Xlist.rev_map tokens revert_tokens in | ||
454 | + let tokens = | ||
455 | + if find_beg_pattern [I "</query>";I "”s"] tokens then | ||
456 | + if find_beg_pattern [I "</query>";I "”s";I "</sentence>"] tokens then tokens else | ||
457 | + replace_beg_pattern [I "</query>";I "”s"] add_sentence_end tokens else | ||
458 | + if find_beg_pattern [I "</query>";I "</sentence>"] tokens then tokens else | ||
459 | + replace_beg_pattern [I "</query>"] add_sentence_end tokens in | ||
460 | + let tokens = Xlist.rev_map tokens revert_tokens in | ||
461 | + tokens | ||
462 | + | ||
463 | + | ||
384 | let find_replacement_patterns tokens = | 464 | let find_replacement_patterns tokens = |
385 | let tokens = find_patterns digit_patterns1 tokens in | 465 | let tokens = find_patterns digit_patterns1 tokens in |
386 | let tokens = normalize_tokens [] tokens in | 466 | let tokens = normalize_tokens [] tokens in |
tokenizer/ENIAMtokenizer.ml
@@ -29,6 +29,8 @@ let parse query = | @@ -29,6 +29,8 @@ let parse query = | ||
29 | let l = ENIAMpatterns.normalize_tokens [] l in | 29 | let l = ENIAMpatterns.normalize_tokens [] l in |
30 | let l = ENIAMpatterns.find_replacement_patterns l in | 30 | let l = ENIAMpatterns.find_replacement_patterns l in |
31 | let l = ENIAMpatterns.remove_spaces [] l in | 31 | let l = ENIAMpatterns.remove_spaces [] l in |
32 | + let l = ENIAMpatterns.manage_query_boundaries l in | ||
32 | let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.abr_patterns l in | 33 | let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.abr_patterns l in |
34 | + (* let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.query_patterns l in *) | ||
33 | let l = ENIAMpatterns.normalize_tokens [] l in | 35 | let l = ENIAMpatterns.normalize_tokens [] l in |
34 | l | 36 | l |
tokenizer/ENIAMtokenizerTypes.ml
@@ -60,7 +60,7 @@ type tokens = | @@ -60,7 +60,7 @@ type tokens = | ||
60 | | Variant of tokens list | 60 | | Variant of tokens list |
61 | | Seq of tokens list | 61 | | Seq of tokens list |
62 | 62 | ||
63 | -type pat = L | CL | D of string | C of string | S of string | RD of string | O of string | 63 | +type pat = L | CL | D of string | C of string | S of string | RD of string | O of string | I of string |
64 | 64 | ||
65 | let empty_token_env = { | 65 | let empty_token_env = { |
66 | orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.} | 66 | orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.} |
tokenizer/test.ml
@@ -36,21 +36,30 @@ let test_strings = [ | @@ -36,21 +36,30 @@ let test_strings = [ | ||
36 | "Ala 22-25 ."; | 36 | "Ala 22-25 ."; |
37 | "Ala 22.5.2000-25.5.2001 ."; | 37 | "Ala 22.5.2000-25.5.2001 ."; |
38 | "Szpak frunie.";*) | 38 | "Szpak frunie.";*) |
39 | - "Kot miauczy."; | 39 | + (* "Kot miauczy."; *) |
40 | (* "Np. Ala.";*) | 40 | (* "Np. Ala.";*) |
41 | - "w. dom."; | 41 | + (* "w. dom."; |
42 | "tzn."; | 42 | "tzn."; |
43 | - "c.d.n."; | 43 | + "c.d.n."; *) |
44 | (* "Arabia Saudyjska biegnie."; | 44 | (* "Arabia Saudyjska biegnie."; |
45 | "Cauchy'ego ONZ-owska biegnie.";*) | 45 | "Cauchy'ego ONZ-owska biegnie.";*) |
46 | - "TE-cie E-e."; | 46 | + (* "TE-cie E-e."; |
47 | "MS-DOS-owska CI-cie KRRi-cie UJ-ocie UJ-OCIE."; | 47 | "MS-DOS-owska CI-cie KRRi-cie UJ-ocie UJ-OCIE."; |
48 | - "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi"; | 48 | + "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi"; *) |
49 | (* "Tom idzie.";*) | 49 | (* "Tom idzie.";*) |
50 | - "Miałem miał."; | 50 | + (* "Miałem miał."; *) |
51 | (* "Szpak śpiewa."; | 51 | (* "Szpak śpiewa."; |
52 | "Ala ma kota."; | 52 | "Ala ma kota."; |
53 | "Ale mają kota:"*) | 53 | "Ale mają kota:"*) |
54 | + "Matura."; | ||
55 | + "matura"; | ||
56 | + "„Matura.”"; | ||
57 | + "„Matura”."; | ||
58 | + "„matura”"; | ||
59 | + "- matura"; | ||
60 | + "- Matura"; | ||
61 | + "2 jabłka"; | ||
62 | + "- 2 jabłka"; | ||
54 | ] | 63 | ] |
55 | 64 | ||
56 | let _ = | 65 | let _ = |