Commit 766cb2a4a06f1150cce4d4ed7ce917ab8fbe9598

Authored by Wojciech Jaworski
1 parent 40942928

ordnum i końce zdań na końcach tekstów

subsyntax/ENIAM_MWE.ml
@@ -210,6 +210,14 @@ let get_orths paths = @@ -210,6 +210,14 @@ let get_orths paths =
210 Xlist.fold l orths (fun orths t -> 210 Xlist.fold l orths (fun orths t ->
211 StringSet.add orths (ENIAMtokens.get_orth t.token)))) 211 StringSet.add orths (ENIAMtokens.get_orth t.token))))
212 212
  213 +let get_intnum_orths paths =
  214 + IntMap.fold paths StringMap.empty (fun orths _ map ->
  215 + IntMap.fold map orths (fun orths _ l ->
  216 + Xlist.fold l orths (fun orths t ->
  217 + match t.token with
  218 + Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
  219 + | _ -> orths)))
  220 +
213 let rec match_path_rec map found (t:token_env) rev = function 221 let rec match_path_rec map found (t:token_env) rev = function
214 [] -> (t :: rev) :: found 222 [] -> (t :: rev) :: found
215 | s :: l -> 223 | s :: l ->
@@ -261,10 +269,19 @@ let apply_rule paths (match_list,lemma,interp) = @@ -261,10 +269,19 @@ let apply_rule paths (match_list,lemma,interp) =
261 add_token paths token 269 add_token paths token
262 with Not_found -> paths) 270 with Not_found -> paths)
263 271
  272 +(* FIXME: reguły dla ordnum powinny maczować część mowy a nie tylko orth *)
  273 +let add_ordnum_rules rules paths =
  274 + let orths = get_intnum_orths paths in
  275 + StringMap.fold orths rules (fun rules orth lemmas ->
  276 + StringSet.fold lemmas rules (fun rules lemma ->
  277 + (* Printf.printf "%s %s\n%!" orth lemma; *)
  278 + ([orth;"."],lemma,"ordnum") :: rules))
  279 +
264 let process (paths,last) = 280 let process (paths,last) =
265 let paths = Xlist.fold paths IntMap.empty add_token in 281 let paths = Xlist.fold paths IntMap.empty add_token in
266 let orths = get_orths paths in 282 let orths = get_orths paths in
267 let rules = preselect_dict orths mwe_dict in 283 let rules = preselect_dict orths mwe_dict in
  284 + let rules = add_ordnum_rules rules paths in
268 let paths = Xlist.fold rules paths apply_rule in 285 let paths = Xlist.fold rules paths apply_rule in
269 let paths = IntMap.fold paths [] (fun paths _ map -> 286 let paths = IntMap.fold paths [] (fun paths _ map ->
270 IntMap.fold map paths (fun paths _ l -> 287 IntMap.fold map paths (fun paths _ l ->
subsyntax/test.ml
@@ -19,21 +19,25 @@ @@ -19,21 +19,25 @@
19 19
20 20
21 let test_strings = [ 21 let test_strings = [
22 - "Szpak frunie."; 22 + (* "Szpak frunie.";
23 "Kot np. miauczy."; 23 "Kot np. miauczy.";
24 "Ala ma kota."; 24 "Ala ma kota.";
25 - "Ale mają kota:" 25 + "Ale mają kota:" *)
26 (* "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *) 26 (* "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *)
27 (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP."; *) 27 (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP."; *)
28 (* "Trzy lata później założył pracownię architektoniczną Atelier Bizio + Ligierko, zajmującą się adaptacjami budynków historycznych."; *) 28 (* "Trzy lata później założył pracownię architektoniczną Atelier Bizio + Ligierko, zajmującą się adaptacjami budynków historycznych."; *)
29 (* "Festiwalu Polskich Sztuk Współczesnych R@Port"; *) 29 (* "Festiwalu Polskich Sztuk Współczesnych R@Port"; *)
30 (* "Przeglądu Teatrów Małych Form „Kontrapunkt”"; *) 30 (* "Przeglądu Teatrów Małych Form „Kontrapunkt”"; *)
  31 + (* "Dyplom uzyskał w 1994.";
  32 + "dyplom uzyskał w 1994"; *)
31 ] 33 ]
32 34
33 let test_strings2 = [ 35 let test_strings2 = [
34 - "Szpak frunie. Kot miauczy.";  
35 - "Szpak powiedział: „Frunę. Śpiewam.”"; 36 + (* "Szpak frunie. Kot miauczy.";
  37 + "Szpak powiedział: „Frunę. Śpiewam.”"; *)
36 (* "Istniejący od XI w. Czersk uzyskał prawa miejskie w 1350 r. Mazowsze było wtedy samodzielnym księstwem."; *) 38 (* "Istniejący od XI w. Czersk uzyskał prawa miejskie w 1350 r. Mazowsze było wtedy samodzielnym księstwem."; *)
  39 + "Dyplom uzyskał w 1994.";
  40 + "dyplom uzyskał w 1994";
37 ] 41 ]
38 42
39 let _ = 43 let _ =
tokenizer/ENIAMacronyms.ml
@@ -864,3 +864,6 @@ let abr_patterns = [ @@ -864,3 +864,6 @@ let abr_patterns = [
864 [O "ws"; S "."], (function [a;b] -> std a b [1,"w","prep:loc:nwok";1,"sprawa","subst:sg:loc:f"] | _ -> failwith "abr_patterns"); 864 [O "ws"; S "."], (function [a;b] -> std a b [1,"w","prep:loc:nwok";1,"sprawa","subst:sg:loc:f"] | _ -> failwith "abr_patterns");
865 [O "ww"; S "."], (function [a;b] -> std a b [1,"wysoko","adv:com";1,"wymieniony","ppas:_:_:_:perf:aff"] | _ -> failwith "abr_patterns"); 865 [O "ww"; S "."], (function [a;b] -> std a b [1,"wysoko","adv:com";1,"wymieniony","ppas:_:_:_:perf:aff"] | _ -> failwith "abr_patterns");
866 ] 866 ]
  867 +
  868 +(* let query_patterns = [
  869 + [I "<query>"; S "."; O "u"; S "."], (function [a;b;c;d] -> [ct [a;b] "bez" "prep:gen:nwok"; ct [c;d] "uwaga" "subst:pl:gen:f"] | _ -> failwith "abr_patterns"); *)
tokenizer/ENIAMpatterns.ml
@@ -63,7 +63,6 @@ let dig_value t = @@ -63,7 +63,6 @@ let dig_value t =
63 Dig(v,_) -> v 63 Dig(v,_) -> v
64 | _ -> failwith "dig_value" 64 | _ -> failwith "dig_value"
65 65
66 -(* FIXME: problem z ordnum - wyklucza year co stanowi problem na końcu zdania *)  
67 let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeba uwzględnić w preprocesingu brak spacji - albo w dezambiguacji *) 66 let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeba uwzględnić w preprocesingu brak spacji - albo w dezambiguacji *)
68 [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"])); 67 [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"]));
69 [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"])); 68 [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"]));
@@ -76,7 +75,7 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb @@ -76,7 +75,7 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb
76 [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); 75 [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum"));
77 [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); 76 [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum"));
78 [D "pref3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); 77 [D "pref3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum"));
79 - [D "intnum"; S "."], (function [token;_] -> Dig(concat_intnum [token],"ordnum") | _ -> failwith "digit_patterns1"); (* FIXME: to nie powinno wykluczać innych interpretacji *) 78 + (* [D "intnum"; S "."], (function [token;_] -> Dig(concat_intnum [token],"ordnum") | _ -> failwith "digit_patterns1"); *)
80 [D "day"; S "."; D "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns2"); 79 [D "day"; S "."; D "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns2");
81 [D "day"; S "."; RD "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3"); 80 [D "day"; S "."; RD "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3");
82 [D "day"; S " "; RD "month"; S " "; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3"); 81 [D "day"; S " "; RD "month"; S " "; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3");
@@ -148,9 +147,9 @@ let digit_patterns3 = [ @@ -148,9 +147,9 @@ let digit_patterns3 = [
148 [D "intnum"; S "-"; O "latku"], (function [x;_;_] -> compose_latek_lemma x "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22"); 147 [D "intnum"; S "-"; O "latku"], (function [x;_;_] -> compose_latek_lemma x "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22");
149 [D "intnum"; S "-"; O "latkowie"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22"); 148 [D "intnum"; S "-"; O "latkowie"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22");
150 [D "intnum"; S "-"; O "latków"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22"); 149 [D "intnum"; S "-"; O "latków"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22");
151 - [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:dat:m1" | _ -> failwith "digit_patterns22");  
152 - [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:inst:m1" | _ -> failwith "digit_patterns22");  
153 - [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:loc:m1" | _ -> failwith "digit_patterns22"); 150 + [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:dat:m1.f" | _ -> failwith "digit_patterns22");
  151 + [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:inst:m1.f" | _ -> failwith "digit_patterns22");
  152 + [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:loc:m1.f" | _ -> failwith "digit_patterns22");
154 [D "intnum"; S "-"; O "latka"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:nom:f" | _ -> failwith "digit_patterns22"); 153 [D "intnum"; S "-"; O "latka"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:nom:f" | _ -> failwith "digit_patterns22");
155 [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:gen:f" | _ -> failwith "digit_patterns22"); 154 [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:gen:f" | _ -> failwith "digit_patterns22");
156 [D "intnum"; S "-"; O "latce"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22"); 155 [D "intnum"; S "-"; O "latce"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22");
@@ -159,9 +158,6 @@ let digit_patterns3 = [ @@ -159,9 +158,6 @@ let digit_patterns3 = [
159 [D "intnum"; S "-"; O "latko"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:voc:f" | _ -> failwith "digit_patterns22"); 158 [D "intnum"; S "-"; O "latko"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:voc:f" | _ -> failwith "digit_patterns22");
160 [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22"); 159 [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22");
161 [D "intnum"; S "-"; O "latek"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:gen:f" | _ -> failwith "digit_patterns22"); 160 [D "intnum"; S "-"; O "latek"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:gen:f" | _ -> failwith "digit_patterns22");
162 - [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:dat:f" | _ -> failwith "digit_patterns22");  
163 - [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:inst:f" | _ -> failwith "digit_patterns22");  
164 - [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:loc:f" | _ -> failwith "digit_patterns22");  
165 [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:nom:m1" | _ -> failwith "digit_patterns22"); 161 [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:nom:m1" | _ -> failwith "digit_patterns22");
166 [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:gen.acc:m1" | _ -> failwith "digit_patterns22"); 162 [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:gen.acc:m1" | _ -> failwith "digit_patterns22");
167 [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowi"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:dat:m1" | _ -> failwith "digit_patterns22"); 163 [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowi"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:dat:m1" | _ -> failwith "digit_patterns22");
@@ -169,9 +165,9 @@ let digit_patterns3 = [ @@ -169,9 +165,9 @@ let digit_patterns3 = [
169 [D "intnum"; S "-"; D "intnum"; S "-"; O "latku"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22"); 165 [D "intnum"; S "-"; D "intnum"; S "-"; O "latku"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22");
170 [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowie"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22"); 166 [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowie"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22");
171 [D "intnum"; S "-"; D "intnum"; S "-"; O "latków"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22"); 167 [D "intnum"; S "-"; D "intnum"; S "-"; O "latków"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22");
172 - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:dat:m1" | _ -> failwith "digit_patterns22");  
173 - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:inst:m1" | _ -> failwith "digit_patterns22");  
174 - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:loc:m1" | _ -> failwith "digit_patterns22"); 168 + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:dat:m1.f" | _ -> failwith "digit_patterns22");
  169 + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:inst:m1.f" | _ -> failwith "digit_patterns22");
  170 + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:loc:m1.f" | _ -> failwith "digit_patterns22");
175 [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:nom:f" | _ -> failwith "digit_patterns22"); 171 [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:nom:f" | _ -> failwith "digit_patterns22");
176 [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:gen:f" | _ -> failwith "digit_patterns22"); 172 [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:gen:f" | _ -> failwith "digit_patterns22");
177 [D "intnum"; S "-"; D "intnum"; S "-"; O "latce"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22"); 173 [D "intnum"; S "-"; D "intnum"; S "-"; O "latce"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22");
@@ -180,9 +176,6 @@ let digit_patterns3 = [ @@ -180,9 +176,6 @@ let digit_patterns3 = [
180 [D "intnum"; S "-"; D "intnum"; S "-"; O "latko"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:voc:f" | _ -> failwith "digit_patterns22"); 176 [D "intnum"; S "-"; D "intnum"; S "-"; O "latko"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:voc:f" | _ -> failwith "digit_patterns22");
181 [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22"); 177 [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22");
182 [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:gen:f" | _ -> failwith "digit_patterns22"); 178 [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:gen:f" | _ -> failwith "digit_patterns22");
183 - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:dat:f" | _ -> failwith "digit_patterns22");  
184 - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:inst:f" | _ -> failwith "digit_patterns22");  
185 - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:loc:f" | _ -> failwith "digit_patterns22");  
186 ] 179 ]
187 180
188 let url_patterns1 = [ 181 let url_patterns1 = [
@@ -299,6 +292,7 @@ let match_token = function @@ -299,6 +292,7 @@ let match_token = function
299 | CL, CapLetter _ -> true 292 | CL, CapLetter _ -> true
300 | CL, AllCap _ -> true 293 | CL, AllCap _ -> true
301 | CL, SomeCap _ -> true 294 | CL, SomeCap _ -> true
  295 + | I pat, Interp s -> pat = s
302 | _ -> false 296 | _ -> false
303 297
304 let rec find_first_token matching pat = function 298 let rec find_first_token matching pat = function
@@ -381,6 +375,92 @@ let find_abr_patterns patterns tokens = @@ -381,6 +375,92 @@ let find_abr_patterns patterns tokens =
381 find_abr_pattern (Xlist.map patterns (fun (pattern,command) -> 375 find_abr_pattern (Xlist.map patterns (fun (pattern,command) ->
382 {prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); command_abr=command})) [] tokens 376 {prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); command_abr=command})) [] tokens
383 377
  378 +
  379 +exception PatternFound
  380 +
  381 +let query_beg_patterns = [
  382 + [I "<query>";I "<sentence>"];
  383 + [I "<query>";I "„s";I "<sentence>"];
  384 + [I "<query>";I "<or>";I "<sentence>"];
  385 + ]
  386 +
  387 +let query_end_patterns = [
  388 + [I "</sentence>";I "</query>"];
  389 + [I "</sentence>";I "”s";I "</query>"];
  390 + ]
  391 +
  392 +let find_beg_pattern pattern tokens =
  393 + try
  394 + let _ = find_pattern_tail [{prefix=[]; matched=[]; suffix=[];
  395 + pattern=pattern; command=(fun _ -> raise PatternFound);
  396 + command_abr=(fun _ -> [])}] tokens in false
  397 + with PatternFound -> true | Not_found -> false
  398 +
  399 +let replace_beg_pattern pattern command tokens =
  400 + try
  401 + let t,l = find_abr_pattern_tail [{prefix=[]; matched=[]; suffix=[];
  402 + pattern=pattern; command=(fun _ -> Symbol "");
  403 + command_abr=command}] tokens in
  404 + t :: l
  405 + with Not_found -> failwith "replace_beg_pattern"
  406 +
  407 +(* let s_beg i = {empty_token_env with beg=i;len=1;next=i+1; token=Interp "<sentence>"}
  408 +let c_beg i = {empty_token_env with beg=i;len=1;next=i+1; token=Interp "<clause>"} *)
  409 +let s_end i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "</sentence>"}
  410 +let c_end i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "</clause>"}
  411 +
  412 +let add_sentence_beg = function
  413 + [q;t] -> let next=t.next in [Token q;Token{t with len=t.len-2;next=next-2};ENIAMtokens.s_beg (next-2);ENIAMtokens.c_beg (next-1)]
  414 + | [q] -> let next=q.next in [Token{q with len=q.len-2;next=next-2};ENIAMtokens.s_beg (next-2);ENIAMtokens.c_beg (next-1)]
  415 + | _ -> failwith "add_sentence_beg"
  416 +
  417 +let add_sentence_end = function
  418 + [q;t] -> let beg=t.beg in [Token q;Token{t with len=t.len-2;beg=beg+2};s_end (beg+1);c_end beg]
  419 + | [q] -> let beg=q.beg in [Token{q with len=q.len-2;beg=beg+2};s_end (beg+1);c_end beg]
  420 + | _ -> failwith "add_sentence_end"
  421 +
  422 +let rec revert_tokens = function
  423 + Token t -> Token t
  424 + | Seq l -> Seq(Xlist.rev_map l revert_tokens)
  425 + | Variant l -> Variant(Xlist.map l revert_tokens)
  426 +
  427 +let manage_query_boundaries tokens =
  428 + let b =
  429 + try
  430 + let _ = find_pattern_tail (Xlist.map query_beg_patterns (fun pattern ->
  431 + {prefix=[]; matched=[]; suffix=[];
  432 + pattern=pattern; command=(fun _ -> raise PatternFound);
  433 + command_abr=(fun _ -> [])})) tokens in false
  434 + with PatternFound -> true | Not_found -> false in
  435 + (if b then print_endline "sentence beg found" else print_endline "sentence beg not found");
  436 + let tokens =
  437 + if find_beg_pattern [I "<query>";I "„s"] tokens then
  438 + if find_beg_pattern [I "<query>";I "„s";I "<sentence>"] tokens then tokens else
  439 + replace_beg_pattern [I "<query>";I "„s"] add_sentence_beg tokens else
  440 + if find_beg_pattern [I "<query>";I "<or>"] tokens then
  441 + if find_beg_pattern [I "<query>";I "<or>";I "<sentence>"] tokens then tokens else
  442 + replace_beg_pattern [I "<query>";I "<or>"] add_sentence_beg tokens else
  443 + if find_beg_pattern [I "<query>";I "<sentence>"] tokens then tokens else
  444 + replace_beg_pattern [I "<query>"] add_sentence_beg tokens in
  445 + let b =
  446 + try
  447 + let _ = find_pattern (Xlist.map query_end_patterns (fun pattern ->
  448 + {prefix=[]; matched=[]; suffix=[];
  449 + pattern=pattern; command=(fun _ -> raise PatternFound);
  450 + command_abr=(fun _ -> [])})) [] tokens in false
  451 + with PatternFound -> true in
  452 + (if b then print_endline "sentence end found" else print_endline "sentence end not found");
  453 + let tokens = Xlist.rev_map tokens revert_tokens in
  454 + let tokens =
  455 + if find_beg_pattern [I "</query>";I "”s"] tokens then
  456 + if find_beg_pattern [I "</query>";I "”s";I "</sentence>"] tokens then tokens else
  457 + replace_beg_pattern [I "</query>";I "”s"] add_sentence_end tokens else
  458 + if find_beg_pattern [I "</query>";I "</sentence>"] tokens then tokens else
  459 + replace_beg_pattern [I "</query>"] add_sentence_end tokens in
  460 + let tokens = Xlist.rev_map tokens revert_tokens in
  461 + tokens
  462 +
  463 +
384 let find_replacement_patterns tokens = 464 let find_replacement_patterns tokens =
385 let tokens = find_patterns digit_patterns1 tokens in 465 let tokens = find_patterns digit_patterns1 tokens in
386 let tokens = normalize_tokens [] tokens in 466 let tokens = normalize_tokens [] tokens in
tokenizer/ENIAMtokenizer.ml
@@ -29,6 +29,8 @@ let parse query = @@ -29,6 +29,8 @@ let parse query =
29 let l = ENIAMpatterns.normalize_tokens [] l in 29 let l = ENIAMpatterns.normalize_tokens [] l in
30 let l = ENIAMpatterns.find_replacement_patterns l in 30 let l = ENIAMpatterns.find_replacement_patterns l in
31 let l = ENIAMpatterns.remove_spaces [] l in 31 let l = ENIAMpatterns.remove_spaces [] l in
  32 + let l = ENIAMpatterns.manage_query_boundaries l in
32 let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.abr_patterns l in 33 let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.abr_patterns l in
  34 + (* let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.query_patterns l in *)
33 let l = ENIAMpatterns.normalize_tokens [] l in 35 let l = ENIAMpatterns.normalize_tokens [] l in
34 l 36 l
tokenizer/ENIAMtokenizerTypes.ml
@@ -60,7 +60,7 @@ type tokens = @@ -60,7 +60,7 @@ type tokens =
60 | Variant of tokens list 60 | Variant of tokens list
61 | Seq of tokens list 61 | Seq of tokens list
62 62
63 -type pat = L | CL | D of string | C of string | S of string | RD of string | O of string 63 +type pat = L | CL | D of string | C of string | S of string | RD of string | O of string | I of string
64 64
65 let empty_token_env = { 65 let empty_token_env = {
66 orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.} 66 orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.}
tokenizer/test.ml
@@ -36,21 +36,30 @@ let test_strings = [ @@ -36,21 +36,30 @@ let test_strings = [
36 "Ala 22-25 ."; 36 "Ala 22-25 .";
37 "Ala 22.5.2000-25.5.2001 ."; 37 "Ala 22.5.2000-25.5.2001 .";
38 "Szpak frunie.";*) 38 "Szpak frunie.";*)
39 - "Kot miauczy."; 39 + (* "Kot miauczy."; *)
40 (* "Np. Ala.";*) 40 (* "Np. Ala.";*)
41 - "w. dom."; 41 + (* "w. dom.";
42 "tzn."; 42 "tzn.";
43 - "c.d.n."; 43 + "c.d.n."; *)
44 (* "Arabia Saudyjska biegnie."; 44 (* "Arabia Saudyjska biegnie.";
45 "Cauchy'ego ONZ-owska biegnie.";*) 45 "Cauchy'ego ONZ-owska biegnie.";*)
46 - "TE-cie E-e."; 46 + (* "TE-cie E-e.";
47 "MS-DOS-owska CI-cie KRRi-cie UJ-ocie UJ-OCIE."; 47 "MS-DOS-owska CI-cie KRRi-cie UJ-ocie UJ-OCIE.";
48 - "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi"; 48 + "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi"; *)
49 (* "Tom idzie.";*) 49 (* "Tom idzie.";*)
50 - "Miałem miał."; 50 + (* "Miałem miał."; *)
51 (* "Szpak śpiewa."; 51 (* "Szpak śpiewa.";
52 "Ala ma kota."; 52 "Ala ma kota.";
53 "Ale mają kota:"*) 53 "Ale mają kota:"*)
  54 + "Matura.";
  55 + "matura";
  56 + "„Matura.”";
  57 + "„Matura”.";
  58 + "„matura”";
  59 + "- matura";
  60 + "- Matura";
  61 + "2 jabłka";
  62 + "- 2 jabłka";
54 ] 63 ]
55 64
56 let _ = 65 let _ =