Commit 766cb2a4a06f1150cce4d4ed7ce917ab8fbe9598

Authored by Wojciech Jaworski
1 parent 40942928

ordnum i końce zdań na końcach tekstów

subsyntax/ENIAM_MWE.ml
... ... @@ -210,6 +210,14 @@ let get_orths paths =
210 210 Xlist.fold l orths (fun orths t ->
211 211 StringSet.add orths (ENIAMtokens.get_orth t.token))))
212 212  
  213 +let get_intnum_orths paths =
  214 + IntMap.fold paths StringMap.empty (fun orths _ map ->
  215 + IntMap.fold map orths (fun orths _ l ->
  216 + Xlist.fold l orths (fun orths t ->
  217 + match t.token with
  218 + Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
  219 + | _ -> orths)))
  220 +
213 221 let rec match_path_rec map found (t:token_env) rev = function
214 222 [] -> (t :: rev) :: found
215 223 | s :: l ->
... ... @@ -261,10 +269,19 @@ let apply_rule paths (match_list,lemma,interp) =
261 269 add_token paths token
262 270 with Not_found -> paths)
263 271  
  272 +(* FIXME: reguły dla ordnum powinny maczować część mowy a nie tylko orth *)
  273 +let add_ordnum_rules rules paths =
  274 + let orths = get_intnum_orths paths in
  275 + StringMap.fold orths rules (fun rules orth lemmas ->
  276 + StringSet.fold lemmas rules (fun rules lemma ->
  277 + (* Printf.printf "%s %s\n%!" orth lemma; *)
  278 + ([orth;"."],lemma,"ordnum") :: rules))
  279 +
264 280 let process (paths,last) =
265 281 let paths = Xlist.fold paths IntMap.empty add_token in
266 282 let orths = get_orths paths in
267 283 let rules = preselect_dict orths mwe_dict in
  284 + let rules = add_ordnum_rules rules paths in
268 285 let paths = Xlist.fold rules paths apply_rule in
269 286 let paths = IntMap.fold paths [] (fun paths _ map ->
270 287 IntMap.fold map paths (fun paths _ l ->
... ...
subsyntax/test.ml
... ... @@ -19,21 +19,25 @@
19 19  
20 20  
21 21 let test_strings = [
22   - "Szpak frunie.";
  22 + (* "Szpak frunie.";
23 23 "Kot np. miauczy.";
24 24 "Ala ma kota.";
25   - "Ale mają kota:"
  25 + "Ale mają kota:" *)
26 26 (* "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *)
27 27 (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP."; *)
28 28 (* "Trzy lata później założył pracownię architektoniczną Atelier Bizio + Ligierko, zajmującą się adaptacjami budynków historycznych."; *)
29 29 (* "Festiwalu Polskich Sztuk Współczesnych R@Port"; *)
30 30 (* "Przeglądu Teatrów Małych Form „Kontrapunkt”"; *)
  31 + (* "Dyplom uzyskał w 1994.";
  32 + "dyplom uzyskał w 1994"; *)
31 33 ]
32 34  
33 35 let test_strings2 = [
34   - "Szpak frunie. Kot miauczy.";
35   - "Szpak powiedział: „Frunę. Śpiewam.”";
  36 + (* "Szpak frunie. Kot miauczy.";
  37 + "Szpak powiedział: „Frunę. Śpiewam.”"; *)
36 38 (* "Istniejący od XI w. Czersk uzyskał prawa miejskie w 1350 r. Mazowsze było wtedy samodzielnym księstwem."; *)
  39 + "Dyplom uzyskał w 1994.";
  40 + "dyplom uzyskał w 1994";
37 41 ]
38 42  
39 43 let _ =
... ...
tokenizer/ENIAMacronyms.ml
... ... @@ -864,3 +864,6 @@ let abr_patterns = [
864 864 [O "ws"; S "."], (function [a;b] -> std a b [1,"w","prep:loc:nwok";1,"sprawa","subst:sg:loc:f"] | _ -> failwith "abr_patterns");
865 865 [O "ww"; S "."], (function [a;b] -> std a b [1,"wysoko","adv:com";1,"wymieniony","ppas:_:_:_:perf:aff"] | _ -> failwith "abr_patterns");
866 866 ]
  867 +
  868 +(* let query_patterns = [
  869 + [I "<query>"; S "."; O "u"; S "."], (function [a;b;c;d] -> [ct [a;b] "bez" "prep:gen:nwok"; ct [c;d] "uwaga" "subst:pl:gen:f"] | _ -> failwith "abr_patterns"); *)
... ...
tokenizer/ENIAMpatterns.ml
... ... @@ -63,7 +63,6 @@ let dig_value t =
63 63 Dig(v,_) -> v
64 64 | _ -> failwith "dig_value"
65 65  
66   -(* FIXME: problem z ordnum - wyklucza year co stanowi problem na końcu zdania *)
67 66 let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeba uwzględnić w preprocesingu brak spacji - albo w dezambiguacji *)
68 67 [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"]));
69 68 [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"]));
... ... @@ -76,7 +75,7 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb
76 75 [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum"));
77 76 [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum"));
78 77 [D "pref3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum"));
79   - [D "intnum"; S "."], (function [token;_] -> Dig(concat_intnum [token],"ordnum") | _ -> failwith "digit_patterns1"); (* FIXME: to nie powinno wykluczać innych interpretacji *)
  78 + (* [D "intnum"; S "."], (function [token;_] -> Dig(concat_intnum [token],"ordnum") | _ -> failwith "digit_patterns1"); *)
80 79 [D "day"; S "."; D "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns2");
81 80 [D "day"; S "."; RD "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3");
82 81 [D "day"; S " "; RD "month"; S " "; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3");
... ... @@ -148,9 +147,9 @@ let digit_patterns3 = [
148 147 [D "intnum"; S "-"; O "latku"], (function [x;_;_] -> compose_latek_lemma x "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22");
149 148 [D "intnum"; S "-"; O "latkowie"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22");
150 149 [D "intnum"; S "-"; O "latków"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22");
151   - [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:dat:m1" | _ -> failwith "digit_patterns22");
152   - [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:inst:m1" | _ -> failwith "digit_patterns22");
153   - [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:loc:m1" | _ -> failwith "digit_patterns22");
  150 + [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:dat:m1.f" | _ -> failwith "digit_patterns22");
  151 + [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:inst:m1.f" | _ -> failwith "digit_patterns22");
  152 + [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:loc:m1.f" | _ -> failwith "digit_patterns22");
154 153 [D "intnum"; S "-"; O "latka"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:nom:f" | _ -> failwith "digit_patterns22");
155 154 [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:gen:f" | _ -> failwith "digit_patterns22");
156 155 [D "intnum"; S "-"; O "latce"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22");
... ... @@ -159,9 +158,6 @@ let digit_patterns3 = [
159 158 [D "intnum"; S "-"; O "latko"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:voc:f" | _ -> failwith "digit_patterns22");
160 159 [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22");
161 160 [D "intnum"; S "-"; O "latek"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:gen:f" | _ -> failwith "digit_patterns22");
162   - [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:dat:f" | _ -> failwith "digit_patterns22");
163   - [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:inst:f" | _ -> failwith "digit_patterns22");
164   - [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:loc:f" | _ -> failwith "digit_patterns22");
165 161 [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:nom:m1" | _ -> failwith "digit_patterns22");
166 162 [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:gen.acc:m1" | _ -> failwith "digit_patterns22");
167 163 [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowi"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:dat:m1" | _ -> failwith "digit_patterns22");
... ... @@ -169,9 +165,9 @@ let digit_patterns3 = [
169 165 [D "intnum"; S "-"; D "intnum"; S "-"; O "latku"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22");
170 166 [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowie"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22");
171 167 [D "intnum"; S "-"; D "intnum"; S "-"; O "latków"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22");
172   - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:dat:m1" | _ -> failwith "digit_patterns22");
173   - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:inst:m1" | _ -> failwith "digit_patterns22");
174   - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:loc:m1" | _ -> failwith "digit_patterns22");
  168 + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:dat:m1.f" | _ -> failwith "digit_patterns22");
  169 + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:inst:m1.f" | _ -> failwith "digit_patterns22");
  170 + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:loc:m1.f" | _ -> failwith "digit_patterns22");
175 171 [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:nom:f" | _ -> failwith "digit_patterns22");
176 172 [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:gen:f" | _ -> failwith "digit_patterns22");
177 173 [D "intnum"; S "-"; D "intnum"; S "-"; O "latce"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22");
... ... @@ -180,9 +176,6 @@ let digit_patterns3 = [
180 176 [D "intnum"; S "-"; D "intnum"; S "-"; O "latko"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:voc:f" | _ -> failwith "digit_patterns22");
181 177 [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22");
182 178 [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:gen:f" | _ -> failwith "digit_patterns22");
183   - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:dat:f" | _ -> failwith "digit_patterns22");
184   - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:inst:f" | _ -> failwith "digit_patterns22");
185   - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:loc:f" | _ -> failwith "digit_patterns22");
186 179 ]
187 180  
188 181 let url_patterns1 = [
... ... @@ -299,6 +292,7 @@ let match_token = function
299 292 | CL, CapLetter _ -> true
300 293 | CL, AllCap _ -> true
301 294 | CL, SomeCap _ -> true
  295 + | I pat, Interp s -> pat = s
302 296 | _ -> false
303 297  
304 298 let rec find_first_token matching pat = function
... ... @@ -381,6 +375,92 @@ let find_abr_patterns patterns tokens =
381 375 find_abr_pattern (Xlist.map patterns (fun (pattern,command) ->
382 376 {prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); command_abr=command})) [] tokens
383 377  
  378 +
  379 +exception PatternFound
  380 +
  381 +let query_beg_patterns = [
  382 + [I "<query>";I "<sentence>"];
  383 + [I "<query>";I "„s";I "<sentence>"];
  384 + [I "<query>";I "<or>";I "<sentence>"];
  385 + ]
  386 +
  387 +let query_end_patterns = [
  388 + [I "</sentence>";I "</query>"];
  389 + [I "</sentence>";I "”s";I "</query>"];
  390 + ]
  391 +
  392 +let find_beg_pattern pattern tokens =
  393 + try
  394 + let _ = find_pattern_tail [{prefix=[]; matched=[]; suffix=[];
  395 + pattern=pattern; command=(fun _ -> raise PatternFound);
  396 + command_abr=(fun _ -> [])}] tokens in false
  397 + with PatternFound -> true | Not_found -> false
  398 +
  399 +let replace_beg_pattern pattern command tokens =
  400 + try
  401 + let t,l = find_abr_pattern_tail [{prefix=[]; matched=[]; suffix=[];
  402 + pattern=pattern; command=(fun _ -> Symbol "");
  403 + command_abr=command}] tokens in
  404 + t :: l
  405 + with Not_found -> failwith "replace_beg_pattern"
  406 +
  407 +(* let s_beg i = {empty_token_env with beg=i;len=1;next=i+1; token=Interp "<sentence>"}
  408 +let c_beg i = {empty_token_env with beg=i;len=1;next=i+1; token=Interp "<clause>"} *)
  409 +let s_end i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "</sentence>"}
  410 +let c_end i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "</clause>"}
  411 +
  412 +let add_sentence_beg = function
  413 + [q;t] -> let next=t.next in [Token q;Token{t with len=t.len-2;next=next-2};ENIAMtokens.s_beg (next-2);ENIAMtokens.c_beg (next-1)]
  414 + | [q] -> let next=q.next in [Token{q with len=q.len-2;next=next-2};ENIAMtokens.s_beg (next-2);ENIAMtokens.c_beg (next-1)]
  415 + | _ -> failwith "add_sentence_beg"
  416 +
  417 +let add_sentence_end = function
  418 + [q;t] -> let beg=t.beg in [Token q;Token{t with len=t.len-2;beg=beg+2};s_end (beg+1);c_end beg]
  419 + | [q] -> let beg=q.beg in [Token{q with len=q.len-2;beg=beg+2};s_end (beg+1);c_end beg]
  420 + | _ -> failwith "add_sentence_end"
  421 +
  422 +let rec revert_tokens = function
  423 + Token t -> Token t
  424 + | Seq l -> Seq(Xlist.rev_map l revert_tokens)
  425 + | Variant l -> Variant(Xlist.map l revert_tokens)
  426 +
  427 +let manage_query_boundaries tokens =
  428 + let b =
  429 + try
  430 + let _ = find_pattern_tail (Xlist.map query_beg_patterns (fun pattern ->
  431 + {prefix=[]; matched=[]; suffix=[];
  432 + pattern=pattern; command=(fun _ -> raise PatternFound);
  433 + command_abr=(fun _ -> [])})) tokens in false
  434 + with PatternFound -> true | Not_found -> false in
  435 + (if b then print_endline "sentence beg found" else print_endline "sentence beg not found");
  436 + let tokens =
  437 + if find_beg_pattern [I "<query>";I "„s"] tokens then
  438 + if find_beg_pattern [I "<query>";I "„s";I "<sentence>"] tokens then tokens else
  439 + replace_beg_pattern [I "<query>";I "„s"] add_sentence_beg tokens else
  440 + if find_beg_pattern [I "<query>";I "<or>"] tokens then
  441 + if find_beg_pattern [I "<query>";I "<or>";I "<sentence>"] tokens then tokens else
  442 + replace_beg_pattern [I "<query>";I "<or>"] add_sentence_beg tokens else
  443 + if find_beg_pattern [I "<query>";I "<sentence>"] tokens then tokens else
  444 + replace_beg_pattern [I "<query>"] add_sentence_beg tokens in
  445 + let b =
  446 + try
  447 + let _ = find_pattern (Xlist.map query_end_patterns (fun pattern ->
  448 + {prefix=[]; matched=[]; suffix=[];
  449 + pattern=pattern; command=(fun _ -> raise PatternFound);
  450 + command_abr=(fun _ -> [])})) [] tokens in false
  451 + with PatternFound -> true in
  452 + (if b then print_endline "sentence end found" else print_endline "sentence end not found");
  453 + let tokens = Xlist.rev_map tokens revert_tokens in
  454 + let tokens =
  455 + if find_beg_pattern [I "</query>";I "”s"] tokens then
  456 + if find_beg_pattern [I "</query>";I "”s";I "</sentence>"] tokens then tokens else
  457 + replace_beg_pattern [I "</query>";I "”s"] add_sentence_end tokens else
  458 + if find_beg_pattern [I "</query>";I "</sentence>"] tokens then tokens else
  459 + replace_beg_pattern [I "</query>"] add_sentence_end tokens in
  460 + let tokens = Xlist.rev_map tokens revert_tokens in
  461 + tokens
  462 +
  463 +
384 464 let find_replacement_patterns tokens =
385 465 let tokens = find_patterns digit_patterns1 tokens in
386 466 let tokens = normalize_tokens [] tokens in
... ...
tokenizer/ENIAMtokenizer.ml
... ... @@ -29,6 +29,8 @@ let parse query =
29 29 let l = ENIAMpatterns.normalize_tokens [] l in
30 30 let l = ENIAMpatterns.find_replacement_patterns l in
31 31 let l = ENIAMpatterns.remove_spaces [] l in
  32 + let l = ENIAMpatterns.manage_query_boundaries l in
32 33 let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.abr_patterns l in
  34 + (* let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.query_patterns l in *)
33 35 let l = ENIAMpatterns.normalize_tokens [] l in
34 36 l
... ...
tokenizer/ENIAMtokenizerTypes.ml
... ... @@ -60,7 +60,7 @@ type tokens =
60 60 | Variant of tokens list
61 61 | Seq of tokens list
62 62  
63   -type pat = L | CL | D of string | C of string | S of string | RD of string | O of string
  63 +type pat = L | CL | D of string | C of string | S of string | RD of string | O of string | I of string
64 64  
65 65 let empty_token_env = {
66 66 orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.}
... ...
tokenizer/test.ml
... ... @@ -36,21 +36,30 @@ let test_strings = [
36 36 "Ala 22-25 .";
37 37 "Ala 22.5.2000-25.5.2001 .";
38 38 "Szpak frunie.";*)
39   - "Kot miauczy.";
  39 + (* "Kot miauczy."; *)
40 40 (* "Np. Ala.";*)
41   - "w. dom.";
  41 + (* "w. dom.";
42 42 "tzn.";
43   - "c.d.n.";
  43 + "c.d.n."; *)
44 44 (* "Arabia Saudyjska biegnie.";
45 45 "Cauchy'ego ONZ-owska biegnie.";*)
46   - "TE-cie E-e.";
  46 + (* "TE-cie E-e.";
47 47 "MS-DOS-owska CI-cie KRRi-cie UJ-ocie UJ-OCIE.";
48   - "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi";
  48 + "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi"; *)
49 49 (* "Tom idzie.";*)
50   - "Miałem miał.";
  50 + (* "Miałem miał."; *)
51 51 (* "Szpak śpiewa.";
52 52 "Ala ma kota.";
53 53 "Ale mają kota:"*)
  54 + "Matura.";
  55 + "matura";
  56 + "„Matura.”";
  57 + "„Matura”.";
  58 + "„matura”";
  59 + "- matura";
  60 + "- Matura";
  61 + "2 jabłka";
  62 + "- 2 jabłka";
54 63 ]
55 64  
56 65 let _ =
... ...