From 766cb2a4a06f1150cce4d4ed7ce917ab8fbe9598 Mon Sep 17 00:00:00 2001 From: Wojciech Jaworski <wjaworski@mimuw.edu.pl> Date: Thu, 13 Apr 2017 23:47:17 +0200 Subject: [PATCH] ordnum i końce zdań na końcach tekstów --- subsyntax/ENIAM_MWE.ml | 17 +++++++++++++++++ subsyntax/test.ml | 12 ++++++++---- tokenizer/ENIAMacronyms.ml | 3 +++ tokenizer/ENIAMpatterns.ml | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------- tokenizer/ENIAMtokenizer.ml | 2 ++ tokenizer/ENIAMtokenizerTypes.ml | 2 +- tokenizer/test.ml | 21 +++++++++++++++------ 7 files changed, 140 insertions(+), 25 deletions(-) diff --git a/subsyntax/ENIAM_MWE.ml b/subsyntax/ENIAM_MWE.ml index e111b15..a0128f6 100644 --- a/subsyntax/ENIAM_MWE.ml +++ b/subsyntax/ENIAM_MWE.ml @@ -210,6 +210,14 @@ let get_orths paths = Xlist.fold l orths (fun orths t -> StringSet.add orths (ENIAMtokens.get_orth t.token)))) +let get_intnum_orths paths = + IntMap.fold paths StringMap.empty (fun orths _ map -> + IntMap.fold map orths (fun orths _ l -> + Xlist.fold l orths (fun orths t -> + match t.token with + Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma) + | _ -> orths))) + let rec match_path_rec map found (t:token_env) rev = function [] -> (t :: rev) :: found | s :: l -> @@ -261,10 +269,19 @@ let apply_rule paths (match_list,lemma,interp) = add_token paths token with Not_found -> paths) +(* FIXME: reguły dla ordnum powinny maczować część mowy a nie tylko orth *) +let add_ordnum_rules rules paths = + let orths = get_intnum_orths paths in + StringMap.fold orths rules (fun rules orth lemmas -> + StringSet.fold lemmas rules (fun rules lemma -> + (* Printf.printf "%s %s\n%!" orth lemma; *) + ([orth;"."],lemma,"ordnum") :: rules)) + let process (paths,last) = let paths = Xlist.fold paths IntMap.empty add_token in let orths = get_orths paths in let rules = preselect_dict orths mwe_dict in + let rules = add_ordnum_rules rules paths in let paths = Xlist.fold rules paths apply_rule in let paths = IntMap.fold paths [] (fun paths _ map -> IntMap.fold map paths (fun paths _ l -> diff --git a/subsyntax/test.ml b/subsyntax/test.ml index 1f01781..ad6e46c 100644 --- a/subsyntax/test.ml +++ b/subsyntax/test.ml @@ -19,21 +19,25 @@ let test_strings = [ - "Szpak frunie."; + (* "Szpak frunie."; "Kot np. miauczy."; "Ala ma kota."; - "Ale mają kota:" + "Ale mają kota:" *) (* "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *) (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP."; *) (* "Trzy lata później założył pracownię architektoniczną Atelier Bizio + Ligierko, zajmującą się adaptacjami budynków historycznych."; *) (* "Festiwalu Polskich Sztuk Współczesnych R@Port"; *) (* "Przeglądu Teatrów Małych Form „Kontrapunkt”"; *) + (* "Dyplom uzyskał w 1994."; + "dyplom uzyskał w 1994"; *) ] let test_strings2 = [ - "Szpak frunie. Kot miauczy."; - "Szpak powiedział: „Frunę. Śpiewam.”"; + (* "Szpak frunie. Kot miauczy."; + "Szpak powiedział: „Frunę. Śpiewam.”"; *) (* "Istniejący od XI w. Czersk uzyskał prawa miejskie w 1350 r. Mazowsze było wtedy samodzielnym księstwem."; *) + "Dyplom uzyskał w 1994."; + "dyplom uzyskał w 1994"; ] let _ = diff --git a/tokenizer/ENIAMacronyms.ml b/tokenizer/ENIAMacronyms.ml index 1c113fe..754bcc0 100644 --- a/tokenizer/ENIAMacronyms.ml +++ b/tokenizer/ENIAMacronyms.ml @@ -864,3 +864,6 @@ let abr_patterns = [ [O "ws"; S "."], (function [a;b] -> std a b [1,"w","prep:loc:nwok";1,"sprawa","subst:sg:loc:f"] | _ -> failwith "abr_patterns"); [O "ww"; S "."], (function [a;b] -> std a b [1,"wysoko","adv:com";1,"wymieniony","ppas:_:_:_:perf:aff"] | _ -> failwith "abr_patterns"); ] + +(* let query_patterns = [ + [I "<query>"; S "."; O "u"; S "."], (function [a;b;c;d] -> [ct [a;b] "bez" "prep:gen:nwok"; ct [c;d] "uwaga" "subst:pl:gen:f"] | _ -> failwith "abr_patterns"); *) diff --git a/tokenizer/ENIAMpatterns.ml b/tokenizer/ENIAMpatterns.ml index e43c6b5..a5fca45 100644 --- a/tokenizer/ENIAMpatterns.ml +++ b/tokenizer/ENIAMpatterns.ml @@ -63,7 +63,6 @@ let dig_value t = Dig(v,_) -> v | _ -> failwith "dig_value" -(* FIXME: problem z ordnum - wyklucza year co stanowi problem na końcu zdania *) let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeba uwzględnić w preprocesingu brak spacji - albo w dezambiguacji *) [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"])); [D "dig"; S "."; D "dig"; S "."; D "dig"; S "."; D "dig"], (fun tokens -> Proper(concat_orths tokens,"obj-id",[[]],["obj-id"])); @@ -76,7 +75,7 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); [D "pref3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); [D "pref3dig"; S " "; D "3dig"], (fun tokens -> Dig(concat_intnum tokens,"intnum")); - [D "intnum"; S "."], (function [token;_] -> Dig(concat_intnum [token],"ordnum") | _ -> failwith "digit_patterns1"); (* FIXME: to nie powinno wykluczać innych interpretacji *) + (* [D "intnum"; S "."], (function [token;_] -> Dig(concat_intnum [token],"ordnum") | _ -> failwith "digit_patterns1"); *) [D "day"; S "."; D "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns2"); [D "day"; S "."; RD "month"; S "."; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3"); [D "day"; S " "; RD "month"; S " "; D "year"], (function [day;_;month;_;year] -> Compound("date",[day.token;month.token;year.token]) | _ -> failwith "digit_patterns3"); @@ -148,9 +147,9 @@ let digit_patterns3 = [ [D "intnum"; S "-"; O "latku"], (function [x;_;_] -> compose_latek_lemma x "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; O "latkowie"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; O "latków"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22"); - [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:dat:m1" | _ -> failwith "digit_patterns22"); - [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:inst:m1" | _ -> failwith "digit_patterns22"); - [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:loc:m1" | _ -> failwith "digit_patterns22"); + [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:dat:m1.f" | _ -> failwith "digit_patterns22"); + [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:inst:m1.f" | _ -> failwith "digit_patterns22"); + [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latek_lemma x "subst:pl:loc:m1.f" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; O "latka"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:nom:f" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:gen:f" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; O "latce"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22"); @@ -159,9 +158,6 @@ let digit_patterns3 = [ [D "intnum"; S "-"; O "latko"], (function [x;_;_] -> compose_latka_lemma x "subst:sg:voc:f" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; O "latki"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; O "latek"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:gen:f" | _ -> failwith "digit_patterns22"); - [D "intnum"; S "-"; O "latkom"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:dat:f" | _ -> failwith "digit_patterns22"); - [D "intnum"; S "-"; O "latkami"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:inst:f" | _ -> failwith "digit_patterns22"); - [D "intnum"; S "-"; O "latkach"], (function [x;_;_] -> compose_latka_lemma x "subst:pl:loc:f" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:nom:m1" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:gen.acc:m1" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowi"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:dat:m1" | _ -> failwith "digit_patterns22"); @@ -169,9 +165,9 @@ let digit_patterns3 = [ [D "intnum"; S "-"; D "intnum"; S "-"; O "latku"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:sg:loc.voc:m1" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; D "intnum"; S "-"; O "latkowie"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:nom.voc:m1" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; D "intnum"; S "-"; O "latków"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:gen.acc:m1" | _ -> failwith "digit_patterns22"); - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:dat:m1" | _ -> failwith "digit_patterns22"); - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:inst:m1" | _ -> failwith "digit_patterns22"); - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:loc:m1" | _ -> failwith "digit_patterns22"); + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:dat:m1.f" | _ -> failwith "digit_patterns22"); + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:inst:m1.f" | _ -> failwith "digit_patterns22"); + [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latek_int_lemma x y "subst:pl:loc:m1.f" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; D "intnum"; S "-"; O "latka"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:nom:f" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:gen:f" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; D "intnum"; S "-"; O "latce"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:dat.loc:f" | _ -> failwith "digit_patterns22"); @@ -180,9 +176,6 @@ let digit_patterns3 = [ [D "intnum"; S "-"; D "intnum"; S "-"; O "latko"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:sg:voc:f" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; D "intnum"; S "-"; O "latki"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:nom.acc.voc:f" | _ -> failwith "digit_patterns22"); [D "intnum"; S "-"; D "intnum"; S "-"; O "latek"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:gen:f" | _ -> failwith "digit_patterns22"); - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkom"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:dat:f" | _ -> failwith "digit_patterns22"); - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkami"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:inst:f" | _ -> failwith "digit_patterns22"); - [D "intnum"; S "-"; D "intnum"; S "-"; O "latkach"], (function [x;_;y;_;_] -> compose_latka_int_lemma x y "subst:pl:loc:f" | _ -> failwith "digit_patterns22"); ] let url_patterns1 = [ @@ -299,6 +292,7 @@ let match_token = function | CL, CapLetter _ -> true | CL, AllCap _ -> true | CL, SomeCap _ -> true + | I pat, Interp s -> pat = s | _ -> false let rec find_first_token matching pat = function @@ -381,6 +375,92 @@ let find_abr_patterns patterns tokens = find_abr_pattern (Xlist.map patterns (fun (pattern,command) -> {prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); command_abr=command})) [] tokens + +exception PatternFound + +let query_beg_patterns = [ + [I "<query>";I "<sentence>"]; + [I "<query>";I "„s";I "<sentence>"]; + [I "<query>";I "<or>";I "<sentence>"]; + ] + +let query_end_patterns = [ + [I "</sentence>";I "</query>"]; + [I "</sentence>";I "”s";I "</query>"]; + ] + +let find_beg_pattern pattern tokens = + try + let _ = find_pattern_tail [{prefix=[]; matched=[]; suffix=[]; + pattern=pattern; command=(fun _ -> raise PatternFound); + command_abr=(fun _ -> [])}] tokens in false + with PatternFound -> true | Not_found -> false + +let replace_beg_pattern pattern command tokens = + try + let t,l = find_abr_pattern_tail [{prefix=[]; matched=[]; suffix=[]; + pattern=pattern; command=(fun _ -> Symbol ""); + command_abr=command}] tokens in + t :: l + with Not_found -> failwith "replace_beg_pattern" + +(* let s_beg i = {empty_token_env with beg=i;len=1;next=i+1; token=Interp "<sentence>"} +let c_beg i = {empty_token_env with beg=i;len=1;next=i+1; token=Interp "<clause>"} *) +let s_end i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "</sentence>"} +let c_end i = Token{empty_token_env with beg=i;len=1;next=i+1; token=Interp "</clause>"} + +let add_sentence_beg = function + [q;t] -> let next=t.next in [Token q;Token{t with len=t.len-2;next=next-2};ENIAMtokens.s_beg (next-2);ENIAMtokens.c_beg (next-1)] + | [q] -> let next=q.next in [Token{q with len=q.len-2;next=next-2};ENIAMtokens.s_beg (next-2);ENIAMtokens.c_beg (next-1)] + | _ -> failwith "add_sentence_beg" + +let add_sentence_end = function + [q;t] -> let beg=t.beg in [Token q;Token{t with len=t.len-2;beg=beg+2};s_end (beg+1);c_end beg] + | [q] -> let beg=q.beg in [Token{q with len=q.len-2;beg=beg+2};s_end (beg+1);c_end beg] + | _ -> failwith "add_sentence_end" + +let rec revert_tokens = function + Token t -> Token t + | Seq l -> Seq(Xlist.rev_map l revert_tokens) + | Variant l -> Variant(Xlist.map l revert_tokens) + +let manage_query_boundaries tokens = + let b = + try + let _ = find_pattern_tail (Xlist.map query_beg_patterns (fun pattern -> + {prefix=[]; matched=[]; suffix=[]; + pattern=pattern; command=(fun _ -> raise PatternFound); + command_abr=(fun _ -> [])})) tokens in false + with PatternFound -> true | Not_found -> false in + (if b then print_endline "sentence beg found" else print_endline "sentence beg not found"); + let tokens = + if find_beg_pattern [I "<query>";I "„s"] tokens then + if find_beg_pattern [I "<query>";I "„s";I "<sentence>"] tokens then tokens else + replace_beg_pattern [I "<query>";I "„s"] add_sentence_beg tokens else + if find_beg_pattern [I "<query>";I "<or>"] tokens then + if find_beg_pattern [I "<query>";I "<or>";I "<sentence>"] tokens then tokens else + replace_beg_pattern [I "<query>";I "<or>"] add_sentence_beg tokens else + if find_beg_pattern [I "<query>";I "<sentence>"] tokens then tokens else + replace_beg_pattern [I "<query>"] add_sentence_beg tokens in + let b = + try + let _ = find_pattern (Xlist.map query_end_patterns (fun pattern -> + {prefix=[]; matched=[]; suffix=[]; + pattern=pattern; command=(fun _ -> raise PatternFound); + command_abr=(fun _ -> [])})) [] tokens in false + with PatternFound -> true in + (if b then print_endline "sentence end found" else print_endline "sentence end not found"); + let tokens = Xlist.rev_map tokens revert_tokens in + let tokens = + if find_beg_pattern [I "</query>";I "”s"] tokens then + if find_beg_pattern [I "</query>";I "”s";I "</sentence>"] tokens then tokens else + replace_beg_pattern [I "</query>";I "”s"] add_sentence_end tokens else + if find_beg_pattern [I "</query>";I "</sentence>"] tokens then tokens else + replace_beg_pattern [I "</query>"] add_sentence_end tokens in + let tokens = Xlist.rev_map tokens revert_tokens in + tokens + + let find_replacement_patterns tokens = let tokens = find_patterns digit_patterns1 tokens in let tokens = normalize_tokens [] tokens in diff --git a/tokenizer/ENIAMtokenizer.ml b/tokenizer/ENIAMtokenizer.ml index fc8d5bc..0e5d962 100644 --- a/tokenizer/ENIAMtokenizer.ml +++ b/tokenizer/ENIAMtokenizer.ml @@ -29,6 +29,8 @@ let parse query = let l = ENIAMpatterns.normalize_tokens [] l in let l = ENIAMpatterns.find_replacement_patterns l in let l = ENIAMpatterns.remove_spaces [] l in + let l = ENIAMpatterns.manage_query_boundaries l in let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.abr_patterns l in + (* let l = ENIAMpatterns.find_abr_patterns ENIAMacronyms.query_patterns l in *) let l = ENIAMpatterns.normalize_tokens [] l in l diff --git a/tokenizer/ENIAMtokenizerTypes.ml b/tokenizer/ENIAMtokenizerTypes.ml index 72c45cc..12aca0c 100644 --- a/tokenizer/ENIAMtokenizerTypes.ml +++ b/tokenizer/ENIAMtokenizerTypes.ml @@ -60,7 +60,7 @@ type tokens = | Variant of tokens list | Seq of tokens list -type pat = L | CL | D of string | C of string | S of string | RD of string | O of string +type pat = L | CL | D of string | C of string | S of string | RD of string | O of string | I of string let empty_token_env = { orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.} diff --git a/tokenizer/test.ml b/tokenizer/test.ml index 7b6136f..62ab28d 100644 --- a/tokenizer/test.ml +++ b/tokenizer/test.ml @@ -36,21 +36,30 @@ let test_strings = [ "Ala 22-25 ."; "Ala 22.5.2000-25.5.2001 ."; "Szpak frunie.";*) - "Kot miauczy."; + (* "Kot miauczy."; *) (* "Np. Ala.";*) - "w. dom."; + (* "w. dom."; "tzn."; - "c.d.n."; + "c.d.n."; *) (* "Arabia Saudyjska biegnie."; "Cauchy'ego ONZ-owska biegnie.";*) - "TE-cie E-e."; + (* "TE-cie E-e."; "MS-DOS-owska CI-cie KRRi-cie UJ-ocie UJ-OCIE."; - "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi"; + "rock'n'rollowy d’Alembertowi staro-cerkiewno-słowiańskimi"; *) (* "Tom idzie.";*) - "Miałem miał."; + (* "Miałem miał."; *) (* "Szpak śpiewa."; "Ala ma kota."; "Ale mają kota:"*) + "Matura."; + "matura"; + "„Matura.”"; + "„Matura”."; + "„matura”"; + "- matura"; + "- Matura"; + "2 jabłka"; + "- 2 jabłka"; ] let _ = -- libgit2 0.22.2