From 95c86f112d51360e2f641c04f3c359afbb38feb8 Mon Sep 17 00:00:00 2001 From: Wojciech Jaworski <wjaworski@mimuw.edu.pl> Date: Sun, 19 Nov 2017 10:30:16 +0100 Subject: [PATCH] Poprawki w subsyntax --- exec/semparser.ml | 11 ++++++++++- morphology/resources/alt_supplement.tab | 1 + subsyntax/ENIAM_MWE.ml | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ subsyntax/ENIAMsubsyntax.ml | 5 +++-- subsyntax/test.ml | 3 ++- tokenizer/ENIAMpatterns.ml | 15 +++++++++++---- tokenizer/ENIAMtokenizerTypes.ml | 2 +- tokenizer/ENIAMtokens.ml | 4 ++++ 8 files changed, 80 insertions(+), 9 deletions(-) diff --git a/exec/semparser.ml b/exec/semparser.ml index 38b63f8..902feff 100644 --- a/exec/semparser.ml +++ b/exec/semparser.ml @@ -40,6 +40,7 @@ let img = ref 1 let timeout = ref 30. let select_sentence_modes_flag = ref false let select_sentences_flag = ref true +let semantic_processing_flag = ref true let output_dir = ref "results/" let spec_list = [ @@ -67,6 +68,8 @@ let spec_list = [ "--no_sel_modes", Arg.Unit (fun () -> select_sentence_modes_flag:=false), "Do not select sencence modes (default)"; "--sel_sent", Arg.Unit (fun () -> select_sentences_flag:=true), "Select parsed sentences (default)"; "--no_sel_sent", Arg.Unit (fun () -> select_sentences_flag:=false), "Do not select parsed sentences"; + "--sem", Arg.Unit (fun () -> semantic_processing_flag:=true), "Perform semantic processing (default)"; + "--no_sem", Arg.Unit (fun () -> semantic_processing_flag:=false), "Do not perforf semantic processing"; ] let usage_msg = @@ -103,8 +106,13 @@ let assign_lex_sems proj_map cats_map tokens = let lex_sems = ExtArray.make (ExtArray.size tokens) ENIAMlexSemanticsTypes.empty_lex_sem in let _ = ExtArray.add lex_sems ENIAMlexSemanticsTypes.empty_lex_sem in Int.iter 1 (ExtArray.size tokens - 1) (fun i -> + let lemma = ENIAMtokens.get_lemma (ExtArray.get tokens i).token in + let pos = ENIAMtokens.get_pos (ExtArray.get tokens i).token in let cats = expand_projections proj_map (get_cats cats_map (ExtArray.get tokens i).token) in - let lex_sem = {ENIAMlexSemanticsTypes.empty_lex_sem with ENIAMlexSemanticsTypes.cats=cats} in + let frames = + Xlist.rev_map (ENIAMvalence.get_aroles [] lemma pos) (fun (sel,arole,arole_attr,arev) -> + {ENIAMlexSemanticsTypes.empty_frame with ENIAMlexSemanticsTypes.selectors=sel; ENIAMlexSemanticsTypes.arole=arole; ENIAMlexSemanticsTypes.arole_attr=arole_attr; ENIAMlexSemanticsTypes.arev=arev}) in + let lex_sem = {ENIAMlexSemanticsTypes.empty_lex_sem with ENIAMlexSemanticsTypes.cats=cats; ENIAMlexSemanticsTypes.frames=frames} in let _ = ExtArray.add lex_sems lex_sem in ()); lex_sems @@ -123,6 +131,7 @@ let rec main_loop sub_in sub_out = let text = ENIAMexec.parse !timeout !verbosity rules dep_rules tokens lex_sems text in let text = if !select_sentence_modes_flag then ENIAMselectSent.select_sentence_modes_text text else text in let text = if !select_sentences_flag then ENIAMselectSent.select_sentences_text ENIAMexecTypes.Struct text else text in + let text = if !semantic_processing_flag then ENIAMexec.semantic_processing !verbosity tokens lex_sems text else text in ENIAMvisualization.print_html_text !output_dir "parsed_text" text !img !verbosity tokens); prerr_endline "Done!"; main_loop sub_in sub_out) diff --git a/morphology/resources/alt_supplement.tab b/morphology/resources/alt_supplement.tab index f75e5ad..6a8c9c4 100644 --- a/morphology/resources/alt_supplement.tab +++ b/morphology/resources/alt_supplement.tab @@ -3,4 +3,5 @@ siebie siebie siebie:acc.gen sobie siebie siebie:dat.loc sobą siebie siebie:inst to to pred +yay yay interj diff --git a/subsyntax/ENIAM_MWE.ml b/subsyntax/ENIAM_MWE.ml index ebf1cd9..a9602ee 100644 --- a/subsyntax/ENIAM_MWE.ml +++ b/subsyntax/ENIAM_MWE.ml @@ -143,6 +143,31 @@ let get_intnum_orths paths = Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma) | _ -> orths))) +let get_intnum_orths paths = + IntMap.fold paths StringMap.empty (fun orths _ map -> + IntMap.fold map orths (fun orths _ l -> + TokenEnvSet.fold l orths (fun orths t -> + match t.token with + Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma) + | _ -> orths))) + +let get_year_orths paths = + IntMap.fold paths StringSet.empty (fun orths _ map -> + IntMap.fold map orths (fun orths _ l -> + TokenEnvSet.fold l orths (fun orths t -> + match t.token with + Dig(lemma,"year") -> StringSet.add orths lemma + | _ -> orths))) + +let get_single_letter_orths paths = + IntMap.fold paths StringSet.empty (fun orths _ map -> + IntMap.fold map orths (fun orths _ l -> + TokenEnvSet.fold l orths (fun orths t -> + match t.token with + SmallLetter lemma -> StringSet.add orths lemma + | CapLetter(lemma,_) -> StringSet.add orths lemma + | _ -> orths))) + let preselect orths lemmas rules l = Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) -> let b = Xlist.fold match_list true (fun b -> function @@ -172,14 +197,33 @@ let add_ordnum_rules orths rules = let add_quot_rule rules = (false,[I "„x";I "<sentence>"; I "<clause>"],"„","interp",[]) :: rules +let add_building_number_rules dig_orths letter_orths rules = + StringSet.fold dig_orths rules (fun rules dig1 -> + let rules = StringSet.fold letter_orths rules (fun rules letter1 -> + (true,[D(dig1,"year");O letter1],dig1^letter1,"building-number",[]) :: rules) in + StringSet.fold dig_orths rules (fun rules dig2 -> + let rules = (true,[D(dig1,"year");O "/";D(dig2,"year")],dig1^"/"^dig2,"building-number",[]) :: rules in + let rules = StringSet.fold letter_orths rules (fun rules letter1 -> + (true,[D(dig1,"year");O letter1;O "/";D(dig2,"year")],dig1^letter1^"/"^dig2,"building-number",[]) :: + (true,[D(dig1,"year");O "/";D(dig2,"year");O letter1],dig1^"/"^dig2^letter1,"building-number",[]) :: rules) in + StringSet.fold dig_orths rules (fun rules dig3 -> + let rules = (true,[D(dig1,"year");O "/";D(dig2,"year");O "/";D(dig3,"year")],dig1^"/"^dig2^"/"^dig3,"building-number",[]) :: rules in + let rules = StringSet.fold letter_orths rules (fun rules letter1 -> + (true,[D(dig1,"year");O letter1;O "/";D(dig2,"year");O "/";D(dig3,"year")],dig1^letter1^"/"^dig2^"/"^dig3,"building-number",[]) :: + (true,[D(dig1,"year");O "/";D(dig2,"year");O letter1;O "/";D(dig3,"year")],dig1^"/"^dig2^letter1^"/"^dig3,"building-number",[]) :: rules) in + rules))) + let select_rules paths mwe_dict mwe_dict2 = let orths = get_orths paths in let lemmas = get_lemmas paths in let intnum_orths = get_intnum_orths paths in + let year_orths = get_year_orths paths in + let letter_orths = get_single_letter_orths paths in let rules = preselect_dict orths lemmas mwe_dict [] in let rules = preselect_dict2 orths lemmas mwe_dict2 rules in let rules = add_ordnum_rules intnum_orths rules in let rules = add_quot_rule rules in + let rules = add_building_number_rules year_orths letter_orths rules in rules let rec check_interp sels = function @@ -223,6 +267,8 @@ let rec match_path_rec map found (t:token_env) sels rev = function (new_t,get_sels sels (interp,interp2)) :: found2 else found2) | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2 | I s, Interp s2 -> if s=s2 then (new_t,sels) :: found2 else found2 + (* | SL, SmallLetter _ -> (new_t,sels) :: found + | SL, CapLetter _ -> (new_t,sels) :: found *) | _ -> found2)) in Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l) @@ -240,6 +286,8 @@ let match_path map = function (t,get_sels [] (interp,interp2)) :: found else found) | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found | I s, Interp s2 -> if s=s2 then (t,[]) :: found else found + (* | SL, SmallLetter _ -> (t,[]) :: found + | SL, CapLetter _ -> (t,[]) :: found *) | _ -> found))) in Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l) diff --git a/subsyntax/ENIAMsubsyntax.ml b/subsyntax/ENIAMsubsyntax.ml index 0a723fb..8fc5544 100644 --- a/subsyntax/ENIAMsubsyntax.ml +++ b/subsyntax/ENIAMsubsyntax.ml @@ -92,8 +92,9 @@ let translate_digs paths = | Dig(lemma,"url") -> {t with token=Proper(lemma,"url",[[]],["url"])} | Dig(lemma,"email") -> {t with token=Proper(lemma,"email",[[]],["email"])} | Dig(lemma,"html-tag") -> {t with token=Lemma(lemma,"html-tag",[[]])} - | Dig(cat,_) -> failwith ("translate_digs: Dig " ^ cat) - | RomanDig(cat,_) -> failwith ("translate_digs: Romandig " ^ cat) + | Dig(lemma,"list-item") -> {t with token=Lemma(lemma,"list-item",[[]])} + | Dig(lemma,cat) -> failwith ("translate_digs: Dig " ^ cat) + | RomanDig(lemma,cat) -> failwith ("translate_digs: Romandig " ^ cat) | Compound(cat,_) as t -> failwith ("translate_digs: " ^ ENIAMtokens.string_of_token t) | _ -> t) diff --git a/subsyntax/test.ml b/subsyntax/test.ml index a04f6b9..26ea17b 100644 --- a/subsyntax/test.ml +++ b/subsyntax/test.ml @@ -37,6 +37,7 @@ let test_strings = [ "Chłopcy mają ulicę kwiatami."; *) (* "„Dialog”"; *) (* "( Głosujmy !)"; *) + "Jakie są ceny w obu firmach za a) wymianę płyty głównej; b) wymianę portu HDMI" ] let test_strings2 = [ @@ -51,7 +52,7 @@ let test_strings2 = [ "„Dialog”:"; *) (* "- Votare! ( Głosujmy !)"; "( Głosujmy !)"; *) - "À propos"; + (* "À propos"; *) ] let _ = diff --git a/tokenizer/ENIAMpatterns.ml b/tokenizer/ENIAMpatterns.ml index e424fac..a2c1837 100644 --- a/tokenizer/ENIAMpatterns.ml +++ b/tokenizer/ENIAMpatterns.ml @@ -102,7 +102,6 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb [D "hour"; S "."; D "minute"], (function [hour;_;minute] -> Compound("hour-minute",[hour.token;minute.token]) | _ -> failwith "digit_patterns5"); [D "hour"; S ":"; D "minute"], (function [hour;_;minute] -> Compound("hour-minute",[hour.token;minute.token]) | _ -> failwith "digit_patterns6"); [D "intnum"; S ":"; D "intnum"], (function [x;_;y] -> Compound("match-result",[x.token;y.token]) | _ -> failwith "digit_patterns7"); - [D "2dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"postal-code",[[]],["postal-code"])); [D "3dig"; S "-"; D "3dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); [D "3dig"; S " "; D "3dig"; S " "; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); [D "3dig"; S "-"; D "2dig"; S "-"; D "2dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); @@ -123,7 +122,7 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb [O "0"; S "-"; D "3dig"; S "-"; D "2dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); [D "3dig"; S " "; D "3dig"; S " "; D "2dig"; S " "; D "2dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); [D "3dig"; S " "; D "3dig"; S " "; D "4dig"], (fun tokens -> Proper(concat_orths tokens,"phone-number",[[]],["phone-number"])); - [D "year"; SL], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) +(* [D "year"; SL], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) [D "year"; S " "; SL2], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) [D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) [D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) @@ -132,8 +131,9 @@ let digit_patterns1 = [ (* FIXME: problem z nadmiarowymi interpretacjami - trzeb [D "year"; SL; S "/"; D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) [D "year"; S "/"; D "year"; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) [D "year"; S "/"; D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) - [D "year"; SL; S "/"; D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *) + [D "year"; SL; S "/"; D "year"; SL; S "/"; D "year"], (fun tokens -> Proper(concat_orths tokens,"building-number",[[]],["building-number"])); (* year - bo jest to dodatnia liczba całkowita *)*) [SL; S ")"], (fun tokens -> Dig(concat_orths tokens,"list-item")); + [D "intnum"; S "."; D "dig"], (function [x;_;y] -> Dig(dig_value x ^ "," ^ dig_value y,"realnum") | _ -> failwith "digit_patterns8"); ] (* bez 1 i *2 *3 *4 mamy rec *) (* w morfeuszu zawsze num:pl?*) let digit_patterns2 = [ @@ -165,6 +165,7 @@ let compose_ordnum_lemma t interp = let digit_patterns3 = [ [S "-"; D "intnum"], (function [_;x] -> Dig("-" ^ dig_value x,"intnum") | _ -> failwith "digit_patterns10"); [S "-"; D "realnum"], (function [_;x] -> Dig("-" ^ dig_value x,"realnum") | _ -> failwith "digit_patterns10"); + [D "2dig"; S "-"; D "3dig"], (fun tokens -> Proper(concat_orths tokens,"postal-code",[[]],["postal-code"])); [D "intnum"; S "-"; D "intnum"], (function [x;_;y] -> Compound("intnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns11"); [D "realnum"; S "-"; D "realnum"], (function [x;_;y] -> Compound("realnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns12"); (* FIXME: konflikt z liczbami ujemnymi *) [D "intnum"; S "-"; D "realnum"], (function [x;_;y] -> Compound("realnum-interval",[x.token;y.token]) | _ -> failwith "digit_patterns12"); (* FIXME: konflikt z liczbami ujemnymi *) @@ -526,7 +527,7 @@ let match_token = function | CL, AllCap _ -> true | CL, SomeCap _ -> true | SL, SmallLetter _ -> true - | SL2, SmallLetter x -> x <> "o" && x <> "w" (* FIXME !!! *) + (* | SL2, SmallLetter x -> x <> "o" && x <> "w" (* FIXME !!! *) *) | SL, CapLetter _ -> true | I pat, Interp s -> pat = s | _ -> false @@ -745,5 +746,11 @@ let rec set_next_id n = function let rec remove_spaces rev = function [] -> List.rev rev | x :: Token{token=Symbol " "; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l) + | x :: Token{token=Symbol "\t"; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l) + | x :: Token{token=Symbol "\n"; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l) + | x :: Token{token=Symbol "\r"; next=n} :: l -> remove_spaces rev ((set_next_id n x) :: l) | Token{token=Symbol " "} :: l -> remove_spaces rev l + | Token{token=Symbol "\t"} :: l -> remove_spaces rev l + | Token{token=Symbol "\n"} :: l -> remove_spaces rev l + | Token{token=Symbol "\r"} :: l -> remove_spaces rev l | x :: l -> remove_spaces (x :: rev) l diff --git a/tokenizer/ENIAMtokenizerTypes.ml b/tokenizer/ENIAMtokenizerTypes.ml index 4e1b9f0..28f18fb 100644 --- a/tokenizer/ENIAMtokenizerTypes.ml +++ b/tokenizer/ENIAMtokenizerTypes.ml @@ -66,7 +66,7 @@ type tokens = | Variant of tokens list | Seq of tokens list -type pat = L | CL | SL | SL2 | D of string | C of string | S of string | RD of string | O of string | I of string +type pat = L | CL | SL | (*SL2 |*) D of string | C of string | S of string | RD of string | O of string | I of string let empty_token_env = { orth="";corr_orth="";beg=0;len=0;next=0; token=Symbol ""; attrs=[]; weight=0.} diff --git a/tokenizer/ENIAMtokens.ml b/tokenizer/ENIAMtokens.ml index 447d6d8..35e7de3 100644 --- a/tokenizer/ENIAMtokens.ml +++ b/tokenizer/ENIAMtokens.ml @@ -958,6 +958,7 @@ let rec recognize_sign_group poss_s_beg i = function let t,i = create_empty_sign_token i [Sign "»"] in Variant[Token{t with token=Interp "»"};Token{t with token=Interp "»s"}],i,l,poss_s_beg | (Sign "<") :: (Sign "<") :: l -> create_sign_token poss_s_beg i [Sign "<";Sign "<"] l (Interp "«") (* prawy cudzysłów *) + | (Sign "<") :: (Digit "3") :: l -> create_sign_token poss_s_beg i [Sign "<";Sign "3"] l (make_lemma ("<3","sinterj")) | (Sign "<") :: l -> (* prawy cudzysłów i element wzoru matematycznego *) let t,i = create_empty_sign_token i [Sign "<"] in Variant[Token{t with token=Interp "«"};Token{t with token=Symbol "<"}],i,l,poss_s_beg @@ -1014,6 +1015,9 @@ let rec recognize_sign_group poss_s_beg i = function | (Sign "²") :: l -> create_sign_token poss_s_beg i [Sign "²"] l (Symbol "²") | (Sign "°") :: l -> create_sign_token poss_s_beg i [Sign "°"] l (make_lemma ("stopień","subst:_:_:m3")) | (Sign "§") :: l -> create_sign_token false i [Sign "§"] l (make_lemma ("paragraf","subst:_:_:m3")) + | (Sign "\t") :: l -> create_sign_token poss_s_beg i [Sign "\t"] l (Symbol "\t") + | (Sign "\r") :: l -> create_sign_token poss_s_beg i [Sign "\r"] l (Symbol "\r") + | (Sign "\n") :: l -> create_sign_token poss_s_beg i [Sign "\n"] l (Symbol "\n") | (Sign s) :: l -> print_endline ("recognize_sign_group: " ^ s); create_sign_token poss_s_beg i [Sign s] l (Symbol s) | l -> failwith "recognize_sign_group" -- libgit2 0.22.2