From 59cdc551498fb155f90b97a52cde992cc90eb84c Mon Sep 17 00:00:00 2001 From: Wojciech Jaworski <wjaworski@mimuw.edu.pl> Date: Wed, 29 Mar 2017 22:28:07 +0200 Subject: [PATCH] poprawki w LCGlexicon i walidacja tokenizacji --- LCGlexicon/ENIAM_LCGlexicon.ml | 4 ++-- LCGlexicon/ENIAM_LCGlexiconParser.ml | 24 ++++++++++++------------ LCGlexicon/ENIAMcategoriesPL.ml | 4 ++-- NKJP2/ENIAM_NKJP.ml | 14 ++++++++++++++ NKJP2/validateTokenizer.ml | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------ tokenizer/ENIAMtokens.ml | 4 ++-- tokenizer/TODO | 4 ++-- 7 files changed, 90 insertions(+), 32 deletions(-) diff --git a/LCGlexicon/ENIAM_LCGlexicon.ml b/LCGlexicon/ENIAM_LCGlexicon.ml index eb101f6..58cd955 100644 --- a/LCGlexicon/ENIAM_LCGlexicon.ml +++ b/LCGlexicon/ENIAM_LCGlexicon.ml @@ -58,7 +58,7 @@ let assign_quantifiers (selectors,rule,weight) = let pos = find_selector Pos selectors in let categories = try StringMap.find pos_categories pos - with Not_found -> failwith ("assign_quantifiers: " ^ pos) in + with Not_found -> failwith ("assign_quantifiers: unknown part of speech " ^ pos) in let categories = Xlist.map categories (fun s -> s,Top) in let syntax,rule = get_syntax [] rule in let quant,rule = get_quant [] rule in @@ -111,7 +111,7 @@ let make_rules x_flag filename = dict_of_grammar lexicon let find_rules rules cats = - let lex_rules,rules = try StringMap.find rules cats.pos with Not_found -> failwith "find_rules 1" in + let lex_rules,rules = try StringMap.find rules cats.pos with Not_found -> failwith ("find_rules: unable to find rules for category " ^ cats.pos) in (* Printf.printf "find_rules: %s %s |rules|=%d\n" cats.lemma cats.pos (Xlist.size rules); *) let rules = try StringMap.find lex_rules cats.lemma @ rules with Not_found -> rules in Xlist.fold rules [] (fun rules (selectors,syntax,semantics) -> diff --git a/LCGlexicon/ENIAM_LCGlexiconParser.ml b/LCGlexicon/ENIAM_LCGlexiconParser.ml index f342809..85ccaf2 100644 --- a/LCGlexicon/ENIAM_LCGlexiconParser.ml +++ b/LCGlexicon/ENIAM_LCGlexiconParser.ml @@ -152,7 +152,7 @@ let find_internal_grammar_symbols atoms = function | s -> if StringSet.mem selector_names s then B (AVar s) else if StringSet.mem atoms s then B (Atom s) else if StringSet.mem operators s then A s else - failwith ("find_internal_grammar_symbols: " ^ s) + failwith ("find_internal_grammar_symbols: unknown symbol " ^ s) let rec find_tensor = function B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: A "*" :: B s7 :: A "*" :: B s8 :: l -> failwith "find_tensor 1" @@ -163,7 +163,7 @@ let rec find_tensor = function | B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: l -> C (Tensor[s1;s2;s3]) :: find_tensor l | B s1 :: A "*" :: B s2 :: l -> C (Tensor[s1;s2]) :: find_tensor l | B s1 :: l -> C (Tensor[s1]) :: find_tensor l - | A "*" :: _ -> failwith "find_tensor 2" + | A "*" :: _ -> failwith "find_tensor 2: unexpected '*'" | t :: l -> t :: find_tensor l | [] -> [] @@ -174,7 +174,7 @@ let rec find_plus = function | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: l -> C (Plus[s1;s2;s3;s4]) :: find_plus l | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: l -> C (Plus[s1;s2;s3]) :: find_plus l | C s1 :: A "+" :: C s2 :: l -> C (Plus[s1;s2]) :: find_plus l - | A "+" :: _ -> failwith "find_plus 2" + | A "+" :: _ -> failwith "find_plus 2: unexpected '+'" | t :: l -> t :: find_plus l | [] -> [] @@ -192,7 +192,7 @@ let rec find_imp = function let rec find_maybe = function | A "?" :: C s2 :: l -> C (Maybe s2) :: find_maybe l - | A "?" :: _ -> failwith "find_maybe 1" + | A "?" :: _ -> failwith "find_maybe 1: unexpected '?'" | s :: l -> s :: find_maybe l | [] -> [] @@ -203,11 +203,11 @@ let rec find_mult_imp = function | A "," :: A "/" :: C s2 :: l -> A "," :: D (Forward,s2) :: find_mult_imp l | A "," :: A "|" :: C s2 :: l -> A "," :: D (Both,s2) :: find_mult_imp l | A "," :: A "\\" :: C s2 :: l -> A "," :: D (Backward,s2) :: find_mult_imp l - | A "/" :: _ -> failwith "find_mult_imp 1" - | A "|" :: _ -> failwith "find_mult_imp 2" - | A "\\" :: _ -> failwith "find_mult_imp 3" - | A "(" :: _ -> failwith "find_mult_imp 4" - | A ")" :: _ -> failwith "find_mult_imp 5" + | A "/" :: _ -> failwith "find_mult_imp 1: unexpected '/'" + | A "|" :: _ -> failwith "find_mult_imp 2: unexpected '|'" + | A "\\" :: _ -> failwith "find_mult_imp 3: unexpected '\\'" + | A "(" :: _ -> failwith "find_mult_imp 4: unexpected '('" + | A ")" :: _ -> failwith "find_mult_imp 5: unexpected ')'" | s :: l -> s :: find_mult_imp l | [] -> [] @@ -220,9 +220,9 @@ let rec find_mult = function | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3] :: find_mult l | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "}" :: l -> E[s1,t1;s2,t2] :: find_mult l | A "{" :: D(s1,t1) :: A "}" :: l -> E[s1,t1] :: find_mult l - | A "{" :: _ -> failwith "find_mult 2" - | A "}" :: _ -> failwith "find_mult 3" - | A "," :: _ -> failwith "find_mult 4" + | A "{" :: _ -> failwith "find_mult 2: unexpected '{'" + | A "}" :: _ -> failwith "find_mult 3: unexpected '}'" + | A "," :: _ -> failwith "find_mult 4: unexpected ','" | t :: l -> t :: find_mult l | [] -> [] diff --git a/LCGlexicon/ENIAMcategoriesPL.ml b/LCGlexicon/ENIAMcategoriesPL.ml index 7f4555b..afa17a7 100644 --- a/LCGlexicon/ENIAMcategoriesPL.ml +++ b/LCGlexicon/ENIAMcategoriesPL.ml @@ -488,8 +488,8 @@ let pos_categories = Xlist.fold [ "ppron12",[Lemma;Number;Case;Gender;Person;]; "ppron3",[Lemma;Number;Case;Gender;Person;Praep;]; "siebie",[Lemma;Number;Case;Gender;Person;]; - "prep",[Lemma;Case;]; - "compar",[Lemma;Case;]; + "prep",[Lemma;Cat;Case;]; + "compar",[Lemma;Cat;Case;]; "num",[Lemma;Number;Case;Gender;Person;Acm;]; "intnum",[Lemma;Number;Case;Gender;Person;Acm;]; "realnum",[Lemma;Number;Case;Gender;Person;Acm;]; diff --git a/NKJP2/ENIAM_NKJP.ml b/NKJP2/ENIAM_NKJP.ml index 5882f85..e135ac0 100644 --- a/NKJP2/ENIAM_NKJP.ml +++ b/NKJP2/ENIAM_NKJP.ml @@ -401,6 +401,20 @@ let fold path s f = let entries = merge_entries name [] (text,segmentation,morphosyntax,named) in f s (name,typ,channel,entries)) +let fold_selected path selection s f = + let names = get_folders path in + Xlist.fold names s (fun s name -> + if not (StringSet.mem selection name) then s else + (* print_endline name; *) + if name = "030-2-000000012" then s else + let typ,channel = load_header path name in + let text = load_text path name in + let segmentation = load_segmentation path name in + let morphosyntax = load_morphosyntax path name in + let named = load_named path name in + let entries = merge_entries name [] (text,segmentation,morphosyntax,named) in + f s (name,typ,channel,entries)) + let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/" let calculate_statistics stats typ channel entries = diff --git a/NKJP2/validateTokenizer.ml b/NKJP2/validateTokenizer.ml index 6b541f9..d78ad7d 100644 --- a/NKJP2/validateTokenizer.ml +++ b/NKJP2/validateTokenizer.ml @@ -28,13 +28,20 @@ let sencence_end = {empty_token_env with token=Interp "</sentence>"} let clause_beg = {empty_token_env with token=Interp "<clause>"} let clause_end = {empty_token_env with token=Interp "</clause>"} -type sent = SentBeg | SentEnd | Inside +type sent = SentBeg | SentEnd | Inside | SentBegEnd let set_sent_end = function - (_,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ -> + (Inside,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ -> (SentEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l + | (SentBeg,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ -> + (SentBegEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l | _ -> failwith "set_sent_end" +let set_beg_as_zero = function + (sent,_,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l -> + (sent,0,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l + | [] -> failwith "set_beg_as_zero" + let flatten_sentences sentences = List.rev (Xlist.fold sentences [] (fun l (id_s,tokens,named_tokens) -> set_sent_end (Xlist.fold tokens (l,SentBeg) (fun (l,sent) (beg,len,no_spaces,real_orth,orth,lemma,cat,interp) -> @@ -72,11 +79,17 @@ let is_space_required prev_orth prev_cat orth cat = let rec allign prev_orth prev_cat rev = function (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l -> allign orth cat ((make_token orth lemma cat interp) :: clause_beg :: sencence_beg :: query_beg :: rev) l - | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith "allign" + | (SentBegEnd,0,_,_,_,orth,lemma,cat,interp) :: l -> + allign orth cat (List.rev [query_beg;sencence_beg;clause_beg;make_token orth lemma cat interp;clause_end;sencence_end]) l + | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith ("allign 1: " ^ orth) | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l -> let rev = if no_spaces > 0 then space :: rev else if is_space_required prev_orth prev_cat orth cat then space :: rev else rev in + if sent = SentBegEnd then + let rev = (List.rev [sencence_beg;clause_beg;make_token orth lemma cat interp;clause_end;sencence_end]) @ rev in + allign orth cat rev l + else let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in let rev = (make_token orth lemma cat interp) :: rev in let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in @@ -115,6 +128,13 @@ let rec get_next = function | Variant [] -> failwith "get_next" | Variant l -> get_next (List.hd l) +let rec get_beg = function + Token t -> t.beg + | Seq [] -> failwith "get_beg" + | Seq l -> get_beg (List.hd l) + | Variant [] -> failwith "get_beg" + | Variant l -> get_beg (List.hd l) + let make_seq = function [] -> failwith "make_seq" | [t] -> t @@ -130,7 +150,10 @@ let rec match_token_sequence erev nrev rev = function match_token_sequence (et :: erev) nrev rev (ets, nt :: nts) else match_token_sequence erev (nt :: nrev) rev (et :: ets, nts) | [],[] -> Xlist.fold rev [] (fun l (et,nt) -> (make_seq et, make_seq nt) :: l) - | _ -> failwith "match_token_sequence" + | ets,nts -> + let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (Seq ets)) in + let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (Seq nts)) in + (*failwith*)print_endline (Printf.sprintf "match_token_sequence: %s\n\n%s\n" s t); [] let rec compare_tokens = function Token et, Token nt -> @@ -146,28 +169,49 @@ let rec compare_tokens = function | Seq(et::ets),Seq(nt::nts) -> if compare_tokens (et,nt) then compare_tokens (Seq ets,Seq nts) else false | _ -> false +let rec shift_token_rec beg = function + Token t -> Token{t with beg=t.beg-beg; next=t.next-beg} + | Seq l -> Seq(Xlist.map l (shift_token_rec beg)) + | Variant l -> Variant(Xlist.map l (shift_token_rec beg)) + +let shift_token t = + let beg = get_beg t in + shift_token_rec beg t + let validate stats name typ channel entries = - (* if name = "120-2-900066" then ( *) print_endline name; Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) -> + (* if id_div = 3 then *) Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> + (* Printf.printf "%d\t%s\n" id_div paragraph; *) let tokens = flatten_sentences sentences in - let tokens = allign "" "" [] tokens in + let tokens = allign "" "" [] (set_beg_as_zero tokens) in let paragraph = render_paragraph tokens in + (* Printf.printf "rend:\t%s\n" paragraph; *) let tokens = set_lengths 0 [] tokens in let tokens = set_special_tokens_lengths [] tokens in let tokens = ENIAMpatterns.remove_spaces [] tokens in let eniam_tokens = ENIAMtokenizer.parse paragraph in + (* Printf.printf "eniam_tokens: %s\n" (ENIAMtokens.string_of_tokens 0 (Seq eniam_tokens)); + Printf.printf "tokens: %s\n" (ENIAMtokens.string_of_tokens 0 (Seq tokens)); *) let l = match_token_sequence [] [] [] (eniam_tokens,tokens) in Xlist.fold l stats (fun stats (eniam_token,nkjp_token) -> if compare_tokens (eniam_token,nkjp_token) then stats else ( - let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 eniam_token) in - let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 nkjp_token) in - Printf.printf "%s\n%s\n\n%!" s t; - StringQMap.add stats (s ^ "\n" ^ t))))) + let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (shift_token eniam_token)) in + let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (shift_token nkjp_token)) in + (* Printf.printf "%s\n%s\n\n%!" s t; *) + StringQMap.add stats (s ^ "\n" ^ t)))) (*else stats*)) +let selection = StringSet.of_list ["040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005"; +"620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727"; +"620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056"; +"711-3-010000079";"720-3-010000217";"720-3-010000335";"720-3-010000341";"forumowisko.pl_18535";"forumowisko.pl_424";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";] -(*let _ = +let _ = let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> validate stats name typ channel entries) in - ()*) + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection StringQMap.empty (fun stats (name,typ,channel,entries) -> + validate stats name typ channel entries) in *) + (* let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in + Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\n%s\n" v k); *) + () diff --git a/tokenizer/ENIAMtokens.ml b/tokenizer/ENIAMtokens.ml index 368708e..642b645 100644 --- a/tokenizer/ENIAMtokens.ml +++ b/tokenizer/ENIAMtokens.ml @@ -768,12 +768,12 @@ let rec recognize_sign_group poss_s_beg i = function Token{empty_token_env with beg=i+20;len=factor-20;next=i+factor;token=Interp "<clause>"}],i+factor,l,false*) | (Sign "'") :: (Sign "'") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "'";Sign "'"] l | (Sign "'") :: (Sign "'") :: l -> - let t,i = create_empty_sign_token i [Sign "”"] in + let t,i = create_empty_sign_token i [Sign "'";Sign "'"] in Variant[Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg | (Sign "'") :: l -> create_sign_token poss_s_beg i [Sign "'"] l (Symbol "’") | (Sign "’") :: (Sign "’") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "’";Sign "’"] l | (Sign "’") :: (Sign "’") :: l -> - let t,i = create_empty_sign_token i [Sign "”"] in + let t,i = create_empty_sign_token i [Sign "’";Sign "’"] in Variant[Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg | (Sign "’") :: l -> create_sign_token poss_s_beg i [Sign "’"] l (Symbol "’") | (Sign ";") :: (Sign "*") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "*") :: []) l (make_lemma (";*","sinterj")) diff --git a/tokenizer/TODO b/tokenizer/TODO index 1fc9fe0..da12c37 100644 --- a/tokenizer/TODO +++ b/tokenizer/TODO @@ -1,4 +1,4 @@ -- liczba przed kropką kończącą zdanie jest interpretowana jako ordnum +- liczba przed kropką kończącą zdanie jest interpretowana jako ordnum - to WAŻNE !!! - przenieść ustalanie weight do następnego modułu @@ -7,4 +7,4 @@ Odkryłem gdzie jest problem z kodowaniem utf. Mianowicie dla zdania "Fan: Niech nie straszą, Że to bomba jest kalorii." preprocesor zwraca segmentacje w której jeden z segmentów zawiera pole lemma z niepoprawnym znakiem. Pole wygląda następująco: "Ż\BCe". -- przecinek "," nie jest traktowany jako Symbol a jedynie jako Interp, co może stwarzać problemy przy parsowaniu MWE uwzględniającym fleksję. +- przecinek "," nie jest traktowany jako Symbol a jedynie jako Interp, co może stwarzać problemy przy parsowaniu MWE uwzględniającym fleksję. -- libgit2 0.22.2