From d06dc00b8867239a75db6a4d8e428f2a36f0117b Mon Sep 17 00:00:00 2001 From: Wojciech Jaworski <wjaworski@mimuw.edu.pl> Date: Wed, 29 Mar 2017 10:58:45 +0200 Subject: [PATCH] generowanie korpusu literówek --- NKJP2/.gitignore | 2 ++ NKJP2/makefile | 6 +++++- NKJP2/spelling.ml | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ NKJP2/validateTokenizer.ml | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------------------------------------------------------- resources/NKJP1M/NKJP1M_spelling_errors.tar.bz2 | Bin 0 -> 2898551 bytes 5 files changed, 136 insertions(+), 74 deletions(-) create mode 100644 NKJP2/spelling.ml create mode 100644 resources/NKJP1M/NKJP1M_spelling_errors.tar.bz2 diff --git a/NKJP2/.gitignore b/NKJP2/.gitignore index 9daeafb..b4b7473 100644 --- a/NKJP2/.gitignore +++ b/NKJP2/.gitignore @@ -1 +1,3 @@ test +NKJP1M_spelling_errors/* +spelling diff --git a/NKJP2/makefile b/NKJP2/makefile index 79cb7a6..196bfc1 100755 --- a/NKJP2/makefile +++ b/NKJP2/makefile @@ -11,6 +11,10 @@ SOURCES=ENIAM_NKJP.ml validateTokenizer.ml all: $(SOURCES) $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^ +spelling: $(SOURCES) spelling.ml + mkdir -p NKJP1M_spelling_errors + $(OCAMLOPT) -o spelling $(OCAMLOPTFLAGS) $^ + # install: # mkdir -p /usr/share/eniam/Walenty # cp resources/* /usr/share/eniam/Walenty @@ -40,4 +44,4 @@ all: $(SOURCES) $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< clean: - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test spelling diff --git a/NKJP2/spelling.ml b/NKJP2/spelling.ml new file mode 100644 index 0000000..c039e9e --- /dev/null +++ b/NKJP2/spelling.ml @@ -0,0 +1,53 @@ +(* + * ENIAM_NKJP, an interface for National Corpus of Polish (NKJP). + * Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> + * Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences + * + * This library is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + *) + +let xml_space = Xml.PCData " " +let xml_err_space = Xml.Element("sp",[],[]) + +let make_xml_token real_orth orth = + if real_orth = orth then Xml.PCData orth else + Xml.Element("err",["cor",orth],[Xml.PCData real_orth]) + +let rec merge_pcdata = function + Xml.PCData a :: Xml.PCData b :: l -> merge_pcdata (Xml.PCData(a ^ b) :: l) + | x :: l -> x :: (merge_pcdata l) + | [] -> [] + +let generate_error_sentences sentences = + let sentences,_,_ = Xlist.fold sentences ([],"","") (fun (sentences,prev_orth,prev_cat) (id_s,tokens,named_tokens) -> + let no_tokens = Xlist.size tokens in + let tokens,prev_orth,prev_cat = Xlist.fold tokens ([],prev_orth,prev_cat) (fun (tokens,prev_orth,prev_cat) (_,_,no_spaces,real_orth,orth,_,cat,_) -> + let tokens = Int.fold 1 no_spaces tokens (fun tokens _ -> xml_space :: tokens) in + let tokens = if no_spaces = 0 && ValidateTokenizer.is_space_required prev_orth prev_cat orth cat then xml_err_space:: tokens else tokens in + (make_xml_token real_orth orth) :: tokens, orth, cat) in + Xml.Element("s",["id",id_s;"length",string_of_int no_tokens],merge_pcdata (List.rev tokens)) :: sentences,prev_orth,prev_cat) in + Xml.Element("p",[],List.rev sentences) + +let generate_error_corpus path out_path = + ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) -> + (* print_endline name; *) + let entries = List.rev (Xlist.rev_map entries (fun (id_div,has_ne,paragraphs) -> + let paragraphs = List.rev (Xlist.rev_map paragraphs (fun (paragraph,sentences) -> + generate_error_sentences sentences)) in + Xml.Element("div",["id",string_of_int id_div],paragraphs))) in + let xml = Xml.Element("source",["id",name;"type",typ;"channel",channel],entries) in + File.file_out (out_path ^ name ^ ".xml") (fun file -> + output_string file (Xml.to_string_fmt xml))) + +let _ = generate_error_corpus ENIAM_NKJP.nkjp_path "NKJP1M_spelling_errors/" diff --git a/NKJP2/validateTokenizer.ml b/NKJP2/validateTokenizer.ml index 4832da7..6b541f9 100644 --- a/NKJP2/validateTokenizer.ml +++ b/NKJP2/validateTokenizer.ml @@ -45,11 +45,29 @@ let make_token orth lemma cat interp = orth=orth; token=Lemma(lemma,cat,[Xlist.map interp (fun s -> [s])])} -let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ] +let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; "li"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ] (* let prefixes = StringSet.of_list [ (*"\""; "-"; "("; "„"; "/"; "."; "+"; "«"; "''"; "»"; "["; "–"; "'"; "’"; ":"; "“"; ","; ")";*) ""; ""; ""; ""; ""; ""; ] *) +let is_space_required prev_orth prev_cat orth cat = + if cat = "interp" || cat = "aglt" || prev_cat = "interp" || prev_cat = "" || StringSet.mem suffixes orth then false else ( + let prev_char = List.hd (List.rev (Xunicode.classified_chars_of_utf8_string prev_orth)) in + let cur_char = List.hd (Xunicode.classified_chars_of_utf8_string orth) in + match prev_char,cur_char with + Xunicode.Sign a,Xunicode.Sign b -> (*print_endline ("is_space_required 1: " ^ prev_orth ^ " " ^ orth ^ " " ^ a ^ " " ^ b);*) true + | _,Xunicode.Sign _ -> false + | Xunicode.Sign _,_ -> false + | Xunicode.Digit _,Xunicode.Digit _ -> true + | Xunicode.Digit _,_ -> false + | _,Xunicode.Digit _ -> false + | Xunicode.Small _,Xunicode.Small _ -> true + | Xunicode.ForeignSmall _,Xunicode.Small _ -> true + | Xunicode.Capital _,Xunicode.Capital _ -> true + | Xunicode.Small _,Xunicode.Capital _ -> true + | Xunicode.Capital _,Xunicode.Small _ -> true + | Xunicode.ForeignCapital _,Xunicode.Small _ -> true + | a,b -> failwith ("is_space_required: " ^ prev_orth ^ " " ^ orth ^ " " ^ Xunicode.to_string a ^ " " ^ Xunicode.to_string b)) let rec allign prev_orth prev_cat rev = function (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l -> @@ -57,11 +75,8 @@ let rec allign prev_orth prev_cat rev = function | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith "allign" | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l -> let rev = - if no_spaces > 0 then space :: rev - else if cat = "interp" || cat = "aglt" || prev_cat = "interp" || StringSet.mem suffixes orth (*|| StringSet.mem prefixes prev_orth*) then rev - else ( - (* print_endline ("allign: " ^ prev_orth ^ " " ^ orth); *) - space :: rev) in + if no_spaces > 0 then space :: rev else + if is_space_required prev_orth prev_cat orth cat then space :: rev else rev in let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in let rev = (make_token orth lemma cat interp) :: rev in let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in @@ -76,6 +91,20 @@ let rec set_lengths n rev = function set_lengths (n+len) ({t with beg=n; len=len; next=n+len} :: rev) l | [] -> List.rev rev +(* FIXME: poprawić interpretacje przecinka i innych znaków interpunkcyjnych *) +let rec set_special_tokens_lengths rev = function + ({token=Interp "<sentence>"} as sent) :: ({token=Interp "<clause>"} as cl) :: t :: l -> + let sent = {sent with len=1; next=sent.beg+1} in + let cl = {cl with beg=sent.next; len=1; next=sent.next+1} in + let t = {t with beg=t.beg+2; len=t.len-2} in + set_special_tokens_lengths (Token t :: Token cl :: Token sent :: rev) l + | ({orth="."; token=Lemma(".","interp",[[]])} as dot) :: ({token=Interp "</clause>"} as cl) :: {token=Interp "</sentence>"} :: l -> + let cl = {cl with beg=dot.beg; len=20; next=dot.beg+20} in + let dot = {dot with beg=cl.next; len=80; token= Interp "</sentence>"} in + set_special_tokens_lengths (Token dot :: Token cl :: rev) l + | t :: l -> set_special_tokens_lengths (Token t :: rev) l + | [] -> List.rev rev + let render_paragraph tokens = String.concat "" (List.rev (Xlist.rev_map tokens (fun t -> t.orth))) @@ -86,68 +115,36 @@ let rec get_next = function | Variant [] -> failwith "get_next" | Variant l -> get_next (List.hd l) -let rec match_tokens erev nrev rev = function +let make_seq = function + [] -> failwith "make_seq" + | [t] -> t + | l -> Seq l + +let rec match_token_sequence erev nrev rev = function et :: ets, nt :: nts -> - let next = get_next et in - if next = nt.next then - match_tokens [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts) - else if next < nt.next then - match_tokens (et :: erev) nrev rev (ets, nt :: nts) - else match_tokens erev (nt :: nrev) rev (et :: ets, nts) - | [],[] -> List.rev rev - | _ -> failwith "match_tokens" - -(* let compare_token et t = - et.orth=t.orth && et.beg=t.beg && et.len=t.len && et.next=t.next && et.token=t.token - -let get_beg = function - Token t -> t.beg - | Seq [] -> failwith "get_beg" - | Seq l -> get_beg (List.hd l) - | Variant [] -> failwith "get_next" - | Variant l -> get_beg (List.hd l) - -let rec compare_tokens stats = function - Token et :: ets, t :: ts -> - if compare_token et t then compare_tokens stats (ets,ts) else ( - Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t); - stats) - | Variant l :: ets, ts -> failwith "compare_tokens 4" - | Seq l :: ets, ts -> failwith "compare_tokens 3" - | [], ts -> failwith "compare_tokens 2" - | _, [] -> failwith "compare_tokens 1" - -let rec get_subsequence_rec next rev = function - t :: tokens -> if t.next = next then List.rev (t :: rev) else get_subsequence_rec next (t :: rev) tokens - | [] -> failwith "get_subsequence_rec" - -let get_subsequence beg next = function - t :: tokens -> if t.beg = beg then get_subsequence_rec next [] (t :: tokens) else failwith "get_subsequence 2" - | [] -> failwith "get_subsequence 1" - -let compare_token stats tokens = function - Token et :: ets, t :: ts -> - if compare_token et t then compare_tokens stats (ets,ts) else ( - Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t); - stats) - | Variant l :: ets, ts -> failwith "compare_tokens 4" - | Seq l :: ets, ts -> failwith "compare_tokens 3" - | [], ts -> failwith "compare_tokens 2" - | _, [] -> failwith "compare_tokens 1" - -let rec compare_tokens stats tokens = function - et :: ets -> - let ts,tokens = get_subsequence (get_beg et) (get_next et) tokens in - compare_token stats ts et - | [] -> if tokens = [] then stats else failwith "compare_tokens 1"*) - -let rec compare_tokens stats = function - (ets,nts) :: l -> - Xlist.iter ets (fun et -> Printf.printf "%s\n" (ENIAMtokens.string_of_tokens 0 et)); - Xlist.iter nts (fun nt -> Printf.printf "%s\n" (ENIAMtokens.string_of_token_env nt)); - print_endline ""; - compare_tokens stats l - | [] -> stats + let enext = get_next et in + let nnext = get_next nt in + if enext = nnext then + match_token_sequence [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts) + else if enext < nnext then + match_token_sequence (et :: erev) nrev rev (ets, nt :: nts) + else match_token_sequence erev (nt :: nrev) rev (et :: ets, nts) + | [],[] -> Xlist.fold rev [] (fun l (et,nt) -> (make_seq et, make_seq nt) :: l) + | _ -> failwith "match_token_sequence" + +let rec compare_tokens = function + Token et, Token nt -> + et.orth = nt.orth && et.beg = nt.beg && et.len = nt.len && et.next = nt.next + | et,Variant l -> + Xlist.fold l true (fun b nt -> + compare_tokens (et,nt) && b) + | Variant l,nt -> + Xlist.fold l false (fun b et -> + compare_tokens (et,nt) || b) + | Seq[et], nt -> compare_tokens (et,nt) + | et, Seq[nt] -> compare_tokens (et,nt) + | Seq(et::ets),Seq(nt::nts) -> if compare_tokens (et,nt) then compare_tokens (Seq ets,Seq nts) else false + | _ -> false let validate stats name typ channel entries = (* if name = "120-2-900066" then ( *) @@ -156,15 +153,21 @@ let validate stats name typ channel entries = Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> let tokens = flatten_sentences sentences in let tokens = allign "" "" [] tokens in - let tokens = set_lengths 0 [] tokens in let paragraph = render_paragraph tokens in - let tokens = remove_spaces [] tokens in + let tokens = set_lengths 0 [] tokens in + let tokens = set_special_tokens_lengths [] tokens in + let tokens = ENIAMpatterns.remove_spaces [] tokens in let eniam_tokens = ENIAMtokenizer.parse paragraph in - let l = match_tokens [] [] [] (eniam_tokens,tokens) in - compare_tokens stats l)) + let l = match_token_sequence [] [] [] (eniam_tokens,tokens) in + Xlist.fold l stats (fun stats (eniam_token,nkjp_token) -> + if compare_tokens (eniam_token,nkjp_token) then stats else ( + let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 eniam_token) in + let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 nkjp_token) in + Printf.printf "%s\n%s\n\n%!" s t; + StringQMap.add stats (s ^ "\n" ^ t))))) -let _ = +(*let _ = let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> validate stats name typ channel entries) in - () + ()*) diff --git a/resources/NKJP1M/NKJP1M_spelling_errors.tar.bz2 b/resources/NKJP1M/NKJP1M_spelling_errors.tar.bz2 new file mode 100644 index 0000000..f8529e3 Binary files /dev/null and b/resources/NKJP1M/NKJP1M_spelling_errors.tar.bz2 differ -- libgit2 0.22.2