From d06dc00b8867239a75db6a4d8e428f2a36f0117b Mon Sep 17 00:00:00 2001
From: Wojciech Jaworski <wjaworski@mimuw.edu.pl>
Date: Wed, 29 Mar 2017 10:58:45 +0200
Subject: [PATCH] generowanie korpusu literówek

---
 NKJP2/.gitignore                                |   2 ++
 NKJP2/makefile                                  |   6 +++++-
 NKJP2/spelling.ml                               |  53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 NKJP2/validateTokenizer.ml                      | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------------------------------------
 resources/NKJP1M/NKJP1M_spelling_errors.tar.bz2 | Bin 0 -> 2898551 bytes
 5 files changed, 136 insertions(+), 74 deletions(-)
 create mode 100644 NKJP2/spelling.ml
 create mode 100644 resources/NKJP1M/NKJP1M_spelling_errors.tar.bz2

diff --git a/NKJP2/.gitignore b/NKJP2/.gitignore
index 9daeafb..b4b7473 100644
--- a/NKJP2/.gitignore
+++ b/NKJP2/.gitignore
@@ -1 +1,3 @@
 test
+NKJP1M_spelling_errors/*
+spelling
diff --git a/NKJP2/makefile b/NKJP2/makefile
index 79cb7a6..196bfc1 100755
--- a/NKJP2/makefile
+++ b/NKJP2/makefile
@@ -11,6 +11,10 @@ SOURCES=ENIAM_NKJP.ml validateTokenizer.ml
 all: $(SOURCES)
 	$(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^
 
+spelling: $(SOURCES) spelling.ml
+	mkdir -p NKJP1M_spelling_errors
+	$(OCAMLOPT) -o spelling $(OCAMLOPTFLAGS) $^
+
 # install:
 # 	mkdir -p /usr/share/eniam/Walenty
 # 	cp resources/*  /usr/share/eniam/Walenty
@@ -40,4 +44,4 @@ all: $(SOURCES)
 	$(OCAMLOPT) $(OCAMLOPTFLAGS) -c $<
 
 clean:
-	rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test
+	rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test spelling
diff --git a/NKJP2/spelling.ml b/NKJP2/spelling.ml
new file mode 100644
index 0000000..c039e9e
--- /dev/null
+++ b/NKJP2/spelling.ml
@@ -0,0 +1,53 @@
+(*
+ *  ENIAM_NKJP, an interface for National Corpus of Polish (NKJP).
+ *  Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
+ *  Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences
+ *
+ *  This library is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Lesser General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *)
+
+let xml_space = Xml.PCData " "
+let xml_err_space = Xml.Element("sp",[],[])
+
+let make_xml_token real_orth orth =
+  if real_orth = orth then Xml.PCData orth else
+  Xml.Element("err",["cor",orth],[Xml.PCData real_orth])
+
+let rec merge_pcdata = function
+    Xml.PCData a :: Xml.PCData b :: l -> merge_pcdata (Xml.PCData(a ^ b) :: l)
+  | x :: l -> x :: (merge_pcdata l)
+  | [] -> []
+
+let generate_error_sentences sentences =
+  let sentences,_,_ = Xlist.fold sentences ([],"","") (fun (sentences,prev_orth,prev_cat) (id_s,tokens,named_tokens) ->
+    let no_tokens = Xlist.size tokens in
+    let tokens,prev_orth,prev_cat = Xlist.fold tokens ([],prev_orth,prev_cat) (fun (tokens,prev_orth,prev_cat) (_,_,no_spaces,real_orth,orth,_,cat,_) ->
+      let tokens = Int.fold 1 no_spaces tokens (fun tokens _ -> xml_space :: tokens) in
+      let tokens = if no_spaces = 0 && ValidateTokenizer.is_space_required prev_orth prev_cat orth cat then xml_err_space:: tokens else tokens in
+      (make_xml_token real_orth orth) :: tokens, orth, cat) in
+    Xml.Element("s",["id",id_s;"length",string_of_int no_tokens],merge_pcdata (List.rev tokens)) :: sentences,prev_orth,prev_cat) in
+  Xml.Element("p",[],List.rev sentences)
+
+let generate_error_corpus path out_path =
+  ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) ->
+    (* print_endline name; *)
+    let entries = List.rev (Xlist.rev_map entries (fun (id_div,has_ne,paragraphs) ->
+      let paragraphs = List.rev (Xlist.rev_map paragraphs (fun (paragraph,sentences) ->
+        generate_error_sentences sentences)) in
+      Xml.Element("div",["id",string_of_int id_div],paragraphs))) in
+    let xml = Xml.Element("source",["id",name;"type",typ;"channel",channel],entries) in
+    File.file_out (out_path ^ name ^ ".xml") (fun file ->
+      output_string file (Xml.to_string_fmt xml)))
+
+let _ = generate_error_corpus ENIAM_NKJP.nkjp_path "NKJP1M_spelling_errors/"
diff --git a/NKJP2/validateTokenizer.ml b/NKJP2/validateTokenizer.ml
index 4832da7..6b541f9 100644
--- a/NKJP2/validateTokenizer.ml
+++ b/NKJP2/validateTokenizer.ml
@@ -45,11 +45,29 @@ let make_token orth lemma cat interp =
          orth=orth;
          token=Lemma(lemma,cat,[Xlist.map interp (fun s -> [s])])}
 
-let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ]
+let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; "li"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ]
 (* let prefixes = StringSet.of_list [
   (*"\""; "-"; "("; "„"; "/"; "."; "+"; "«"; "''"; "»"; "["; "–"; "'";
   "’"; ":"; "“"; ","; ")";*) ""; ""; ""; ""; ""; ""; ] *)
 
+let is_space_required prev_orth prev_cat orth cat =
+  if cat = "interp" || cat = "aglt" || prev_cat = "interp" || prev_cat = "" || StringSet.mem suffixes orth then false else (
+  let prev_char = List.hd (List.rev (Xunicode.classified_chars_of_utf8_string prev_orth)) in
+  let cur_char = List.hd (Xunicode.classified_chars_of_utf8_string orth) in
+  match prev_char,cur_char with
+    Xunicode.Sign a,Xunicode.Sign b -> (*print_endline ("is_space_required 1: " ^ prev_orth ^ " " ^ orth ^ " " ^ a ^ " " ^ b);*) true
+  | _,Xunicode.Sign _ -> false
+  | Xunicode.Sign _,_ -> false
+  | Xunicode.Digit _,Xunicode.Digit _ -> true
+  | Xunicode.Digit _,_ -> false
+  | _,Xunicode.Digit _ -> false
+  | Xunicode.Small _,Xunicode.Small _ -> true
+  | Xunicode.ForeignSmall _,Xunicode.Small _ -> true
+  | Xunicode.Capital _,Xunicode.Capital _ -> true
+  | Xunicode.Small _,Xunicode.Capital _ -> true
+  | Xunicode.Capital _,Xunicode.Small _ -> true
+  | Xunicode.ForeignCapital _,Xunicode.Small _ -> true
+  | a,b -> failwith ("is_space_required: " ^ prev_orth ^ " " ^ orth ^ " " ^ Xunicode.to_string a ^ " " ^ Xunicode.to_string b))
 
 let rec allign prev_orth prev_cat rev = function
     (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l ->
@@ -57,11 +75,8 @@ let rec allign prev_orth prev_cat rev = function
   | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith "allign"
   | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l ->
        let rev =
-         if no_spaces > 0 then space :: rev
-         else if cat = "interp" || cat = "aglt" || prev_cat = "interp" || StringSet.mem suffixes orth (*|| StringSet.mem prefixes prev_orth*) then rev
-         else (
-         (* print_endline ("allign: " ^ prev_orth ^ " " ^ orth); *)
-         space :: rev) in
+         if no_spaces > 0 then space :: rev else
+         if is_space_required prev_orth prev_cat orth cat then space :: rev else rev in
        let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in
        let rev = (make_token orth lemma cat interp) :: rev in
        let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in
@@ -76,6 +91,20 @@ let rec set_lengths n rev = function
        set_lengths (n+len) ({t with beg=n; len=len; next=n+len} :: rev) l
   | [] -> List.rev rev
 
+(* FIXME: poprawić interpretacje przecinka i innych znaków interpunkcyjnych *)
+let rec set_special_tokens_lengths rev = function
+    ({token=Interp "<sentence>"} as sent) :: ({token=Interp "<clause>"} as cl) :: t :: l ->
+       let sent = {sent with len=1; next=sent.beg+1} in
+       let cl = {cl with beg=sent.next; len=1; next=sent.next+1} in
+       let t = {t with beg=t.beg+2; len=t.len-2} in
+       set_special_tokens_lengths (Token t :: Token cl :: Token sent :: rev) l
+  | ({orth="."; token=Lemma(".","interp",[[]])} as dot) :: ({token=Interp "</clause>"} as cl) :: {token=Interp "</sentence>"} :: l ->
+       let cl = {cl with beg=dot.beg; len=20; next=dot.beg+20} in
+       let dot = {dot with beg=cl.next; len=80; token= Interp "</sentence>"} in
+       set_special_tokens_lengths (Token dot :: Token cl :: rev) l
+  | t :: l -> set_special_tokens_lengths (Token t :: rev) l
+  | [] -> List.rev rev
+
 let render_paragraph tokens =
   String.concat "" (List.rev (Xlist.rev_map tokens (fun t -> t.orth)))
 
@@ -86,68 +115,36 @@ let rec get_next = function
   | Variant [] -> failwith "get_next"
   | Variant l -> get_next (List.hd l)
 
-let rec match_tokens erev nrev rev = function
+let make_seq  = function
+    [] -> failwith "make_seq"
+  | [t] -> t
+  | l -> Seq l
+
+let rec match_token_sequence erev nrev rev = function
     et :: ets, nt :: nts ->
-      let next = get_next et in
-      if next = nt.next then
-        match_tokens [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts)
-      else if next < nt.next then
-        match_tokens (et :: erev) nrev rev (ets, nt :: nts)
-      else match_tokens erev (nt :: nrev) rev (et :: ets, nts)
-  | [],[] -> List.rev rev
-  | _ -> failwith "match_tokens"
-
-(*  let compare_token et t =
-  et.orth=t.orth && et.beg=t.beg && et.len=t.len && et.next=t.next && et.token=t.token
-
-let get_beg = function
-    Token t -> t.beg
-  | Seq [] -> failwith "get_beg"
-  | Seq l -> get_beg (List.hd l)
-  | Variant [] -> failwith "get_next"
-  | Variant l -> get_beg (List.hd l)
-
-let rec compare_tokens stats = function
-    Token et :: ets, t :: ts ->
-       if compare_token et t then compare_tokens stats (ets,ts) else (
-       Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t);
-       stats)
-  | Variant l :: ets, ts -> failwith "compare_tokens 4"
-  | Seq l :: ets, ts -> failwith "compare_tokens 3"
-  | [], ts -> failwith "compare_tokens 2"
-  | _, [] -> failwith "compare_tokens 1"
-
-let rec get_subsequence_rec next rev = function
-    t :: tokens -> if t.next = next then List.rev (t :: rev) else get_subsequence_rec next (t :: rev) tokens
-  | [] -> failwith "get_subsequence_rec"
-
-let get_subsequence beg next = function
-    t :: tokens -> if t.beg = beg then get_subsequence_rec next [] (t :: tokens) else failwith "get_subsequence 2"
-  | [] -> failwith "get_subsequence 1"
-
-let compare_token stats tokens = function
-    Token et :: ets, t :: ts ->
-       if compare_token et t then compare_tokens stats (ets,ts) else (
-       Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t);
-       stats)
-  | Variant l :: ets, ts -> failwith "compare_tokens 4"
-  | Seq l :: ets, ts -> failwith "compare_tokens 3"
-  | [], ts -> failwith "compare_tokens 2"
-  | _, [] -> failwith "compare_tokens 1"
-
-let rec compare_tokens stats tokens = function
-    et :: ets ->
-      let ts,tokens = get_subsequence (get_beg et) (get_next et) tokens in
-      compare_token stats ts et
-  | [] -> if tokens = [] then stats else failwith "compare_tokens 1"*)
-
-let rec compare_tokens stats = function
-    (ets,nts) :: l ->
-       Xlist.iter ets (fun et -> Printf.printf "%s\n" (ENIAMtokens.string_of_tokens 0 et));
-       Xlist.iter nts (fun nt -> Printf.printf "%s\n" (ENIAMtokens.string_of_token_env nt));
-       print_endline "";
-       compare_tokens stats l
-  | [] -> stats
+      let enext = get_next et in
+      let nnext = get_next nt in
+      if enext = nnext then
+        match_token_sequence [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts)
+      else if enext < nnext then
+        match_token_sequence (et :: erev) nrev rev (ets, nt :: nts)
+      else match_token_sequence erev (nt :: nrev) rev (et :: ets, nts)
+  | [],[] -> Xlist.fold rev [] (fun l (et,nt) -> (make_seq et, make_seq nt) :: l)
+  | _ -> failwith "match_token_sequence"
+
+let rec compare_tokens = function
+    Token et, Token nt ->
+       et.orth = nt.orth && et.beg = nt.beg && et.len = nt.len && et.next = nt.next
+  | et,Variant l ->
+       Xlist.fold l true (fun b nt ->
+         compare_tokens (et,nt) && b)
+  | Variant l,nt ->
+       Xlist.fold l false (fun b et ->
+         compare_tokens (et,nt) || b)
+  | Seq[et], nt -> compare_tokens (et,nt)
+  | et, Seq[nt] -> compare_tokens (et,nt)
+  | Seq(et::ets),Seq(nt::nts) -> if compare_tokens (et,nt) then compare_tokens (Seq ets,Seq nts) else false
+  | _ -> false
 
 let validate stats name typ channel entries =
   (* if name = "120-2-900066" then ( *)
@@ -156,15 +153,21 @@ let validate stats name typ channel entries =
     Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
       let tokens = flatten_sentences sentences in
       let tokens = allign "" "" [] tokens in
-      let tokens = set_lengths 0 [] tokens in
       let paragraph = render_paragraph tokens in
-      let tokens = remove_spaces [] tokens in
+      let tokens = set_lengths 0 [] tokens in
+      let tokens = set_special_tokens_lengths [] tokens in
+      let tokens = ENIAMpatterns.remove_spaces [] tokens in
       let eniam_tokens = ENIAMtokenizer.parse paragraph in
-      let l = match_tokens [] [] [] (eniam_tokens,tokens) in
-      compare_tokens stats l))
+      let l = match_token_sequence [] [] [] (eniam_tokens,tokens) in
+      Xlist.fold l stats (fun stats (eniam_token,nkjp_token) ->
+        if compare_tokens (eniam_token,nkjp_token) then stats else (
+          let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 eniam_token) in
+          let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 nkjp_token) in
+          Printf.printf "%s\n%s\n\n%!"  s t;
+          StringQMap.add stats (s ^ "\n" ^ t)))))
 
 
-let _ =
+(*let _ =
   let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
     validate stats name typ channel entries) in
-  ()
+  ()*)
diff --git a/resources/NKJP1M/NKJP1M_spelling_errors.tar.bz2 b/resources/NKJP1M/NKJP1M_spelling_errors.tar.bz2
new file mode 100644
index 0000000..f8529e3
Binary files /dev/null and b/resources/NKJP1M/NKJP1M_spelling_errors.tar.bz2 differ
--
libgit2 0.22.2