Commit d06dc00b8867239a75db6a4d8e428f2a36f0117b

Authored by Wojciech Jaworski
1 parent f86c717e

generowanie korpusu literówek

NKJP2/.gitignore
1 1 test
  2 +NKJP1M_spelling_errors/*
  3 +spelling
... ...
NKJP2/makefile
... ... @@ -11,6 +11,10 @@ SOURCES=ENIAM_NKJP.ml validateTokenizer.ml
11 11 all: $(SOURCES)
12 12 $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^
13 13  
  14 +spelling: $(SOURCES) spelling.ml
  15 + mkdir -p NKJP1M_spelling_errors
  16 + $(OCAMLOPT) -o spelling $(OCAMLOPTFLAGS) $^
  17 +
14 18 # install:
15 19 # mkdir -p /usr/share/eniam/Walenty
16 20 # cp resources/* /usr/share/eniam/Walenty
... ... @@ -40,4 +44,4 @@ all: $(SOURCES)
40 44 $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $<
41 45  
42 46 clean:
43   - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test
  47 + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test spelling
... ...
NKJP2/spelling.ml 0 → 100644
  1 +(*
  2 + * ENIAM_NKJP, an interface for National Corpus of Polish (NKJP).
  3 + * Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
  4 + * Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences
  5 + *
  6 + * This library is free software: you can redistribute it and/or modify
  7 + * it under the terms of the GNU Lesser General Public License as published by
  8 + * the Free Software Foundation, either version 3 of the License, or
  9 + * (at your option) any later version.
  10 + *
  11 + * This library is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 + * GNU Lesser General Public License for more details.
  15 + *
  16 + * You should have received a copy of the GNU Lesser General Public License
  17 + * along with this program. If not, see <http://www.gnu.org/licenses/>.
  18 + *)
  19 +
  20 +let xml_space = Xml.PCData " "
  21 +let xml_err_space = Xml.Element("sp",[],[])
  22 +
  23 +let make_xml_token real_orth orth =
  24 + if real_orth = orth then Xml.PCData orth else
  25 + Xml.Element("err",["cor",orth],[Xml.PCData real_orth])
  26 +
  27 +let rec merge_pcdata = function
  28 + Xml.PCData a :: Xml.PCData b :: l -> merge_pcdata (Xml.PCData(a ^ b) :: l)
  29 + | x :: l -> x :: (merge_pcdata l)
  30 + | [] -> []
  31 +
  32 +let generate_error_sentences sentences =
  33 + let sentences,_,_ = Xlist.fold sentences ([],"","") (fun (sentences,prev_orth,prev_cat) (id_s,tokens,named_tokens) ->
  34 + let no_tokens = Xlist.size tokens in
  35 + let tokens,prev_orth,prev_cat = Xlist.fold tokens ([],prev_orth,prev_cat) (fun (tokens,prev_orth,prev_cat) (_,_,no_spaces,real_orth,orth,_,cat,_) ->
  36 + let tokens = Int.fold 1 no_spaces tokens (fun tokens _ -> xml_space :: tokens) in
  37 + let tokens = if no_spaces = 0 && ValidateTokenizer.is_space_required prev_orth prev_cat orth cat then xml_err_space:: tokens else tokens in
  38 + (make_xml_token real_orth orth) :: tokens, orth, cat) in
  39 + Xml.Element("s",["id",id_s;"length",string_of_int no_tokens],merge_pcdata (List.rev tokens)) :: sentences,prev_orth,prev_cat) in
  40 + Xml.Element("p",[],List.rev sentences)
  41 +
  42 +let generate_error_corpus path out_path =
  43 + ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) ->
  44 + (* print_endline name; *)
  45 + let entries = List.rev (Xlist.rev_map entries (fun (id_div,has_ne,paragraphs) ->
  46 + let paragraphs = List.rev (Xlist.rev_map paragraphs (fun (paragraph,sentences) ->
  47 + generate_error_sentences sentences)) in
  48 + Xml.Element("div",["id",string_of_int id_div],paragraphs))) in
  49 + let xml = Xml.Element("source",["id",name;"type",typ;"channel",channel],entries) in
  50 + File.file_out (out_path ^ name ^ ".xml") (fun file ->
  51 + output_string file (Xml.to_string_fmt xml)))
  52 +
  53 +let _ = generate_error_corpus ENIAM_NKJP.nkjp_path "NKJP1M_spelling_errors/"
... ...
NKJP2/validateTokenizer.ml
... ... @@ -45,11 +45,29 @@ let make_token orth lemma cat interp =
45 45 orth=orth;
46 46 token=Lemma(lemma,cat,[Xlist.map interp (fun s -> [s])])}
47 47  
48   -let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ]
  48 +let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; "li"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ]
49 49 (* let prefixes = StringSet.of_list [
50 50 (*"\""; "-"; "("; "„"; "/"; "."; "+"; "«"; "''"; "»"; "["; "–"; "'";
51 51 "’"; ":"; "“"; ","; ")";*) ""; ""; ""; ""; ""; ""; ] *)
52 52  
  53 +let is_space_required prev_orth prev_cat orth cat =
  54 + if cat = "interp" || cat = "aglt" || prev_cat = "interp" || prev_cat = "" || StringSet.mem suffixes orth then false else (
  55 + let prev_char = List.hd (List.rev (Xunicode.classified_chars_of_utf8_string prev_orth)) in
  56 + let cur_char = List.hd (Xunicode.classified_chars_of_utf8_string orth) in
  57 + match prev_char,cur_char with
  58 + Xunicode.Sign a,Xunicode.Sign b -> (*print_endline ("is_space_required 1: " ^ prev_orth ^ " " ^ orth ^ " " ^ a ^ " " ^ b);*) true
  59 + | _,Xunicode.Sign _ -> false
  60 + | Xunicode.Sign _,_ -> false
  61 + | Xunicode.Digit _,Xunicode.Digit _ -> true
  62 + | Xunicode.Digit _,_ -> false
  63 + | _,Xunicode.Digit _ -> false
  64 + | Xunicode.Small _,Xunicode.Small _ -> true
  65 + | Xunicode.ForeignSmall _,Xunicode.Small _ -> true
  66 + | Xunicode.Capital _,Xunicode.Capital _ -> true
  67 + | Xunicode.Small _,Xunicode.Capital _ -> true
  68 + | Xunicode.Capital _,Xunicode.Small _ -> true
  69 + | Xunicode.ForeignCapital _,Xunicode.Small _ -> true
  70 + | a,b -> failwith ("is_space_required: " ^ prev_orth ^ " " ^ orth ^ " " ^ Xunicode.to_string a ^ " " ^ Xunicode.to_string b))
53 71  
54 72 let rec allign prev_orth prev_cat rev = function
55 73 (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l ->
... ... @@ -57,11 +75,8 @@ let rec allign prev_orth prev_cat rev = function
57 75 | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith "allign"
58 76 | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l ->
59 77 let rev =
60   - if no_spaces > 0 then space :: rev
61   - else if cat = "interp" || cat = "aglt" || prev_cat = "interp" || StringSet.mem suffixes orth (*|| StringSet.mem prefixes prev_orth*) then rev
62   - else (
63   - (* print_endline ("allign: " ^ prev_orth ^ " " ^ orth); *)
64   - space :: rev) in
  78 + if no_spaces > 0 then space :: rev else
  79 + if is_space_required prev_orth prev_cat orth cat then space :: rev else rev in
65 80 let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in
66 81 let rev = (make_token orth lemma cat interp) :: rev in
67 82 let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in
... ... @@ -76,6 +91,20 @@ let rec set_lengths n rev = function
76 91 set_lengths (n+len) ({t with beg=n; len=len; next=n+len} :: rev) l
77 92 | [] -> List.rev rev
78 93  
  94 +(* FIXME: poprawić interpretacje przecinka i innych znaków interpunkcyjnych *)
  95 +let rec set_special_tokens_lengths rev = function
  96 + ({token=Interp "<sentence>"} as sent) :: ({token=Interp "<clause>"} as cl) :: t :: l ->
  97 + let sent = {sent with len=1; next=sent.beg+1} in
  98 + let cl = {cl with beg=sent.next; len=1; next=sent.next+1} in
  99 + let t = {t with beg=t.beg+2; len=t.len-2} in
  100 + set_special_tokens_lengths (Token t :: Token cl :: Token sent :: rev) l
  101 + | ({orth="."; token=Lemma(".","interp",[[]])} as dot) :: ({token=Interp "</clause>"} as cl) :: {token=Interp "</sentence>"} :: l ->
  102 + let cl = {cl with beg=dot.beg; len=20; next=dot.beg+20} in
  103 + let dot = {dot with beg=cl.next; len=80; token= Interp "</sentence>"} in
  104 + set_special_tokens_lengths (Token dot :: Token cl :: rev) l
  105 + | t :: l -> set_special_tokens_lengths (Token t :: rev) l
  106 + | [] -> List.rev rev
  107 +
79 108 let render_paragraph tokens =
80 109 String.concat "" (List.rev (Xlist.rev_map tokens (fun t -> t.orth)))
81 110  
... ... @@ -86,68 +115,36 @@ let rec get_next = function
86 115 | Variant [] -> failwith "get_next"
87 116 | Variant l -> get_next (List.hd l)
88 117  
89   -let rec match_tokens erev nrev rev = function
  118 +let make_seq = function
  119 + [] -> failwith "make_seq"
  120 + | [t] -> t
  121 + | l -> Seq l
  122 +
  123 +let rec match_token_sequence erev nrev rev = function
90 124 et :: ets, nt :: nts ->
91   - let next = get_next et in
92   - if next = nt.next then
93   - match_tokens [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts)
94   - else if next < nt.next then
95   - match_tokens (et :: erev) nrev rev (ets, nt :: nts)
96   - else match_tokens erev (nt :: nrev) rev (et :: ets, nts)
97   - | [],[] -> List.rev rev
98   - | _ -> failwith "match_tokens"
99   -
100   -(* let compare_token et t =
101   - et.orth=t.orth && et.beg=t.beg && et.len=t.len && et.next=t.next && et.token=t.token
102   -
103   -let get_beg = function
104   - Token t -> t.beg
105   - | Seq [] -> failwith "get_beg"
106   - | Seq l -> get_beg (List.hd l)
107   - | Variant [] -> failwith "get_next"
108   - | Variant l -> get_beg (List.hd l)
109   -
110   -let rec compare_tokens stats = function
111   - Token et :: ets, t :: ts ->
112   - if compare_token et t then compare_tokens stats (ets,ts) else (
113   - Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t);
114   - stats)
115   - | Variant l :: ets, ts -> failwith "compare_tokens 4"
116   - | Seq l :: ets, ts -> failwith "compare_tokens 3"
117   - | [], ts -> failwith "compare_tokens 2"
118   - | _, [] -> failwith "compare_tokens 1"
119   -
120   -let rec get_subsequence_rec next rev = function
121   - t :: tokens -> if t.next = next then List.rev (t :: rev) else get_subsequence_rec next (t :: rev) tokens
122   - | [] -> failwith "get_subsequence_rec"
123   -
124   -let get_subsequence beg next = function
125   - t :: tokens -> if t.beg = beg then get_subsequence_rec next [] (t :: tokens) else failwith "get_subsequence 2"
126   - | [] -> failwith "get_subsequence 1"
127   -
128   -let compare_token stats tokens = function
129   - Token et :: ets, t :: ts ->
130   - if compare_token et t then compare_tokens stats (ets,ts) else (
131   - Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t);
132   - stats)
133   - | Variant l :: ets, ts -> failwith "compare_tokens 4"
134   - | Seq l :: ets, ts -> failwith "compare_tokens 3"
135   - | [], ts -> failwith "compare_tokens 2"
136   - | _, [] -> failwith "compare_tokens 1"
137   -
138   -let rec compare_tokens stats tokens = function
139   - et :: ets ->
140   - let ts,tokens = get_subsequence (get_beg et) (get_next et) tokens in
141   - compare_token stats ts et
142   - | [] -> if tokens = [] then stats else failwith "compare_tokens 1"*)
143   -
144   -let rec compare_tokens stats = function
145   - (ets,nts) :: l ->
146   - Xlist.iter ets (fun et -> Printf.printf "%s\n" (ENIAMtokens.string_of_tokens 0 et));
147   - Xlist.iter nts (fun nt -> Printf.printf "%s\n" (ENIAMtokens.string_of_token_env nt));
148   - print_endline "";
149   - compare_tokens stats l
150   - | [] -> stats
  125 + let enext = get_next et in
  126 + let nnext = get_next nt in
  127 + if enext = nnext then
  128 + match_token_sequence [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts)
  129 + else if enext < nnext then
  130 + match_token_sequence (et :: erev) nrev rev (ets, nt :: nts)
  131 + else match_token_sequence erev (nt :: nrev) rev (et :: ets, nts)
  132 + | [],[] -> Xlist.fold rev [] (fun l (et,nt) -> (make_seq et, make_seq nt) :: l)
  133 + | _ -> failwith "match_token_sequence"
  134 +
  135 +let rec compare_tokens = function
  136 + Token et, Token nt ->
  137 + et.orth = nt.orth && et.beg = nt.beg && et.len = nt.len && et.next = nt.next
  138 + | et,Variant l ->
  139 + Xlist.fold l true (fun b nt ->
  140 + compare_tokens (et,nt) && b)
  141 + | Variant l,nt ->
  142 + Xlist.fold l false (fun b et ->
  143 + compare_tokens (et,nt) || b)
  144 + | Seq[et], nt -> compare_tokens (et,nt)
  145 + | et, Seq[nt] -> compare_tokens (et,nt)
  146 + | Seq(et::ets),Seq(nt::nts) -> if compare_tokens (et,nt) then compare_tokens (Seq ets,Seq nts) else false
  147 + | _ -> false
151 148  
152 149 let validate stats name typ channel entries =
153 150 (* if name = "120-2-900066" then ( *)
... ... @@ -156,15 +153,21 @@ let validate stats name typ channel entries =
156 153 Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
157 154 let tokens = flatten_sentences sentences in
158 155 let tokens = allign "" "" [] tokens in
159   - let tokens = set_lengths 0 [] tokens in
160 156 let paragraph = render_paragraph tokens in
161   - let tokens = remove_spaces [] tokens in
  157 + let tokens = set_lengths 0 [] tokens in
  158 + let tokens = set_special_tokens_lengths [] tokens in
  159 + let tokens = ENIAMpatterns.remove_spaces [] tokens in
162 160 let eniam_tokens = ENIAMtokenizer.parse paragraph in
163   - let l = match_tokens [] [] [] (eniam_tokens,tokens) in
164   - compare_tokens stats l))
  161 + let l = match_token_sequence [] [] [] (eniam_tokens,tokens) in
  162 + Xlist.fold l stats (fun stats (eniam_token,nkjp_token) ->
  163 + if compare_tokens (eniam_token,nkjp_token) then stats else (
  164 + let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 eniam_token) in
  165 + let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 nkjp_token) in
  166 + Printf.printf "%s\n%s\n\n%!" s t;
  167 + StringQMap.add stats (s ^ "\n" ^ t)))))
165 168  
166 169  
167   -let _ =
  170 +(*let _ =
168 171 let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
169 172 validate stats name typ channel entries) in
170   - ()
  173 + ()*)
... ...
resources/NKJP1M/NKJP1M_spelling_errors.tar.bz2 0 → 100644
No preview for this file type