Commit d06dc00b8867239a75db6a4d8e428f2a36f0117b
1 parent
f86c717e
generowanie korpusu literówek
Showing
5 changed files
with
136 additions
and
74 deletions
NKJP2/.gitignore
NKJP2/makefile
... | ... | @@ -11,6 +11,10 @@ SOURCES=ENIAM_NKJP.ml validateTokenizer.ml |
11 | 11 | all: $(SOURCES) |
12 | 12 | $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^ |
13 | 13 | |
14 | +spelling: $(SOURCES) spelling.ml | |
15 | + mkdir -p NKJP1M_spelling_errors | |
16 | + $(OCAMLOPT) -o spelling $(OCAMLOPTFLAGS) $^ | |
17 | + | |
14 | 18 | # install: |
15 | 19 | # mkdir -p /usr/share/eniam/Walenty |
16 | 20 | # cp resources/* /usr/share/eniam/Walenty |
... | ... | @@ -40,4 +44,4 @@ all: $(SOURCES) |
40 | 44 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< |
41 | 45 | |
42 | 46 | clean: |
43 | - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test | |
47 | + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test spelling | |
... | ... |
NKJP2/spelling.ml
0 → 100644
1 | +(* | |
2 | + * ENIAM_NKJP, an interface for National Corpus of Polish (NKJP). | |
3 | + * Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences | |
5 | + * | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | + * the Free Software Foundation, either version 3 of the License, or | |
9 | + * (at your option) any later version. | |
10 | + * | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + * GNU Lesser General Public License for more details. | |
15 | + * | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | + *) | |
19 | + | |
20 | +let xml_space = Xml.PCData " " | |
21 | +let xml_err_space = Xml.Element("sp",[],[]) | |
22 | + | |
23 | +let make_xml_token real_orth orth = | |
24 | + if real_orth = orth then Xml.PCData orth else | |
25 | + Xml.Element("err",["cor",orth],[Xml.PCData real_orth]) | |
26 | + | |
27 | +let rec merge_pcdata = function | |
28 | + Xml.PCData a :: Xml.PCData b :: l -> merge_pcdata (Xml.PCData(a ^ b) :: l) | |
29 | + | x :: l -> x :: (merge_pcdata l) | |
30 | + | [] -> [] | |
31 | + | |
32 | +let generate_error_sentences sentences = | |
33 | + let sentences,_,_ = Xlist.fold sentences ([],"","") (fun (sentences,prev_orth,prev_cat) (id_s,tokens,named_tokens) -> | |
34 | + let no_tokens = Xlist.size tokens in | |
35 | + let tokens,prev_orth,prev_cat = Xlist.fold tokens ([],prev_orth,prev_cat) (fun (tokens,prev_orth,prev_cat) (_,_,no_spaces,real_orth,orth,_,cat,_) -> | |
36 | + let tokens = Int.fold 1 no_spaces tokens (fun tokens _ -> xml_space :: tokens) in | |
37 | + let tokens = if no_spaces = 0 && ValidateTokenizer.is_space_required prev_orth prev_cat orth cat then xml_err_space:: tokens else tokens in | |
38 | + (make_xml_token real_orth orth) :: tokens, orth, cat) in | |
39 | + Xml.Element("s",["id",id_s;"length",string_of_int no_tokens],merge_pcdata (List.rev tokens)) :: sentences,prev_orth,prev_cat) in | |
40 | + Xml.Element("p",[],List.rev sentences) | |
41 | + | |
42 | +let generate_error_corpus path out_path = | |
43 | + ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) -> | |
44 | + (* print_endline name; *) | |
45 | + let entries = List.rev (Xlist.rev_map entries (fun (id_div,has_ne,paragraphs) -> | |
46 | + let paragraphs = List.rev (Xlist.rev_map paragraphs (fun (paragraph,sentences) -> | |
47 | + generate_error_sentences sentences)) in | |
48 | + Xml.Element("div",["id",string_of_int id_div],paragraphs))) in | |
49 | + let xml = Xml.Element("source",["id",name;"type",typ;"channel",channel],entries) in | |
50 | + File.file_out (out_path ^ name ^ ".xml") (fun file -> | |
51 | + output_string file (Xml.to_string_fmt xml))) | |
52 | + | |
53 | +let _ = generate_error_corpus ENIAM_NKJP.nkjp_path "NKJP1M_spelling_errors/" | |
... | ... |
NKJP2/validateTokenizer.ml
... | ... | @@ -45,11 +45,29 @@ let make_token orth lemma cat interp = |
45 | 45 | orth=orth; |
46 | 46 | token=Lemma(lemma,cat,[Xlist.map interp (fun s -> [s])])} |
47 | 47 | |
48 | -let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ] | |
48 | +let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; "li"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ] | |
49 | 49 | (* let prefixes = StringSet.of_list [ |
50 | 50 | (*"\""; "-"; "("; "„"; "/"; "."; "+"; "«"; "''"; "»"; "["; "–"; "'"; |
51 | 51 | "’"; ":"; "“"; ","; ")";*) ""; ""; ""; ""; ""; ""; ] *) |
52 | 52 | |
53 | +let is_space_required prev_orth prev_cat orth cat = | |
54 | + if cat = "interp" || cat = "aglt" || prev_cat = "interp" || prev_cat = "" || StringSet.mem suffixes orth then false else ( | |
55 | + let prev_char = List.hd (List.rev (Xunicode.classified_chars_of_utf8_string prev_orth)) in | |
56 | + let cur_char = List.hd (Xunicode.classified_chars_of_utf8_string orth) in | |
57 | + match prev_char,cur_char with | |
58 | + Xunicode.Sign a,Xunicode.Sign b -> (*print_endline ("is_space_required 1: " ^ prev_orth ^ " " ^ orth ^ " " ^ a ^ " " ^ b);*) true | |
59 | + | _,Xunicode.Sign _ -> false | |
60 | + | Xunicode.Sign _,_ -> false | |
61 | + | Xunicode.Digit _,Xunicode.Digit _ -> true | |
62 | + | Xunicode.Digit _,_ -> false | |
63 | + | _,Xunicode.Digit _ -> false | |
64 | + | Xunicode.Small _,Xunicode.Small _ -> true | |
65 | + | Xunicode.ForeignSmall _,Xunicode.Small _ -> true | |
66 | + | Xunicode.Capital _,Xunicode.Capital _ -> true | |
67 | + | Xunicode.Small _,Xunicode.Capital _ -> true | |
68 | + | Xunicode.Capital _,Xunicode.Small _ -> true | |
69 | + | Xunicode.ForeignCapital _,Xunicode.Small _ -> true | |
70 | + | a,b -> failwith ("is_space_required: " ^ prev_orth ^ " " ^ orth ^ " " ^ Xunicode.to_string a ^ " " ^ Xunicode.to_string b)) | |
53 | 71 | |
54 | 72 | let rec allign prev_orth prev_cat rev = function |
55 | 73 | (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l -> |
... | ... | @@ -57,11 +75,8 @@ let rec allign prev_orth prev_cat rev = function |
57 | 75 | | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith "allign" |
58 | 76 | | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l -> |
59 | 77 | let rev = |
60 | - if no_spaces > 0 then space :: rev | |
61 | - else if cat = "interp" || cat = "aglt" || prev_cat = "interp" || StringSet.mem suffixes orth (*|| StringSet.mem prefixes prev_orth*) then rev | |
62 | - else ( | |
63 | - (* print_endline ("allign: " ^ prev_orth ^ " " ^ orth); *) | |
64 | - space :: rev) in | |
78 | + if no_spaces > 0 then space :: rev else | |
79 | + if is_space_required prev_orth prev_cat orth cat then space :: rev else rev in | |
65 | 80 | let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in |
66 | 81 | let rev = (make_token orth lemma cat interp) :: rev in |
67 | 82 | let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in |
... | ... | @@ -76,6 +91,20 @@ let rec set_lengths n rev = function |
76 | 91 | set_lengths (n+len) ({t with beg=n; len=len; next=n+len} :: rev) l |
77 | 92 | | [] -> List.rev rev |
78 | 93 | |
94 | +(* FIXME: poprawić interpretacje przecinka i innych znaków interpunkcyjnych *) | |
95 | +let rec set_special_tokens_lengths rev = function | |
96 | + ({token=Interp "<sentence>"} as sent) :: ({token=Interp "<clause>"} as cl) :: t :: l -> | |
97 | + let sent = {sent with len=1; next=sent.beg+1} in | |
98 | + let cl = {cl with beg=sent.next; len=1; next=sent.next+1} in | |
99 | + let t = {t with beg=t.beg+2; len=t.len-2} in | |
100 | + set_special_tokens_lengths (Token t :: Token cl :: Token sent :: rev) l | |
101 | + | ({orth="."; token=Lemma(".","interp",[[]])} as dot) :: ({token=Interp "</clause>"} as cl) :: {token=Interp "</sentence>"} :: l -> | |
102 | + let cl = {cl with beg=dot.beg; len=20; next=dot.beg+20} in | |
103 | + let dot = {dot with beg=cl.next; len=80; token= Interp "</sentence>"} in | |
104 | + set_special_tokens_lengths (Token dot :: Token cl :: rev) l | |
105 | + | t :: l -> set_special_tokens_lengths (Token t :: rev) l | |
106 | + | [] -> List.rev rev | |
107 | + | |
79 | 108 | let render_paragraph tokens = |
80 | 109 | String.concat "" (List.rev (Xlist.rev_map tokens (fun t -> t.orth))) |
81 | 110 | |
... | ... | @@ -86,68 +115,36 @@ let rec get_next = function |
86 | 115 | | Variant [] -> failwith "get_next" |
87 | 116 | | Variant l -> get_next (List.hd l) |
88 | 117 | |
89 | -let rec match_tokens erev nrev rev = function | |
118 | +let make_seq = function | |
119 | + [] -> failwith "make_seq" | |
120 | + | [t] -> t | |
121 | + | l -> Seq l | |
122 | + | |
123 | +let rec match_token_sequence erev nrev rev = function | |
90 | 124 | et :: ets, nt :: nts -> |
91 | - let next = get_next et in | |
92 | - if next = nt.next then | |
93 | - match_tokens [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts) | |
94 | - else if next < nt.next then | |
95 | - match_tokens (et :: erev) nrev rev (ets, nt :: nts) | |
96 | - else match_tokens erev (nt :: nrev) rev (et :: ets, nts) | |
97 | - | [],[] -> List.rev rev | |
98 | - | _ -> failwith "match_tokens" | |
99 | - | |
100 | -(* let compare_token et t = | |
101 | - et.orth=t.orth && et.beg=t.beg && et.len=t.len && et.next=t.next && et.token=t.token | |
102 | - | |
103 | -let get_beg = function | |
104 | - Token t -> t.beg | |
105 | - | Seq [] -> failwith "get_beg" | |
106 | - | Seq l -> get_beg (List.hd l) | |
107 | - | Variant [] -> failwith "get_next" | |
108 | - | Variant l -> get_beg (List.hd l) | |
109 | - | |
110 | -let rec compare_tokens stats = function | |
111 | - Token et :: ets, t :: ts -> | |
112 | - if compare_token et t then compare_tokens stats (ets,ts) else ( | |
113 | - Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t); | |
114 | - stats) | |
115 | - | Variant l :: ets, ts -> failwith "compare_tokens 4" | |
116 | - | Seq l :: ets, ts -> failwith "compare_tokens 3" | |
117 | - | [], ts -> failwith "compare_tokens 2" | |
118 | - | _, [] -> failwith "compare_tokens 1" | |
119 | - | |
120 | -let rec get_subsequence_rec next rev = function | |
121 | - t :: tokens -> if t.next = next then List.rev (t :: rev) else get_subsequence_rec next (t :: rev) tokens | |
122 | - | [] -> failwith "get_subsequence_rec" | |
123 | - | |
124 | -let get_subsequence beg next = function | |
125 | - t :: tokens -> if t.beg = beg then get_subsequence_rec next [] (t :: tokens) else failwith "get_subsequence 2" | |
126 | - | [] -> failwith "get_subsequence 1" | |
127 | - | |
128 | -let compare_token stats tokens = function | |
129 | - Token et :: ets, t :: ts -> | |
130 | - if compare_token et t then compare_tokens stats (ets,ts) else ( | |
131 | - Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t); | |
132 | - stats) | |
133 | - | Variant l :: ets, ts -> failwith "compare_tokens 4" | |
134 | - | Seq l :: ets, ts -> failwith "compare_tokens 3" | |
135 | - | [], ts -> failwith "compare_tokens 2" | |
136 | - | _, [] -> failwith "compare_tokens 1" | |
137 | - | |
138 | -let rec compare_tokens stats tokens = function | |
139 | - et :: ets -> | |
140 | - let ts,tokens = get_subsequence (get_beg et) (get_next et) tokens in | |
141 | - compare_token stats ts et | |
142 | - | [] -> if tokens = [] then stats else failwith "compare_tokens 1"*) | |
143 | - | |
144 | -let rec compare_tokens stats = function | |
145 | - (ets,nts) :: l -> | |
146 | - Xlist.iter ets (fun et -> Printf.printf "%s\n" (ENIAMtokens.string_of_tokens 0 et)); | |
147 | - Xlist.iter nts (fun nt -> Printf.printf "%s\n" (ENIAMtokens.string_of_token_env nt)); | |
148 | - print_endline ""; | |
149 | - compare_tokens stats l | |
150 | - | [] -> stats | |
125 | + let enext = get_next et in | |
126 | + let nnext = get_next nt in | |
127 | + if enext = nnext then | |
128 | + match_token_sequence [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts) | |
129 | + else if enext < nnext then | |
130 | + match_token_sequence (et :: erev) nrev rev (ets, nt :: nts) | |
131 | + else match_token_sequence erev (nt :: nrev) rev (et :: ets, nts) | |
132 | + | [],[] -> Xlist.fold rev [] (fun l (et,nt) -> (make_seq et, make_seq nt) :: l) | |
133 | + | _ -> failwith "match_token_sequence" | |
134 | + | |
135 | +let rec compare_tokens = function | |
136 | + Token et, Token nt -> | |
137 | + et.orth = nt.orth && et.beg = nt.beg && et.len = nt.len && et.next = nt.next | |
138 | + | et,Variant l -> | |
139 | + Xlist.fold l true (fun b nt -> | |
140 | + compare_tokens (et,nt) && b) | |
141 | + | Variant l,nt -> | |
142 | + Xlist.fold l false (fun b et -> | |
143 | + compare_tokens (et,nt) || b) | |
144 | + | Seq[et], nt -> compare_tokens (et,nt) | |
145 | + | et, Seq[nt] -> compare_tokens (et,nt) | |
146 | + | Seq(et::ets),Seq(nt::nts) -> if compare_tokens (et,nt) then compare_tokens (Seq ets,Seq nts) else false | |
147 | + | _ -> false | |
151 | 148 | |
152 | 149 | let validate stats name typ channel entries = |
153 | 150 | (* if name = "120-2-900066" then ( *) |
... | ... | @@ -156,15 +153,21 @@ let validate stats name typ channel entries = |
156 | 153 | Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> |
157 | 154 | let tokens = flatten_sentences sentences in |
158 | 155 | let tokens = allign "" "" [] tokens in |
159 | - let tokens = set_lengths 0 [] tokens in | |
160 | 156 | let paragraph = render_paragraph tokens in |
161 | - let tokens = remove_spaces [] tokens in | |
157 | + let tokens = set_lengths 0 [] tokens in | |
158 | + let tokens = set_special_tokens_lengths [] tokens in | |
159 | + let tokens = ENIAMpatterns.remove_spaces [] tokens in | |
162 | 160 | let eniam_tokens = ENIAMtokenizer.parse paragraph in |
163 | - let l = match_tokens [] [] [] (eniam_tokens,tokens) in | |
164 | - compare_tokens stats l)) | |
161 | + let l = match_token_sequence [] [] [] (eniam_tokens,tokens) in | |
162 | + Xlist.fold l stats (fun stats (eniam_token,nkjp_token) -> | |
163 | + if compare_tokens (eniam_token,nkjp_token) then stats else ( | |
164 | + let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 eniam_token) in | |
165 | + let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 nkjp_token) in | |
166 | + Printf.printf "%s\n%s\n\n%!" s t; | |
167 | + StringQMap.add stats (s ^ "\n" ^ t))))) | |
165 | 168 | |
166 | 169 | |
167 | -let _ = | |
170 | +(*let _ = | |
168 | 171 | let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> |
169 | 172 | validate stats name typ channel entries) in |
170 | - () | |
173 | + ()*) | |
... | ... |
resources/NKJP1M/NKJP1M_spelling_errors.tar.bz2
0 → 100644
No preview for this file type