Commit d06dc00b8867239a75db6a4d8e428f2a36f0117b
1 parent
f86c717e
generowanie korpusu literówek
Showing
5 changed files
with
136 additions
and
74 deletions
NKJP2/.gitignore
NKJP2/makefile
@@ -11,6 +11,10 @@ SOURCES=ENIAM_NKJP.ml validateTokenizer.ml | @@ -11,6 +11,10 @@ SOURCES=ENIAM_NKJP.ml validateTokenizer.ml | ||
11 | all: $(SOURCES) | 11 | all: $(SOURCES) |
12 | $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^ | 12 | $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^ |
13 | 13 | ||
14 | +spelling: $(SOURCES) spelling.ml | ||
15 | + mkdir -p NKJP1M_spelling_errors | ||
16 | + $(OCAMLOPT) -o spelling $(OCAMLOPTFLAGS) $^ | ||
17 | + | ||
14 | # install: | 18 | # install: |
15 | # mkdir -p /usr/share/eniam/Walenty | 19 | # mkdir -p /usr/share/eniam/Walenty |
16 | # cp resources/* /usr/share/eniam/Walenty | 20 | # cp resources/* /usr/share/eniam/Walenty |
@@ -40,4 +44,4 @@ all: $(SOURCES) | @@ -40,4 +44,4 @@ all: $(SOURCES) | ||
40 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< | 44 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< |
41 | 45 | ||
42 | clean: | 46 | clean: |
43 | - rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test | 47 | + rm -f *~ *.cm[aoix] *.o *.so *.cmxa *.a test spelling |
NKJP2/spelling.ml
0 → 100644
1 | +(* | ||
2 | + * ENIAM_NKJP, an interface for National Corpus of Polish (NKJP). | ||
3 | + * Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | ||
4 | + * Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences | ||
5 | + * | ||
6 | + * This library is free software: you can redistribute it and/or modify | ||
7 | + * it under the terms of the GNU Lesser General Public License as published by | ||
8 | + * the Free Software Foundation, either version 3 of the License, or | ||
9 | + * (at your option) any later version. | ||
10 | + * | ||
11 | + * This library is distributed in the hope that it will be useful, | ||
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | + * GNU Lesser General Public License for more details. | ||
15 | + * | ||
16 | + * You should have received a copy of the GNU Lesser General Public License | ||
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
18 | + *) | ||
19 | + | ||
20 | +let xml_space = Xml.PCData " " | ||
21 | +let xml_err_space = Xml.Element("sp",[],[]) | ||
22 | + | ||
23 | +let make_xml_token real_orth orth = | ||
24 | + if real_orth = orth then Xml.PCData orth else | ||
25 | + Xml.Element("err",["cor",orth],[Xml.PCData real_orth]) | ||
26 | + | ||
27 | +let rec merge_pcdata = function | ||
28 | + Xml.PCData a :: Xml.PCData b :: l -> merge_pcdata (Xml.PCData(a ^ b) :: l) | ||
29 | + | x :: l -> x :: (merge_pcdata l) | ||
30 | + | [] -> [] | ||
31 | + | ||
32 | +let generate_error_sentences sentences = | ||
33 | + let sentences,_,_ = Xlist.fold sentences ([],"","") (fun (sentences,prev_orth,prev_cat) (id_s,tokens,named_tokens) -> | ||
34 | + let no_tokens = Xlist.size tokens in | ||
35 | + let tokens,prev_orth,prev_cat = Xlist.fold tokens ([],prev_orth,prev_cat) (fun (tokens,prev_orth,prev_cat) (_,_,no_spaces,real_orth,orth,_,cat,_) -> | ||
36 | + let tokens = Int.fold 1 no_spaces tokens (fun tokens _ -> xml_space :: tokens) in | ||
37 | + let tokens = if no_spaces = 0 && ValidateTokenizer.is_space_required prev_orth prev_cat orth cat then xml_err_space:: tokens else tokens in | ||
38 | + (make_xml_token real_orth orth) :: tokens, orth, cat) in | ||
39 | + Xml.Element("s",["id",id_s;"length",string_of_int no_tokens],merge_pcdata (List.rev tokens)) :: sentences,prev_orth,prev_cat) in | ||
40 | + Xml.Element("p",[],List.rev sentences) | ||
41 | + | ||
42 | +let generate_error_corpus path out_path = | ||
43 | + ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) -> | ||
44 | + (* print_endline name; *) | ||
45 | + let entries = List.rev (Xlist.rev_map entries (fun (id_div,has_ne,paragraphs) -> | ||
46 | + let paragraphs = List.rev (Xlist.rev_map paragraphs (fun (paragraph,sentences) -> | ||
47 | + generate_error_sentences sentences)) in | ||
48 | + Xml.Element("div",["id",string_of_int id_div],paragraphs))) in | ||
49 | + let xml = Xml.Element("source",["id",name;"type",typ;"channel",channel],entries) in | ||
50 | + File.file_out (out_path ^ name ^ ".xml") (fun file -> | ||
51 | + output_string file (Xml.to_string_fmt xml))) | ||
52 | + | ||
53 | +let _ = generate_error_corpus ENIAM_NKJP.nkjp_path "NKJP1M_spelling_errors/" |
NKJP2/validateTokenizer.ml
@@ -45,11 +45,29 @@ let make_token orth lemma cat interp = | @@ -45,11 +45,29 @@ let make_token orth lemma cat interp = | ||
45 | orth=orth; | 45 | orth=orth; |
46 | token=Lemma(lemma,cat,[Xlist.map interp (fun s -> [s])])} | 46 | token=Lemma(lemma,cat,[Xlist.map interp (fun s -> [s])])} |
47 | 47 | ||
48 | -let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ] | 48 | +let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; "li"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ] |
49 | (* let prefixes = StringSet.of_list [ | 49 | (* let prefixes = StringSet.of_list [ |
50 | (*"\""; "-"; "("; "„"; "/"; "."; "+"; "«"; "''"; "»"; "["; "–"; "'"; | 50 | (*"\""; "-"; "("; "„"; "/"; "."; "+"; "«"; "''"; "»"; "["; "–"; "'"; |
51 | "’"; ":"; "“"; ","; ")";*) ""; ""; ""; ""; ""; ""; ] *) | 51 | "’"; ":"; "“"; ","; ")";*) ""; ""; ""; ""; ""; ""; ] *) |
52 | 52 | ||
53 | +let is_space_required prev_orth prev_cat orth cat = | ||
54 | + if cat = "interp" || cat = "aglt" || prev_cat = "interp" || prev_cat = "" || StringSet.mem suffixes orth then false else ( | ||
55 | + let prev_char = List.hd (List.rev (Xunicode.classified_chars_of_utf8_string prev_orth)) in | ||
56 | + let cur_char = List.hd (Xunicode.classified_chars_of_utf8_string orth) in | ||
57 | + match prev_char,cur_char with | ||
58 | + Xunicode.Sign a,Xunicode.Sign b -> (*print_endline ("is_space_required 1: " ^ prev_orth ^ " " ^ orth ^ " " ^ a ^ " " ^ b);*) true | ||
59 | + | _,Xunicode.Sign _ -> false | ||
60 | + | Xunicode.Sign _,_ -> false | ||
61 | + | Xunicode.Digit _,Xunicode.Digit _ -> true | ||
62 | + | Xunicode.Digit _,_ -> false | ||
63 | + | _,Xunicode.Digit _ -> false | ||
64 | + | Xunicode.Small _,Xunicode.Small _ -> true | ||
65 | + | Xunicode.ForeignSmall _,Xunicode.Small _ -> true | ||
66 | + | Xunicode.Capital _,Xunicode.Capital _ -> true | ||
67 | + | Xunicode.Small _,Xunicode.Capital _ -> true | ||
68 | + | Xunicode.Capital _,Xunicode.Small _ -> true | ||
69 | + | Xunicode.ForeignCapital _,Xunicode.Small _ -> true | ||
70 | + | a,b -> failwith ("is_space_required: " ^ prev_orth ^ " " ^ orth ^ " " ^ Xunicode.to_string a ^ " " ^ Xunicode.to_string b)) | ||
53 | 71 | ||
54 | let rec allign prev_orth prev_cat rev = function | 72 | let rec allign prev_orth prev_cat rev = function |
55 | (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l -> | 73 | (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l -> |
@@ -57,11 +75,8 @@ let rec allign prev_orth prev_cat rev = function | @@ -57,11 +75,8 @@ let rec allign prev_orth prev_cat rev = function | ||
57 | | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith "allign" | 75 | | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith "allign" |
58 | | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l -> | 76 | | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l -> |
59 | let rev = | 77 | let rev = |
60 | - if no_spaces > 0 then space :: rev | ||
61 | - else if cat = "interp" || cat = "aglt" || prev_cat = "interp" || StringSet.mem suffixes orth (*|| StringSet.mem prefixes prev_orth*) then rev | ||
62 | - else ( | ||
63 | - (* print_endline ("allign: " ^ prev_orth ^ " " ^ orth); *) | ||
64 | - space :: rev) in | 78 | + if no_spaces > 0 then space :: rev else |
79 | + if is_space_required prev_orth prev_cat orth cat then space :: rev else rev in | ||
65 | let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in | 80 | let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in |
66 | let rev = (make_token orth lemma cat interp) :: rev in | 81 | let rev = (make_token orth lemma cat interp) :: rev in |
67 | let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in | 82 | let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in |
@@ -76,6 +91,20 @@ let rec set_lengths n rev = function | @@ -76,6 +91,20 @@ let rec set_lengths n rev = function | ||
76 | set_lengths (n+len) ({t with beg=n; len=len; next=n+len} :: rev) l | 91 | set_lengths (n+len) ({t with beg=n; len=len; next=n+len} :: rev) l |
77 | | [] -> List.rev rev | 92 | | [] -> List.rev rev |
78 | 93 | ||
94 | +(* FIXME: poprawić interpretacje przecinka i innych znaków interpunkcyjnych *) | ||
95 | +let rec set_special_tokens_lengths rev = function | ||
96 | + ({token=Interp "<sentence>"} as sent) :: ({token=Interp "<clause>"} as cl) :: t :: l -> | ||
97 | + let sent = {sent with len=1; next=sent.beg+1} in | ||
98 | + let cl = {cl with beg=sent.next; len=1; next=sent.next+1} in | ||
99 | + let t = {t with beg=t.beg+2; len=t.len-2} in | ||
100 | + set_special_tokens_lengths (Token t :: Token cl :: Token sent :: rev) l | ||
101 | + | ({orth="."; token=Lemma(".","interp",[[]])} as dot) :: ({token=Interp "</clause>"} as cl) :: {token=Interp "</sentence>"} :: l -> | ||
102 | + let cl = {cl with beg=dot.beg; len=20; next=dot.beg+20} in | ||
103 | + let dot = {dot with beg=cl.next; len=80; token= Interp "</sentence>"} in | ||
104 | + set_special_tokens_lengths (Token dot :: Token cl :: rev) l | ||
105 | + | t :: l -> set_special_tokens_lengths (Token t :: rev) l | ||
106 | + | [] -> List.rev rev | ||
107 | + | ||
79 | let render_paragraph tokens = | 108 | let render_paragraph tokens = |
80 | String.concat "" (List.rev (Xlist.rev_map tokens (fun t -> t.orth))) | 109 | String.concat "" (List.rev (Xlist.rev_map tokens (fun t -> t.orth))) |
81 | 110 | ||
@@ -86,68 +115,36 @@ let rec get_next = function | @@ -86,68 +115,36 @@ let rec get_next = function | ||
86 | | Variant [] -> failwith "get_next" | 115 | | Variant [] -> failwith "get_next" |
87 | | Variant l -> get_next (List.hd l) | 116 | | Variant l -> get_next (List.hd l) |
88 | 117 | ||
89 | -let rec match_tokens erev nrev rev = function | 118 | +let make_seq = function |
119 | + [] -> failwith "make_seq" | ||
120 | + | [t] -> t | ||
121 | + | l -> Seq l | ||
122 | + | ||
123 | +let rec match_token_sequence erev nrev rev = function | ||
90 | et :: ets, nt :: nts -> | 124 | et :: ets, nt :: nts -> |
91 | - let next = get_next et in | ||
92 | - if next = nt.next then | ||
93 | - match_tokens [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts) | ||
94 | - else if next < nt.next then | ||
95 | - match_tokens (et :: erev) nrev rev (ets, nt :: nts) | ||
96 | - else match_tokens erev (nt :: nrev) rev (et :: ets, nts) | ||
97 | - | [],[] -> List.rev rev | ||
98 | - | _ -> failwith "match_tokens" | ||
99 | - | ||
100 | -(* let compare_token et t = | ||
101 | - et.orth=t.orth && et.beg=t.beg && et.len=t.len && et.next=t.next && et.token=t.token | ||
102 | - | ||
103 | -let get_beg = function | ||
104 | - Token t -> t.beg | ||
105 | - | Seq [] -> failwith "get_beg" | ||
106 | - | Seq l -> get_beg (List.hd l) | ||
107 | - | Variant [] -> failwith "get_next" | ||
108 | - | Variant l -> get_beg (List.hd l) | ||
109 | - | ||
110 | -let rec compare_tokens stats = function | ||
111 | - Token et :: ets, t :: ts -> | ||
112 | - if compare_token et t then compare_tokens stats (ets,ts) else ( | ||
113 | - Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t); | ||
114 | - stats) | ||
115 | - | Variant l :: ets, ts -> failwith "compare_tokens 4" | ||
116 | - | Seq l :: ets, ts -> failwith "compare_tokens 3" | ||
117 | - | [], ts -> failwith "compare_tokens 2" | ||
118 | - | _, [] -> failwith "compare_tokens 1" | ||
119 | - | ||
120 | -let rec get_subsequence_rec next rev = function | ||
121 | - t :: tokens -> if t.next = next then List.rev (t :: rev) else get_subsequence_rec next (t :: rev) tokens | ||
122 | - | [] -> failwith "get_subsequence_rec" | ||
123 | - | ||
124 | -let get_subsequence beg next = function | ||
125 | - t :: tokens -> if t.beg = beg then get_subsequence_rec next [] (t :: tokens) else failwith "get_subsequence 2" | ||
126 | - | [] -> failwith "get_subsequence 1" | ||
127 | - | ||
128 | -let compare_token stats tokens = function | ||
129 | - Token et :: ets, t :: ts -> | ||
130 | - if compare_token et t then compare_tokens stats (ets,ts) else ( | ||
131 | - Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t); | ||
132 | - stats) | ||
133 | - | Variant l :: ets, ts -> failwith "compare_tokens 4" | ||
134 | - | Seq l :: ets, ts -> failwith "compare_tokens 3" | ||
135 | - | [], ts -> failwith "compare_tokens 2" | ||
136 | - | _, [] -> failwith "compare_tokens 1" | ||
137 | - | ||
138 | -let rec compare_tokens stats tokens = function | ||
139 | - et :: ets -> | ||
140 | - let ts,tokens = get_subsequence (get_beg et) (get_next et) tokens in | ||
141 | - compare_token stats ts et | ||
142 | - | [] -> if tokens = [] then stats else failwith "compare_tokens 1"*) | ||
143 | - | ||
144 | -let rec compare_tokens stats = function | ||
145 | - (ets,nts) :: l -> | ||
146 | - Xlist.iter ets (fun et -> Printf.printf "%s\n" (ENIAMtokens.string_of_tokens 0 et)); | ||
147 | - Xlist.iter nts (fun nt -> Printf.printf "%s\n" (ENIAMtokens.string_of_token_env nt)); | ||
148 | - print_endline ""; | ||
149 | - compare_tokens stats l | ||
150 | - | [] -> stats | 125 | + let enext = get_next et in |
126 | + let nnext = get_next nt in | ||
127 | + if enext = nnext then | ||
128 | + match_token_sequence [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts) | ||
129 | + else if enext < nnext then | ||
130 | + match_token_sequence (et :: erev) nrev rev (ets, nt :: nts) | ||
131 | + else match_token_sequence erev (nt :: nrev) rev (et :: ets, nts) | ||
132 | + | [],[] -> Xlist.fold rev [] (fun l (et,nt) -> (make_seq et, make_seq nt) :: l) | ||
133 | + | _ -> failwith "match_token_sequence" | ||
134 | + | ||
135 | +let rec compare_tokens = function | ||
136 | + Token et, Token nt -> | ||
137 | + et.orth = nt.orth && et.beg = nt.beg && et.len = nt.len && et.next = nt.next | ||
138 | + | et,Variant l -> | ||
139 | + Xlist.fold l true (fun b nt -> | ||
140 | + compare_tokens (et,nt) && b) | ||
141 | + | Variant l,nt -> | ||
142 | + Xlist.fold l false (fun b et -> | ||
143 | + compare_tokens (et,nt) || b) | ||
144 | + | Seq[et], nt -> compare_tokens (et,nt) | ||
145 | + | et, Seq[nt] -> compare_tokens (et,nt) | ||
146 | + | Seq(et::ets),Seq(nt::nts) -> if compare_tokens (et,nt) then compare_tokens (Seq ets,Seq nts) else false | ||
147 | + | _ -> false | ||
151 | 148 | ||
152 | let validate stats name typ channel entries = | 149 | let validate stats name typ channel entries = |
153 | (* if name = "120-2-900066" then ( *) | 150 | (* if name = "120-2-900066" then ( *) |
@@ -156,15 +153,21 @@ let validate stats name typ channel entries = | @@ -156,15 +153,21 @@ let validate stats name typ channel entries = | ||
156 | Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> | 153 | Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> |
157 | let tokens = flatten_sentences sentences in | 154 | let tokens = flatten_sentences sentences in |
158 | let tokens = allign "" "" [] tokens in | 155 | let tokens = allign "" "" [] tokens in |
159 | - let tokens = set_lengths 0 [] tokens in | ||
160 | let paragraph = render_paragraph tokens in | 156 | let paragraph = render_paragraph tokens in |
161 | - let tokens = remove_spaces [] tokens in | 157 | + let tokens = set_lengths 0 [] tokens in |
158 | + let tokens = set_special_tokens_lengths [] tokens in | ||
159 | + let tokens = ENIAMpatterns.remove_spaces [] tokens in | ||
162 | let eniam_tokens = ENIAMtokenizer.parse paragraph in | 160 | let eniam_tokens = ENIAMtokenizer.parse paragraph in |
163 | - let l = match_tokens [] [] [] (eniam_tokens,tokens) in | ||
164 | - compare_tokens stats l)) | 161 | + let l = match_token_sequence [] [] [] (eniam_tokens,tokens) in |
162 | + Xlist.fold l stats (fun stats (eniam_token,nkjp_token) -> | ||
163 | + if compare_tokens (eniam_token,nkjp_token) then stats else ( | ||
164 | + let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 eniam_token) in | ||
165 | + let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 nkjp_token) in | ||
166 | + Printf.printf "%s\n%s\n\n%!" s t; | ||
167 | + StringQMap.add stats (s ^ "\n" ^ t))))) | ||
165 | 168 | ||
166 | 169 | ||
167 | -let _ = | 170 | +(*let _ = |
168 | let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> | 171 | let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> |
169 | validate stats name typ channel entries) in | 172 | validate stats name typ channel entries) in |
170 | - () | 173 | + ()*) |
resources/NKJP1M/NKJP1M_spelling_errors.tar.bz2
0 → 100644
No preview for this file type