Commit f86c717ecaa3eff28984864dc3251eb26b0fb2fc
1 parent
aedafaff
porównywanie sekwencji tokenów
Showing
1 changed file
with
170 additions
and
0 deletions
NKJP2/validateTokenizer.ml
0 → 100644
1 | +(* | |
2 | + * ENIAM_NKJP, an interface for National Corpus of Polish (NKJP). | |
3 | + * Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences | |
5 | + * | |
6 | + * This library is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU Lesser General Public License as published by | |
8 | + * the Free Software Foundation, either version 3 of the License, or | |
9 | + * (at your option) any later version. | |
10 | + * | |
11 | + * This library is distributed in the hope that it will be useful, | |
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + * GNU Lesser General Public License for more details. | |
15 | + * | |
16 | + * You should have received a copy of the GNU Lesser General Public License | |
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | + *) | |
19 | + | |
20 | +open ENIAMtokenizerTypes | |
21 | +open Xstd | |
22 | + | |
23 | +let space = {empty_token_env with orth=" "; token=Symbol " "} | |
24 | +let query_beg = {empty_token_env with token=Interp "<query>"} | |
25 | +let query_end = {empty_token_env with token=Interp "</query>"} | |
26 | +let sencence_beg = {empty_token_env with token=Interp "<sentence>"} | |
27 | +let sencence_end = {empty_token_env with token=Interp "</sentence>"} | |
28 | +let clause_beg = {empty_token_env with token=Interp "<clause>"} | |
29 | +let clause_end = {empty_token_env with token=Interp "</clause>"} | |
30 | + | |
31 | +type sent = SentBeg | SentEnd | Inside | |
32 | + | |
33 | +let set_sent_end = function | |
34 | + (_,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ -> | |
35 | + (SentEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l | |
36 | + | _ -> failwith "set_sent_end" | |
37 | + | |
38 | +let flatten_sentences sentences = | |
39 | + List.rev (Xlist.fold sentences [] (fun l (id_s,tokens,named_tokens) -> | |
40 | + set_sent_end (Xlist.fold tokens (l,SentBeg) (fun (l,sent) (beg,len,no_spaces,real_orth,orth,lemma,cat,interp) -> | |
41 | + (sent,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l, Inside)))) | |
42 | + | |
43 | +let make_token orth lemma cat interp = | |
44 | + {empty_token_env with | |
45 | + orth=orth; | |
46 | + token=Lemma(lemma,cat,[Xlist.map interp (fun s -> [s])])} | |
47 | + | |
48 | +let suffixes = StringSet.of_list ["by"; "ż"; "ń"; "że"; "%"; "BY"; "ś"; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; ] | |
49 | +(* let prefixes = StringSet.of_list [ | |
50 | + (*"\""; "-"; "("; "„"; "/"; "."; "+"; "«"; "''"; "»"; "["; "–"; "'"; | |
51 | + "’"; ":"; "“"; ","; ")";*) ""; ""; ""; ""; ""; ""; ] *) | |
52 | + | |
53 | + | |
54 | +let rec allign prev_orth prev_cat rev = function | |
55 | + (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l -> | |
56 | + allign orth cat ((make_token orth lemma cat interp) :: clause_beg :: sencence_beg :: query_beg :: rev) l | |
57 | + | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith "allign" | |
58 | + | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l -> | |
59 | + let rev = | |
60 | + if no_spaces > 0 then space :: rev | |
61 | + else if cat = "interp" || cat = "aglt" || prev_cat = "interp" || StringSet.mem suffixes orth (*|| StringSet.mem prefixes prev_orth*) then rev | |
62 | + else ( | |
63 | + (* print_endline ("allign: " ^ prev_orth ^ " " ^ orth); *) | |
64 | + space :: rev) in | |
65 | + let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in | |
66 | + let rev = (make_token orth lemma cat interp) :: rev in | |
67 | + let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in | |
68 | + allign orth cat rev l | |
69 | + | [] -> List.rev (query_end :: rev) | |
70 | + | |
71 | +let rec set_lengths n rev = function | |
72 | + t :: l -> | |
73 | + let len = | |
74 | + if t.token = Interp "<query>" || t.token = Interp "</query>" then factor else | |
75 | + Xlist.size (Xunicode.utf8_chars_of_utf8_string t.orth) * factor in | |
76 | + set_lengths (n+len) ({t with beg=n; len=len; next=n+len} :: rev) l | |
77 | + | [] -> List.rev rev | |
78 | + | |
79 | +let render_paragraph tokens = | |
80 | + String.concat "" (List.rev (Xlist.rev_map tokens (fun t -> t.orth))) | |
81 | + | |
82 | +let rec get_next = function | |
83 | + Token t -> t.next | |
84 | + | Seq [] -> failwith "get_next" | |
85 | + | Seq l -> get_next (List.hd (List.rev l)) | |
86 | + | Variant [] -> failwith "get_next" | |
87 | + | Variant l -> get_next (List.hd l) | |
88 | + | |
89 | +let rec match_tokens erev nrev rev = function | |
90 | + et :: ets, nt :: nts -> | |
91 | + let next = get_next et in | |
92 | + if next = nt.next then | |
93 | + match_tokens [] [] ((List.rev (et :: erev), List.rev (nt :: nrev)) :: rev) (ets,nts) | |
94 | + else if next < nt.next then | |
95 | + match_tokens (et :: erev) nrev rev (ets, nt :: nts) | |
96 | + else match_tokens erev (nt :: nrev) rev (et :: ets, nts) | |
97 | + | [],[] -> List.rev rev | |
98 | + | _ -> failwith "match_tokens" | |
99 | + | |
100 | +(* let compare_token et t = | |
101 | + et.orth=t.orth && et.beg=t.beg && et.len=t.len && et.next=t.next && et.token=t.token | |
102 | + | |
103 | +let get_beg = function | |
104 | + Token t -> t.beg | |
105 | + | Seq [] -> failwith "get_beg" | |
106 | + | Seq l -> get_beg (List.hd l) | |
107 | + | Variant [] -> failwith "get_next" | |
108 | + | Variant l -> get_beg (List.hd l) | |
109 | + | |
110 | +let rec compare_tokens stats = function | |
111 | + Token et :: ets, t :: ts -> | |
112 | + if compare_token et t then compare_tokens stats (ets,ts) else ( | |
113 | + Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t); | |
114 | + stats) | |
115 | + | Variant l :: ets, ts -> failwith "compare_tokens 4" | |
116 | + | Seq l :: ets, ts -> failwith "compare_tokens 3" | |
117 | + | [], ts -> failwith "compare_tokens 2" | |
118 | + | _, [] -> failwith "compare_tokens 1" | |
119 | + | |
120 | +let rec get_subsequence_rec next rev = function | |
121 | + t :: tokens -> if t.next = next then List.rev (t :: rev) else get_subsequence_rec next (t :: rev) tokens | |
122 | + | [] -> failwith "get_subsequence_rec" | |
123 | + | |
124 | +let get_subsequence beg next = function | |
125 | + t :: tokens -> if t.beg = beg then get_subsequence_rec next [] (t :: tokens) else failwith "get_subsequence 2" | |
126 | + | [] -> failwith "get_subsequence 1" | |
127 | + | |
128 | +let compare_token stats tokens = function | |
129 | + Token et :: ets, t :: ts -> | |
130 | + if compare_token et t then compare_tokens stats (ets,ts) else ( | |
131 | + Printf.printf "%s\n%s\n\n" (ENIAMtokens.string_of_token_env et) (ENIAMtokens.string_of_token_env t); | |
132 | + stats) | |
133 | + | Variant l :: ets, ts -> failwith "compare_tokens 4" | |
134 | + | Seq l :: ets, ts -> failwith "compare_tokens 3" | |
135 | + | [], ts -> failwith "compare_tokens 2" | |
136 | + | _, [] -> failwith "compare_tokens 1" | |
137 | + | |
138 | +let rec compare_tokens stats tokens = function | |
139 | + et :: ets -> | |
140 | + let ts,tokens = get_subsequence (get_beg et) (get_next et) tokens in | |
141 | + compare_token stats ts et | |
142 | + | [] -> if tokens = [] then stats else failwith "compare_tokens 1"*) | |
143 | + | |
144 | +let rec compare_tokens stats = function | |
145 | + (ets,nts) :: l -> | |
146 | + Xlist.iter ets (fun et -> Printf.printf "%s\n" (ENIAMtokens.string_of_tokens 0 et)); | |
147 | + Xlist.iter nts (fun nt -> Printf.printf "%s\n" (ENIAMtokens.string_of_token_env nt)); | |
148 | + print_endline ""; | |
149 | + compare_tokens stats l | |
150 | + | [] -> stats | |
151 | + | |
152 | +let validate stats name typ channel entries = | |
153 | + (* if name = "120-2-900066" then ( *) | |
154 | + print_endline name; | |
155 | + Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) -> | |
156 | + Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> | |
157 | + let tokens = flatten_sentences sentences in | |
158 | + let tokens = allign "" "" [] tokens in | |
159 | + let tokens = set_lengths 0 [] tokens in | |
160 | + let paragraph = render_paragraph tokens in | |
161 | + let tokens = remove_spaces [] tokens in | |
162 | + let eniam_tokens = ENIAMtokenizer.parse paragraph in | |
163 | + let l = match_tokens [] [] [] (eniam_tokens,tokens) in | |
164 | + compare_tokens stats l)) | |
165 | + | |
166 | + | |
167 | +let _ = | |
168 | + let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> | |
169 | + validate stats name typ channel entries) in | |
170 | + () | |
... | ... |