Commit e8d2c65cd63bb36a671eb2bbd0effd7a39eec931
1 parent
be209dd2
Analiza różnic w lematyzacji
Showing
4 changed files
with
421 additions
and
50 deletions
NKJP2/validateMorphology.ml
... | ... | @@ -26,10 +26,10 @@ let rec has_brev = function |
26 | 26 | | _ :: l -> has_brev l |
27 | 27 | | [] -> false |
28 | 28 | |
29 | -let rec get_ntoken = function | |
30 | - (Disamb(nlemma,ncat,ninterp) : attr) :: _ -> nlemma,ncat,ninterp | |
31 | - | _ :: l -> get_ntoken l | |
32 | - | [] -> raise Not_found | |
29 | +let rec get_brev = function | |
30 | + BrevLemma s :: _ -> s | |
31 | + | _ :: l -> get_brev l | |
32 | + | [] -> failwith "get_brev" | |
33 | 33 | |
34 | 34 | let rec add_ntoken stats = function |
35 | 35 | Token t -> |
... | ... | @@ -91,7 +91,7 @@ let lemmatize_string s = |
91 | 91 | | Token ({token=FirstCap _} as t) -> t :: l |
92 | 92 | | Token ({token=AllCap _} as t) -> t :: l |
93 | 93 | | Token ({token=CapLetter _} as t) -> t :: l |
94 | - | Token ({token=RomanDig _}) -> (*print_endline ("lemmatize_string: " ^ s);*) (*t ::*) l | |
94 | + | Token ({token=RomanDig _} as t) -> (*print_endline ("lemmatize_string: " ^ s);*) t :: l | |
95 | 95 | | Token ({token=Dig _} as t) -> (*print_endline ("lemmatize_string: " ^ s);*) t :: l |
96 | 96 | | Token ({token=Proper _} as t) -> t :: l |
97 | 97 | | Seq[Token {token=AllSmall _};Token {token=Lemma _}] -> l |
... | ... | @@ -125,34 +125,52 @@ let lemmatize_string s = |
125 | 125 | |
126 | 126 | let get_cat_interp = function |
127 | 127 | "subst","subst",[n;c;["m1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]] |
128 | + | "subst","subst",[n;c;["m1"]],[_;_;["m1"];col] -> "subst",[n;c;["m1"];col] | |
128 | 129 | | "subst","subst",[n;c;["m2"]],[_;_;["m2"]] -> "subst",[n;c;["m2"]] |
129 | 130 | | "subst","subst",[n;c;["m3"]],[_;_;["m3"]] -> "subst",[n;c;["m3"]] |
130 | - | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]] | |
131 | - | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]] | |
131 | + | "subst","subst",[n;c;["n"]],[_;_;["n"];col] -> "subst",[n;c;["n"];col] | |
132 | 132 | | "subst","subst",[n;c;["f"]],[_;_;["f"]] -> "subst",[n;c;["f"]] |
133 | - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]] | |
134 | - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]] | |
135 | - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p2"]] -> "subst",[n;c;["p2"]] | |
136 | - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p3"]] -> "subst",[n;c;["p3"]] | |
137 | - | "subst","subst",[n;c;["m1";"p1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]] | |
138 | - | "subst","subst",[n;c;["m1";"p1"]],[_;_;["p1"]] -> "subst",[n;c;["p1"]] | |
133 | + | "subst","subst",[n;c;g],[_;_;_] -> "subst",[n;c;g] | |
134 | + | "subst","subst",[n;c;g],[_;_;_;_] -> "subst",[n;c;g] | |
135 | + | "subst","adj",[n;c;g],_ -> "subst",[n;c;g] | |
139 | 136 | | "depr","subst",[["pl"];["nom"];["m2"]],[["sg"];["nom"];["m1"]] -> "depr",[["pl"];["nom"];["m2"]] |
137 | + | "depr","subst",[["pl"];["acc"];["m2"]],[["sg"];["nom"];["m1"]] -> "depr",[["pl"];["acc"];["m2"]] | |
140 | 138 | | "ppron3","ppron3",ninterp,[["sg"];["nom"];["m1";"m2";"m3"];["ter"];_;_] -> "ppron3",ninterp |
141 | - | "ppron12","ppron12",ninterp,[_;["nom"];_;_] -> "ppron3",ninterp | |
142 | - | "numcol","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *) | |
143 | - | "num","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *) | |
139 | + | "ppron12","ppron12",ninterp,[_;["nom"];_;_] -> "ppron12",ninterp | |
140 | + | "numcol","num",ninterp,_ -> "num",ninterp | |
141 | + | "num","num",ninterp,_ -> "num",ninterp (* na tym etapie nie da się skorygować błędów *) | |
142 | + (* | "num","num",[["pl"];c;g;["rec"]],[["sg";"pl"];["nom";"gen";"acc"];["m1";"m2";"m3";"f";"n"];["rec"]] -> "num",[["pl"];c;g;["rec"]] | |
143 | + | "num","num",[["pl"];c;["m2"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["m2"];["rec"]] | |
144 | + | "num","num",[["pl"];c;["m3"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["m3"];["rec"]] | |
145 | + | "num","num",[["pl"];c;["f"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["f"];["rec"]] | |
146 | + | "num","num",[["pl"];c;["n"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["n"];["rec"];col] | |
147 | + | "num","num",[["pl"];c;["m1"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m1"];["congr"]] | |
148 | + | "num","num",[["pl"];c;["m2"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m2"];["congr"]] | |
149 | + | "num","num",[["pl"];c;["m3"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m3"];["congr"]] | |
150 | + | "num","num",[["pl"];c;["f"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["f"];["congr"]] | |
151 | + | "num","num",[["pl"];c;["n"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["n"];["congr"];col] | |
152 | + | "num","num",[["pl"];c;["m2"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["m2"];["congr"]] | |
153 | + | "num","num",[["pl"];c;["m3"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["m3"];["congr"]] | |
154 | + | "num","num",[["pl"];c;["n"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["n"];["congr"];col] *) | |
144 | 155 | | "siebie","siebie",[[c]],[["acc";"gen"]] -> "siebie",[[c]] |
145 | 156 | | "adj","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adj",ninterp |
157 | + | "adj","adj",ninterp,[["sg";"pl"];["nom";"gen";"dat";"acc";"inst";"loc";"voc"];["m1";"m2";"m3";"f";"n"];["pos"]] -> "adj",ninterp | |
146 | 158 | | "adja","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adja",ninterp |
147 | 159 | | "adjc","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjc",ninterp |
148 | 160 | | "adjp","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjp",ninterp |
161 | + | "adj","adj",ninterp,[["sg"];["nom"];["m1";"m2";"m3"];["pos"]] -> "adj",ninterp | |
149 | 162 | | "adv","adv",[[g]],[["pos"]] -> "adv",[[g]] |
150 | - | "adv","adv",ninterp,interp -> if ninterp = interp then "adv",ninterp else raise Not_found | |
163 | + | "adv","adv",[],[["pos"]] -> "adv",[["pos"]] | |
164 | + | "adv",_,ninterp,_ -> "adv",ninterp | |
165 | + | "comp","comp",ninterp,interp -> if ninterp = interp then "comp",ninterp else raise Not_found | |
166 | + | "conj","conj",ninterp,interp -> if ninterp = interp then "conj",ninterp else raise Not_found | |
167 | + | "conj",_,ninterp,_ -> "conj",ninterp | |
168 | + | "prep","prep",[c1;w],[c2;_] -> if c1 = c2 then "prep",[c1;w] else raise Not_found | |
151 | 169 | | "prep","prep",ninterp,interp -> if ninterp = interp then "prep",ninterp else raise Not_found |
152 | 170 | | "qub","qub",ninterp,interp -> if ninterp = interp then "qub",ninterp else raise Not_found |
153 | - | "conj","conj",ninterp,interp -> if ninterp = interp then "conj",ninterp else raise Not_found | |
154 | - | "comp","comp",ninterp,interp -> if ninterp = interp then "comp",ninterp else raise Not_found | |
171 | + | "qub",_,ninterp,_ -> "qub",ninterp | |
155 | 172 | | "interj","interj",ninterp,interp -> if ninterp = interp then "interj",ninterp else raise Not_found |
173 | + | "interj",_,ninterp,_ -> "interj",ninterp | |
156 | 174 | | "burk","burk",ninterp,interp -> if ninterp = interp then "burk",ninterp else raise Not_found |
157 | 175 | | "pred","pred",ninterp,interp -> if ninterp = interp then "pred",ninterp else raise Not_found |
158 | 176 | | "fin","inf",[n;p;["imperf"]],[["imperf";"perf"]] -> "fin",[n;p;["imperf"]] |
... | ... | @@ -163,6 +181,8 @@ let get_cat_interp = function |
163 | 181 | | "impt","inf",[n;p;["imperf"]],[["imperf"]] -> "impt",[n;p;["imperf"]] |
164 | 182 | | "impt","inf",[n;p;["perf"]],[["imperf";"perf"]] -> "impt",[n;p;["perf"]] |
165 | 183 | | "impt","inf",[n;p;["perf"]],[["perf"]] -> "impt",[n;p;["perf"]] |
184 | + | "bedzie","inf",[n;p;["imperf"]],[["imperf"]] -> "bedzie",[n;p;["imperf"]] | |
185 | + | "aglt","inf",[n;p;["imperf"];w],[["imperf"]] -> "aglt",[n;p;["imperf"];w] | |
166 | 186 | | "inf","inf",[["imperf"]],[["imperf";"perf"]] -> "inf",[["imperf"]] |
167 | 187 | | "inf","inf",[["imperf"]],[["imperf"]] -> "inf",[["imperf"]] |
168 | 188 | | "inf","inf",[["perf"]],[["imperf";"perf"]] -> "inf",[["perf"]] |
... | ... | @@ -175,7 +195,7 @@ let get_cat_interp = function |
175 | 195 | | "praet","inf",[n;g;["imperf"];a],[["imperf"]] -> "praet",[n;g;["imperf"];a] |
176 | 196 | | "praet","inf",[n;g;["perf"];a],[["imperf";"perf"]] -> "praet",[n;g;["perf"];a] |
177 | 197 | | "praet","inf",[n;g;["perf"];a],[["perf"]] -> "praet",[n;g;["perf"];a] |
178 | - | "winien","inf",[n;g;["imperf"]],[["imperf"]] -> "winien",[n;g;["imperf"]] | |
198 | + | "winien","winien",[n;g;["imperf"]],[_;_;["imperf"]] -> "winien",[n;g;["imperf"]] | |
179 | 199 | | "ppas","inf",[n;c;g;["imperf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["imperf"];a] |
180 | 200 | | "ppas","inf",[n;c;g;["imperf"];a],[["imperf"]] -> "ppas",[n;c;g;["imperf"];a] |
181 | 201 | | "ppas","inf",[n;c;g;["perf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["perf"];a] |
... | ... | @@ -202,6 +222,16 @@ let get_cat_interp = function |
202 | 222 | | "imps","inf",[["perf"]],[["perf"]] -> "imps",[["perf"]] |
203 | 223 | | _ -> raise Not_found |
204 | 224 | |
225 | +let get_lemma_cat_interp = function | |
226 | + nlemma,lemma,"adj","ppas",[n;c;g;["pos"]],[["sg"];["nom";"voc"];["m1";"m2";"m3"];a;aff] -> lemma,"ppas",[n;c;g;a;aff] | |
227 | + | nlemma,lemma,"adja","adja",[],[] -> lemma,"adja",[] | |
228 | + | nlemma,lemma,"subst","subst",[["pl"];c;g],[["pl"];["nom";"voc"];_] -> lemma,"subst",[["pl"];c;g] | |
229 | + (* | "5","5","adj","dig",ninterp,[] -> "piąty","adj",ninterp | |
230 | + | "6","6","adj","dig",ninterp,[] -> "szósty","adj",ninterp *) | |
231 | + (* | "adj","ppas",ninterp,interp -> print_endline (ENIAMtokens.string_of_interps [ninterp] ^ " " ^ ENIAMtokens.string_of_interps [interp]); raise Not_found *) | |
232 | + | _ -> raise Not_found | |
233 | + | |
234 | + | |
205 | 235 | let correct_nlemma = function |
206 | 236 | "letnia " -> "letnia" |
207 | 237 | | "10minutowy" -> "minutowy" |
... | ... | @@ -233,7 +263,7 @@ let correct_nlemma = function |
233 | 263 | | "16-latek" -> raise Not_found |
234 | 264 | | s -> s |
235 | 265 | |
236 | -let process_ntoken stats nlemma ncat ninterp = | |
266 | +let process_ntoken stats q nlemma ncat ninterp = | |
237 | 267 | try |
238 | 268 | let nlemma = correct_nlemma nlemma in |
239 | 269 | let nl = lemmatize_string nlemma in |
... | ... | @@ -241,44 +271,291 @@ let process_ntoken stats nlemma ncat ninterp = |
241 | 271 | {token=Lemma(lemma,cat,interp)} -> |
242 | 272 | Xlist.fold interp nl (fun nl interp -> |
243 | 273 | try |
244 | - let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in | |
245 | - if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl | |
274 | + if lemma = nlemma then | |
275 | + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in | |
276 | + (Lemma(lemma,cat,[interp])) :: nl else | |
277 | + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in | |
278 | + (Lemma(lemma,cat,[interp])) :: nl | |
279 | + with Not_found -> nl) | |
280 | + | {token=Dig(_,"dig")} -> nl (* FIXME: todo *) | |
281 | + (* (try | |
282 | + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in | |
283 | + (Lemma(lemma,cat,[interp])) :: nl | |
284 | + with Not_found -> nl) *) | |
285 | + | {token=RomanDig(_,"roman")} -> | |
286 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl | |
287 | + | {token=Proper(lemma,cat,interp,_)} -> (*print_endline ("P " ^ nlemma);*) nl (* FIXME: todo *) | |
288 | + | _ -> nl) in | |
289 | + if nl2 = [] then StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) q | |
290 | + else StringQMap.add_val stats "lemmatized" q | |
291 | + with Not_found -> StringQMap.add_val stats "incorrect" q | |
292 | + | |
293 | +let process_ntoken2 stats q name id_div orth beg paragraph nlemma ncat ninterp = | |
294 | + try | |
295 | + let nlemma = correct_nlemma nlemma in | |
296 | + let nl = lemmatize_string nlemma in | |
297 | + let nl2 = Xlist.fold nl [] (fun nl -> function | |
298 | + {token=Lemma(lemma,cat,interp)} -> | |
299 | + Xlist.fold interp nl (fun nl interp -> | |
300 | + try | |
301 | + if lemma = nlemma then | |
302 | + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in | |
303 | + (Lemma(lemma,cat,[interp])) :: nl else | |
304 | + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in | |
305 | + (Lemma(lemma,cat,[interp])) :: nl | |
246 | 306 | with Not_found -> nl) |
247 | 307 | | {token=Dig _} -> nl (* FIXME: todo *) |
308 | + | {token=RomanDig(_,"roman")} -> | |
309 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl | |
248 | 310 | | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *) |
249 | 311 | | _ -> nl) in |
250 | - if nl2 = [] then StringQMap.add stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) | |
251 | - else StringQMap.add stats "lemmatized" | |
252 | - with Not_found -> StringQMap.add stats "incorrect" | |
312 | + if nl2 = [] then | |
313 | + StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token)) | |
314 | + ^ "\n" ^ name ^ " " ^ string_of_int id_div ^ " " ^ string_of_int beg ^ " " ^ orth ^ "\n" ^ paragraph) q | |
315 | + else StringQMap.add_val stats "lemmatized" q | |
316 | + with Not_found -> StringQMap.add_val stats "incorrect" q | |
317 | + | |
318 | +let validate_ntoken stats q (nlemma,ncat,ninterp) = | |
319 | + process_ntoken stats q nlemma ncat ninterp | |
253 | 320 | |
254 | -let validate_ntoken stats (nlemma,ncat,ninterp) = | |
255 | - process_ntoken stats nlemma ncat ninterp | |
321 | +let rec validate_ntoken_token name id_div paragraph stats = function | |
322 | + Token t -> | |
323 | + (try | |
324 | + let nlemma,ncat,ninterp = get_ntoken t.attrs in | |
325 | + process_ntoken2 stats 1 name id_div t.orth t.beg paragraph nlemma ncat ninterp | |
326 | + (* print_endline (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtokens.string_of_interps [ninterp]); | |
327 | + Printf.printf "%s\t%d\t%s\t%d\n" name id_div t.orth t.beg; | |
328 | + print_endline paragraph; | |
329 | + stats *) | |
330 | + with Not_found -> stats) | |
331 | + | Seq l -> Xlist.fold l stats (validate_ntoken_token name id_div paragraph) | |
332 | + | Variant l -> Xlist.fold l stats (validate_ntoken_token name id_div paragraph) | |
256 | 333 | |
257 | -let match_lemmatize stats t = | |
258 | - if has_brev t.attrs then StringQMap.add stats "brev" else | |
334 | +let validate_ntoken_entry stats name typ channel entries = | |
335 | + prerr_endline name; | |
336 | + Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) -> | |
337 | + Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> | |
338 | + let paragraph,tokens = annotate name sentences in | |
339 | + Xlist.fold tokens stats (validate_ntoken_token name id_div paragraph))) | |
340 | + | |
341 | +let rec subset_list = function | |
342 | + [],[] -> true | |
343 | + | [x] :: l1, y :: l2 -> if Xlist.mem y x then subset_list (l1,l2) else false | |
344 | + | _ -> false | |
345 | + | |
346 | +let match_cat_interp = function | |
347 | + | "subst","subst",[nn;nc;ng],[n;c;g;col] -> if subset_list ([nn;nc;ng],[n;c;g]) then "subst",[nn;nc;ng;col] else raise Not_found | |
348 | +(* | "numcol","num",ninterp,_ -> "num",ninterp*) | |
349 | + | "num","num",[nn;nc;["n"];na],[n;c;g;a;col] -> if subset_list ([nn;nc;["n"];na],[n;c;g;a]) then "num",[nn;nc;["n"];na;col] else raise Not_found | |
350 | + | "num","num",[nn;nc;ng;na],[n;c;g;a;col] -> if subset_list ([nn;nc;ng;na],[n;c;g;a]) then "num",[nn;nc;ng;na] else raise Not_found | |
351 | + | "adv","adv",[],[["pos"]] -> "adv",[["pos"]] | |
352 | + | _ -> raise Not_found | |
353 | + | |
354 | +let match_cat_interp_substgender = function | |
355 | + "subst","subst",[nn;nc;ng],[n;c;_] -> if subset_list ([nn;nc],[n;c]) then "subst",[nn;nc;ng] else raise Not_found | |
356 | + | "subst","subst",[nn;nc;ng],[n;c;_;_] -> if subset_list ([nn;nc],[n;c]) then "subst",[nn;nc;ng] else raise Not_found | |
357 | + | _ -> raise Not_found | |
358 | + | |
359 | +exception HasBrev | |
360 | +exception NoNtoken | |
361 | +exception LemmaNotMatched of string * string * string list list * token_env list | |
362 | +exception MultipleLemmaMatched of string * string * string list list * token list | |
363 | + | |
364 | +let rec sort_uniq_rec rev = function | |
365 | + [] -> rev | |
366 | + | x :: y :: l -> if x = y then sort_uniq_rec rev (y :: l) else sort_uniq_rec (x :: rev) (y :: l) | |
367 | + | [x] -> x :: rev | |
368 | + | |
369 | +let sort_uniq l = | |
370 | + match sort_uniq_rec [] (Xlist.sort l compare) with | |
371 | + [Lemma(lemma1,"subst",[[n1;c1;["n"];["ncol"]]]);Lemma(lemma2,"subst",[[n2;c2;["n"];["col"]]])] as l -> | |
372 | + if lemma1 = lemma2 && n1 = n2 && c1 = c2 then [Lemma(lemma1,"subst",[[n1;c1;["n"];["ncol";"col"]]])] else l | |
373 | + | [Lemma("kląsknięcie","subst",[[["pl"];c1;["n"];["pt"]]]);Lemma("kląsknięcie","subst",[[["pl"];c2;["n"];["ncol"]]])] as l -> | |
374 | + if c1 = c2 then [Lemma("kląsknięcie","subst",[[["pl"];c1;["n"];["pt"]]])] else l | |
375 | + | [Lemma("wybrażenie","subst",[[["pl"];c1;["n"];["pt"]]]);Lemma("wybrażenie","subst",[[["pl"];c2;["n"];["ncol"]]])] as l -> | |
376 | + if c1 = c2 then [Lemma("wybrażenie","subst",[[["pl"];c1;["n"];["pt"]]])] else l | |
377 | + | [Lemma(lemma1,"subst",[[["pl"];c1;["n"];["pt"]]]);Lemma(lemma2,"subst",[[["pl"];c2;["n"];["ncol"]]])] as l -> | |
378 | + (* print_endline lemma1; *) | |
379 | + if lemma1 = lemma2 && c1 = c2 then [Lemma(lemma1,"subst",[[["pl"];c1;["n"];["pt"]]])] else l | |
380 | + | l -> (*print_endline (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t)));*) l | |
381 | + | |
382 | +type t = TokenMatched | TokenLowercase | TokenBrev | TokenSubstGender | TokenDeviated | |
383 | + | |
384 | +let match_lemmatize_simple t nlemma ncat ninterp = | |
385 | + let l1 = ENIAMpaths.lemmatize_token t in | |
386 | + let l2 = Xlist.fold l1 [] (fun l -> function | |
387 | + {token=Lemma(lemma,cat,interp)} -> | |
388 | + Xlist.fold interp l (fun l interp -> | |
389 | + try | |
390 | + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else | |
391 | + if lemma = nlemma then | |
392 | + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in | |
393 | + (Lemma(lemma,cat,[interp])) :: l else l | |
394 | + with Not_found -> l) | |
395 | + | {token=Dig _} -> l (* FIXME: todo *) | |
396 | + | {token=RomanDig(_,"roman")} -> | |
397 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l | |
398 | + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) | |
399 | + | _ -> l) in | |
400 | + match sort_uniq l2 with | |
401 | + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) | |
402 | + | [t] -> t, TokenMatched | |
403 | + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2)) | |
404 | + | |
405 | +let match_lemmatize_lowercase t nlemma ncat ninterp = | |
406 | + let t = match t.token with | |
407 | + | FirstCap(s,lower,cl,ll) -> {t with token=AllSmall lower} | |
408 | + | CapLetter(s,lower) -> {t with token=SmallLetter lower} | |
409 | + | AllCap(_,a,b) -> {t with token=FirstCap(a,b,"","")} (* FIXME: to powinno być zdezambiguowane *) | |
410 | + | _ -> t in | |
259 | 411 | let l = ENIAMpaths.lemmatize_token t in |
260 | - try | |
261 | - let nlemma,ncat,ninterp = get_ntoken t.attrs in | |
412 | + let l2 = Xlist.fold l [] (fun l -> function | |
413 | + {token=Lemma(lemma,cat,interp)} -> | |
414 | + Xlist.fold interp l (fun l interp -> | |
415 | + try | |
416 | + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else | |
417 | + if lemma = nlemma then | |
418 | + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in | |
419 | + (Lemma(lemma,cat,[interp])) :: l else l | |
420 | + with Not_found -> l) | |
421 | + | {token=Dig _} -> l (* FIXME: todo *) | |
422 | + | {token=RomanDig(_,"roman")} -> | |
423 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l | |
424 | + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) | |
425 | + | _ -> l) in | |
426 | + match sort_uniq l2 with | |
427 | + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l)) | |
428 | + | [t] -> t, TokenLowercase | |
429 | + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2)) | |
430 | + | |
431 | +let match_lemmatize_substgender t nlemma ncat ninterp = | |
432 | + let l1 = ENIAMpaths.lemmatize_token t in | |
433 | + let l2 = Xlist.fold l1 [] (fun l -> function | |
434 | + {token=Lemma(lemma,cat,interp)} -> | |
435 | + Xlist.fold interp l (fun l interp -> | |
436 | + try | |
437 | + if lemma = nlemma then | |
438 | + let cat,interp = match_cat_interp_substgender (ncat,cat,ninterp,interp) in | |
439 | + (Lemma(lemma,cat,[interp])) :: l else l | |
440 | + with Not_found -> l) | |
441 | + | {token=Dig _} -> l (* FIXME: todo *) | |
442 | + | {token=RomanDig(_,"roman")} -> | |
443 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l | |
444 | + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) | |
445 | + | _ -> l) in | |
446 | + match sort_uniq l2 with | |
447 | + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) | |
448 | + | [t] -> t, TokenSubstGender | |
449 | + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2)) | |
450 | + | |
451 | +let match_lemmatize_deviated t nlemma ncat ninterp = | |
452 | + let l1 = ENIAMpaths.lemmatize_token t in | |
453 | + let nlemma = try correct_nlemma nlemma with Not_found -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) in | |
454 | + let nl = lemmatize_string nlemma in | |
455 | + let nl2 = Xlist.fold nl [] (fun nl -> function | |
456 | + {token=Lemma(lemma,cat,interp)} -> | |
457 | + Xlist.fold interp nl (fun nl interp -> | |
458 | + try | |
459 | + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in | |
460 | + (lemma,cat,interp) :: nl | |
461 | + with Not_found -> nl) | |
462 | + | _ -> nl) in | |
463 | + let l2 = Xlist.fold nl2 [] (fun l (nlemma,ncat,ninterp) -> | |
464 | + Xlist.fold l1 l (fun l -> function | |
465 | + {token=Lemma(lemma,cat,interp)} -> | |
466 | + Xlist.fold interp l (fun l interp -> | |
467 | + try | |
468 | + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else | |
469 | + if lemma = nlemma then | |
470 | + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in | |
471 | + (Lemma(lemma,cat,[interp])) :: l else l | |
472 | + with Not_found -> l) | |
473 | + | _ -> l)) in | |
474 | + match sort_uniq l2 with | |
475 | + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) | |
476 | + | [t] -> t, TokenDeviated | |
477 | + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2)) | |
478 | + | |
479 | +let rec match_lemmatize_rec t nlemma ncat ninterp f0 = function | |
480 | + f :: l -> | |
481 | + (try f t nlemma ncat ninterp | |
482 | + with LemmaNotMatched _ -> match_lemmatize_rec t nlemma ncat ninterp f0 l) | |
483 | + | [] -> f0 t nlemma ncat ninterp | |
484 | + | |
485 | +let match_lemmatize (*stats q name id_div paragraph*) t = | |
486 | + if has_brev t.attrs then raise HasBrev (*StringQMap.add_val stats "brev" q*) | |
487 | +(* let nlemma = get_brev t.attrs in | |
488 | + (let l = ENIAMpaths.lemmatize_token t in | |
489 | + let l2 = Xlist.fold l [] (fun l -> function | |
490 | + {token=Lemma(lemma,cat,interp)} -> | |
491 | + Xlist.fold interp l (fun l interp -> | |
492 | + try | |
493 | + if lemma = nlemma then (Lemma(nlemma,cat,[interp])) :: l else l | |
494 | + with Not_found -> l) | |
495 | + (* | {token=Dig _} -> l (* FIXME: todo *) | |
496 | + | {token=RomanDig(_,"roman")} -> | |
497 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l | |
498 | + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) *) | |
499 | + | _ -> l) in | |
500 | + match sort_uniq l2 with | |
501 | + [] -> raise (LemmaNotMatched(nlemma,"BREV",[],l)) | |
502 | + | [t] -> t, TokenBrev | |
503 | + | _ -> raise (MultipleLemmaMatched(nlemma,"BREV",[],l2)))*) | |
504 | + else | |
505 | + let nlemma,ncat,ninterp = try get_ntoken t.attrs with Not_found -> raise NoNtoken in | |
506 | + match_lemmatize_rec t nlemma ncat ninterp match_lemmatize_simple | |
507 | + [match_lemmatize_simple; match_lemmatize_lowercase; match_lemmatize_substgender; match_lemmatize_deviated] | |
508 | + (* let ninterp = if ncat = "adv" && ninterp = [] then [["pos"]] else ninterp in *) | |
509 | +(* let l1 = ENIAMpaths.lemmatize_token t in | |
510 | + let l2 = Xlist.fold l1 [] (fun l -> function | |
511 | + {token=Lemma(lemma,cat,interp)} -> | |
512 | + Xlist.fold interp l (fun l interp -> | |
513 | + try | |
514 | + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else | |
515 | + if lemma = nlemma then | |
516 | + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in | |
517 | + (Lemma(lemma,cat,[interp])) :: l else l | |
518 | + with Not_found -> l) | |
519 | + | {token=Dig _} -> l (* FIXME: todo *) | |
520 | + | {token=RomanDig(_,"roman")} -> | |
521 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l | |
522 | + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) | |
523 | + | _ -> l) in | |
524 | + match sort_uniq l2 with | |
525 | + [] -> (*raise (LemmaNotMatched(nlemma,ncat,ninterp,l))*) | |
526 | +lowercase | |
527 | + | [t] -> t, TokenMatched | |
528 | + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2))*) | |
529 | + | |
530 | +(* try | |
262 | 531 | let nlemma = correct_nlemma nlemma in |
263 | 532 | let nl = lemmatize_string nlemma in |
264 | 533 | let nl2 = Xlist.fold nl [] (fun nl -> function |
265 | 534 | {token=Lemma(lemma,cat,interp)} -> |
266 | 535 | Xlist.fold interp nl (fun nl interp -> |
267 | 536 | try |
268 | - let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in | |
269 | - if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl | |
537 | + if lemma = nlemma then | |
538 | + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in | |
539 | + (Lemma(lemma,cat,[interp])) :: nl else | |
540 | + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in | |
541 | + (Lemma(lemma,cat,[interp])) :: nl | |
270 | 542 | with Not_found -> nl) |
271 | 543 | | {token=Dig _} -> nl (* FIXME: todo *) |
544 | + | {token=RomanDig(_,"roman")} -> | |
545 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl | |
272 | 546 | | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *) |
273 | 547 | | _ -> nl) in |
274 | - if nl2 = [] then StringQMap.add stats (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) | |
548 | + if nl2 = [] then | |
549 | + StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token)) | |
550 | + ^ "\n" ^ name ^ " " ^ string_of_int id_div ^ " " ^ string_of_int t.beg ^ " " ^ t.orth ^ "\n" ^ paragraph) q | |
551 | + (* if nl2 = [] then StringQMap.add_val stats (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) q *) | |
275 | 552 | (* let l2 = Xlist.fold l [] (fun l2 t2 -> |
276 | 553 | match t2.token with |
277 | 554 | Lemma(lemma,cat,interp) -> if lemma = nlemma (*|| lemma = lowercase nlemma t.token*) then t2 :: l2 else l2 |
278 | 555 | (* | Proper(lemma,cat,interp,_) -> if lemma = nlemma || lemma = lowercase nlemma t.token then t2 :: l2 else l2 *) |
279 | 556 | | _ -> l2) in |
280 | 557 | if l2 = [] then StringQMap.add stats ("no lemma: " ^ t.orth ^ " " ^ nlemma) else *) |
281 | - else StringQMap.add stats "lemmatized" | |
558 | + else StringQMap.add_val stats "lemmatized" q | |
282 | 559 | (* let l3 = Xlist.fold l2 [] (fun l3 t -> |
283 | 560 | match t.token with |
284 | 561 | Lemma(lemma2,cat2,interp2) -> if cat = cat2 then t :: l3 else l3 |
... | ... | @@ -308,12 +585,30 @@ let match_lemmatize stats t = |
308 | 585 | | [{token=Lemma _};{token=SmallLetter _}] -> stats |
309 | 586 | | [{token=Lemma _};{token=FirstCap _}] -> stats |
310 | 587 | | l -> StringQMap.add stats ("multiple interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))*) |
311 | - with Not_found -> StringQMap.add stats "no ntoken" (*("no ntoken for: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token)*) | |
588 | + with Not_found -> StringQMap.add_val stats "no ntoken/incorrect" q | |
589 | + (* with Not_found -> StringQMap.add_val stats "no ntoken" q (*("no ntoken for: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token)*) *)*) | |
312 | 590 | |
313 | -let rec validate_token stats = function | |
314 | - Token t -> match_lemmatize stats t | |
315 | - | Seq l -> Xlist.fold l stats validate_token | |
316 | - | Variant l -> Xlist.fold l stats validate_token | |
591 | +let rec validate_token name id_div paragraph stats = function | |
592 | + Token t -> | |
593 | + (* if t.orth = "POWIŚLE" then Printf.printf "%s %d %s\n%s\n" name id_div paragraph (ENIAMtokens.string_of_token_env t); *) | |
594 | + (try let _,f = match_lemmatize (*stats 1 name id_div paragraph*) t in | |
595 | + match f with | |
596 | + TokenMatched -> StringQMap.add stats "validated" | |
597 | + | TokenLowercase -> StringQMap.add stats "validated as lowercase" | |
598 | + | TokenBrev -> StringQMap.add stats "validated abbreviation" | |
599 | + | TokenSubstGender -> StringQMap.add stats "validated substgender" | |
600 | + | TokenDeviated -> StringQMap.add stats "validated deviated" | |
601 | + with | |
602 | + HasBrev -> StringQMap.add stats ("has brev: " ^ t.orth (*^ " " ^ lemma ^ " " ^ cat ^ "\n"*)) | |
603 | + (* | NoNtoken -> StringQMap.add stats ("no ntoken: " ^ t.orth (*^ " " ^ lemma ^ " " ^ cat ^ "\n"*)) *) | |
604 | + | NoNtoken -> StringQMap.add stats "no ntoken" | |
605 | + | LemmaNotMatched(nlemma,ncat,ninterp,l) -> | |
606 | + (* StringQMap.add stats (Printf.sprintf "lemma not matched: %s %s : %s \n%s" t.orth (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp]))) (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t.token))) paragraph) *) | |
607 | + StringQMap.add stats (Printf.sprintf "%s %s %s %d %s\n#%s\n#%s" ncat t.orth name id_div (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp]))) | |
608 | + (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t.token))) paragraph) | |
609 | + | MultipleLemmaMatched(nlemma,ncat,ninterp,l) -> StringQMap.add stats (Printf.sprintf "multiple lemma matched: %s %s : %s" t.orth (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp]))) (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t))))) | |
610 | + | Seq l -> Xlist.fold l stats (validate_token name id_div paragraph) | |
611 | + | Variant l -> Xlist.fold l stats (validate_token name id_div paragraph) | |
317 | 612 | |
318 | 613 | let validate_morphology stats name typ channel entries = |
319 | 614 | prerr_endline name; |
... | ... | @@ -323,7 +618,7 @@ let validate_morphology stats name typ channel entries = |
323 | 618 | (* print_endline paragraph; *) |
324 | 619 | (*let s = "W Specjalnym Ośrodku Szkolno-Wychowawczym" in |
325 | 620 | if String.length paragraph >= String.length s && String.sub paragraph 0 (String.length s) = s then*) |
326 | - Xlist.fold tokens stats validate_token | |
621 | + Xlist.fold tokens stats (validate_token name id_div paragraph) | |
327 | 622 | (*else stats*))) |
328 | 623 | |
329 | 624 | let ntokens_filename = "results/ntokens.tab" |
... | ... | @@ -333,7 +628,7 @@ let parse_ninterp s = |
333 | 628 | |
334 | 629 | let fold_ntokens ntokens_filename s f = |
335 | 630 | File.fold_tab ntokens_filename s (fun s -> function |
336 | - [_;nlemma;ncat;ninterp] -> f s (nlemma,ncat,parse_ninterp ninterp) | |
631 | + [q;nlemma;ncat;ninterp] -> f s (int_of_string q) (nlemma,ncat,parse_ninterp ninterp) | |
337 | 632 | | l -> failwith ("fold_ntokens: " ^ String.concat "\t" l)) |
338 | 633 | |
339 | 634 | let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005"; |
... | ... | @@ -371,12 +666,15 @@ let _ = |
371 | 666 | create_ntoken_list stats name typ channel entries) in *) |
372 | 667 | (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> |
373 | 668 | create_ntoken_list stats name typ channel entries) in *) |
374 | - let stats = fold_ntokens ntokens_filename StringQMap.empty validate_ntoken in | |
669 | + (* let stats = fold_ntokens ntokens_filename StringQMap.empty validate_ntoken in *) | |
670 | + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) -> | |
671 | + validate_ntoken_entry stats name typ channel entries) in *) | |
375 | 672 | (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) -> |
376 | 673 | validate_morphology stats name typ channel entries) in *) |
377 | - (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> | |
378 | - validate_morphology stats name typ channel entries) in *) | |
674 | + let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> | |
675 | + validate_morphology stats name typ channel entries) in | |
379 | 676 | let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in |
380 | 677 | Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k); |
678 | + flush stdout; | |
381 | 679 | ignore(Sys.command "mpg123 \"../../Inne/gong/gong_00m_30s.mp3\""); |
382 | 680 | () |
... | ... |
NKJP2/validateTokenizer.ml
... | ... | @@ -609,7 +609,7 @@ let rec annotate_paragraph name paragraph l = function |
609 | 609 | with Not_found -> (try |
610 | 610 | let m,ets,l = annotate_apply_rules (et :: ets) l rules in |
611 | 611 | m :: annotate_paragraph name paragraph l ets |
612 | - with Not_found -> failwith "annotate_paragraph 1"))) | |
612 | + with Not_found -> (*print_endline ("annotate_paragraph 1: " ^ (string_of_vtoken et));*)failwith "annotate_paragraph 1"))) | |
613 | 613 | | [] -> if l = [] then [] else failwith "annotate_paragraph 2" |
614 | 614 | |
615 | 615 | let validate_segmentation stats name typ channel entries = |
... | ... | @@ -713,12 +713,15 @@ let transform_nkjp_interp cat interp1 = |
713 | 713 | | "prep" | "adv" | "fin" | "inf" | "imps" | "pcon" | "bedzie" | "impt" | "siebie" | "aglt" | "pant" | "brev" | "qub" -> interp |
714 | 714 | | _ -> print_endline ("transform_nkjp_interp: " ^ cat ^ " " ^ String.concat ":" interp1); interp |
715 | 715 | |
716 | +let transform_nkjp_interp_simple cat interp1 = | |
717 | + Xlist.map interp1 (fun s -> [s]) | |
718 | + | |
716 | 719 | let merge_token = function |
717 | 720 | t,[] -> Token t |
718 | 721 | | t,[{ncat="brev"} as n] -> set_sent n.nsent {t with attrs=BrevLemma n.nlemma :: t.attrs} |
719 | 722 | | t,[n] -> |
720 | 723 | if n.nlemma = "+/-" then set_sent n.nsent t else |
721 | - if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: t.attrs} | |
724 | + if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,transform_nkjp_interp_simple n.ncat n.ninterp) :: t.attrs} | |
722 | 725 | else set_sent n.nsent t |
723 | 726 | | _ -> failwith "merge_token" |
724 | 727 | |
... | ... | @@ -732,7 +735,7 @@ let merge_letni l seq = |
732 | 735 | match List.rev seq with |
733 | 736 | last :: l -> |
734 | 737 | let attrs = if n.nsent=SentEnd || n.nsent=SentBegEnd then (SentEnd : attr) :: last.attrs else last.attrs in |
735 | - Seq(Xlist.rev_map ({last with attrs=Disamb(lemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: attrs} :: l) (fun t -> Token t)) | |
738 | + Seq(Xlist.rev_map ({last with attrs=Disamb(lemma,n.ncat,transform_nkjp_interp_simple n.ncat n.ninterp) :: attrs} :: l) (fun t -> Token t)) | |
736 | 739 | | _ -> failwith "merge_letni" |
737 | 740 | |
738 | 741 | let blabla_orths = StringSet.of_list ["8.12"; "9.11"; "1.1"; "1.2"] |
... | ... | @@ -751,7 +754,7 @@ let merge_paragraph name = function |
751 | 754 | | AR("brev",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) |
752 | 755 | | AR("both-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) |
753 | 756 | | AR("eniam-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) |
754 | - | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp n.ncat n.ninterp])})) (* FIXME: ustalenie beg len next *) | |
757 | + | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp_simple n.ncat n.ninterp])})) (* FIXME: ustalenie beg len next *) | |
755 | 758 | | t -> failwith ("merge_paragraph: " ^ string_of_atoken t) |
756 | 759 | |
757 | 760 | let test_annotate name typ channel entries = |
... | ... | @@ -783,6 +786,60 @@ let test_annotate name typ channel entries = |
783 | 786 | (* print_endline (String.concat "\n" (Xlist.map m string_of_atoken))); *) |
784 | 787 | ())) |
785 | 788 | |
789 | +type cap = Capital | Small | Sign | |
790 | + | |
791 | +let classify_cap s = | |
792 | + match Xunicode.classified_chars_of_utf8_string s with | |
793 | + Xunicode.Capital _ :: _ -> Capital | |
794 | + | Xunicode.ForeignCapital _ :: _ -> Capital | |
795 | + | Xunicode.Small _ :: _ -> Small | |
796 | + | Xunicode.ForeignSmall _ :: _ -> Small | |
797 | + | _ -> Sign | |
798 | + | |
799 | +let rec get_ntoken = function | |
800 | + (Disamb(nlemma,ncat,ninterp) : attr) :: _ -> nlemma,ncat,ninterp | |
801 | + | _ :: l -> get_ntoken l | |
802 | + | [] -> raise Not_found | |
803 | + | |
804 | +let rec disambiguate_capitalics = function | |
805 | + Token t -> | |
806 | + (try | |
807 | + let nlemma,ncat,ninterp = get_ntoken t.attrs in | |
808 | + let c = match t.token, classify_cap nlemma with | |
809 | + ENIAMtokenizerTypes.SmallLetter _, Small -> true | |
810 | + | ENIAMtokenizerTypes.CapLetter _, Capital -> true | |
811 | + | ENIAMtokenizerTypes.AllSmall _ , Small-> true | |
812 | + | ENIAMtokenizerTypes.AllCap _, Capital -> true | |
813 | + (* | ENIAMtokenizerTypes.AllCap _, Small -> true *) | |
814 | + | ENIAMtokenizerTypes.FirstCap _, Capital -> true | |
815 | + | ENIAMtokenizerTypes.SomeCap _, Capital -> true | |
816 | + | ENIAMtokenizerTypes.SomeCap _, Small -> true | |
817 | + | ENIAMtokenizerTypes.RomanDig _, Capital -> true | |
818 | + | ENIAMtokenizerTypes.Interp _, _ -> true | |
819 | + | ENIAMtokenizerTypes.Symbol _, _ -> true | |
820 | + | ENIAMtokenizerTypes.Dig _, _ -> true | |
821 | + | ENIAMtokenizerTypes.Other _, _ -> true | |
822 | + | ENIAMtokenizerTypes.Lemma _, _ -> true | |
823 | + | ENIAMtokenizerTypes.Proper _, _ -> true | |
824 | + | ENIAMtokenizerTypes.Compound _, _ -> true | |
825 | + | ENIAMtokenizerTypes.Tokens _, _ -> true | |
826 | + | _ -> false in | |
827 | + Token t, c | |
828 | + (* let nc = classify_cap nlemma in | |
829 | + let no = classify_cap t.orth in | |
830 | + if no = nc then Token t,true else Token t,false *) | |
831 | + with Not_found -> Token t,true) | |
832 | + | Seq l -> | |
833 | + let l,c = Xlist.fold l ([],true) (fun (l,c) t -> | |
834 | + let t,d = disambiguate_capitalics t in | |
835 | + t :: l, c && d) in | |
836 | + Seq(List.rev l), c | |
837 | + | Variant l -> | |
838 | + let l2 = Xlist.fold l [] (fun l t -> | |
839 | + let t,d = disambiguate_capitalics t in | |
840 | + if d then t :: l else l) in | |
841 | + if l2 = [] then Variant l,false else Variant l2,true | |
842 | + | |
786 | 843 | let annotate name sentences = |
787 | 844 | let tokens = flatten_sentences sentences in |
788 | 845 | let tokens = simple_allign "" "" [] tokens in |
... | ... | @@ -793,8 +850,21 @@ let annotate name sentences = |
793 | 850 | let eniam_tokens = annotate_variants_par eniam_tokens in |
794 | 851 | let m = annotate_paragraph name paragraph tokens eniam_tokens in |
795 | 852 | let m = List.rev (Xlist.rev_map m (merge_paragraph name)) in |
853 | + let m = List.rev (Xlist.fold m [] (fun m t -> | |
854 | + let t,_ = disambiguate_capitalics t in | |
855 | + t :: m)) in | |
796 | 856 | paragraph, m |
797 | 857 | |
858 | +let test_disambiguate_capitalics stats name typ channel entries = | |
859 | + prerr_endline name; | |
860 | + Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) -> | |
861 | + Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> | |
862 | + let paragraph,tokens = annotate name sentences in | |
863 | + Xlist.fold tokens stats (fun stats t -> | |
864 | + let _,c = disambiguate_capitalics t in | |
865 | + if c then stats else StringQMap.add stats (Printf.sprintf "%s %s" (ENIAMtokens.string_of_tokens 0 t) paragraph)))) | |
866 | + | |
867 | + | |
798 | 868 | let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005"; |
799 | 869 | "620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727"; |
800 | 870 | "620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056"; |
... | ... | @@ -839,6 +909,8 @@ let _ = |
839 | 909 | test_annotate name typ channel entries); *) |
840 | 910 | (* ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) -> |
841 | 911 | test_annotate name typ channel entries); *) |
912 | + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) -> | |
913 | + test_disambiguate_capitalics stats name typ channel entries) in *) | |
842 | 914 | (* let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in |
843 | 915 | Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k); *) |
844 | 916 | (* ignore(Sys.command "mpg123 \"../../Inne/gong/gong_00m_30s.mp3\""); *) |
... | ... |
morphology/doc/model2.pdf
No preview for this file type