Commit e8d2c65cd63bb36a671eb2bbd0effd7a39eec931
1 parent
be209dd2
Analiza różnic w lematyzacji
Showing
4 changed files
with
421 additions
and
50 deletions
NKJP2/validateMorphology.ml
@@ -26,10 +26,10 @@ let rec has_brev = function | @@ -26,10 +26,10 @@ let rec has_brev = function | ||
26 | | _ :: l -> has_brev l | 26 | | _ :: l -> has_brev l |
27 | | [] -> false | 27 | | [] -> false |
28 | 28 | ||
29 | -let rec get_ntoken = function | ||
30 | - (Disamb(nlemma,ncat,ninterp) : attr) :: _ -> nlemma,ncat,ninterp | ||
31 | - | _ :: l -> get_ntoken l | ||
32 | - | [] -> raise Not_found | 29 | +let rec get_brev = function |
30 | + BrevLemma s :: _ -> s | ||
31 | + | _ :: l -> get_brev l | ||
32 | + | [] -> failwith "get_brev" | ||
33 | 33 | ||
34 | let rec add_ntoken stats = function | 34 | let rec add_ntoken stats = function |
35 | Token t -> | 35 | Token t -> |
@@ -91,7 +91,7 @@ let lemmatize_string s = | @@ -91,7 +91,7 @@ let lemmatize_string s = | ||
91 | | Token ({token=FirstCap _} as t) -> t :: l | 91 | | Token ({token=FirstCap _} as t) -> t :: l |
92 | | Token ({token=AllCap _} as t) -> t :: l | 92 | | Token ({token=AllCap _} as t) -> t :: l |
93 | | Token ({token=CapLetter _} as t) -> t :: l | 93 | | Token ({token=CapLetter _} as t) -> t :: l |
94 | - | Token ({token=RomanDig _}) -> (*print_endline ("lemmatize_string: " ^ s);*) (*t ::*) l | 94 | + | Token ({token=RomanDig _} as t) -> (*print_endline ("lemmatize_string: " ^ s);*) t :: l |
95 | | Token ({token=Dig _} as t) -> (*print_endline ("lemmatize_string: " ^ s);*) t :: l | 95 | | Token ({token=Dig _} as t) -> (*print_endline ("lemmatize_string: " ^ s);*) t :: l |
96 | | Token ({token=Proper _} as t) -> t :: l | 96 | | Token ({token=Proper _} as t) -> t :: l |
97 | | Seq[Token {token=AllSmall _};Token {token=Lemma _}] -> l | 97 | | Seq[Token {token=AllSmall _};Token {token=Lemma _}] -> l |
@@ -125,34 +125,52 @@ let lemmatize_string s = | @@ -125,34 +125,52 @@ let lemmatize_string s = | ||
125 | 125 | ||
126 | let get_cat_interp = function | 126 | let get_cat_interp = function |
127 | "subst","subst",[n;c;["m1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]] | 127 | "subst","subst",[n;c;["m1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]] |
128 | + | "subst","subst",[n;c;["m1"]],[_;_;["m1"];col] -> "subst",[n;c;["m1"];col] | ||
128 | | "subst","subst",[n;c;["m2"]],[_;_;["m2"]] -> "subst",[n;c;["m2"]] | 129 | | "subst","subst",[n;c;["m2"]],[_;_;["m2"]] -> "subst",[n;c;["m2"]] |
129 | | "subst","subst",[n;c;["m3"]],[_;_;["m3"]] -> "subst",[n;c;["m3"]] | 130 | | "subst","subst",[n;c;["m3"]],[_;_;["m3"]] -> "subst",[n;c;["m3"]] |
130 | - | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]] | ||
131 | - | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]] | 131 | + | "subst","subst",[n;c;["n"]],[_;_;["n"];col] -> "subst",[n;c;["n"];col] |
132 | | "subst","subst",[n;c;["f"]],[_;_;["f"]] -> "subst",[n;c;["f"]] | 132 | | "subst","subst",[n;c;["f"]],[_;_;["f"]] -> "subst",[n;c;["f"]] |
133 | - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]] | ||
134 | - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]] | ||
135 | - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p2"]] -> "subst",[n;c;["p2"]] | ||
136 | - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p3"]] -> "subst",[n;c;["p3"]] | ||
137 | - | "subst","subst",[n;c;["m1";"p1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]] | ||
138 | - | "subst","subst",[n;c;["m1";"p1"]],[_;_;["p1"]] -> "subst",[n;c;["p1"]] | 133 | + | "subst","subst",[n;c;g],[_;_;_] -> "subst",[n;c;g] |
134 | + | "subst","subst",[n;c;g],[_;_;_;_] -> "subst",[n;c;g] | ||
135 | + | "subst","adj",[n;c;g],_ -> "subst",[n;c;g] | ||
139 | | "depr","subst",[["pl"];["nom"];["m2"]],[["sg"];["nom"];["m1"]] -> "depr",[["pl"];["nom"];["m2"]] | 136 | | "depr","subst",[["pl"];["nom"];["m2"]],[["sg"];["nom"];["m1"]] -> "depr",[["pl"];["nom"];["m2"]] |
137 | + | "depr","subst",[["pl"];["acc"];["m2"]],[["sg"];["nom"];["m1"]] -> "depr",[["pl"];["acc"];["m2"]] | ||
140 | | "ppron3","ppron3",ninterp,[["sg"];["nom"];["m1";"m2";"m3"];["ter"];_;_] -> "ppron3",ninterp | 138 | | "ppron3","ppron3",ninterp,[["sg"];["nom"];["m1";"m2";"m3"];["ter"];_;_] -> "ppron3",ninterp |
141 | - | "ppron12","ppron12",ninterp,[_;["nom"];_;_] -> "ppron3",ninterp | ||
142 | - | "numcol","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *) | ||
143 | - | "num","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *) | 139 | + | "ppron12","ppron12",ninterp,[_;["nom"];_;_] -> "ppron12",ninterp |
140 | + | "numcol","num",ninterp,_ -> "num",ninterp | ||
141 | + | "num","num",ninterp,_ -> "num",ninterp (* na tym etapie nie da się skorygować błędów *) | ||
142 | + (* | "num","num",[["pl"];c;g;["rec"]],[["sg";"pl"];["nom";"gen";"acc"];["m1";"m2";"m3";"f";"n"];["rec"]] -> "num",[["pl"];c;g;["rec"]] | ||
143 | + | "num","num",[["pl"];c;["m2"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["m2"];["rec"]] | ||
144 | + | "num","num",[["pl"];c;["m3"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["m3"];["rec"]] | ||
145 | + | "num","num",[["pl"];c;["f"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["f"];["rec"]] | ||
146 | + | "num","num",[["pl"];c;["n"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["n"];["rec"];col] | ||
147 | + | "num","num",[["pl"];c;["m1"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m1"];["congr"]] | ||
148 | + | "num","num",[["pl"];c;["m2"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m2"];["congr"]] | ||
149 | + | "num","num",[["pl"];c;["m3"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m3"];["congr"]] | ||
150 | + | "num","num",[["pl"];c;["f"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["f"];["congr"]] | ||
151 | + | "num","num",[["pl"];c;["n"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["n"];["congr"];col] | ||
152 | + | "num","num",[["pl"];c;["m2"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["m2"];["congr"]] | ||
153 | + | "num","num",[["pl"];c;["m3"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["m3"];["congr"]] | ||
154 | + | "num","num",[["pl"];c;["n"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["n"];["congr"];col] *) | ||
144 | | "siebie","siebie",[[c]],[["acc";"gen"]] -> "siebie",[[c]] | 155 | | "siebie","siebie",[[c]],[["acc";"gen"]] -> "siebie",[[c]] |
145 | | "adj","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adj",ninterp | 156 | | "adj","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adj",ninterp |
157 | + | "adj","adj",ninterp,[["sg";"pl"];["nom";"gen";"dat";"acc";"inst";"loc";"voc"];["m1";"m2";"m3";"f";"n"];["pos"]] -> "adj",ninterp | ||
146 | | "adja","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adja",ninterp | 158 | | "adja","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adja",ninterp |
147 | | "adjc","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjc",ninterp | 159 | | "adjc","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjc",ninterp |
148 | | "adjp","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjp",ninterp | 160 | | "adjp","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjp",ninterp |
161 | + | "adj","adj",ninterp,[["sg"];["nom"];["m1";"m2";"m3"];["pos"]] -> "adj",ninterp | ||
149 | | "adv","adv",[[g]],[["pos"]] -> "adv",[[g]] | 162 | | "adv","adv",[[g]],[["pos"]] -> "adv",[[g]] |
150 | - | "adv","adv",ninterp,interp -> if ninterp = interp then "adv",ninterp else raise Not_found | 163 | + | "adv","adv",[],[["pos"]] -> "adv",[["pos"]] |
164 | + | "adv",_,ninterp,_ -> "adv",ninterp | ||
165 | + | "comp","comp",ninterp,interp -> if ninterp = interp then "comp",ninterp else raise Not_found | ||
166 | + | "conj","conj",ninterp,interp -> if ninterp = interp then "conj",ninterp else raise Not_found | ||
167 | + | "conj",_,ninterp,_ -> "conj",ninterp | ||
168 | + | "prep","prep",[c1;w],[c2;_] -> if c1 = c2 then "prep",[c1;w] else raise Not_found | ||
151 | | "prep","prep",ninterp,interp -> if ninterp = interp then "prep",ninterp else raise Not_found | 169 | | "prep","prep",ninterp,interp -> if ninterp = interp then "prep",ninterp else raise Not_found |
152 | | "qub","qub",ninterp,interp -> if ninterp = interp then "qub",ninterp else raise Not_found | 170 | | "qub","qub",ninterp,interp -> if ninterp = interp then "qub",ninterp else raise Not_found |
153 | - | "conj","conj",ninterp,interp -> if ninterp = interp then "conj",ninterp else raise Not_found | ||
154 | - | "comp","comp",ninterp,interp -> if ninterp = interp then "comp",ninterp else raise Not_found | 171 | + | "qub",_,ninterp,_ -> "qub",ninterp |
155 | | "interj","interj",ninterp,interp -> if ninterp = interp then "interj",ninterp else raise Not_found | 172 | | "interj","interj",ninterp,interp -> if ninterp = interp then "interj",ninterp else raise Not_found |
173 | + | "interj",_,ninterp,_ -> "interj",ninterp | ||
156 | | "burk","burk",ninterp,interp -> if ninterp = interp then "burk",ninterp else raise Not_found | 174 | | "burk","burk",ninterp,interp -> if ninterp = interp then "burk",ninterp else raise Not_found |
157 | | "pred","pred",ninterp,interp -> if ninterp = interp then "pred",ninterp else raise Not_found | 175 | | "pred","pred",ninterp,interp -> if ninterp = interp then "pred",ninterp else raise Not_found |
158 | | "fin","inf",[n;p;["imperf"]],[["imperf";"perf"]] -> "fin",[n;p;["imperf"]] | 176 | | "fin","inf",[n;p;["imperf"]],[["imperf";"perf"]] -> "fin",[n;p;["imperf"]] |
@@ -163,6 +181,8 @@ let get_cat_interp = function | @@ -163,6 +181,8 @@ let get_cat_interp = function | ||
163 | | "impt","inf",[n;p;["imperf"]],[["imperf"]] -> "impt",[n;p;["imperf"]] | 181 | | "impt","inf",[n;p;["imperf"]],[["imperf"]] -> "impt",[n;p;["imperf"]] |
164 | | "impt","inf",[n;p;["perf"]],[["imperf";"perf"]] -> "impt",[n;p;["perf"]] | 182 | | "impt","inf",[n;p;["perf"]],[["imperf";"perf"]] -> "impt",[n;p;["perf"]] |
165 | | "impt","inf",[n;p;["perf"]],[["perf"]] -> "impt",[n;p;["perf"]] | 183 | | "impt","inf",[n;p;["perf"]],[["perf"]] -> "impt",[n;p;["perf"]] |
184 | + | "bedzie","inf",[n;p;["imperf"]],[["imperf"]] -> "bedzie",[n;p;["imperf"]] | ||
185 | + | "aglt","inf",[n;p;["imperf"];w],[["imperf"]] -> "aglt",[n;p;["imperf"];w] | ||
166 | | "inf","inf",[["imperf"]],[["imperf";"perf"]] -> "inf",[["imperf"]] | 186 | | "inf","inf",[["imperf"]],[["imperf";"perf"]] -> "inf",[["imperf"]] |
167 | | "inf","inf",[["imperf"]],[["imperf"]] -> "inf",[["imperf"]] | 187 | | "inf","inf",[["imperf"]],[["imperf"]] -> "inf",[["imperf"]] |
168 | | "inf","inf",[["perf"]],[["imperf";"perf"]] -> "inf",[["perf"]] | 188 | | "inf","inf",[["perf"]],[["imperf";"perf"]] -> "inf",[["perf"]] |
@@ -175,7 +195,7 @@ let get_cat_interp = function | @@ -175,7 +195,7 @@ let get_cat_interp = function | ||
175 | | "praet","inf",[n;g;["imperf"];a],[["imperf"]] -> "praet",[n;g;["imperf"];a] | 195 | | "praet","inf",[n;g;["imperf"];a],[["imperf"]] -> "praet",[n;g;["imperf"];a] |
176 | | "praet","inf",[n;g;["perf"];a],[["imperf";"perf"]] -> "praet",[n;g;["perf"];a] | 196 | | "praet","inf",[n;g;["perf"];a],[["imperf";"perf"]] -> "praet",[n;g;["perf"];a] |
177 | | "praet","inf",[n;g;["perf"];a],[["perf"]] -> "praet",[n;g;["perf"];a] | 197 | | "praet","inf",[n;g;["perf"];a],[["perf"]] -> "praet",[n;g;["perf"];a] |
178 | - | "winien","inf",[n;g;["imperf"]],[["imperf"]] -> "winien",[n;g;["imperf"]] | 198 | + | "winien","winien",[n;g;["imperf"]],[_;_;["imperf"]] -> "winien",[n;g;["imperf"]] |
179 | | "ppas","inf",[n;c;g;["imperf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["imperf"];a] | 199 | | "ppas","inf",[n;c;g;["imperf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["imperf"];a] |
180 | | "ppas","inf",[n;c;g;["imperf"];a],[["imperf"]] -> "ppas",[n;c;g;["imperf"];a] | 200 | | "ppas","inf",[n;c;g;["imperf"];a],[["imperf"]] -> "ppas",[n;c;g;["imperf"];a] |
181 | | "ppas","inf",[n;c;g;["perf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["perf"];a] | 201 | | "ppas","inf",[n;c;g;["perf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["perf"];a] |
@@ -202,6 +222,16 @@ let get_cat_interp = function | @@ -202,6 +222,16 @@ let get_cat_interp = function | ||
202 | | "imps","inf",[["perf"]],[["perf"]] -> "imps",[["perf"]] | 222 | | "imps","inf",[["perf"]],[["perf"]] -> "imps",[["perf"]] |
203 | | _ -> raise Not_found | 223 | | _ -> raise Not_found |
204 | 224 | ||
225 | +let get_lemma_cat_interp = function | ||
226 | + nlemma,lemma,"adj","ppas",[n;c;g;["pos"]],[["sg"];["nom";"voc"];["m1";"m2";"m3"];a;aff] -> lemma,"ppas",[n;c;g;a;aff] | ||
227 | + | nlemma,lemma,"adja","adja",[],[] -> lemma,"adja",[] | ||
228 | + | nlemma,lemma,"subst","subst",[["pl"];c;g],[["pl"];["nom";"voc"];_] -> lemma,"subst",[["pl"];c;g] | ||
229 | + (* | "5","5","adj","dig",ninterp,[] -> "piąty","adj",ninterp | ||
230 | + | "6","6","adj","dig",ninterp,[] -> "szósty","adj",ninterp *) | ||
231 | + (* | "adj","ppas",ninterp,interp -> print_endline (ENIAMtokens.string_of_interps [ninterp] ^ " " ^ ENIAMtokens.string_of_interps [interp]); raise Not_found *) | ||
232 | + | _ -> raise Not_found | ||
233 | + | ||
234 | + | ||
205 | let correct_nlemma = function | 235 | let correct_nlemma = function |
206 | "letnia " -> "letnia" | 236 | "letnia " -> "letnia" |
207 | | "10minutowy" -> "minutowy" | 237 | | "10minutowy" -> "minutowy" |
@@ -233,7 +263,7 @@ let correct_nlemma = function | @@ -233,7 +263,7 @@ let correct_nlemma = function | ||
233 | | "16-latek" -> raise Not_found | 263 | | "16-latek" -> raise Not_found |
234 | | s -> s | 264 | | s -> s |
235 | 265 | ||
236 | -let process_ntoken stats nlemma ncat ninterp = | 266 | +let process_ntoken stats q nlemma ncat ninterp = |
237 | try | 267 | try |
238 | let nlemma = correct_nlemma nlemma in | 268 | let nlemma = correct_nlemma nlemma in |
239 | let nl = lemmatize_string nlemma in | 269 | let nl = lemmatize_string nlemma in |
@@ -241,44 +271,291 @@ let process_ntoken stats nlemma ncat ninterp = | @@ -241,44 +271,291 @@ let process_ntoken stats nlemma ncat ninterp = | ||
241 | {token=Lemma(lemma,cat,interp)} -> | 271 | {token=Lemma(lemma,cat,interp)} -> |
242 | Xlist.fold interp nl (fun nl interp -> | 272 | Xlist.fold interp nl (fun nl interp -> |
243 | try | 273 | try |
244 | - let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in | ||
245 | - if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl | 274 | + if lemma = nlemma then |
275 | + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in | ||
276 | + (Lemma(lemma,cat,[interp])) :: nl else | ||
277 | + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in | ||
278 | + (Lemma(lemma,cat,[interp])) :: nl | ||
279 | + with Not_found -> nl) | ||
280 | + | {token=Dig(_,"dig")} -> nl (* FIXME: todo *) | ||
281 | + (* (try | ||
282 | + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in | ||
283 | + (Lemma(lemma,cat,[interp])) :: nl | ||
284 | + with Not_found -> nl) *) | ||
285 | + | {token=RomanDig(_,"roman")} -> | ||
286 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl | ||
287 | + | {token=Proper(lemma,cat,interp,_)} -> (*print_endline ("P " ^ nlemma);*) nl (* FIXME: todo *) | ||
288 | + | _ -> nl) in | ||
289 | + if nl2 = [] then StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) q | ||
290 | + else StringQMap.add_val stats "lemmatized" q | ||
291 | + with Not_found -> StringQMap.add_val stats "incorrect" q | ||
292 | + | ||
293 | +let process_ntoken2 stats q name id_div orth beg paragraph nlemma ncat ninterp = | ||
294 | + try | ||
295 | + let nlemma = correct_nlemma nlemma in | ||
296 | + let nl = lemmatize_string nlemma in | ||
297 | + let nl2 = Xlist.fold nl [] (fun nl -> function | ||
298 | + {token=Lemma(lemma,cat,interp)} -> | ||
299 | + Xlist.fold interp nl (fun nl interp -> | ||
300 | + try | ||
301 | + if lemma = nlemma then | ||
302 | + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in | ||
303 | + (Lemma(lemma,cat,[interp])) :: nl else | ||
304 | + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in | ||
305 | + (Lemma(lemma,cat,[interp])) :: nl | ||
246 | with Not_found -> nl) | 306 | with Not_found -> nl) |
247 | | {token=Dig _} -> nl (* FIXME: todo *) | 307 | | {token=Dig _} -> nl (* FIXME: todo *) |
308 | + | {token=RomanDig(_,"roman")} -> | ||
309 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl | ||
248 | | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *) | 310 | | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *) |
249 | | _ -> nl) in | 311 | | _ -> nl) in |
250 | - if nl2 = [] then StringQMap.add stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) | ||
251 | - else StringQMap.add stats "lemmatized" | ||
252 | - with Not_found -> StringQMap.add stats "incorrect" | 312 | + if nl2 = [] then |
313 | + StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token)) | ||
314 | + ^ "\n" ^ name ^ " " ^ string_of_int id_div ^ " " ^ string_of_int beg ^ " " ^ orth ^ "\n" ^ paragraph) q | ||
315 | + else StringQMap.add_val stats "lemmatized" q | ||
316 | + with Not_found -> StringQMap.add_val stats "incorrect" q | ||
317 | + | ||
318 | +let validate_ntoken stats q (nlemma,ncat,ninterp) = | ||
319 | + process_ntoken stats q nlemma ncat ninterp | ||
253 | 320 | ||
254 | -let validate_ntoken stats (nlemma,ncat,ninterp) = | ||
255 | - process_ntoken stats nlemma ncat ninterp | 321 | +let rec validate_ntoken_token name id_div paragraph stats = function |
322 | + Token t -> | ||
323 | + (try | ||
324 | + let nlemma,ncat,ninterp = get_ntoken t.attrs in | ||
325 | + process_ntoken2 stats 1 name id_div t.orth t.beg paragraph nlemma ncat ninterp | ||
326 | + (* print_endline (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtokens.string_of_interps [ninterp]); | ||
327 | + Printf.printf "%s\t%d\t%s\t%d\n" name id_div t.orth t.beg; | ||
328 | + print_endline paragraph; | ||
329 | + stats *) | ||
330 | + with Not_found -> stats) | ||
331 | + | Seq l -> Xlist.fold l stats (validate_ntoken_token name id_div paragraph) | ||
332 | + | Variant l -> Xlist.fold l stats (validate_ntoken_token name id_div paragraph) | ||
256 | 333 | ||
257 | -let match_lemmatize stats t = | ||
258 | - if has_brev t.attrs then StringQMap.add stats "brev" else | 334 | +let validate_ntoken_entry stats name typ channel entries = |
335 | + prerr_endline name; | ||
336 | + Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) -> | ||
337 | + Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> | ||
338 | + let paragraph,tokens = annotate name sentences in | ||
339 | + Xlist.fold tokens stats (validate_ntoken_token name id_div paragraph))) | ||
340 | + | ||
341 | +let rec subset_list = function | ||
342 | + [],[] -> true | ||
343 | + | [x] :: l1, y :: l2 -> if Xlist.mem y x then subset_list (l1,l2) else false | ||
344 | + | _ -> false | ||
345 | + | ||
346 | +let match_cat_interp = function | ||
347 | + | "subst","subst",[nn;nc;ng],[n;c;g;col] -> if subset_list ([nn;nc;ng],[n;c;g]) then "subst",[nn;nc;ng;col] else raise Not_found | ||
348 | +(* | "numcol","num",ninterp,_ -> "num",ninterp*) | ||
349 | + | "num","num",[nn;nc;["n"];na],[n;c;g;a;col] -> if subset_list ([nn;nc;["n"];na],[n;c;g;a]) then "num",[nn;nc;["n"];na;col] else raise Not_found | ||
350 | + | "num","num",[nn;nc;ng;na],[n;c;g;a;col] -> if subset_list ([nn;nc;ng;na],[n;c;g;a]) then "num",[nn;nc;ng;na] else raise Not_found | ||
351 | + | "adv","adv",[],[["pos"]] -> "adv",[["pos"]] | ||
352 | + | _ -> raise Not_found | ||
353 | + | ||
354 | +let match_cat_interp_substgender = function | ||
355 | + "subst","subst",[nn;nc;ng],[n;c;_] -> if subset_list ([nn;nc],[n;c]) then "subst",[nn;nc;ng] else raise Not_found | ||
356 | + | "subst","subst",[nn;nc;ng],[n;c;_;_] -> if subset_list ([nn;nc],[n;c]) then "subst",[nn;nc;ng] else raise Not_found | ||
357 | + | _ -> raise Not_found | ||
358 | + | ||
359 | +exception HasBrev | ||
360 | +exception NoNtoken | ||
361 | +exception LemmaNotMatched of string * string * string list list * token_env list | ||
362 | +exception MultipleLemmaMatched of string * string * string list list * token list | ||
363 | + | ||
364 | +let rec sort_uniq_rec rev = function | ||
365 | + [] -> rev | ||
366 | + | x :: y :: l -> if x = y then sort_uniq_rec rev (y :: l) else sort_uniq_rec (x :: rev) (y :: l) | ||
367 | + | [x] -> x :: rev | ||
368 | + | ||
369 | +let sort_uniq l = | ||
370 | + match sort_uniq_rec [] (Xlist.sort l compare) with | ||
371 | + [Lemma(lemma1,"subst",[[n1;c1;["n"];["ncol"]]]);Lemma(lemma2,"subst",[[n2;c2;["n"];["col"]]])] as l -> | ||
372 | + if lemma1 = lemma2 && n1 = n2 && c1 = c2 then [Lemma(lemma1,"subst",[[n1;c1;["n"];["ncol";"col"]]])] else l | ||
373 | + | [Lemma("kląsknięcie","subst",[[["pl"];c1;["n"];["pt"]]]);Lemma("kląsknięcie","subst",[[["pl"];c2;["n"];["ncol"]]])] as l -> | ||
374 | + if c1 = c2 then [Lemma("kląsknięcie","subst",[[["pl"];c1;["n"];["pt"]]])] else l | ||
375 | + | [Lemma("wybrażenie","subst",[[["pl"];c1;["n"];["pt"]]]);Lemma("wybrażenie","subst",[[["pl"];c2;["n"];["ncol"]]])] as l -> | ||
376 | + if c1 = c2 then [Lemma("wybrażenie","subst",[[["pl"];c1;["n"];["pt"]]])] else l | ||
377 | + | [Lemma(lemma1,"subst",[[["pl"];c1;["n"];["pt"]]]);Lemma(lemma2,"subst",[[["pl"];c2;["n"];["ncol"]]])] as l -> | ||
378 | + (* print_endline lemma1; *) | ||
379 | + if lemma1 = lemma2 && c1 = c2 then [Lemma(lemma1,"subst",[[["pl"];c1;["n"];["pt"]]])] else l | ||
380 | + | l -> (*print_endline (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t)));*) l | ||
381 | + | ||
382 | +type t = TokenMatched | TokenLowercase | TokenBrev | TokenSubstGender | TokenDeviated | ||
383 | + | ||
384 | +let match_lemmatize_simple t nlemma ncat ninterp = | ||
385 | + let l1 = ENIAMpaths.lemmatize_token t in | ||
386 | + let l2 = Xlist.fold l1 [] (fun l -> function | ||
387 | + {token=Lemma(lemma,cat,interp)} -> | ||
388 | + Xlist.fold interp l (fun l interp -> | ||
389 | + try | ||
390 | + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else | ||
391 | + if lemma = nlemma then | ||
392 | + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in | ||
393 | + (Lemma(lemma,cat,[interp])) :: l else l | ||
394 | + with Not_found -> l) | ||
395 | + | {token=Dig _} -> l (* FIXME: todo *) | ||
396 | + | {token=RomanDig(_,"roman")} -> | ||
397 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l | ||
398 | + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) | ||
399 | + | _ -> l) in | ||
400 | + match sort_uniq l2 with | ||
401 | + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) | ||
402 | + | [t] -> t, TokenMatched | ||
403 | + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2)) | ||
404 | + | ||
405 | +let match_lemmatize_lowercase t nlemma ncat ninterp = | ||
406 | + let t = match t.token with | ||
407 | + | FirstCap(s,lower,cl,ll) -> {t with token=AllSmall lower} | ||
408 | + | CapLetter(s,lower) -> {t with token=SmallLetter lower} | ||
409 | + | AllCap(_,a,b) -> {t with token=FirstCap(a,b,"","")} (* FIXME: to powinno być zdezambiguowane *) | ||
410 | + | _ -> t in | ||
259 | let l = ENIAMpaths.lemmatize_token t in | 411 | let l = ENIAMpaths.lemmatize_token t in |
260 | - try | ||
261 | - let nlemma,ncat,ninterp = get_ntoken t.attrs in | 412 | + let l2 = Xlist.fold l [] (fun l -> function |
413 | + {token=Lemma(lemma,cat,interp)} -> | ||
414 | + Xlist.fold interp l (fun l interp -> | ||
415 | + try | ||
416 | + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else | ||
417 | + if lemma = nlemma then | ||
418 | + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in | ||
419 | + (Lemma(lemma,cat,[interp])) :: l else l | ||
420 | + with Not_found -> l) | ||
421 | + | {token=Dig _} -> l (* FIXME: todo *) | ||
422 | + | {token=RomanDig(_,"roman")} -> | ||
423 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l | ||
424 | + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) | ||
425 | + | _ -> l) in | ||
426 | + match sort_uniq l2 with | ||
427 | + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l)) | ||
428 | + | [t] -> t, TokenLowercase | ||
429 | + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2)) | ||
430 | + | ||
431 | +let match_lemmatize_substgender t nlemma ncat ninterp = | ||
432 | + let l1 = ENIAMpaths.lemmatize_token t in | ||
433 | + let l2 = Xlist.fold l1 [] (fun l -> function | ||
434 | + {token=Lemma(lemma,cat,interp)} -> | ||
435 | + Xlist.fold interp l (fun l interp -> | ||
436 | + try | ||
437 | + if lemma = nlemma then | ||
438 | + let cat,interp = match_cat_interp_substgender (ncat,cat,ninterp,interp) in | ||
439 | + (Lemma(lemma,cat,[interp])) :: l else l | ||
440 | + with Not_found -> l) | ||
441 | + | {token=Dig _} -> l (* FIXME: todo *) | ||
442 | + | {token=RomanDig(_,"roman")} -> | ||
443 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l | ||
444 | + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) | ||
445 | + | _ -> l) in | ||
446 | + match sort_uniq l2 with | ||
447 | + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) | ||
448 | + | [t] -> t, TokenSubstGender | ||
449 | + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2)) | ||
450 | + | ||
451 | +let match_lemmatize_deviated t nlemma ncat ninterp = | ||
452 | + let l1 = ENIAMpaths.lemmatize_token t in | ||
453 | + let nlemma = try correct_nlemma nlemma with Not_found -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) in | ||
454 | + let nl = lemmatize_string nlemma in | ||
455 | + let nl2 = Xlist.fold nl [] (fun nl -> function | ||
456 | + {token=Lemma(lemma,cat,interp)} -> | ||
457 | + Xlist.fold interp nl (fun nl interp -> | ||
458 | + try | ||
459 | + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in | ||
460 | + (lemma,cat,interp) :: nl | ||
461 | + with Not_found -> nl) | ||
462 | + | _ -> nl) in | ||
463 | + let l2 = Xlist.fold nl2 [] (fun l (nlemma,ncat,ninterp) -> | ||
464 | + Xlist.fold l1 l (fun l -> function | ||
465 | + {token=Lemma(lemma,cat,interp)} -> | ||
466 | + Xlist.fold interp l (fun l interp -> | ||
467 | + try | ||
468 | + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else | ||
469 | + if lemma = nlemma then | ||
470 | + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in | ||
471 | + (Lemma(lemma,cat,[interp])) :: l else l | ||
472 | + with Not_found -> l) | ||
473 | + | _ -> l)) in | ||
474 | + match sort_uniq l2 with | ||
475 | + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) | ||
476 | + | [t] -> t, TokenDeviated | ||
477 | + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2)) | ||
478 | + | ||
479 | +let rec match_lemmatize_rec t nlemma ncat ninterp f0 = function | ||
480 | + f :: l -> | ||
481 | + (try f t nlemma ncat ninterp | ||
482 | + with LemmaNotMatched _ -> match_lemmatize_rec t nlemma ncat ninterp f0 l) | ||
483 | + | [] -> f0 t nlemma ncat ninterp | ||
484 | + | ||
485 | +let match_lemmatize (*stats q name id_div paragraph*) t = | ||
486 | + if has_brev t.attrs then raise HasBrev (*StringQMap.add_val stats "brev" q*) | ||
487 | +(* let nlemma = get_brev t.attrs in | ||
488 | + (let l = ENIAMpaths.lemmatize_token t in | ||
489 | + let l2 = Xlist.fold l [] (fun l -> function | ||
490 | + {token=Lemma(lemma,cat,interp)} -> | ||
491 | + Xlist.fold interp l (fun l interp -> | ||
492 | + try | ||
493 | + if lemma = nlemma then (Lemma(nlemma,cat,[interp])) :: l else l | ||
494 | + with Not_found -> l) | ||
495 | + (* | {token=Dig _} -> l (* FIXME: todo *) | ||
496 | + | {token=RomanDig(_,"roman")} -> | ||
497 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l | ||
498 | + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) *) | ||
499 | + | _ -> l) in | ||
500 | + match sort_uniq l2 with | ||
501 | + [] -> raise (LemmaNotMatched(nlemma,"BREV",[],l)) | ||
502 | + | [t] -> t, TokenBrev | ||
503 | + | _ -> raise (MultipleLemmaMatched(nlemma,"BREV",[],l2)))*) | ||
504 | + else | ||
505 | + let nlemma,ncat,ninterp = try get_ntoken t.attrs with Not_found -> raise NoNtoken in | ||
506 | + match_lemmatize_rec t nlemma ncat ninterp match_lemmatize_simple | ||
507 | + [match_lemmatize_simple; match_lemmatize_lowercase; match_lemmatize_substgender; match_lemmatize_deviated] | ||
508 | + (* let ninterp = if ncat = "adv" && ninterp = [] then [["pos"]] else ninterp in *) | ||
509 | +(* let l1 = ENIAMpaths.lemmatize_token t in | ||
510 | + let l2 = Xlist.fold l1 [] (fun l -> function | ||
511 | + {token=Lemma(lemma,cat,interp)} -> | ||
512 | + Xlist.fold interp l (fun l interp -> | ||
513 | + try | ||
514 | + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else | ||
515 | + if lemma = nlemma then | ||
516 | + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in | ||
517 | + (Lemma(lemma,cat,[interp])) :: l else l | ||
518 | + with Not_found -> l) | ||
519 | + | {token=Dig _} -> l (* FIXME: todo *) | ||
520 | + | {token=RomanDig(_,"roman")} -> | ||
521 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l | ||
522 | + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) | ||
523 | + | _ -> l) in | ||
524 | + match sort_uniq l2 with | ||
525 | + [] -> (*raise (LemmaNotMatched(nlemma,ncat,ninterp,l))*) | ||
526 | +lowercase | ||
527 | + | [t] -> t, TokenMatched | ||
528 | + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2))*) | ||
529 | + | ||
530 | +(* try | ||
262 | let nlemma = correct_nlemma nlemma in | 531 | let nlemma = correct_nlemma nlemma in |
263 | let nl = lemmatize_string nlemma in | 532 | let nl = lemmatize_string nlemma in |
264 | let nl2 = Xlist.fold nl [] (fun nl -> function | 533 | let nl2 = Xlist.fold nl [] (fun nl -> function |
265 | {token=Lemma(lemma,cat,interp)} -> | 534 | {token=Lemma(lemma,cat,interp)} -> |
266 | Xlist.fold interp nl (fun nl interp -> | 535 | Xlist.fold interp nl (fun nl interp -> |
267 | try | 536 | try |
268 | - let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in | ||
269 | - if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl | 537 | + if lemma = nlemma then |
538 | + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in | ||
539 | + (Lemma(lemma,cat,[interp])) :: nl else | ||
540 | + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in | ||
541 | + (Lemma(lemma,cat,[interp])) :: nl | ||
270 | with Not_found -> nl) | 542 | with Not_found -> nl) |
271 | | {token=Dig _} -> nl (* FIXME: todo *) | 543 | | {token=Dig _} -> nl (* FIXME: todo *) |
544 | + | {token=RomanDig(_,"roman")} -> | ||
545 | + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl | ||
272 | | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *) | 546 | | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *) |
273 | | _ -> nl) in | 547 | | _ -> nl) in |
274 | - if nl2 = [] then StringQMap.add stats (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) | 548 | + if nl2 = [] then |
549 | + StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token)) | ||
550 | + ^ "\n" ^ name ^ " " ^ string_of_int id_div ^ " " ^ string_of_int t.beg ^ " " ^ t.orth ^ "\n" ^ paragraph) q | ||
551 | + (* if nl2 = [] then StringQMap.add_val stats (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) q *) | ||
275 | (* let l2 = Xlist.fold l [] (fun l2 t2 -> | 552 | (* let l2 = Xlist.fold l [] (fun l2 t2 -> |
276 | match t2.token with | 553 | match t2.token with |
277 | Lemma(lemma,cat,interp) -> if lemma = nlemma (*|| lemma = lowercase nlemma t.token*) then t2 :: l2 else l2 | 554 | Lemma(lemma,cat,interp) -> if lemma = nlemma (*|| lemma = lowercase nlemma t.token*) then t2 :: l2 else l2 |
278 | (* | Proper(lemma,cat,interp,_) -> if lemma = nlemma || lemma = lowercase nlemma t.token then t2 :: l2 else l2 *) | 555 | (* | Proper(lemma,cat,interp,_) -> if lemma = nlemma || lemma = lowercase nlemma t.token then t2 :: l2 else l2 *) |
279 | | _ -> l2) in | 556 | | _ -> l2) in |
280 | if l2 = [] then StringQMap.add stats ("no lemma: " ^ t.orth ^ " " ^ nlemma) else *) | 557 | if l2 = [] then StringQMap.add stats ("no lemma: " ^ t.orth ^ " " ^ nlemma) else *) |
281 | - else StringQMap.add stats "lemmatized" | 558 | + else StringQMap.add_val stats "lemmatized" q |
282 | (* let l3 = Xlist.fold l2 [] (fun l3 t -> | 559 | (* let l3 = Xlist.fold l2 [] (fun l3 t -> |
283 | match t.token with | 560 | match t.token with |
284 | Lemma(lemma2,cat2,interp2) -> if cat = cat2 then t :: l3 else l3 | 561 | Lemma(lemma2,cat2,interp2) -> if cat = cat2 then t :: l3 else l3 |
@@ -308,12 +585,30 @@ let match_lemmatize stats t = | @@ -308,12 +585,30 @@ let match_lemmatize stats t = | ||
308 | | [{token=Lemma _};{token=SmallLetter _}] -> stats | 585 | | [{token=Lemma _};{token=SmallLetter _}] -> stats |
309 | | [{token=Lemma _};{token=FirstCap _}] -> stats | 586 | | [{token=Lemma _};{token=FirstCap _}] -> stats |
310 | | l -> StringQMap.add stats ("multiple interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))*) | 587 | | l -> StringQMap.add stats ("multiple interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))*) |
311 | - with Not_found -> StringQMap.add stats "no ntoken" (*("no ntoken for: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token)*) | 588 | + with Not_found -> StringQMap.add_val stats "no ntoken/incorrect" q |
589 | + (* with Not_found -> StringQMap.add_val stats "no ntoken" q (*("no ntoken for: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token)*) *)*) | ||
312 | 590 | ||
313 | -let rec validate_token stats = function | ||
314 | - Token t -> match_lemmatize stats t | ||
315 | - | Seq l -> Xlist.fold l stats validate_token | ||
316 | - | Variant l -> Xlist.fold l stats validate_token | 591 | +let rec validate_token name id_div paragraph stats = function |
592 | + Token t -> | ||
593 | + (* if t.orth = "POWIŚLE" then Printf.printf "%s %d %s\n%s\n" name id_div paragraph (ENIAMtokens.string_of_token_env t); *) | ||
594 | + (try let _,f = match_lemmatize (*stats 1 name id_div paragraph*) t in | ||
595 | + match f with | ||
596 | + TokenMatched -> StringQMap.add stats "validated" | ||
597 | + | TokenLowercase -> StringQMap.add stats "validated as lowercase" | ||
598 | + | TokenBrev -> StringQMap.add stats "validated abbreviation" | ||
599 | + | TokenSubstGender -> StringQMap.add stats "validated substgender" | ||
600 | + | TokenDeviated -> StringQMap.add stats "validated deviated" | ||
601 | + with | ||
602 | + HasBrev -> StringQMap.add stats ("has brev: " ^ t.orth (*^ " " ^ lemma ^ " " ^ cat ^ "\n"*)) | ||
603 | + (* | NoNtoken -> StringQMap.add stats ("no ntoken: " ^ t.orth (*^ " " ^ lemma ^ " " ^ cat ^ "\n"*)) *) | ||
604 | + | NoNtoken -> StringQMap.add stats "no ntoken" | ||
605 | + | LemmaNotMatched(nlemma,ncat,ninterp,l) -> | ||
606 | + (* StringQMap.add stats (Printf.sprintf "lemma not matched: %s %s : %s \n%s" t.orth (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp]))) (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t.token))) paragraph) *) | ||
607 | + StringQMap.add stats (Printf.sprintf "%s %s %s %d %s\n#%s\n#%s" ncat t.orth name id_div (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp]))) | ||
608 | + (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t.token))) paragraph) | ||
609 | + | MultipleLemmaMatched(nlemma,ncat,ninterp,l) -> StringQMap.add stats (Printf.sprintf "multiple lemma matched: %s %s : %s" t.orth (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp]))) (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t))))) | ||
610 | + | Seq l -> Xlist.fold l stats (validate_token name id_div paragraph) | ||
611 | + | Variant l -> Xlist.fold l stats (validate_token name id_div paragraph) | ||
317 | 612 | ||
318 | let validate_morphology stats name typ channel entries = | 613 | let validate_morphology stats name typ channel entries = |
319 | prerr_endline name; | 614 | prerr_endline name; |
@@ -323,7 +618,7 @@ let validate_morphology stats name typ channel entries = | @@ -323,7 +618,7 @@ let validate_morphology stats name typ channel entries = | ||
323 | (* print_endline paragraph; *) | 618 | (* print_endline paragraph; *) |
324 | (*let s = "W Specjalnym Ośrodku Szkolno-Wychowawczym" in | 619 | (*let s = "W Specjalnym Ośrodku Szkolno-Wychowawczym" in |
325 | if String.length paragraph >= String.length s && String.sub paragraph 0 (String.length s) = s then*) | 620 | if String.length paragraph >= String.length s && String.sub paragraph 0 (String.length s) = s then*) |
326 | - Xlist.fold tokens stats validate_token | 621 | + Xlist.fold tokens stats (validate_token name id_div paragraph) |
327 | (*else stats*))) | 622 | (*else stats*))) |
328 | 623 | ||
329 | let ntokens_filename = "results/ntokens.tab" | 624 | let ntokens_filename = "results/ntokens.tab" |
@@ -333,7 +628,7 @@ let parse_ninterp s = | @@ -333,7 +628,7 @@ let parse_ninterp s = | ||
333 | 628 | ||
334 | let fold_ntokens ntokens_filename s f = | 629 | let fold_ntokens ntokens_filename s f = |
335 | File.fold_tab ntokens_filename s (fun s -> function | 630 | File.fold_tab ntokens_filename s (fun s -> function |
336 | - [_;nlemma;ncat;ninterp] -> f s (nlemma,ncat,parse_ninterp ninterp) | 631 | + [q;nlemma;ncat;ninterp] -> f s (int_of_string q) (nlemma,ncat,parse_ninterp ninterp) |
337 | | l -> failwith ("fold_ntokens: " ^ String.concat "\t" l)) | 632 | | l -> failwith ("fold_ntokens: " ^ String.concat "\t" l)) |
338 | 633 | ||
339 | let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005"; | 634 | let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005"; |
@@ -371,12 +666,15 @@ let _ = | @@ -371,12 +666,15 @@ let _ = | ||
371 | create_ntoken_list stats name typ channel entries) in *) | 666 | create_ntoken_list stats name typ channel entries) in *) |
372 | (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> | 667 | (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> |
373 | create_ntoken_list stats name typ channel entries) in *) | 668 | create_ntoken_list stats name typ channel entries) in *) |
374 | - let stats = fold_ntokens ntokens_filename StringQMap.empty validate_ntoken in | 669 | + (* let stats = fold_ntokens ntokens_filename StringQMap.empty validate_ntoken in *) |
670 | + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) -> | ||
671 | + validate_ntoken_entry stats name typ channel entries) in *) | ||
375 | (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) -> | 672 | (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) -> |
376 | validate_morphology stats name typ channel entries) in *) | 673 | validate_morphology stats name typ channel entries) in *) |
377 | - (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> | ||
378 | - validate_morphology stats name typ channel entries) in *) | 674 | + let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> |
675 | + validate_morphology stats name typ channel entries) in | ||
379 | let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in | 676 | let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in |
380 | Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k); | 677 | Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k); |
678 | + flush stdout; | ||
381 | ignore(Sys.command "mpg123 \"../../Inne/gong/gong_00m_30s.mp3\""); | 679 | ignore(Sys.command "mpg123 \"../../Inne/gong/gong_00m_30s.mp3\""); |
382 | () | 680 | () |
NKJP2/validateTokenizer.ml
@@ -609,7 +609,7 @@ let rec annotate_paragraph name paragraph l = function | @@ -609,7 +609,7 @@ let rec annotate_paragraph name paragraph l = function | ||
609 | with Not_found -> (try | 609 | with Not_found -> (try |
610 | let m,ets,l = annotate_apply_rules (et :: ets) l rules in | 610 | let m,ets,l = annotate_apply_rules (et :: ets) l rules in |
611 | m :: annotate_paragraph name paragraph l ets | 611 | m :: annotate_paragraph name paragraph l ets |
612 | - with Not_found -> failwith "annotate_paragraph 1"))) | 612 | + with Not_found -> (*print_endline ("annotate_paragraph 1: " ^ (string_of_vtoken et));*)failwith "annotate_paragraph 1"))) |
613 | | [] -> if l = [] then [] else failwith "annotate_paragraph 2" | 613 | | [] -> if l = [] then [] else failwith "annotate_paragraph 2" |
614 | 614 | ||
615 | let validate_segmentation stats name typ channel entries = | 615 | let validate_segmentation stats name typ channel entries = |
@@ -713,12 +713,15 @@ let transform_nkjp_interp cat interp1 = | @@ -713,12 +713,15 @@ let transform_nkjp_interp cat interp1 = | ||
713 | | "prep" | "adv" | "fin" | "inf" | "imps" | "pcon" | "bedzie" | "impt" | "siebie" | "aglt" | "pant" | "brev" | "qub" -> interp | 713 | | "prep" | "adv" | "fin" | "inf" | "imps" | "pcon" | "bedzie" | "impt" | "siebie" | "aglt" | "pant" | "brev" | "qub" -> interp |
714 | | _ -> print_endline ("transform_nkjp_interp: " ^ cat ^ " " ^ String.concat ":" interp1); interp | 714 | | _ -> print_endline ("transform_nkjp_interp: " ^ cat ^ " " ^ String.concat ":" interp1); interp |
715 | 715 | ||
716 | +let transform_nkjp_interp_simple cat interp1 = | ||
717 | + Xlist.map interp1 (fun s -> [s]) | ||
718 | + | ||
716 | let merge_token = function | 719 | let merge_token = function |
717 | t,[] -> Token t | 720 | t,[] -> Token t |
718 | | t,[{ncat="brev"} as n] -> set_sent n.nsent {t with attrs=BrevLemma n.nlemma :: t.attrs} | 721 | | t,[{ncat="brev"} as n] -> set_sent n.nsent {t with attrs=BrevLemma n.nlemma :: t.attrs} |
719 | | t,[n] -> | 722 | | t,[n] -> |
720 | if n.nlemma = "+/-" then set_sent n.nsent t else | 723 | if n.nlemma = "+/-" then set_sent n.nsent t else |
721 | - if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: t.attrs} | 724 | + if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,transform_nkjp_interp_simple n.ncat n.ninterp) :: t.attrs} |
722 | else set_sent n.nsent t | 725 | else set_sent n.nsent t |
723 | | _ -> failwith "merge_token" | 726 | | _ -> failwith "merge_token" |
724 | 727 | ||
@@ -732,7 +735,7 @@ let merge_letni l seq = | @@ -732,7 +735,7 @@ let merge_letni l seq = | ||
732 | match List.rev seq with | 735 | match List.rev seq with |
733 | last :: l -> | 736 | last :: l -> |
734 | let attrs = if n.nsent=SentEnd || n.nsent=SentBegEnd then (SentEnd : attr) :: last.attrs else last.attrs in | 737 | let attrs = if n.nsent=SentEnd || n.nsent=SentBegEnd then (SentEnd : attr) :: last.attrs else last.attrs in |
735 | - Seq(Xlist.rev_map ({last with attrs=Disamb(lemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: attrs} :: l) (fun t -> Token t)) | 738 | + Seq(Xlist.rev_map ({last with attrs=Disamb(lemma,n.ncat,transform_nkjp_interp_simple n.ncat n.ninterp) :: attrs} :: l) (fun t -> Token t)) |
736 | | _ -> failwith "merge_letni" | 739 | | _ -> failwith "merge_letni" |
737 | 740 | ||
738 | let blabla_orths = StringSet.of_list ["8.12"; "9.11"; "1.1"; "1.2"] | 741 | let blabla_orths = StringSet.of_list ["8.12"; "9.11"; "1.1"; "1.2"] |
@@ -751,7 +754,7 @@ let merge_paragraph name = function | @@ -751,7 +754,7 @@ let merge_paragraph name = function | ||
751 | | AR("brev",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) | 754 | | AR("brev",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) |
752 | | AR("both-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) | 755 | | AR("both-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) |
753 | | AR("eniam-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) | 756 | | AR("eniam-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) |
754 | - | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp n.ncat n.ninterp])})) (* FIXME: ustalenie beg len next *) | 757 | + | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp_simple n.ncat n.ninterp])})) (* FIXME: ustalenie beg len next *) |
755 | | t -> failwith ("merge_paragraph: " ^ string_of_atoken t) | 758 | | t -> failwith ("merge_paragraph: " ^ string_of_atoken t) |
756 | 759 | ||
757 | let test_annotate name typ channel entries = | 760 | let test_annotate name typ channel entries = |
@@ -783,6 +786,60 @@ let test_annotate name typ channel entries = | @@ -783,6 +786,60 @@ let test_annotate name typ channel entries = | ||
783 | (* print_endline (String.concat "\n" (Xlist.map m string_of_atoken))); *) | 786 | (* print_endline (String.concat "\n" (Xlist.map m string_of_atoken))); *) |
784 | ())) | 787 | ())) |
785 | 788 | ||
789 | +type cap = Capital | Small | Sign | ||
790 | + | ||
791 | +let classify_cap s = | ||
792 | + match Xunicode.classified_chars_of_utf8_string s with | ||
793 | + Xunicode.Capital _ :: _ -> Capital | ||
794 | + | Xunicode.ForeignCapital _ :: _ -> Capital | ||
795 | + | Xunicode.Small _ :: _ -> Small | ||
796 | + | Xunicode.ForeignSmall _ :: _ -> Small | ||
797 | + | _ -> Sign | ||
798 | + | ||
799 | +let rec get_ntoken = function | ||
800 | + (Disamb(nlemma,ncat,ninterp) : attr) :: _ -> nlemma,ncat,ninterp | ||
801 | + | _ :: l -> get_ntoken l | ||
802 | + | [] -> raise Not_found | ||
803 | + | ||
804 | +let rec disambiguate_capitalics = function | ||
805 | + Token t -> | ||
806 | + (try | ||
807 | + let nlemma,ncat,ninterp = get_ntoken t.attrs in | ||
808 | + let c = match t.token, classify_cap nlemma with | ||
809 | + ENIAMtokenizerTypes.SmallLetter _, Small -> true | ||
810 | + | ENIAMtokenizerTypes.CapLetter _, Capital -> true | ||
811 | + | ENIAMtokenizerTypes.AllSmall _ , Small-> true | ||
812 | + | ENIAMtokenizerTypes.AllCap _, Capital -> true | ||
813 | + (* | ENIAMtokenizerTypes.AllCap _, Small -> true *) | ||
814 | + | ENIAMtokenizerTypes.FirstCap _, Capital -> true | ||
815 | + | ENIAMtokenizerTypes.SomeCap _, Capital -> true | ||
816 | + | ENIAMtokenizerTypes.SomeCap _, Small -> true | ||
817 | + | ENIAMtokenizerTypes.RomanDig _, Capital -> true | ||
818 | + | ENIAMtokenizerTypes.Interp _, _ -> true | ||
819 | + | ENIAMtokenizerTypes.Symbol _, _ -> true | ||
820 | + | ENIAMtokenizerTypes.Dig _, _ -> true | ||
821 | + | ENIAMtokenizerTypes.Other _, _ -> true | ||
822 | + | ENIAMtokenizerTypes.Lemma _, _ -> true | ||
823 | + | ENIAMtokenizerTypes.Proper _, _ -> true | ||
824 | + | ENIAMtokenizerTypes.Compound _, _ -> true | ||
825 | + | ENIAMtokenizerTypes.Tokens _, _ -> true | ||
826 | + | _ -> false in | ||
827 | + Token t, c | ||
828 | + (* let nc = classify_cap nlemma in | ||
829 | + let no = classify_cap t.orth in | ||
830 | + if no = nc then Token t,true else Token t,false *) | ||
831 | + with Not_found -> Token t,true) | ||
832 | + | Seq l -> | ||
833 | + let l,c = Xlist.fold l ([],true) (fun (l,c) t -> | ||
834 | + let t,d = disambiguate_capitalics t in | ||
835 | + t :: l, c && d) in | ||
836 | + Seq(List.rev l), c | ||
837 | + | Variant l -> | ||
838 | + let l2 = Xlist.fold l [] (fun l t -> | ||
839 | + let t,d = disambiguate_capitalics t in | ||
840 | + if d then t :: l else l) in | ||
841 | + if l2 = [] then Variant l,false else Variant l2,true | ||
842 | + | ||
786 | let annotate name sentences = | 843 | let annotate name sentences = |
787 | let tokens = flatten_sentences sentences in | 844 | let tokens = flatten_sentences sentences in |
788 | let tokens = simple_allign "" "" [] tokens in | 845 | let tokens = simple_allign "" "" [] tokens in |
@@ -793,8 +850,21 @@ let annotate name sentences = | @@ -793,8 +850,21 @@ let annotate name sentences = | ||
793 | let eniam_tokens = annotate_variants_par eniam_tokens in | 850 | let eniam_tokens = annotate_variants_par eniam_tokens in |
794 | let m = annotate_paragraph name paragraph tokens eniam_tokens in | 851 | let m = annotate_paragraph name paragraph tokens eniam_tokens in |
795 | let m = List.rev (Xlist.rev_map m (merge_paragraph name)) in | 852 | let m = List.rev (Xlist.rev_map m (merge_paragraph name)) in |
853 | + let m = List.rev (Xlist.fold m [] (fun m t -> | ||
854 | + let t,_ = disambiguate_capitalics t in | ||
855 | + t :: m)) in | ||
796 | paragraph, m | 856 | paragraph, m |
797 | 857 | ||
858 | +let test_disambiguate_capitalics stats name typ channel entries = | ||
859 | + prerr_endline name; | ||
860 | + Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) -> | ||
861 | + Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> | ||
862 | + let paragraph,tokens = annotate name sentences in | ||
863 | + Xlist.fold tokens stats (fun stats t -> | ||
864 | + let _,c = disambiguate_capitalics t in | ||
865 | + if c then stats else StringQMap.add stats (Printf.sprintf "%s %s" (ENIAMtokens.string_of_tokens 0 t) paragraph)))) | ||
866 | + | ||
867 | + | ||
798 | let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005"; | 868 | let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005"; |
799 | "620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727"; | 869 | "620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727"; |
800 | "620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056"; | 870 | "620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056"; |
@@ -839,6 +909,8 @@ let _ = | @@ -839,6 +909,8 @@ let _ = | ||
839 | test_annotate name typ channel entries); *) | 909 | test_annotate name typ channel entries); *) |
840 | (* ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) -> | 910 | (* ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) -> |
841 | test_annotate name typ channel entries); *) | 911 | test_annotate name typ channel entries); *) |
912 | + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) -> | ||
913 | + test_disambiguate_capitalics stats name typ channel entries) in *) | ||
842 | (* let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in | 914 | (* let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in |
843 | Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k); *) | 915 | Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k); *) |
844 | (* ignore(Sys.command "mpg123 \"../../Inne/gong/gong_00m_30s.mp3\""); *) | 916 | (* ignore(Sys.command "mpg123 \"../../Inne/gong/gong_00m_30s.mp3\""); *) |
morphology/doc/model2.pdf
No preview for this file type
morphology/resources/alt_supplement.tab