Commit e8d2c65cd63bb36a671eb2bbd0effd7a39eec931

Authored by Wojciech Jaworski
1 parent be209dd2

Analiza różnic w lematyzacji

NKJP2/validateMorphology.ml
... ... @@ -26,10 +26,10 @@ let rec has_brev = function
26 26 | _ :: l -> has_brev l
27 27 | [] -> false
28 28  
29   -let rec get_ntoken = function
30   - (Disamb(nlemma,ncat,ninterp) : attr) :: _ -> nlemma,ncat,ninterp
31   - | _ :: l -> get_ntoken l
32   - | [] -> raise Not_found
  29 +let rec get_brev = function
  30 + BrevLemma s :: _ -> s
  31 + | _ :: l -> get_brev l
  32 + | [] -> failwith "get_brev"
33 33  
34 34 let rec add_ntoken stats = function
35 35 Token t ->
... ... @@ -91,7 +91,7 @@ let lemmatize_string s =
91 91 | Token ({token=FirstCap _} as t) -> t :: l
92 92 | Token ({token=AllCap _} as t) -> t :: l
93 93 | Token ({token=CapLetter _} as t) -> t :: l
94   - | Token ({token=RomanDig _}) -> (*print_endline ("lemmatize_string: " ^ s);*) (*t ::*) l
  94 + | Token ({token=RomanDig _} as t) -> (*print_endline ("lemmatize_string: " ^ s);*) t :: l
95 95 | Token ({token=Dig _} as t) -> (*print_endline ("lemmatize_string: " ^ s);*) t :: l
96 96 | Token ({token=Proper _} as t) -> t :: l
97 97 | Seq[Token {token=AllSmall _};Token {token=Lemma _}] -> l
... ... @@ -125,34 +125,52 @@ let lemmatize_string s =
125 125  
126 126 let get_cat_interp = function
127 127 "subst","subst",[n;c;["m1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]]
  128 + | "subst","subst",[n;c;["m1"]],[_;_;["m1"];col] -> "subst",[n;c;["m1"];col]
128 129 | "subst","subst",[n;c;["m2"]],[_;_;["m2"]] -> "subst",[n;c;["m2"]]
129 130 | "subst","subst",[n;c;["m3"]],[_;_;["m3"]] -> "subst",[n;c;["m3"]]
130   - | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]]
131   - | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]]
  131 + | "subst","subst",[n;c;["n"]],[_;_;["n"];col] -> "subst",[n;c;["n"];col]
132 132 | "subst","subst",[n;c;["f"]],[_;_;["f"]] -> "subst",[n;c;["f"]]
133   - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]]
134   - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]]
135   - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p2"]] -> "subst",[n;c;["p2"]]
136   - | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p3"]] -> "subst",[n;c;["p3"]]
137   - | "subst","subst",[n;c;["m1";"p1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]]
138   - | "subst","subst",[n;c;["m1";"p1"]],[_;_;["p1"]] -> "subst",[n;c;["p1"]]
  133 + | "subst","subst",[n;c;g],[_;_;_] -> "subst",[n;c;g]
  134 + | "subst","subst",[n;c;g],[_;_;_;_] -> "subst",[n;c;g]
  135 + | "subst","adj",[n;c;g],_ -> "subst",[n;c;g]
139 136 | "depr","subst",[["pl"];["nom"];["m2"]],[["sg"];["nom"];["m1"]] -> "depr",[["pl"];["nom"];["m2"]]
  137 + | "depr","subst",[["pl"];["acc"];["m2"]],[["sg"];["nom"];["m1"]] -> "depr",[["pl"];["acc"];["m2"]]
140 138 | "ppron3","ppron3",ninterp,[["sg"];["nom"];["m1";"m2";"m3"];["ter"];_;_] -> "ppron3",ninterp
141   - | "ppron12","ppron12",ninterp,[_;["nom"];_;_] -> "ppron3",ninterp
142   - | "numcol","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *)
143   - | "num","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *)
  139 + | "ppron12","ppron12",ninterp,[_;["nom"];_;_] -> "ppron12",ninterp
  140 + | "numcol","num",ninterp,_ -> "num",ninterp
  141 + | "num","num",ninterp,_ -> "num",ninterp (* na tym etapie nie da się skorygować błędów *)
  142 + (* | "num","num",[["pl"];c;g;["rec"]],[["sg";"pl"];["nom";"gen";"acc"];["m1";"m2";"m3";"f";"n"];["rec"]] -> "num",[["pl"];c;g;["rec"]]
  143 + | "num","num",[["pl"];c;["m2"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["m2"];["rec"]]
  144 + | "num","num",[["pl"];c;["m3"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["m3"];["rec"]]
  145 + | "num","num",[["pl"];c;["f"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["f"];["rec"]]
  146 + | "num","num",[["pl"];c;["n"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["n"];["rec"];col]
  147 + | "num","num",[["pl"];c;["m1"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m1"];["congr"]]
  148 + | "num","num",[["pl"];c;["m2"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m2"];["congr"]]
  149 + | "num","num",[["pl"];c;["m3"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m3"];["congr"]]
  150 + | "num","num",[["pl"];c;["f"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["f"];["congr"]]
  151 + | "num","num",[["pl"];c;["n"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["n"];["congr"];col]
  152 + | "num","num",[["pl"];c;["m2"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["m2"];["congr"]]
  153 + | "num","num",[["pl"];c;["m3"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["m3"];["congr"]]
  154 + | "num","num",[["pl"];c;["n"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["n"];["congr"];col] *)
144 155 | "siebie","siebie",[[c]],[["acc";"gen"]] -> "siebie",[[c]]
145 156 | "adj","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adj",ninterp
  157 + | "adj","adj",ninterp,[["sg";"pl"];["nom";"gen";"dat";"acc";"inst";"loc";"voc"];["m1";"m2";"m3";"f";"n"];["pos"]] -> "adj",ninterp
146 158 | "adja","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adja",ninterp
147 159 | "adjc","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjc",ninterp
148 160 | "adjp","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjp",ninterp
  161 + | "adj","adj",ninterp,[["sg"];["nom"];["m1";"m2";"m3"];["pos"]] -> "adj",ninterp
149 162 | "adv","adv",[[g]],[["pos"]] -> "adv",[[g]]
150   - | "adv","adv",ninterp,interp -> if ninterp = interp then "adv",ninterp else raise Not_found
  163 + | "adv","adv",[],[["pos"]] -> "adv",[["pos"]]
  164 + | "adv",_,ninterp,_ -> "adv",ninterp
  165 + | "comp","comp",ninterp,interp -> if ninterp = interp then "comp",ninterp else raise Not_found
  166 + | "conj","conj",ninterp,interp -> if ninterp = interp then "conj",ninterp else raise Not_found
  167 + | "conj",_,ninterp,_ -> "conj",ninterp
  168 + | "prep","prep",[c1;w],[c2;_] -> if c1 = c2 then "prep",[c1;w] else raise Not_found
151 169 | "prep","prep",ninterp,interp -> if ninterp = interp then "prep",ninterp else raise Not_found
152 170 | "qub","qub",ninterp,interp -> if ninterp = interp then "qub",ninterp else raise Not_found
153   - | "conj","conj",ninterp,interp -> if ninterp = interp then "conj",ninterp else raise Not_found
154   - | "comp","comp",ninterp,interp -> if ninterp = interp then "comp",ninterp else raise Not_found
  171 + | "qub",_,ninterp,_ -> "qub",ninterp
155 172 | "interj","interj",ninterp,interp -> if ninterp = interp then "interj",ninterp else raise Not_found
  173 + | "interj",_,ninterp,_ -> "interj",ninterp
156 174 | "burk","burk",ninterp,interp -> if ninterp = interp then "burk",ninterp else raise Not_found
157 175 | "pred","pred",ninterp,interp -> if ninterp = interp then "pred",ninterp else raise Not_found
158 176 | "fin","inf",[n;p;["imperf"]],[["imperf";"perf"]] -> "fin",[n;p;["imperf"]]
... ... @@ -163,6 +181,8 @@ let get_cat_interp = function
163 181 | "impt","inf",[n;p;["imperf"]],[["imperf"]] -> "impt",[n;p;["imperf"]]
164 182 | "impt","inf",[n;p;["perf"]],[["imperf";"perf"]] -> "impt",[n;p;["perf"]]
165 183 | "impt","inf",[n;p;["perf"]],[["perf"]] -> "impt",[n;p;["perf"]]
  184 + | "bedzie","inf",[n;p;["imperf"]],[["imperf"]] -> "bedzie",[n;p;["imperf"]]
  185 + | "aglt","inf",[n;p;["imperf"];w],[["imperf"]] -> "aglt",[n;p;["imperf"];w]
166 186 | "inf","inf",[["imperf"]],[["imperf";"perf"]] -> "inf",[["imperf"]]
167 187 | "inf","inf",[["imperf"]],[["imperf"]] -> "inf",[["imperf"]]
168 188 | "inf","inf",[["perf"]],[["imperf";"perf"]] -> "inf",[["perf"]]
... ... @@ -175,7 +195,7 @@ let get_cat_interp = function
175 195 | "praet","inf",[n;g;["imperf"];a],[["imperf"]] -> "praet",[n;g;["imperf"];a]
176 196 | "praet","inf",[n;g;["perf"];a],[["imperf";"perf"]] -> "praet",[n;g;["perf"];a]
177 197 | "praet","inf",[n;g;["perf"];a],[["perf"]] -> "praet",[n;g;["perf"];a]
178   - | "winien","inf",[n;g;["imperf"]],[["imperf"]] -> "winien",[n;g;["imperf"]]
  198 + | "winien","winien",[n;g;["imperf"]],[_;_;["imperf"]] -> "winien",[n;g;["imperf"]]
179 199 | "ppas","inf",[n;c;g;["imperf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["imperf"];a]
180 200 | "ppas","inf",[n;c;g;["imperf"];a],[["imperf"]] -> "ppas",[n;c;g;["imperf"];a]
181 201 | "ppas","inf",[n;c;g;["perf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["perf"];a]
... ... @@ -202,6 +222,16 @@ let get_cat_interp = function
202 222 | "imps","inf",[["perf"]],[["perf"]] -> "imps",[["perf"]]
203 223 | _ -> raise Not_found
204 224  
  225 +let get_lemma_cat_interp = function
  226 + nlemma,lemma,"adj","ppas",[n;c;g;["pos"]],[["sg"];["nom";"voc"];["m1";"m2";"m3"];a;aff] -> lemma,"ppas",[n;c;g;a;aff]
  227 + | nlemma,lemma,"adja","adja",[],[] -> lemma,"adja",[]
  228 + | nlemma,lemma,"subst","subst",[["pl"];c;g],[["pl"];["nom";"voc"];_] -> lemma,"subst",[["pl"];c;g]
  229 + (* | "5","5","adj","dig",ninterp,[] -> "piąty","adj",ninterp
  230 + | "6","6","adj","dig",ninterp,[] -> "szósty","adj",ninterp *)
  231 + (* | "adj","ppas",ninterp,interp -> print_endline (ENIAMtokens.string_of_interps [ninterp] ^ " " ^ ENIAMtokens.string_of_interps [interp]); raise Not_found *)
  232 + | _ -> raise Not_found
  233 +
  234 +
205 235 let correct_nlemma = function
206 236 "letnia " -> "letnia"
207 237 | "10minutowy" -> "minutowy"
... ... @@ -233,7 +263,7 @@ let correct_nlemma = function
233 263 | "16-latek" -> raise Not_found
234 264 | s -> s
235 265  
236   -let process_ntoken stats nlemma ncat ninterp =
  266 +let process_ntoken stats q nlemma ncat ninterp =
237 267 try
238 268 let nlemma = correct_nlemma nlemma in
239 269 let nl = lemmatize_string nlemma in
... ... @@ -241,44 +271,291 @@ let process_ntoken stats nlemma ncat ninterp =
241 271 {token=Lemma(lemma,cat,interp)} ->
242 272 Xlist.fold interp nl (fun nl interp ->
243 273 try
244   - let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in
245   - if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl
  274 + if lemma = nlemma then
  275 + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in
  276 + (Lemma(lemma,cat,[interp])) :: nl else
  277 + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in
  278 + (Lemma(lemma,cat,[interp])) :: nl
  279 + with Not_found -> nl)
  280 + | {token=Dig(_,"dig")} -> nl (* FIXME: todo *)
  281 + (* (try
  282 + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in
  283 + (Lemma(lemma,cat,[interp])) :: nl
  284 + with Not_found -> nl) *)
  285 + | {token=RomanDig(_,"roman")} ->
  286 + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl
  287 + | {token=Proper(lemma,cat,interp,_)} -> (*print_endline ("P " ^ nlemma);*) nl (* FIXME: todo *)
  288 + | _ -> nl) in
  289 + if nl2 = [] then StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) q
  290 + else StringQMap.add_val stats "lemmatized" q
  291 + with Not_found -> StringQMap.add_val stats "incorrect" q
  292 +
  293 +let process_ntoken2 stats q name id_div orth beg paragraph nlemma ncat ninterp =
  294 + try
  295 + let nlemma = correct_nlemma nlemma in
  296 + let nl = lemmatize_string nlemma in
  297 + let nl2 = Xlist.fold nl [] (fun nl -> function
  298 + {token=Lemma(lemma,cat,interp)} ->
  299 + Xlist.fold interp nl (fun nl interp ->
  300 + try
  301 + if lemma = nlemma then
  302 + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in
  303 + (Lemma(lemma,cat,[interp])) :: nl else
  304 + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in
  305 + (Lemma(lemma,cat,[interp])) :: nl
246 306 with Not_found -> nl)
247 307 | {token=Dig _} -> nl (* FIXME: todo *)
  308 + | {token=RomanDig(_,"roman")} ->
  309 + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl
248 310 | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *)
249 311 | _ -> nl) in
250   - if nl2 = [] then StringQMap.add stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token)))
251   - else StringQMap.add stats "lemmatized"
252   - with Not_found -> StringQMap.add stats "incorrect"
  312 + if nl2 = [] then
  313 + StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))
  314 + ^ "\n" ^ name ^ " " ^ string_of_int id_div ^ " " ^ string_of_int beg ^ " " ^ orth ^ "\n" ^ paragraph) q
  315 + else StringQMap.add_val stats "lemmatized" q
  316 + with Not_found -> StringQMap.add_val stats "incorrect" q
  317 +
  318 +let validate_ntoken stats q (nlemma,ncat,ninterp) =
  319 + process_ntoken stats q nlemma ncat ninterp
253 320  
254   -let validate_ntoken stats (nlemma,ncat,ninterp) =
255   - process_ntoken stats nlemma ncat ninterp
  321 +let rec validate_ntoken_token name id_div paragraph stats = function
  322 + Token t ->
  323 + (try
  324 + let nlemma,ncat,ninterp = get_ntoken t.attrs in
  325 + process_ntoken2 stats 1 name id_div t.orth t.beg paragraph nlemma ncat ninterp
  326 + (* print_endline (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtokens.string_of_interps [ninterp]);
  327 + Printf.printf "%s\t%d\t%s\t%d\n" name id_div t.orth t.beg;
  328 + print_endline paragraph;
  329 + stats *)
  330 + with Not_found -> stats)
  331 + | Seq l -> Xlist.fold l stats (validate_ntoken_token name id_div paragraph)
  332 + | Variant l -> Xlist.fold l stats (validate_ntoken_token name id_div paragraph)
256 333  
257   -let match_lemmatize stats t =
258   - if has_brev t.attrs then StringQMap.add stats "brev" else
  334 +let validate_ntoken_entry stats name typ channel entries =
  335 + prerr_endline name;
  336 + Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
  337 + Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
  338 + let paragraph,tokens = annotate name sentences in
  339 + Xlist.fold tokens stats (validate_ntoken_token name id_div paragraph)))
  340 +
  341 +let rec subset_list = function
  342 + [],[] -> true
  343 + | [x] :: l1, y :: l2 -> if Xlist.mem y x then subset_list (l1,l2) else false
  344 + | _ -> false
  345 +
  346 +let match_cat_interp = function
  347 + | "subst","subst",[nn;nc;ng],[n;c;g;col] -> if subset_list ([nn;nc;ng],[n;c;g]) then "subst",[nn;nc;ng;col] else raise Not_found
  348 +(* | "numcol","num",ninterp,_ -> "num",ninterp*)
  349 + | "num","num",[nn;nc;["n"];na],[n;c;g;a;col] -> if subset_list ([nn;nc;["n"];na],[n;c;g;a]) then "num",[nn;nc;["n"];na;col] else raise Not_found
  350 + | "num","num",[nn;nc;ng;na],[n;c;g;a;col] -> if subset_list ([nn;nc;ng;na],[n;c;g;a]) then "num",[nn;nc;ng;na] else raise Not_found
  351 + | "adv","adv",[],[["pos"]] -> "adv",[["pos"]]
  352 + | _ -> raise Not_found
  353 +
  354 +let match_cat_interp_substgender = function
  355 + "subst","subst",[nn;nc;ng],[n;c;_] -> if subset_list ([nn;nc],[n;c]) then "subst",[nn;nc;ng] else raise Not_found
  356 + | "subst","subst",[nn;nc;ng],[n;c;_;_] -> if subset_list ([nn;nc],[n;c]) then "subst",[nn;nc;ng] else raise Not_found
  357 + | _ -> raise Not_found
  358 +
  359 +exception HasBrev
  360 +exception NoNtoken
  361 +exception LemmaNotMatched of string * string * string list list * token_env list
  362 +exception MultipleLemmaMatched of string * string * string list list * token list
  363 +
  364 +let rec sort_uniq_rec rev = function
  365 + [] -> rev
  366 + | x :: y :: l -> if x = y then sort_uniq_rec rev (y :: l) else sort_uniq_rec (x :: rev) (y :: l)
  367 + | [x] -> x :: rev
  368 +
  369 +let sort_uniq l =
  370 + match sort_uniq_rec [] (Xlist.sort l compare) with
  371 + [Lemma(lemma1,"subst",[[n1;c1;["n"];["ncol"]]]);Lemma(lemma2,"subst",[[n2;c2;["n"];["col"]]])] as l ->
  372 + if lemma1 = lemma2 && n1 = n2 && c1 = c2 then [Lemma(lemma1,"subst",[[n1;c1;["n"];["ncol";"col"]]])] else l
  373 + | [Lemma("kląsknięcie","subst",[[["pl"];c1;["n"];["pt"]]]);Lemma("kląsknięcie","subst",[[["pl"];c2;["n"];["ncol"]]])] as l ->
  374 + if c1 = c2 then [Lemma("kląsknięcie","subst",[[["pl"];c1;["n"];["pt"]]])] else l
  375 + | [Lemma("wybrażenie","subst",[[["pl"];c1;["n"];["pt"]]]);Lemma("wybrażenie","subst",[[["pl"];c2;["n"];["ncol"]]])] as l ->
  376 + if c1 = c2 then [Lemma("wybrażenie","subst",[[["pl"];c1;["n"];["pt"]]])] else l
  377 + | [Lemma(lemma1,"subst",[[["pl"];c1;["n"];["pt"]]]);Lemma(lemma2,"subst",[[["pl"];c2;["n"];["ncol"]]])] as l ->
  378 + (* print_endline lemma1; *)
  379 + if lemma1 = lemma2 && c1 = c2 then [Lemma(lemma1,"subst",[[["pl"];c1;["n"];["pt"]]])] else l
  380 + | l -> (*print_endline (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t)));*) l
  381 +
  382 +type t = TokenMatched | TokenLowercase | TokenBrev | TokenSubstGender | TokenDeviated
  383 +
  384 +let match_lemmatize_simple t nlemma ncat ninterp =
  385 + let l1 = ENIAMpaths.lemmatize_token t in
  386 + let l2 = Xlist.fold l1 [] (fun l -> function
  387 + {token=Lemma(lemma,cat,interp)} ->
  388 + Xlist.fold interp l (fun l interp ->
  389 + try
  390 + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else
  391 + if lemma = nlemma then
  392 + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in
  393 + (Lemma(lemma,cat,[interp])) :: l else l
  394 + with Not_found -> l)
  395 + | {token=Dig _} -> l (* FIXME: todo *)
  396 + | {token=RomanDig(_,"roman")} ->
  397 + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l
  398 + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *)
  399 + | _ -> l) in
  400 + match sort_uniq l2 with
  401 + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1))
  402 + | [t] -> t, TokenMatched
  403 + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2))
  404 +
  405 +let match_lemmatize_lowercase t nlemma ncat ninterp =
  406 + let t = match t.token with
  407 + | FirstCap(s,lower,cl,ll) -> {t with token=AllSmall lower}
  408 + | CapLetter(s,lower) -> {t with token=SmallLetter lower}
  409 + | AllCap(_,a,b) -> {t with token=FirstCap(a,b,"","")} (* FIXME: to powinno być zdezambiguowane *)
  410 + | _ -> t in
259 411 let l = ENIAMpaths.lemmatize_token t in
260   - try
261   - let nlemma,ncat,ninterp = get_ntoken t.attrs in
  412 + let l2 = Xlist.fold l [] (fun l -> function
  413 + {token=Lemma(lemma,cat,interp)} ->
  414 + Xlist.fold interp l (fun l interp ->
  415 + try
  416 + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else
  417 + if lemma = nlemma then
  418 + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in
  419 + (Lemma(lemma,cat,[interp])) :: l else l
  420 + with Not_found -> l)
  421 + | {token=Dig _} -> l (* FIXME: todo *)
  422 + | {token=RomanDig(_,"roman")} ->
  423 + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l
  424 + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *)
  425 + | _ -> l) in
  426 + match sort_uniq l2 with
  427 + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l))
  428 + | [t] -> t, TokenLowercase
  429 + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2))
  430 +
  431 +let match_lemmatize_substgender t nlemma ncat ninterp =
  432 + let l1 = ENIAMpaths.lemmatize_token t in
  433 + let l2 = Xlist.fold l1 [] (fun l -> function
  434 + {token=Lemma(lemma,cat,interp)} ->
  435 + Xlist.fold interp l (fun l interp ->
  436 + try
  437 + if lemma = nlemma then
  438 + let cat,interp = match_cat_interp_substgender (ncat,cat,ninterp,interp) in
  439 + (Lemma(lemma,cat,[interp])) :: l else l
  440 + with Not_found -> l)
  441 + | {token=Dig _} -> l (* FIXME: todo *)
  442 + | {token=RomanDig(_,"roman")} ->
  443 + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l
  444 + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *)
  445 + | _ -> l) in
  446 + match sort_uniq l2 with
  447 + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1))
  448 + | [t] -> t, TokenSubstGender
  449 + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2))
  450 +
  451 +let match_lemmatize_deviated t nlemma ncat ninterp =
  452 + let l1 = ENIAMpaths.lemmatize_token t in
  453 + let nlemma = try correct_nlemma nlemma with Not_found -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) in
  454 + let nl = lemmatize_string nlemma in
  455 + let nl2 = Xlist.fold nl [] (fun nl -> function
  456 + {token=Lemma(lemma,cat,interp)} ->
  457 + Xlist.fold interp nl (fun nl interp ->
  458 + try
  459 + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in
  460 + (lemma,cat,interp) :: nl
  461 + with Not_found -> nl)
  462 + | _ -> nl) in
  463 + let l2 = Xlist.fold nl2 [] (fun l (nlemma,ncat,ninterp) ->
  464 + Xlist.fold l1 l (fun l -> function
  465 + {token=Lemma(lemma,cat,interp)} ->
  466 + Xlist.fold interp l (fun l interp ->
  467 + try
  468 + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else
  469 + if lemma = nlemma then
  470 + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in
  471 + (Lemma(lemma,cat,[interp])) :: l else l
  472 + with Not_found -> l)
  473 + | _ -> l)) in
  474 + match sort_uniq l2 with
  475 + [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1))
  476 + | [t] -> t, TokenDeviated
  477 + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2))
  478 +
  479 +let rec match_lemmatize_rec t nlemma ncat ninterp f0 = function
  480 + f :: l ->
  481 + (try f t nlemma ncat ninterp
  482 + with LemmaNotMatched _ -> match_lemmatize_rec t nlemma ncat ninterp f0 l)
  483 + | [] -> f0 t nlemma ncat ninterp
  484 +
  485 +let match_lemmatize (*stats q name id_div paragraph*) t =
  486 + if has_brev t.attrs then raise HasBrev (*StringQMap.add_val stats "brev" q*)
  487 +(* let nlemma = get_brev t.attrs in
  488 + (let l = ENIAMpaths.lemmatize_token t in
  489 + let l2 = Xlist.fold l [] (fun l -> function
  490 + {token=Lemma(lemma,cat,interp)} ->
  491 + Xlist.fold interp l (fun l interp ->
  492 + try
  493 + if lemma = nlemma then (Lemma(nlemma,cat,[interp])) :: l else l
  494 + with Not_found -> l)
  495 + (* | {token=Dig _} -> l (* FIXME: todo *)
  496 + | {token=RomanDig(_,"roman")} ->
  497 + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l
  498 + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) *)
  499 + | _ -> l) in
  500 + match sort_uniq l2 with
  501 + [] -> raise (LemmaNotMatched(nlemma,"BREV",[],l))
  502 + | [t] -> t, TokenBrev
  503 + | _ -> raise (MultipleLemmaMatched(nlemma,"BREV",[],l2)))*)
  504 + else
  505 + let nlemma,ncat,ninterp = try get_ntoken t.attrs with Not_found -> raise NoNtoken in
  506 + match_lemmatize_rec t nlemma ncat ninterp match_lemmatize_simple
  507 + [match_lemmatize_simple; match_lemmatize_lowercase; match_lemmatize_substgender; match_lemmatize_deviated]
  508 + (* let ninterp = if ncat = "adv" && ninterp = [] then [["pos"]] else ninterp in *)
  509 +(* let l1 = ENIAMpaths.lemmatize_token t in
  510 + let l2 = Xlist.fold l1 [] (fun l -> function
  511 + {token=Lemma(lemma,cat,interp)} ->
  512 + Xlist.fold interp l (fun l interp ->
  513 + try
  514 + if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else
  515 + if lemma = nlemma then
  516 + let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in
  517 + (Lemma(lemma,cat,[interp])) :: l else l
  518 + with Not_found -> l)
  519 + | {token=Dig _} -> l (* FIXME: todo *)
  520 + | {token=RomanDig(_,"roman")} ->
  521 + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l
  522 + | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *)
  523 + | _ -> l) in
  524 + match sort_uniq l2 with
  525 + [] -> (*raise (LemmaNotMatched(nlemma,ncat,ninterp,l))*)
  526 +lowercase
  527 + | [t] -> t, TokenMatched
  528 + | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2))*)
  529 +
  530 +(* try
262 531 let nlemma = correct_nlemma nlemma in
263 532 let nl = lemmatize_string nlemma in
264 533 let nl2 = Xlist.fold nl [] (fun nl -> function
265 534 {token=Lemma(lemma,cat,interp)} ->
266 535 Xlist.fold interp nl (fun nl interp ->
267 536 try
268   - let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in
269   - if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl
  537 + if lemma = nlemma then
  538 + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in
  539 + (Lemma(lemma,cat,[interp])) :: nl else
  540 + let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in
  541 + (Lemma(lemma,cat,[interp])) :: nl
270 542 with Not_found -> nl)
271 543 | {token=Dig _} -> nl (* FIXME: todo *)
  544 + | {token=RomanDig(_,"roman")} ->
  545 + if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl
272 546 | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *)
273 547 | _ -> nl) in
274   - if nl2 = [] then StringQMap.add stats (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token)))
  548 + if nl2 = [] then
  549 + StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))
  550 + ^ "\n" ^ name ^ " " ^ string_of_int id_div ^ " " ^ string_of_int t.beg ^ " " ^ t.orth ^ "\n" ^ paragraph) q
  551 + (* if nl2 = [] then StringQMap.add_val stats (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) q *)
275 552 (* let l2 = Xlist.fold l [] (fun l2 t2 ->
276 553 match t2.token with
277 554 Lemma(lemma,cat,interp) -> if lemma = nlemma (*|| lemma = lowercase nlemma t.token*) then t2 :: l2 else l2
278 555 (* | Proper(lemma,cat,interp,_) -> if lemma = nlemma || lemma = lowercase nlemma t.token then t2 :: l2 else l2 *)
279 556 | _ -> l2) in
280 557 if l2 = [] then StringQMap.add stats ("no lemma: " ^ t.orth ^ " " ^ nlemma) else *)
281   - else StringQMap.add stats "lemmatized"
  558 + else StringQMap.add_val stats "lemmatized" q
282 559 (* let l3 = Xlist.fold l2 [] (fun l3 t ->
283 560 match t.token with
284 561 Lemma(lemma2,cat2,interp2) -> if cat = cat2 then t :: l3 else l3
... ... @@ -308,12 +585,30 @@ let match_lemmatize stats t =
308 585 | [{token=Lemma _};{token=SmallLetter _}] -> stats
309 586 | [{token=Lemma _};{token=FirstCap _}] -> stats
310 587 | l -> StringQMap.add stats ("multiple interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))*)
311   - with Not_found -> StringQMap.add stats "no ntoken" (*("no ntoken for: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token)*)
  588 + with Not_found -> StringQMap.add_val stats "no ntoken/incorrect" q
  589 + (* with Not_found -> StringQMap.add_val stats "no ntoken" q (*("no ntoken for: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token)*) *)*)
312 590  
313   -let rec validate_token stats = function
314   - Token t -> match_lemmatize stats t
315   - | Seq l -> Xlist.fold l stats validate_token
316   - | Variant l -> Xlist.fold l stats validate_token
  591 +let rec validate_token name id_div paragraph stats = function
  592 + Token t ->
  593 + (* if t.orth = "POWIŚLE" then Printf.printf "%s %d %s\n%s\n" name id_div paragraph (ENIAMtokens.string_of_token_env t); *)
  594 + (try let _,f = match_lemmatize (*stats 1 name id_div paragraph*) t in
  595 + match f with
  596 + TokenMatched -> StringQMap.add stats "validated"
  597 + | TokenLowercase -> StringQMap.add stats "validated as lowercase"
  598 + | TokenBrev -> StringQMap.add stats "validated abbreviation"
  599 + | TokenSubstGender -> StringQMap.add stats "validated substgender"
  600 + | TokenDeviated -> StringQMap.add stats "validated deviated"
  601 + with
  602 + HasBrev -> StringQMap.add stats ("has brev: " ^ t.orth (*^ " " ^ lemma ^ " " ^ cat ^ "\n"*))
  603 + (* | NoNtoken -> StringQMap.add stats ("no ntoken: " ^ t.orth (*^ " " ^ lemma ^ " " ^ cat ^ "\n"*)) *)
  604 + | NoNtoken -> StringQMap.add stats "no ntoken"
  605 + | LemmaNotMatched(nlemma,ncat,ninterp,l) ->
  606 + (* StringQMap.add stats (Printf.sprintf "lemma not matched: %s %s : %s \n%s" t.orth (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp]))) (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t.token))) paragraph) *)
  607 + StringQMap.add stats (Printf.sprintf "%s %s %s %d %s\n#%s\n#%s" ncat t.orth name id_div (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])))
  608 + (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t.token))) paragraph)
  609 + | MultipleLemmaMatched(nlemma,ncat,ninterp,l) -> StringQMap.add stats (Printf.sprintf "multiple lemma matched: %s %s : %s" t.orth (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp]))) (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t)))))
  610 + | Seq l -> Xlist.fold l stats (validate_token name id_div paragraph)
  611 + | Variant l -> Xlist.fold l stats (validate_token name id_div paragraph)
317 612  
318 613 let validate_morphology stats name typ channel entries =
319 614 prerr_endline name;
... ... @@ -323,7 +618,7 @@ let validate_morphology stats name typ channel entries =
323 618 (* print_endline paragraph; *)
324 619 (*let s = "W Specjalnym Ośrodku Szkolno-Wychowawczym" in
325 620 if String.length paragraph >= String.length s && String.sub paragraph 0 (String.length s) = s then*)
326   - Xlist.fold tokens stats validate_token
  621 + Xlist.fold tokens stats (validate_token name id_div paragraph)
327 622 (*else stats*)))
328 623  
329 624 let ntokens_filename = "results/ntokens.tab"
... ... @@ -333,7 +628,7 @@ let parse_ninterp s =
333 628  
334 629 let fold_ntokens ntokens_filename s f =
335 630 File.fold_tab ntokens_filename s (fun s -> function
336   - [_;nlemma;ncat;ninterp] -> f s (nlemma,ncat,parse_ninterp ninterp)
  631 + [q;nlemma;ncat;ninterp] -> f s (int_of_string q) (nlemma,ncat,parse_ninterp ninterp)
337 632 | l -> failwith ("fold_ntokens: " ^ String.concat "\t" l))
338 633  
339 634 let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005";
... ... @@ -371,12 +666,15 @@ let _ =
371 666 create_ntoken_list stats name typ channel entries) in *)
372 667 (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
373 668 create_ntoken_list stats name typ channel entries) in *)
374   - let stats = fold_ntokens ntokens_filename StringQMap.empty validate_ntoken in
  669 + (* let stats = fold_ntokens ntokens_filename StringQMap.empty validate_ntoken in *)
  670 + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) ->
  671 + validate_ntoken_entry stats name typ channel entries) in *)
375 672 (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) ->
376 673 validate_morphology stats name typ channel entries) in *)
377   - (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
378   - validate_morphology stats name typ channel entries) in *)
  674 + let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
  675 + validate_morphology stats name typ channel entries) in
379 676 let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in
380 677 Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k);
  678 + flush stdout;
381 679 ignore(Sys.command "mpg123 \"../../Inne/gong/gong_00m_30s.mp3\"");
382 680 ()
... ...
NKJP2/validateTokenizer.ml
... ... @@ -609,7 +609,7 @@ let rec annotate_paragraph name paragraph l = function
609 609 with Not_found -> (try
610 610 let m,ets,l = annotate_apply_rules (et :: ets) l rules in
611 611 m :: annotate_paragraph name paragraph l ets
612   - with Not_found -> failwith "annotate_paragraph 1")))
  612 + with Not_found -> (*print_endline ("annotate_paragraph 1: " ^ (string_of_vtoken et));*)failwith "annotate_paragraph 1")))
613 613 | [] -> if l = [] then [] else failwith "annotate_paragraph 2"
614 614  
615 615 let validate_segmentation stats name typ channel entries =
... ... @@ -713,12 +713,15 @@ let transform_nkjp_interp cat interp1 =
713 713 | "prep" | "adv" | "fin" | "inf" | "imps" | "pcon" | "bedzie" | "impt" | "siebie" | "aglt" | "pant" | "brev" | "qub" -> interp
714 714 | _ -> print_endline ("transform_nkjp_interp: " ^ cat ^ " " ^ String.concat ":" interp1); interp
715 715  
  716 +let transform_nkjp_interp_simple cat interp1 =
  717 + Xlist.map interp1 (fun s -> [s])
  718 +
716 719 let merge_token = function
717 720 t,[] -> Token t
718 721 | t,[{ncat="brev"} as n] -> set_sent n.nsent {t with attrs=BrevLemma n.nlemma :: t.attrs}
719 722 | t,[n] ->
720 723 if n.nlemma = "+/-" then set_sent n.nsent t else
721   - if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: t.attrs}
  724 + if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,transform_nkjp_interp_simple n.ncat n.ninterp) :: t.attrs}
722 725 else set_sent n.nsent t
723 726 | _ -> failwith "merge_token"
724 727  
... ... @@ -732,7 +735,7 @@ let merge_letni l seq =
732 735 match List.rev seq with
733 736 last :: l ->
734 737 let attrs = if n.nsent=SentEnd || n.nsent=SentBegEnd then (SentEnd : attr) :: last.attrs else last.attrs in
735   - Seq(Xlist.rev_map ({last with attrs=Disamb(lemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: attrs} :: l) (fun t -> Token t))
  738 + Seq(Xlist.rev_map ({last with attrs=Disamb(lemma,n.ncat,transform_nkjp_interp_simple n.ncat n.ninterp) :: attrs} :: l) (fun t -> Token t))
736 739 | _ -> failwith "merge_letni"
737 740  
738 741 let blabla_orths = StringSet.of_list ["8.12"; "9.11"; "1.1"; "1.2"]
... ... @@ -751,7 +754,7 @@ let merge_paragraph name = function
751 754 | AR("brev",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t))))
752 755 | AR("both-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t))))
753 756 | AR("eniam-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t))))
754   - | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp n.ncat n.ninterp])})) (* FIXME: ustalenie beg len next *)
  757 + | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp_simple n.ncat n.ninterp])})) (* FIXME: ustalenie beg len next *)
755 758 | t -> failwith ("merge_paragraph: " ^ string_of_atoken t)
756 759  
757 760 let test_annotate name typ channel entries =
... ... @@ -783,6 +786,60 @@ let test_annotate name typ channel entries =
783 786 (* print_endline (String.concat "\n" (Xlist.map m string_of_atoken))); *)
784 787 ()))
785 788  
  789 +type cap = Capital | Small | Sign
  790 +
  791 +let classify_cap s =
  792 + match Xunicode.classified_chars_of_utf8_string s with
  793 + Xunicode.Capital _ :: _ -> Capital
  794 + | Xunicode.ForeignCapital _ :: _ -> Capital
  795 + | Xunicode.Small _ :: _ -> Small
  796 + | Xunicode.ForeignSmall _ :: _ -> Small
  797 + | _ -> Sign
  798 +
  799 +let rec get_ntoken = function
  800 + (Disamb(nlemma,ncat,ninterp) : attr) :: _ -> nlemma,ncat,ninterp
  801 + | _ :: l -> get_ntoken l
  802 + | [] -> raise Not_found
  803 +
  804 +let rec disambiguate_capitalics = function
  805 + Token t ->
  806 + (try
  807 + let nlemma,ncat,ninterp = get_ntoken t.attrs in
  808 + let c = match t.token, classify_cap nlemma with
  809 + ENIAMtokenizerTypes.SmallLetter _, Small -> true
  810 + | ENIAMtokenizerTypes.CapLetter _, Capital -> true
  811 + | ENIAMtokenizerTypes.AllSmall _ , Small-> true
  812 + | ENIAMtokenizerTypes.AllCap _, Capital -> true
  813 + (* | ENIAMtokenizerTypes.AllCap _, Small -> true *)
  814 + | ENIAMtokenizerTypes.FirstCap _, Capital -> true
  815 + | ENIAMtokenizerTypes.SomeCap _, Capital -> true
  816 + | ENIAMtokenizerTypes.SomeCap _, Small -> true
  817 + | ENIAMtokenizerTypes.RomanDig _, Capital -> true
  818 + | ENIAMtokenizerTypes.Interp _, _ -> true
  819 + | ENIAMtokenizerTypes.Symbol _, _ -> true
  820 + | ENIAMtokenizerTypes.Dig _, _ -> true
  821 + | ENIAMtokenizerTypes.Other _, _ -> true
  822 + | ENIAMtokenizerTypes.Lemma _, _ -> true
  823 + | ENIAMtokenizerTypes.Proper _, _ -> true
  824 + | ENIAMtokenizerTypes.Compound _, _ -> true
  825 + | ENIAMtokenizerTypes.Tokens _, _ -> true
  826 + | _ -> false in
  827 + Token t, c
  828 + (* let nc = classify_cap nlemma in
  829 + let no = classify_cap t.orth in
  830 + if no = nc then Token t,true else Token t,false *)
  831 + with Not_found -> Token t,true)
  832 + | Seq l ->
  833 + let l,c = Xlist.fold l ([],true) (fun (l,c) t ->
  834 + let t,d = disambiguate_capitalics t in
  835 + t :: l, c && d) in
  836 + Seq(List.rev l), c
  837 + | Variant l ->
  838 + let l2 = Xlist.fold l [] (fun l t ->
  839 + let t,d = disambiguate_capitalics t in
  840 + if d then t :: l else l) in
  841 + if l2 = [] then Variant l,false else Variant l2,true
  842 +
786 843 let annotate name sentences =
787 844 let tokens = flatten_sentences sentences in
788 845 let tokens = simple_allign "" "" [] tokens in
... ... @@ -793,8 +850,21 @@ let annotate name sentences =
793 850 let eniam_tokens = annotate_variants_par eniam_tokens in
794 851 let m = annotate_paragraph name paragraph tokens eniam_tokens in
795 852 let m = List.rev (Xlist.rev_map m (merge_paragraph name)) in
  853 + let m = List.rev (Xlist.fold m [] (fun m t ->
  854 + let t,_ = disambiguate_capitalics t in
  855 + t :: m)) in
796 856 paragraph, m
797 857  
  858 +let test_disambiguate_capitalics stats name typ channel entries =
  859 + prerr_endline name;
  860 + Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
  861 + Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
  862 + let paragraph,tokens = annotate name sentences in
  863 + Xlist.fold tokens stats (fun stats t ->
  864 + let _,c = disambiguate_capitalics t in
  865 + if c then stats else StringQMap.add stats (Printf.sprintf "%s %s" (ENIAMtokens.string_of_tokens 0 t) paragraph))))
  866 +
  867 +
798 868 let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005";
799 869 "620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727";
800 870 "620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056";
... ... @@ -839,6 +909,8 @@ let _ =
839 909 test_annotate name typ channel entries); *)
840 910 (* ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) ->
841 911 test_annotate name typ channel entries); *)
  912 + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) ->
  913 + test_disambiguate_capitalics stats name typ channel entries) in *)
842 914 (* let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in
843 915 Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k); *)
844 916 (* ignore(Sys.command "mpg123 \"../../Inne/gong/gong_00m_30s.mp3\""); *)
... ...
morphology/doc/model2.pdf
No preview for this file type
morphology/resources/alt_supplement.tab
... ... @@ -2,4 +2,5 @@ się się qub
2 2 siebie siebie siebie:acc.gen
3 3 sobie siebie siebie:dat.loc
4 4 sobą siebie siebie:inst
  5 +to to pred
5 6  
... ...