From e8d2c65cd63bb36a671eb2bbd0effd7a39eec931 Mon Sep 17 00:00:00 2001
From: Wojciech Jaworski <wjaworski@mimuw.edu.pl>
Date: Mon, 16 Oct 2017 12:08:47 +0200
Subject: [PATCH] Analiza różnic w lematyzacji

---
 NKJP2/validateMorphology.ml             | 390 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------------------------
 NKJP2/validateTokenizer.ml              |  80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 morphology/doc/model2.pdf               | Bin 355865 -> 0 bytes
 morphology/resources/alt_supplement.tab |   1 +
 4 files changed, 421 insertions(+), 50 deletions(-)

diff --git a/NKJP2/validateMorphology.ml b/NKJP2/validateMorphology.ml
index 6667c5e..ba3a71c 100644
--- a/NKJP2/validateMorphology.ml
+++ b/NKJP2/validateMorphology.ml
@@ -26,10 +26,10 @@ let rec has_brev = function
   | _ :: l -> has_brev l
   | [] -> false
 
-let rec get_ntoken = function
-    (Disamb(nlemma,ncat,ninterp) : attr) :: _ -> nlemma,ncat,ninterp
-  | _ :: l -> get_ntoken l
-  | [] -> raise Not_found
+let rec get_brev = function
+    BrevLemma s :: _ -> s
+  | _ :: l -> get_brev l
+  | [] -> failwith "get_brev"
 
 let rec add_ntoken stats = function
     Token t ->
@@ -91,7 +91,7 @@ let lemmatize_string s =
       | Token ({token=FirstCap _} as t) -> t :: l
       | Token ({token=AllCap _} as t) -> t :: l
       | Token ({token=CapLetter _} as t) -> t :: l
-      | Token ({token=RomanDig _}) -> (*print_endline ("lemmatize_string: " ^ s);*) (*t ::*) l
+      | Token ({token=RomanDig _} as t) -> (*print_endline ("lemmatize_string: " ^ s);*) t :: l
       | Token ({token=Dig _} as t) -> (*print_endline ("lemmatize_string: " ^ s);*) t :: l
       | Token ({token=Proper _} as t) -> t :: l
       | Seq[Token {token=AllSmall _};Token {token=Lemma _}] -> l
@@ -125,34 +125,52 @@ let lemmatize_string s =
 
 let get_cat_interp = function
     "subst","subst",[n;c;["m1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]]
+  | "subst","subst",[n;c;["m1"]],[_;_;["m1"];col] -> "subst",[n;c;["m1"];col]
   | "subst","subst",[n;c;["m2"]],[_;_;["m2"]] -> "subst",[n;c;["m2"]]
   | "subst","subst",[n;c;["m3"]],[_;_;["m3"]] -> "subst",[n;c;["m3"]]
-  | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]]
-  | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]]
+  | "subst","subst",[n;c;["n"]],[_;_;["n"];col] -> "subst",[n;c;["n"];col]
   | "subst","subst",[n;c;["f"]],[_;_;["f"]] -> "subst",[n;c;["f"]]
-  | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]]
-  | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]]
-  | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p2"]] -> "subst",[n;c;["p2"]]
-  | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p3"]] -> "subst",[n;c;["p3"]]
-  | "subst","subst",[n;c;["m1";"p1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]]
-  | "subst","subst",[n;c;["m1";"p1"]],[_;_;["p1"]] -> "subst",[n;c;["p1"]]
+  | "subst","subst",[n;c;g],[_;_;_] -> "subst",[n;c;g]
+  | "subst","subst",[n;c;g],[_;_;_;_] -> "subst",[n;c;g]
+  | "subst","adj",[n;c;g],_ -> "subst",[n;c;g]
   | "depr","subst",[["pl"];["nom"];["m2"]],[["sg"];["nom"];["m1"]] -> "depr",[["pl"];["nom"];["m2"]]
+  | "depr","subst",[["pl"];["acc"];["m2"]],[["sg"];["nom"];["m1"]] -> "depr",[["pl"];["acc"];["m2"]]
   | "ppron3","ppron3",ninterp,[["sg"];["nom"];["m1";"m2";"m3"];["ter"];_;_] -> "ppron3",ninterp
-  | "ppron12","ppron12",ninterp,[_;["nom"];_;_] -> "ppron3",ninterp
-  | "numcol","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *)
-  | "num","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *)
+  | "ppron12","ppron12",ninterp,[_;["nom"];_;_] -> "ppron12",ninterp
+  | "numcol","num",ninterp,_ -> "num",ninterp
+  | "num","num",ninterp,_ -> "num",ninterp (* na tym etapie nie da się skorygować błędów *)
+  (* | "num","num",[["pl"];c;g;["rec"]],[["sg";"pl"];["nom";"gen";"acc"];["m1";"m2";"m3";"f";"n"];["rec"]] -> "num",[["pl"];c;g;["rec"]]
+  | "num","num",[["pl"];c;["m2"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["m2"];["rec"]]
+  | "num","num",[["pl"];c;["m3"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["m3"];["rec"]]
+  | "num","num",[["pl"];c;["f"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["f"];["rec"]]
+  | "num","num",[["pl"];c;["n"];["rec"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["rec"];col] -> "num",[["pl"];c;["n"];["rec"];col]
+  | "num","num",[["pl"];c;["m1"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m1"];["congr"]]
+  | "num","num",[["pl"];c;["m2"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m2"];["congr"]]
+  | "num","num",[["pl"];c;["m3"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["m3"];["congr"]]
+  | "num","num",[["pl"];c;["f"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["f"];["congr"]]
+  | "num","num",[["pl"];c;["n"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"f";"n"];["congr"];col] -> "num",[["pl"];c;["n"];["congr"];col]
+  | "num","num",[["pl"];c;["m2"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["m2"];["congr"]]
+  | "num","num",[["pl"];c;["m3"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["m3"];["congr"]]
+  | "num","num",[["pl"];c;["n"];["congr"]],[["pl"];["nom";"acc";"voc"];["m2";"m3";"n"];["congr"];col] -> "num",[["pl"];c;["n"];["congr"];col] *)
   | "siebie","siebie",[[c]],[["acc";"gen"]] -> "siebie",[[c]]
   | "adj","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adj",ninterp
+  | "adj","adj",ninterp,[["sg";"pl"];["nom";"gen";"dat";"acc";"inst";"loc";"voc"];["m1";"m2";"m3";"f";"n"];["pos"]] -> "adj",ninterp
   | "adja","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adja",ninterp
   | "adjc","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjc",ninterp
   | "adjp","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjp",ninterp
+  | "adj","adj",ninterp,[["sg"];["nom"];["m1";"m2";"m3"];["pos"]] -> "adj",ninterp
   | "adv","adv",[[g]],[["pos"]] -> "adv",[[g]]
-  | "adv","adv",ninterp,interp -> if ninterp = interp then "adv",ninterp else raise Not_found
+  | "adv","adv",[],[["pos"]] -> "adv",[["pos"]]
+  | "adv",_,ninterp,_ -> "adv",ninterp
+  | "comp","comp",ninterp,interp -> if ninterp = interp then "comp",ninterp else raise Not_found
+  | "conj","conj",ninterp,interp -> if ninterp = interp then "conj",ninterp else raise Not_found
+  | "conj",_,ninterp,_ -> "conj",ninterp
+  | "prep","prep",[c1;w],[c2;_] -> if c1 = c2 then "prep",[c1;w] else raise Not_found
   | "prep","prep",ninterp,interp -> if ninterp = interp then "prep",ninterp else raise Not_found
   | "qub","qub",ninterp,interp -> if ninterp = interp then "qub",ninterp else raise Not_found
-  | "conj","conj",ninterp,interp -> if ninterp = interp then "conj",ninterp else raise Not_found
-  | "comp","comp",ninterp,interp -> if ninterp = interp then "comp",ninterp else raise Not_found
+  | "qub",_,ninterp,_ -> "qub",ninterp
   | "interj","interj",ninterp,interp -> if ninterp = interp then "interj",ninterp else raise Not_found
+  | "interj",_,ninterp,_ -> "interj",ninterp
   | "burk","burk",ninterp,interp -> if ninterp = interp then "burk",ninterp else raise Not_found
   | "pred","pred",ninterp,interp -> if ninterp = interp then "pred",ninterp else raise Not_found
   | "fin","inf",[n;p;["imperf"]],[["imperf";"perf"]] -> "fin",[n;p;["imperf"]]
@@ -163,6 +181,8 @@ let get_cat_interp = function
   | "impt","inf",[n;p;["imperf"]],[["imperf"]] -> "impt",[n;p;["imperf"]]
   | "impt","inf",[n;p;["perf"]],[["imperf";"perf"]] -> "impt",[n;p;["perf"]]
   | "impt","inf",[n;p;["perf"]],[["perf"]] -> "impt",[n;p;["perf"]]
+  | "bedzie","inf",[n;p;["imperf"]],[["imperf"]] -> "bedzie",[n;p;["imperf"]]
+  | "aglt","inf",[n;p;["imperf"];w],[["imperf"]] -> "aglt",[n;p;["imperf"];w]
   | "inf","inf",[["imperf"]],[["imperf";"perf"]] -> "inf",[["imperf"]]
   | "inf","inf",[["imperf"]],[["imperf"]] -> "inf",[["imperf"]]
   | "inf","inf",[["perf"]],[["imperf";"perf"]] -> "inf",[["perf"]]
@@ -175,7 +195,7 @@ let get_cat_interp = function
   | "praet","inf",[n;g;["imperf"];a],[["imperf"]] -> "praet",[n;g;["imperf"];a]
   | "praet","inf",[n;g;["perf"];a],[["imperf";"perf"]] -> "praet",[n;g;["perf"];a]
   | "praet","inf",[n;g;["perf"];a],[["perf"]] -> "praet",[n;g;["perf"];a]
-  | "winien","inf",[n;g;["imperf"]],[["imperf"]] -> "winien",[n;g;["imperf"]]
+  | "winien","winien",[n;g;["imperf"]],[_;_;["imperf"]] -> "winien",[n;g;["imperf"]]
   | "ppas","inf",[n;c;g;["imperf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["imperf"];a]
   | "ppas","inf",[n;c;g;["imperf"];a],[["imperf"]] -> "ppas",[n;c;g;["imperf"];a]
   | "ppas","inf",[n;c;g;["perf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["perf"];a]
@@ -202,6 +222,16 @@ let get_cat_interp = function
   | "imps","inf",[["perf"]],[["perf"]] -> "imps",[["perf"]]
   | _ -> raise Not_found
 
+let get_lemma_cat_interp = function
+    nlemma,lemma,"adj","ppas",[n;c;g;["pos"]],[["sg"];["nom";"voc"];["m1";"m2";"m3"];a;aff] -> lemma,"ppas",[n;c;g;a;aff]
+  | nlemma,lemma,"adja","adja",[],[] -> lemma,"adja",[]
+  | nlemma,lemma,"subst","subst",[["pl"];c;g],[["pl"];["nom";"voc"];_] -> lemma,"subst",[["pl"];c;g]
+  (* | "5","5","adj","dig",ninterp,[] -> "piąty","adj",ninterp
+  | "6","6","adj","dig",ninterp,[] -> "szósty","adj",ninterp *)
+  (* | "adj","ppas",ninterp,interp -> print_endline (ENIAMtokens.string_of_interps [ninterp] ^ " " ^ ENIAMtokens.string_of_interps [interp]); raise Not_found *)
+  | _ -> raise Not_found
+
+
 let correct_nlemma = function
     "letnia  " -> "letnia"
   | "10minutowy" -> "minutowy"
@@ -233,7 +263,7 @@ let correct_nlemma = function
   | "16-latek" -> raise Not_found
   | s -> s
 
-let process_ntoken stats nlemma ncat ninterp =
+let process_ntoken stats q nlemma ncat ninterp =
   try
     let nlemma = correct_nlemma nlemma in
     let nl = lemmatize_string nlemma in
@@ -241,44 +271,291 @@ let process_ntoken stats nlemma ncat ninterp =
         {token=Lemma(lemma,cat,interp)} ->
           Xlist.fold interp nl (fun nl interp ->
             try
-              let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in
-              if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl
+              if lemma = nlemma then
+                let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in
+                (Lemma(lemma,cat,[interp])) :: nl else
+              let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in
+              (Lemma(lemma,cat,[interp])) :: nl
+            with Not_found -> nl)
+      | {token=Dig(_,"dig")} -> nl (* FIXME: todo *)
+            (* (try
+              let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in
+              (Lemma(lemma,cat,[interp])) :: nl
+            with Not_found -> nl) *)
+      | {token=RomanDig(_,"roman")} ->
+          if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl
+      | {token=Proper(lemma,cat,interp,_)} -> (*print_endline ("P " ^ nlemma);*) nl (* FIXME: todo *)
+      | _ -> nl) in
+    if nl2 = [] then StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) q
+    else StringQMap.add_val stats "lemmatized" q
+  with Not_found -> StringQMap.add_val stats "incorrect" q
+
+let process_ntoken2 stats q name id_div orth beg paragraph nlemma ncat ninterp =
+  try
+    let nlemma = correct_nlemma nlemma in
+    let nl = lemmatize_string nlemma in
+    let nl2 = Xlist.fold nl [] (fun nl -> function
+        {token=Lemma(lemma,cat,interp)} ->
+          Xlist.fold interp nl (fun nl interp ->
+            try
+              if lemma = nlemma then
+                let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in
+                (Lemma(lemma,cat,[interp])) :: nl else
+              let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in
+              (Lemma(lemma,cat,[interp])) :: nl
             with Not_found -> nl)
       | {token=Dig _} -> nl (* FIXME: todo *)
+      | {token=RomanDig(_,"roman")} ->
+          if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl
       | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *)
       | _ -> nl) in
-    if nl2 = [] then StringQMap.add stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token)))
-    else StringQMap.add stats "lemmatized"
-  with Not_found -> StringQMap.add stats "incorrect"
+    if nl2 = [] then
+      StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))
+        ^ "\n" ^ name ^ " " ^ string_of_int id_div ^ " " ^ string_of_int beg ^ " " ^ orth ^ "\n" ^ paragraph) q
+    else StringQMap.add_val stats "lemmatized" q
+  with Not_found -> StringQMap.add_val stats "incorrect" q
+
+let validate_ntoken stats q (nlemma,ncat,ninterp) =
+  process_ntoken stats q nlemma ncat ninterp
 
-let validate_ntoken stats (nlemma,ncat,ninterp) =
-  process_ntoken stats nlemma ncat ninterp
+let rec validate_ntoken_token name id_div paragraph stats = function
+    Token t ->
+      (try
+        let nlemma,ncat,ninterp = get_ntoken t.attrs in
+        process_ntoken2 stats 1 name id_div t.orth t.beg paragraph nlemma ncat ninterp
+        (* print_endline (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtokens.string_of_interps [ninterp]);
+        Printf.printf "%s\t%d\t%s\t%d\n" name id_div t.orth t.beg;
+        print_endline paragraph;
+        stats *)
+      with Not_found -> stats)
+  | Seq l -> Xlist.fold l stats (validate_ntoken_token name id_div paragraph)
+  | Variant l -> Xlist.fold l stats (validate_ntoken_token name id_div paragraph)
 
-let match_lemmatize stats t =
-  if has_brev t.attrs then StringQMap.add stats "brev" else
+let validate_ntoken_entry stats name typ channel entries =
+  prerr_endline name;
+  Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
+    Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
+      let paragraph,tokens = annotate name sentences in
+      Xlist.fold tokens stats (validate_ntoken_token name id_div paragraph)))
+
+let rec subset_list = function
+    [],[] -> true
+  | [x] :: l1, y :: l2 -> if Xlist.mem y x then subset_list (l1,l2) else false
+  | _ -> false
+
+let match_cat_interp = function
+  | "subst","subst",[nn;nc;ng],[n;c;g;col] -> if subset_list ([nn;nc;ng],[n;c;g]) then "subst",[nn;nc;ng;col] else raise Not_found
+(*  | "numcol","num",ninterp,_ -> "num",ninterp*)
+  | "num","num",[nn;nc;["n"];na],[n;c;g;a;col] -> if subset_list ([nn;nc;["n"];na],[n;c;g;a]) then "num",[nn;nc;["n"];na;col] else raise Not_found
+  | "num","num",[nn;nc;ng;na],[n;c;g;a;col] -> if subset_list ([nn;nc;ng;na],[n;c;g;a]) then "num",[nn;nc;ng;na] else raise Not_found
+  | "adv","adv",[],[["pos"]] -> "adv",[["pos"]]
+  | _ -> raise Not_found
+
+let match_cat_interp_substgender = function
+    "subst","subst",[nn;nc;ng],[n;c;_] -> if subset_list ([nn;nc],[n;c]) then "subst",[nn;nc;ng] else raise Not_found
+  | "subst","subst",[nn;nc;ng],[n;c;_;_] -> if subset_list ([nn;nc],[n;c]) then "subst",[nn;nc;ng] else raise Not_found
+  | _ -> raise Not_found
+
+exception HasBrev
+exception NoNtoken
+exception LemmaNotMatched of string * string * string list list * token_env list
+exception MultipleLemmaMatched of string * string * string list list * token list
+
+let rec sort_uniq_rec rev = function
+    [] -> rev
+  | x :: y :: l -> if x = y then sort_uniq_rec rev (y :: l) else sort_uniq_rec (x :: rev) (y :: l)
+  | [x] -> x :: rev
+
+let sort_uniq l =
+  match sort_uniq_rec [] (Xlist.sort l compare) with
+    [Lemma(lemma1,"subst",[[n1;c1;["n"];["ncol"]]]);Lemma(lemma2,"subst",[[n2;c2;["n"];["col"]]])] as l ->
+       if lemma1 = lemma2 && n1 = n2 && c1 = c2 then [Lemma(lemma1,"subst",[[n1;c1;["n"];["ncol";"col"]]])] else l
+  | [Lemma("kląsknięcie","subst",[[["pl"];c1;["n"];["pt"]]]);Lemma("kląsknięcie","subst",[[["pl"];c2;["n"];["ncol"]]])] as l ->
+       if c1 = c2 then [Lemma("kląsknięcie","subst",[[["pl"];c1;["n"];["pt"]]])] else l
+  | [Lemma("wybrażenie","subst",[[["pl"];c1;["n"];["pt"]]]);Lemma("wybrażenie","subst",[[["pl"];c2;["n"];["ncol"]]])] as l ->
+       if c1 = c2 then [Lemma("wybrażenie","subst",[[["pl"];c1;["n"];["pt"]]])] else l
+  | [Lemma(lemma1,"subst",[[["pl"];c1;["n"];["pt"]]]);Lemma(lemma2,"subst",[[["pl"];c2;["n"];["ncol"]]])] as l ->
+       (* print_endline lemma1; *)
+       if lemma1 = lemma2 && c1 = c2 then [Lemma(lemma1,"subst",[[["pl"];c1;["n"];["pt"]]])] else l
+  | l -> (*print_endline (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t)));*) l
+
+type t = TokenMatched | TokenLowercase | TokenBrev | TokenSubstGender | TokenDeviated
+
+let match_lemmatize_simple t nlemma ncat ninterp =
+  let l1 = ENIAMpaths.lemmatize_token t in
+  let l2 = Xlist.fold l1 [] (fun l -> function
+        {token=Lemma(lemma,cat,interp)} ->
+          Xlist.fold interp l (fun l interp ->
+            try
+              if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else
+              if lemma = nlemma then
+                let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in
+                (Lemma(lemma,cat,[interp])) :: l else l
+            with Not_found -> l)
+      | {token=Dig _} -> l (* FIXME: todo *)
+      | {token=RomanDig(_,"roman")} ->
+          if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l
+      | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *)
+      | _ -> l) in
+  match sort_uniq l2 with
+    [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1))
+  | [t] -> t, TokenMatched
+  | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2))
+
+let match_lemmatize_lowercase t nlemma ncat ninterp =
+  let t = match t.token with
+    | FirstCap(s,lower,cl,ll) -> {t with token=AllSmall lower}
+    | CapLetter(s,lower) -> {t with token=SmallLetter lower}
+    | AllCap(_,a,b) -> {t with token=FirstCap(a,b,"","")} (* FIXME: to powinno być zdezambiguowane *)
+    | _ -> t in
   let l = ENIAMpaths.lemmatize_token t in
-  try
-    let nlemma,ncat,ninterp = get_ntoken t.attrs in
+  let l2 = Xlist.fold l [] (fun l -> function
+      {token=Lemma(lemma,cat,interp)} ->
+          Xlist.fold interp l (fun l interp ->
+            try
+          if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else
+              if lemma = nlemma then
+                let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in
+                (Lemma(lemma,cat,[interp])) :: l else l
+            with Not_found -> l)
+    | {token=Dig _} -> l (* FIXME: todo *)
+    | {token=RomanDig(_,"roman")} ->
+          if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l
+    | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *)
+    | _ -> l) in
+  match sort_uniq l2 with
+    [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l))
+  | [t] -> t, TokenLowercase
+  | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2))
+
+let match_lemmatize_substgender t nlemma ncat ninterp =
+  let l1 = ENIAMpaths.lemmatize_token t in
+  let l2 = Xlist.fold l1 [] (fun l -> function
+        {token=Lemma(lemma,cat,interp)} ->
+          Xlist.fold interp l (fun l interp ->
+            try
+              if lemma = nlemma then
+                let cat,interp = match_cat_interp_substgender (ncat,cat,ninterp,interp) in
+                (Lemma(lemma,cat,[interp])) :: l else l
+            with Not_found -> l)
+      | {token=Dig _} -> l (* FIXME: todo *)
+      | {token=RomanDig(_,"roman")} ->
+          if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l
+      | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *)
+      | _ -> l) in
+  match sort_uniq l2 with
+    [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1))
+  | [t] -> t, TokenSubstGender
+  | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2))
+
+let match_lemmatize_deviated t nlemma ncat ninterp =
+  let l1 = ENIAMpaths.lemmatize_token t in
+  let nlemma = try correct_nlemma nlemma with Not_found -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1)) in
+  let nl = lemmatize_string nlemma in
+  let nl2 = Xlist.fold nl [] (fun nl -> function
+        {token=Lemma(lemma,cat,interp)} ->
+          Xlist.fold interp nl (fun nl interp ->
+            try
+              let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in
+              (lemma,cat,interp) :: nl
+            with Not_found -> nl)
+      | _ -> nl) in
+  let l2 = Xlist.fold nl2 [] (fun l (nlemma,ncat,ninterp) ->
+    Xlist.fold l1 l (fun l -> function
+        {token=Lemma(lemma,cat,interp)} ->
+          Xlist.fold interp l (fun l interp ->
+            try
+              if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else
+              if lemma = nlemma then
+                let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in
+                (Lemma(lemma,cat,[interp])) :: l else l
+            with Not_found -> l)
+      | _ -> l)) in
+  match sort_uniq l2 with
+    [] -> raise (LemmaNotMatched(nlemma,ncat,ninterp,l1))
+  | [t] -> t, TokenDeviated
+  | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2))
+
+let rec match_lemmatize_rec t nlemma ncat ninterp f0 = function
+    f :: l ->
+       (try f t nlemma ncat ninterp
+        with LemmaNotMatched _ -> match_lemmatize_rec t nlemma ncat ninterp f0 l)
+  | [] -> f0 t nlemma ncat ninterp
+
+let match_lemmatize (*stats q name id_div paragraph*) t =
+  if has_brev t.attrs then raise HasBrev (*StringQMap.add_val stats "brev" q*)
+(*    let nlemma = get_brev t.attrs in
+      (let l = ENIAMpaths.lemmatize_token t in
+      let l2 = Xlist.fold l [] (fun l -> function
+            {token=Lemma(lemma,cat,interp)} ->
+              Xlist.fold interp l (fun l interp ->
+                try
+                  if lemma = nlemma then (Lemma(nlemma,cat,[interp])) :: l else l
+                with Not_found -> l)
+          (* | {token=Dig _} -> l (* FIXME: todo *)
+          | {token=RomanDig(_,"roman")} ->
+              if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l
+          | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *) *)
+          | _ -> l) in
+      match sort_uniq l2 with
+        [] -> raise (LemmaNotMatched(nlemma,"BREV",[],l))
+      | [t] -> t, TokenBrev
+      | _ -> raise (MultipleLemmaMatched(nlemma,"BREV",[],l2)))*)
+  else
+  let nlemma,ncat,ninterp = try get_ntoken t.attrs with Not_found -> raise NoNtoken in
+  match_lemmatize_rec t nlemma ncat ninterp match_lemmatize_simple
+    [match_lemmatize_simple; match_lemmatize_lowercase; match_lemmatize_substgender; match_lemmatize_deviated]
+  (* let ninterp = if ncat = "adv" && ninterp = [] then [["pos"]] else ninterp in *)
+(*  let l1 = ENIAMpaths.lemmatize_token t in
+  let l2 = Xlist.fold l1 [] (fun l -> function
+        {token=Lemma(lemma,cat,interp)} ->
+          Xlist.fold interp l (fun l interp ->
+            try
+              if lemma = nlemma && cat = ncat && subset_list (ninterp,interp) then (Lemma(nlemma,ncat,[ninterp])) :: l else
+              if lemma = nlemma then
+                let cat,interp = match_cat_interp (ncat,cat,ninterp,interp) in
+                (Lemma(lemma,cat,[interp])) :: l else l
+            with Not_found -> l)
+      | {token=Dig _} -> l (* FIXME: todo *)
+      | {token=RomanDig(_,"roman")} ->
+          if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: l else l
+      | {token=Proper(lemma,cat,interp,_)} -> l (* FIXME: todo *)
+      | _ -> l) in
+  match sort_uniq l2 with
+    [] -> (*raise (LemmaNotMatched(nlemma,ncat,ninterp,l))*)
+lowercase
+  | [t] -> t, TokenMatched
+  | _ -> raise (MultipleLemmaMatched(nlemma,ncat,ninterp,l2))*)
+
+(*  try
     let nlemma = correct_nlemma nlemma in
     let nl = lemmatize_string nlemma in
     let nl2 = Xlist.fold nl [] (fun nl -> function
         {token=Lemma(lemma,cat,interp)} ->
           Xlist.fold interp nl (fun nl interp ->
             try
-              let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in
-              if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl
+              if lemma = nlemma then
+                let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in
+                (Lemma(lemma,cat,[interp])) :: nl else
+              let lemma,cat,interp = get_lemma_cat_interp (nlemma,lemma,ncat,cat,ninterp,interp) in
+              (Lemma(lemma,cat,[interp])) :: nl
             with Not_found -> nl)
       | {token=Dig _} -> nl (* FIXME: todo *)
+      | {token=RomanDig(_,"roman")} ->
+          if ncat = "adj" then (Lemma(nlemma,ncat,[ninterp])) :: nl else nl
       | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *)
       | _ -> nl) in
-    if nl2 = [] then StringQMap.add stats (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token)))
+    if nl2 = [] then
+      StringQMap.add_val stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))
+        ^ "\n" ^ name ^ " " ^ string_of_int id_div ^ " " ^ string_of_int t.beg ^ " " ^ t.orth ^ "\n" ^ paragraph) q
+    (* if nl2 = [] then StringQMap.add_val stats (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) q *)
     (* let l2 = Xlist.fold l [] (fun l2 t2 ->
       match t2.token with
         Lemma(lemma,cat,interp) -> if lemma = nlemma (*|| lemma = lowercase nlemma t.token*) then t2 :: l2 else l2
       (* | Proper(lemma,cat,interp,_) -> if lemma = nlemma || lemma = lowercase nlemma t.token then t2 :: l2 else l2 *)
       | _  -> l2) in
     if l2 = [] then StringQMap.add stats ("no lemma: " ^ t.orth ^ " " ^ nlemma) else *)
-    else StringQMap.add stats "lemmatized"
+    else StringQMap.add_val stats "lemmatized" q
 (*  let l3 = Xlist.fold l2 [] (fun l3 t ->
     match t.token with
       Lemma(lemma2,cat2,interp2) -> if cat = cat2 then t :: l3 else l3
@@ -308,12 +585,30 @@ let match_lemmatize stats t =
   | [{token=Lemma _};{token=SmallLetter _}] -> stats
   | [{token=Lemma _};{token=FirstCap _}] -> stats
   | l -> StringQMap.add stats ("multiple interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))*)
-  with Not_found -> StringQMap.add stats "no ntoken" (*("no ntoken for: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token)*)
+  with Not_found -> StringQMap.add_val stats "no ntoken/incorrect" q
+  (* with Not_found -> StringQMap.add_val stats "no ntoken" q (*("no ntoken for: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token)*) *)*)
 
-let rec validate_token stats = function
-    Token t -> match_lemmatize stats t
-  | Seq l -> Xlist.fold l stats validate_token
-  | Variant l -> Xlist.fold l stats validate_token
+let rec validate_token name id_div paragraph stats = function
+    Token t ->
+      (* if t.orth = "POWIŚLE" then Printf.printf "%s %d %s\n%s\n" name id_div paragraph (ENIAMtokens.string_of_token_env t); *)
+      (try let _,f = match_lemmatize (*stats 1 name id_div paragraph*) t in
+        match f with
+          TokenMatched -> StringQMap.add stats "validated"
+        | TokenLowercase -> StringQMap.add stats "validated as lowercase"
+        | TokenBrev -> StringQMap.add stats "validated abbreviation"
+        | TokenSubstGender -> StringQMap.add stats "validated substgender"
+        | TokenDeviated -> StringQMap.add stats "validated deviated"
+      with
+        HasBrev -> StringQMap.add stats ("has brev: " ^ t.orth (*^ " " ^ lemma ^ " " ^ cat ^ "\n"*))
+      (* | NoNtoken -> StringQMap.add stats ("no ntoken: " ^ t.orth (*^ " " ^ lemma ^ " " ^ cat ^ "\n"*)) *)
+      | NoNtoken -> StringQMap.add stats "no ntoken"
+      | LemmaNotMatched(nlemma,ncat,ninterp,l) ->
+          (* StringQMap.add stats (Printf.sprintf "lemma not matched: %s %s : %s \n%s" t.orth (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp]))) (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t.token))) paragraph) *)
+          StringQMap.add stats (Printf.sprintf "%s %s %s %d %s\n#%s\n#%s" ncat t.orth name id_div (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])))
+          (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t.token))) paragraph)
+      | MultipleLemmaMatched(nlemma,ncat,ninterp,l) -> StringQMap.add stats (Printf.sprintf "multiple lemma matched: %s %s : %s" t.orth (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp]))) (String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_token t)))))
+  | Seq l -> Xlist.fold l stats (validate_token name id_div paragraph)
+  | Variant l -> Xlist.fold l stats (validate_token name id_div paragraph)
 
 let validate_morphology stats name typ channel entries =
   prerr_endline name;
@@ -323,7 +618,7 @@ let validate_morphology stats name typ channel entries =
       (* print_endline paragraph; *)
       (*let s = "W Specjalnym Ośrodku Szkolno-Wychowawczym" in
       if String.length paragraph >= String.length s && String.sub paragraph 0 (String.length s) = s then*)
-        Xlist.fold tokens stats validate_token
+        Xlist.fold tokens stats (validate_token name id_div paragraph)
       (*else stats*)))
 
 let ntokens_filename = "results/ntokens.tab"
@@ -333,7 +628,7 @@ let parse_ninterp s =
 
 let fold_ntokens ntokens_filename s f =
   File.fold_tab ntokens_filename s (fun s -> function
-      [_;nlemma;ncat;ninterp] -> f s (nlemma,ncat,parse_ninterp ninterp)
+      [q;nlemma;ncat;ninterp] -> f s (int_of_string q) (nlemma,ncat,parse_ninterp ninterp)
     | l -> failwith ("fold_ntokens: " ^ String.concat "\t" l))
 
 let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005";
@@ -371,12 +666,15 @@ let _ =
     create_ntoken_list stats name typ channel entries) in *)
   (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
     create_ntoken_list stats name typ channel entries) in *)
-  let stats = fold_ntokens ntokens_filename StringQMap.empty validate_ntoken in
+  (* let stats = fold_ntokens ntokens_filename StringQMap.empty validate_ntoken in *)
+  (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) ->
+    validate_ntoken_entry stats name typ channel entries) in *)
   (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) ->
     validate_morphology stats name typ channel entries) in *)
-  (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
-    validate_morphology stats name typ channel entries) in *)
+  let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
+    validate_morphology stats name typ channel entries) in
   let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in
   Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k);
+  flush stdout;
   ignore(Sys.command "mpg123 \"../../Inne/gong/gong_00m_30s.mp3\"");
   ()
diff --git a/NKJP2/validateTokenizer.ml b/NKJP2/validateTokenizer.ml
index 538bf78..cf33f58 100644
--- a/NKJP2/validateTokenizer.ml
+++ b/NKJP2/validateTokenizer.ml
@@ -609,7 +609,7 @@ let rec annotate_paragraph name paragraph l = function
       with Not_found -> (try
         let m,ets,l = annotate_apply_rules (et :: ets) l rules in
         m :: annotate_paragraph name paragraph l ets
-      with Not_found -> failwith "annotate_paragraph 1")))
+      with Not_found -> (*print_endline ("annotate_paragraph 1: " ^ (string_of_vtoken et));*)failwith "annotate_paragraph 1")))
   | [] -> if l = [] then [] else failwith "annotate_paragraph 2"
 
 let validate_segmentation stats name typ channel entries =
@@ -713,12 +713,15 @@ let transform_nkjp_interp cat interp1 =
   | "prep" | "adv" | "fin" | "inf" | "imps" | "pcon" | "bedzie" | "impt" | "siebie" | "aglt" | "pant" | "brev" | "qub" -> interp
   | _ -> print_endline ("transform_nkjp_interp: " ^ cat ^ " " ^ String.concat ":" interp1); interp
 
+let transform_nkjp_interp_simple cat interp1 =
+  Xlist.map interp1 (fun s -> [s])
+
 let merge_token = function
     t,[] -> Token t
   | t,[{ncat="brev"} as n] -> set_sent n.nsent {t with attrs=BrevLemma n.nlemma :: t.attrs}
   | t,[n] ->
       if n.nlemma = "+/-" then set_sent n.nsent t else
-      if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: t.attrs}
+      if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,transform_nkjp_interp_simple n.ncat n.ninterp) :: t.attrs}
       else set_sent n.nsent t
   | _ -> failwith "merge_token"
 
@@ -732,7 +735,7 @@ let merge_letni l seq =
   match List.rev seq with
     last :: l ->
       let attrs = if n.nsent=SentEnd || n.nsent=SentBegEnd then (SentEnd : attr) :: last.attrs else last.attrs in
-      Seq(Xlist.rev_map ({last with attrs=Disamb(lemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: attrs} :: l) (fun t -> Token t))
+      Seq(Xlist.rev_map ({last with attrs=Disamb(lemma,n.ncat,transform_nkjp_interp_simple n.ncat n.ninterp) :: attrs} :: l) (fun t -> Token t))
   | _ -> failwith "merge_letni"
 
 let blabla_orths = StringSet.of_list ["8.12"; "9.11"; "1.1"; "1.2"]
@@ -751,7 +754,7 @@ let merge_paragraph name = function
   | AR("brev",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t))))
   | AR("both-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t))))
   | AR("eniam-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t))))
-  | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp n.ncat n.ninterp])})) (* FIXME: ustalenie beg len next *)
+  | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp_simple n.ncat n.ninterp])})) (* FIXME: ustalenie beg len next *)
   | t -> failwith ("merge_paragraph: " ^ string_of_atoken t)
 
 let test_annotate name typ channel entries =
@@ -783,6 +786,60 @@ let test_annotate name typ channel entries =
       (* print_endline (String.concat "\n" (Xlist.map m string_of_atoken))); *)
       ()))
 
+type cap = Capital | Small | Sign
+
+let classify_cap s =
+   match Xunicode.classified_chars_of_utf8_string s with
+        Xunicode.Capital _ :: _ -> Capital
+      | Xunicode.ForeignCapital _ :: _ -> Capital
+      | Xunicode.Small _ :: _ -> Small
+      | Xunicode.ForeignSmall _ :: _ -> Small
+      | _ -> Sign
+
+let rec get_ntoken = function
+    (Disamb(nlemma,ncat,ninterp) : attr) :: _ -> nlemma,ncat,ninterp
+  | _ :: l -> get_ntoken l
+  | [] -> raise Not_found
+
+let rec disambiguate_capitalics = function
+    Token t ->
+      (try
+        let nlemma,ncat,ninterp = get_ntoken t.attrs in
+        let c = match t.token, classify_cap nlemma with
+          ENIAMtokenizerTypes.SmallLetter _, Small -> true
+        | ENIAMtokenizerTypes.CapLetter _, Capital -> true
+        | ENIAMtokenizerTypes.AllSmall _ , Small-> true
+        | ENIAMtokenizerTypes.AllCap _, Capital -> true
+        (* | ENIAMtokenizerTypes.AllCap _, Small -> true *)
+        | ENIAMtokenizerTypes.FirstCap _, Capital -> true
+        | ENIAMtokenizerTypes.SomeCap _, Capital -> true
+        | ENIAMtokenizerTypes.SomeCap _, Small -> true
+        | ENIAMtokenizerTypes.RomanDig _, Capital -> true
+        | ENIAMtokenizerTypes.Interp _, _ -> true
+        | ENIAMtokenizerTypes.Symbol _, _ -> true
+        | ENIAMtokenizerTypes.Dig _, _ -> true
+        | ENIAMtokenizerTypes.Other _, _ -> true
+        | ENIAMtokenizerTypes.Lemma _, _ -> true
+        | ENIAMtokenizerTypes.Proper _, _ -> true
+        | ENIAMtokenizerTypes.Compound _, _ -> true
+        | ENIAMtokenizerTypes.Tokens _, _ -> true
+        | _ -> false in
+        Token t, c
+        (* let nc = classify_cap nlemma in
+        let no = classify_cap t.orth in
+        if no = nc then Token t,true else Token t,false *)
+      with Not_found -> Token t,true)
+  | Seq l ->
+      let l,c = Xlist.fold l ([],true) (fun (l,c) t ->
+        let t,d = disambiguate_capitalics t in
+        t :: l, c && d) in
+      Seq(List.rev l), c
+  | Variant l ->
+      let l2 = Xlist.fold l [] (fun l t ->
+        let t,d = disambiguate_capitalics t in
+        if d then t :: l else l) in
+      if l2 = [] then Variant l,false else Variant l2,true
+
 let annotate name sentences =
   let tokens = flatten_sentences sentences in
   let tokens = simple_allign "" "" [] tokens in
@@ -793,8 +850,21 @@ let annotate name sentences =
   let eniam_tokens = annotate_variants_par eniam_tokens in
   let m = annotate_paragraph name paragraph tokens eniam_tokens in
   let m = List.rev (Xlist.rev_map m (merge_paragraph name)) in
+  let m = List.rev (Xlist.fold m [] (fun m t ->
+    let t,_ = disambiguate_capitalics t in
+    t :: m)) in
   paragraph, m
 
+let test_disambiguate_capitalics stats name typ channel entries =
+  prerr_endline name;
+  Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
+    Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
+      let paragraph,tokens = annotate name sentences in
+      Xlist.fold tokens stats (fun stats t ->
+        let _,c = disambiguate_capitalics t in
+        if c then stats else StringQMap.add stats (Printf.sprintf "%s %s" (ENIAMtokens.string_of_tokens 0 t) paragraph))))
+
+
 let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005";
 "620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727";
 "620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056";
@@ -839,6 +909,8 @@ let _ =
     test_annotate name typ channel entries); *)
   (* ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) ->
     test_annotate name typ channel entries); *)
+  (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) ->
+    test_disambiguate_capitalics stats name typ channel entries) in *)
   (* let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in
   Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k); *)
   (* ignore(Sys.command "mpg123 \"../../Inne/gong/gong_00m_30s.mp3\""); *)
diff --git a/morphology/doc/model2.pdf b/morphology/doc/model2.pdf
index 1ece0e0..c31b9b0 100644
Binary files a/morphology/doc/model2.pdf and b/morphology/doc/model2.pdf differ
diff --git a/morphology/resources/alt_supplement.tab b/morphology/resources/alt_supplement.tab
index 9fe48ec..f75e5ad 100644
--- a/morphology/resources/alt_supplement.tab
+++ b/morphology/resources/alt_supplement.tab
@@ -2,4 +2,5 @@ się	się	qub
 siebie	siebie	siebie:acc.gen
 sobie	siebie	siebie:dat.loc
 sobą	siebie	siebie:inst
+to	to	pred
 
--
libgit2 0.22.2