Commit 686d1a264f759063e9655eb298dd53b55f67dfab

Authored by Wojciech Jaworski
1 parent 644020fc

Początek walidacji lematyzacji

NKJP2/data/eniam-correct.tab
... ... @@ -33,3 +33,4 @@ rowecki@wp.pl rowecki @ wp . pl
33 33 28-29 28 -29
34 34 Praca ˝ Praca˝
35 35 marzycielem - marzycielem-
  36 +:)))))) :) ) ) ) ) )
... ...
NKJP2/makefile
... ... @@ -6,7 +6,7 @@ OCAMLFLAGS=$(INCLUDES) -g
6 6 OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa
7 7 INSTALLDIR=`ocamlc -where`/eniam
8 8  
9   -SOURCES=ENIAM_NKJP.ml validateTokenizer.ml #validateMorphology.ml #validateSubsyntax.ml
  9 +SOURCES=ENIAM_NKJP.ml validateTokenizer.ml validateMorphology.ml #validateSubsyntax.ml
10 10  
11 11 all: $(SOURCES)
12 12 $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^
... ...
NKJP2/validateMorphology.ml
... ... @@ -21,7 +21,33 @@ open ENIAMtokenizerTypes
21 21 open Xstd
22 22 open ValidateTokenizer
23 23  
24   -let rec select_interp = function (* przejście z m1 do m1.p1 *)
  24 +let rec has_brev = function
  25 + BrevLemma _ :: _ -> true
  26 + | _ :: l -> has_brev l
  27 + | [] -> false
  28 +
  29 +let rec get_ntoken = function
  30 + (Disamb(nlemma,ncat,ninterp) : attr) :: _ -> nlemma,ncat,ninterp
  31 + | _ :: l -> get_ntoken l
  32 + | [] -> raise Not_found
  33 +
  34 +let rec add_ntoken stats = function
  35 + Token t ->
  36 + (try
  37 + let nlemma,ncat,ninterp = get_ntoken t.attrs in
  38 + StringQMap.add stats (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtokens.string_of_interps [ninterp])
  39 + with Not_found -> stats)
  40 + | Seq l -> Xlist.fold l stats add_ntoken
  41 + | Variant l -> Xlist.fold l stats add_ntoken
  42 +
  43 +let create_ntoken_list stats name typ channel entries =
  44 + prerr_endline name;
  45 + Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
  46 + Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
  47 + let paragraph,tokens = annotate name sentences in
  48 + Xlist.fold tokens stats add_ntoken))
  49 +
  50 +(* let rec select_interp = function (* przejście z m1 do m1.p1 *)
25 51 "n" :: l,["n1"] :: ll -> ["n1"] :: (select_interp (l,ll))
26 52 | "n" :: l,["n2"] :: ll -> ["n2"] :: (select_interp (l,ll))
27 53 | "n" :: l,["p2"] :: ll -> ["p2"] :: (select_interp (l,ll))
... ... @@ -39,7 +65,7 @@ let rec select_interp = function (* przejście z m1 do m1.p1 *)
39 65 | "n" :: l,["_"] :: ll -> ["n1";"n2";"p2";"p3"] :: (select_interp (l,ll))
40 66 | a :: l,al :: ll -> if Xlist.mem al a then [a] :: (select_interp (l,ll)) else raise Not_found
41 67 | [],[] -> []
42   - | _ -> raise Not_found
  68 + | _ -> raise Not_found *)
43 69  
44 70 let lowercase s = function
45 71 AllSmall _ -> s
... ... @@ -50,17 +76,210 @@ let lowercase s = function
50 76 else failwith ("lowercase: " ^ s ^ " " ^ c)
51 77 | t -> failwith ("lowercase: " ^ ENIAMtokens.string_of_token t)
52 78  
53   -let match_lemmatize stats t lemma cat interp =
54   - if cat = "brev" then stats else
55   - if t.token = Symbol "." then stats else
  79 +let lemmatize_string s =
  80 + let l = Xunicode.classified_chars_of_utf8_string s in
  81 + let l = ENIAMtokens.tokenize l in
  82 + let l = ENIAMpatterns.normalize_tokens [] l in
  83 + let l = match l with
  84 + [Token {token=Interp "<query>"};Variant l;Token {token=Interp "</query>"}] -> l
  85 + | [Token {token=Interp "<query>"};t;Token {token=Interp "</query>"}] -> [t]
  86 + | _ -> failwith ("lemmatize_string 1: " ^ s ^ " " ^ String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_tokens_simple t))) in
  87 + let l = Xlist.fold l [] (fun l -> function
  88 + Token ({token=AllSmall _} as t) -> t :: l
  89 + | Token ({token=SmallLetter _} as t) -> t :: l
  90 + | Token ({token=SomeCap _} as t) -> t :: l
  91 + | Token ({token=FirstCap _} as t) -> t :: l
  92 + | Token ({token=AllCap _} as t) -> t :: l
  93 + | Token ({token=CapLetter _} as t) -> t :: l
  94 + | Token ({token=RomanDig _}) -> (*print_endline ("lemmatize_string: " ^ s);*) (*t ::*) l
  95 + | Token ({token=Dig _} as t) -> (*print_endline ("lemmatize_string: " ^ s);*) t :: l
  96 + | Token ({token=Proper _} as t) -> t :: l
  97 + | Seq[Token {token=AllSmall _};Token {token=Lemma _}] -> l
  98 + | Seq[Token {token=SmallLetter _};Token {token=Lemma _}] -> l
  99 + | Seq[Token {token=FirstCap _};Token {token=Lemma _}] -> l
  100 + | Seq[Token {token=CapLetter _};Token {token=Lemma _}] -> l
  101 + | Seq[Token {token=SomeCap _};Token {token=Lemma _}] -> l
  102 + | Seq[Token {token=AllSmall _};Token {token=Lemma _};Token {token=Lemma _}] -> l
  103 + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=AllSmall _}] -> l
  104 + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=SmallLetter _}] -> l
  105 + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=FirstCap _}] -> l
  106 + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=AllCap _}] -> l
  107 + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=CapLetter _}] -> l
  108 + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=RomanDig _}] -> l
  109 + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=Dig _}] -> l
  110 + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=AllSmall _};Token {token=Lemma _}] -> l
  111 + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=FirstCap _};Token {token=Lemma _}] -> l
  112 + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=SmallLetter _};Token {token=Lemma _}] -> l
  113 + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=CapLetter _};Token {token=Lemma _}] -> l
  114 + (* | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=AllSmall _};Token {token=Lemma _};Token {token=Lemma _}] -> l *)
  115 + | t -> failwith ("lemmatize_string 3: " ^ ENIAMtokens.string_of_tokens_simple t)) in
  116 + if l = [] then failwith "lemmatize_string 3" else
  117 + List.flatten (Xlist.map l ENIAMpaths.lemmatize_token)
  118 + (* match l with
  119 + [] -> failwith "lemmatize_string 2"
  120 + | [t] -> t
  121 + | _ -> Xlist.iter l (fun t -> print_endline (ENIAMtokens.string_of_tokens_simple t)); failwith "lemmatize_string 3" *)
  122 + (* Xlist.iter l (fun t -> print_endline (ENIAMtokens.string_of_tokens_simple t));
  123 + print_endline "";
  124 + Token empty_token_env *)
  125 +
  126 +let get_cat_interp = function
  127 + "subst","subst",[n;c;["m1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]]
  128 + | "subst","subst",[n;c;["m2"]],[_;_;["m2"]] -> "subst",[n;c;["m2"]]
  129 + | "subst","subst",[n;c;["m3"]],[_;_;["m3"]] -> "subst",[n;c;["m3"]]
  130 + | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]]
  131 + | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]]
  132 + | "subst","subst",[n;c;["f"]],[_;_;["f"]] -> "subst",[n;c;["f"]]
  133 + | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]]
  134 + | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]]
  135 + | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p2"]] -> "subst",[n;c;["p2"]]
  136 + | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p3"]] -> "subst",[n;c;["p3"]]
  137 + | "subst","subst",[n;c;["m1";"p1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]]
  138 + | "subst","subst",[n;c;["m1";"p1"]],[_;_;["p1"]] -> "subst",[n;c;["p1"]]
  139 + | "depr","subst",[["pl"];["nom"];["m2"]],[["sg"];["nom"];["m1"]] -> "depr",[["pl"];["nom"];["m2"]]
  140 + | "ppron3","ppron3",ninterp,[["sg"];["nom"];["m1";"m2";"m3"];["ter"];_;_] -> "ppron3",ninterp
  141 + | "ppron12","ppron12",ninterp,[_;["nom"];_;_] -> "ppron3",ninterp
  142 + | "numcol","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *)
  143 + | "num","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *)
  144 + | "siebie","siebie",[[c]],[["acc";"gen"]] -> "siebie",[[c]]
  145 + | "adj","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adj",ninterp
  146 + | "adja","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adja",ninterp
  147 + | "adjc","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjc",ninterp
  148 + | "adjp","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjp",ninterp
  149 + | "adv","adv",[[g]],[["pos"]] -> "adv",[[g]]
  150 + | "adv","adv",ninterp,interp -> if ninterp = interp then "adv",ninterp else raise Not_found
  151 + | "prep","prep",ninterp,interp -> if ninterp = interp then "prep",ninterp else raise Not_found
  152 + | "qub","qub",ninterp,interp -> if ninterp = interp then "qub",ninterp else raise Not_found
  153 + | "conj","conj",ninterp,interp -> if ninterp = interp then "conj",ninterp else raise Not_found
  154 + | "comp","comp",ninterp,interp -> if ninterp = interp then "comp",ninterp else raise Not_found
  155 + | "interj","interj",ninterp,interp -> if ninterp = interp then "interj",ninterp else raise Not_found
  156 + | "burk","burk",ninterp,interp -> if ninterp = interp then "burk",ninterp else raise Not_found
  157 + | "pred","pred",ninterp,interp -> if ninterp = interp then "pred",ninterp else raise Not_found
  158 + | "fin","inf",[n;p;["imperf"]],[["imperf";"perf"]] -> "fin",[n;p;["imperf"]]
  159 + | "fin","inf",[n;p;["imperf"]],[["imperf"]] -> "fin",[n;p;["imperf"]]
  160 + | "fin","inf",[n;p;["perf"]],[["imperf";"perf"]] -> "fin",[n;p;["perf"]]
  161 + | "fin","inf",[n;p;["perf"]],[["perf"]] -> "fin",[n;p;["perf"]]
  162 + | "impt","inf",[n;p;["imperf"]],[["imperf";"perf"]] -> "impt",[n;p;["imperf"]]
  163 + | "impt","inf",[n;p;["imperf"]],[["imperf"]] -> "impt",[n;p;["imperf"]]
  164 + | "impt","inf",[n;p;["perf"]],[["imperf";"perf"]] -> "impt",[n;p;["perf"]]
  165 + | "impt","inf",[n;p;["perf"]],[["perf"]] -> "impt",[n;p;["perf"]]
  166 + | "inf","inf",[["imperf"]],[["imperf";"perf"]] -> "inf",[["imperf"]]
  167 + | "inf","inf",[["imperf"]],[["imperf"]] -> "inf",[["imperf"]]
  168 + | "inf","inf",[["perf"]],[["imperf";"perf"]] -> "inf",[["perf"]]
  169 + | "inf","inf",[["perf"]],[["perf"]] -> "inf",[["perf"]]
  170 + | "praet","inf",[n;g;["imperf"]],[["imperf";"perf"]] -> "praet",[n;g;["imperf"]]
  171 + | "praet","inf",[n;g;["imperf"]],[["imperf"]] -> "praet",[n;g;["imperf"]]
  172 + | "praet","inf",[n;g;["perf"]],[["imperf";"perf"]] -> "praet",[n;g;["perf"]]
  173 + | "praet","inf",[n;g;["perf"]],[["perf"]] -> "praet",[n;g;["perf"]]
  174 + | "praet","inf",[n;g;["imperf"];a],[["imperf";"perf"]] -> "praet",[n;g;["imperf"];a]
  175 + | "praet","inf",[n;g;["imperf"];a],[["imperf"]] -> "praet",[n;g;["imperf"];a]
  176 + | "praet","inf",[n;g;["perf"];a],[["imperf";"perf"]] -> "praet",[n;g;["perf"];a]
  177 + | "praet","inf",[n;g;["perf"];a],[["perf"]] -> "praet",[n;g;["perf"];a]
  178 + | "winien","inf",[n;g;["imperf"]],[["imperf"]] -> "winien",[n;g;["imperf"]]
  179 + | "ppas","inf",[n;c;g;["imperf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["imperf"];a]
  180 + | "ppas","inf",[n;c;g;["imperf"];a],[["imperf"]] -> "ppas",[n;c;g;["imperf"];a]
  181 + | "ppas","inf",[n;c;g;["perf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["perf"];a]
  182 + | "ppas","inf",[n;c;g;["perf"];a],[["perf"]] -> "ppas",[n;c;g;["perf"];a]
  183 + | "pact","inf",[n;c;g;["imperf"];a],[["imperf";"perf"]] -> "pact",[n;c;g;["imperf"];a]
  184 + | "pact","inf",[n;c;g;["imperf"];a],[["imperf"]] -> "pact",[n;c;g;["imperf"];a]
  185 + | "pact","inf",[n;c;g;["perf"];a],[["imperf";"perf"]] -> "pact",[n;c;g;["perf"];a]
  186 + | "pact","inf",[n;c;g;["perf"];a],[["perf"]] -> "pact",[n;c;g;["perf"];a]
  187 + | "pant","inf",[["imperf"]],[["imperf";"perf"]] -> "pant",[["imperf"]]
  188 + | "pant","inf",[["imperf"]],[["imperf"]] -> "pant",[["imperf"]]
  189 + | "pant","inf",[["perf"]],[["imperf";"perf"]] -> "pant",[["perf"]]
  190 + | "pant","inf",[["perf"]],[["perf"]] -> "pant",[["perf"]]
  191 + | "pcon","inf",[["imperf"]],[["imperf";"perf"]] -> "pcon",[["imperf"]]
  192 + | "pcon","inf",[["imperf"]],[["imperf"]] -> "pcon",[["imperf"]]
  193 + | "pcon","inf",[["perf"]],[["imperf";"perf"]] -> "pcon",[["perf"]]
  194 + | "pcon","inf",[["perf"]],[["perf"]] -> "pcon",[["perf"]]
  195 + | "ger","inf",[n;c;g;["imperf"];a],[["imperf";"perf"]] -> "ger",[n;c;g;["imperf"];a]
  196 + | "ger","inf",[n;c;g;["imperf"];a],[["imperf"]] -> "ger",[n;c;g;["imperf"];a]
  197 + | "ger","inf",[n;c;g;["perf"];a],[["imperf";"perf"]] -> "ger",[n;c;g;["perf"];a]
  198 + | "ger","inf",[n;c;g;["perf"];a],[["perf"]] -> "ger",[n;c;g;["perf"];a]
  199 + | "imps","inf",[["imperf"]],[["imperf";"perf"]] -> "imps",[["imperf"]]
  200 + | "imps","inf",[["imperf"]],[["imperf"]] -> "imps",[["imperf"]]
  201 + | "imps","inf",[["perf"]],[["imperf";"perf"]] -> "imps",[["perf"]]
  202 + | "imps","inf",[["perf"]],[["perf"]] -> "imps",[["perf"]]
  203 + | _ -> raise Not_found
  204 +
  205 +let correct_nlemma = function
  206 + "letnia " -> "letnia"
  207 + | "10minutowy" -> "minutowy"
  208 + | "23-letni" -> "letni"
  209 + | "40--letni" -> "letni"
  210 + | "5minutowy" -> "minutowy"
  211 + | "10-ta" -> (*"10."*)raise Not_found
  212 + | "10-tej" -> (*"10."*)raise Not_found
  213 + | "13-letni" -> "letni"
  214 + | "itineraryjny " -> "itineraryjny"
  215 + | "Składowy " -> "Składowy"
  216 + | "tak " -> "tak"
  217 + | "letni " -> "letni"
  218 + | "Kaznodziey'a" -> raise Not_found
  219 + | "Naczelna Rada Łowiecka" -> raise Not_found
  220 + | "PR-owy" -> raise Not_found
  221 + | "starać się" -> raise Not_found
  222 + | "vis-à-vis" -> raise Not_found
  223 + | "Ewangelia wg św. Jana" -> raise Not_found
  224 + | "`a" -> raise Not_found
  225 + | "6-piętrowy" -> "piętrowy"
  226 + | "6-letni" -> "letni"
  227 + | "5—lampowy" -> "lampowy"
  228 + | "4-piętrowy" -> "piętrowy"
  229 + | "3-centymetrowy" -> "centymetrowy"
  230 + | "34-letni" -> "letni"
  231 + | "18-ka" -> (*"18"*)raise Not_found
  232 + | "185-osobowy" -> "osobowy"
  233 + | "16-latek" -> raise Not_found
  234 + | s -> s
  235 +
  236 +let process_ntoken stats nlemma ncat ninterp =
  237 + try
  238 + let nlemma = correct_nlemma nlemma in
  239 + let nl = lemmatize_string nlemma in
  240 + let nl2 = Xlist.fold nl [] (fun nl -> function
  241 + {token=Lemma(lemma,cat,interp)} ->
  242 + Xlist.fold interp nl (fun nl interp ->
  243 + try
  244 + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in
  245 + if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl
  246 + with Not_found -> nl)
  247 + | {token=Dig _} -> nl (* FIXME: todo *)
  248 + | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *)
  249 + | _ -> nl) in
  250 + if nl2 = [] then StringQMap.add stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token)))
  251 + else StringQMap.add stats "lemmatized"
  252 + with Not_found -> StringQMap.add stats "incorrect"
  253 +
  254 +let validate_ntoken stats (nlemma,ncat,ninterp) =
  255 + process_ntoken stats nlemma ncat ninterp
  256 +
  257 +let match_lemmatize stats t =
  258 + if has_brev t.attrs then StringQMap.add stats "brev" else
56 259 let l = ENIAMpaths.lemmatize_token t in
57   - let l2 = Xlist.fold l [] (fun l2 t2 ->
58   - match t2.token with
59   - Lemma(lemma2,cat2,interp2) -> if lemma = lemma2 || lemma = lowercase lemma2 t.token then t2 :: l2 else l2
60   - | Proper(lemma2,cat2,interp2,_) -> if lemma = lemma2 || lemma = lowercase lemma2 t.token then t2 :: l2 else l2
61   - | _ -> t2 :: l2) in
62   - if l2 = [] then StringQMap.add stats ("no lemma: " ^ t.orth ^ " " ^ lemma) else
63   - let l3 = Xlist.fold l2 [] (fun l3 t ->
  260 + try
  261 + let nlemma,ncat,ninterp = get_ntoken t.attrs in
  262 + let nlemma = correct_nlemma nlemma in
  263 + let nl = lemmatize_string nlemma in
  264 + let nl2 = Xlist.fold nl [] (fun nl -> function
  265 + {token=Lemma(lemma,cat,interp)} ->
  266 + Xlist.fold interp nl (fun nl interp ->
  267 + try
  268 + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in
  269 + if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl
  270 + with Not_found -> nl)
  271 + | {token=Dig _} -> nl (* FIXME: todo *)
  272 + | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *)
  273 + | _ -> nl) in
  274 + if nl2 = [] then StringQMap.add stats (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token)))
  275 + (* let l2 = Xlist.fold l [] (fun l2 t2 ->
  276 + match t2.token with
  277 + Lemma(lemma,cat,interp) -> if lemma = nlemma (*|| lemma = lowercase nlemma t.token*) then t2 :: l2 else l2
  278 + (* | Proper(lemma,cat,interp,_) -> if lemma = nlemma || lemma = lowercase nlemma t.token then t2 :: l2 else l2 *)
  279 + | _ -> l2) in
  280 + if l2 = [] then StringQMap.add stats ("no lemma: " ^ t.orth ^ " " ^ nlemma) else *)
  281 + else StringQMap.add stats "lemmatized"
  282 +(* let l3 = Xlist.fold l2 [] (fun l3 t ->
64 283 match t.token with
65 284 Lemma(lemma2,cat2,interp2) -> if cat = cat2 then t :: l3 else l3
66 285 | Proper(lemma2,cat2,interp2,_) -> if cat = cat2 then t :: l3 else l3
... ... @@ -88,27 +307,13 @@ let match_lemmatize stats t lemma cat interp =
88 307 | [{token=Lemma _};{token=AllSmall _}] -> stats
89 308 | [{token=Lemma _};{token=SmallLetter _}] -> stats
90 309 | [{token=Lemma _};{token=FirstCap _}] -> stats
91   - | l -> StringQMap.add stats ("multiple interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))
92   -
93   -let is_lemmatizable = function
94   - | AllSmall _ -> true
95   - | SmallLetter _ -> true
96   - | FirstCap _ -> true
97   - | AllCap _ -> true
98   - | CapLetter _ -> true
99   - | SomeCap _ -> true
100   - | t -> false
101   -
102   -let validate_token stats = function
103   - AT(t,[sent,orth,lemma,"brev",interp]) -> StringQMap.add stats "brev"
104   - | AT(t,l(*[sent,orth,lemma,cat,interp]*)) ->
105   - if is_lemmatizable t.token then
106   - StringQMap.add stats "lemmatizable" else StringQMap.add stats "non lemmatizable"
107   - (*match_lemmatize stats t lemma cat interp*)
108   - (* | AT(_,l) as t -> StringQMap.add stats ("validate_token: " ^ string_of_atoken t)*)
109   - | AV(tl,l) as t -> StringQMap.add stats ("validate_token: " ^ string_of_atoken t)
110   - | AR(stat,tl,l) as t -> StringQMap.add stats ("validate_token: " ^ string_of_atoken t)
111   - (* | _ -> StringQMap.add stats "validate_token: ni" *)
  310 + | l -> StringQMap.add stats ("multiple interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))*)
  311 + with Not_found -> StringQMap.add stats "no ntoken" (*("no ntoken for: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token)*)
  312 +
  313 +let rec validate_token stats = function
  314 + Token t -> match_lemmatize stats t
  315 + | Seq l -> Xlist.fold l stats validate_token
  316 + | Variant l -> Xlist.fold l stats validate_token
112 317  
113 318 let validate_morphology stats name typ channel entries =
114 319 prerr_endline name;
... ... @@ -121,18 +326,28 @@ let validate_morphology stats name typ channel entries =
121 326 Xlist.fold tokens stats validate_token
122 327 (*else stats*)))
123 328  
  329 +let ntokens_filename = "results/ntokens.tab"
  330 +
  331 +let parse_ninterp s =
  332 + Xlist.map (Xstring.split ":" s) (fun s -> Xstring.split "\\." s)
  333 +
  334 +let fold_ntokens ntokens_filename s f =
  335 + File.fold_tab ntokens_filename s (fun s -> function
  336 + [_;nlemma;ncat;ninterp] -> f s (nlemma,ncat,parse_ninterp ninterp)
  337 + | l -> failwith ("fold_ntokens: " ^ String.concat "\t" l))
  338 +
124 339 let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005";
125 340 "620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727";
126 341 "620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056";
127 342 "711-3-010000079";"720-3-010000217";"720-3-010000335";"720-3-010000341";"forumowisko.pl_18535";"forumowisko.pl_424";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";*)
128   - (*"040-2-000001";"040-2-000007";"040-4-000000103";"120-2-000003";"120-2-000007";"120-2-000009";"120-2-000010";"120-2-900017";"120-2-900041";"120-2-900044";"120-2-900083";
  343 + (* "040-2-000001";"040-2-000007";"040-4-000000103";"120-2-000003";"120-2-000007";"120-2-000009";"120-2-000010";"120-2-900017";"120-2-900041";"120-2-900044";"120-2-900083";
129 344 "120-2-900092";"120-2-900094";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900008";"120-4-900010";"130-3-900001";"130-3-910001";"130-5-000000267";
130 345 "130-5-000000406";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001338";"130-5-000001628";"130-5-000001742";"200-1-000011";"200-1-000026";"200-2-000078";
131 346 "200-2-000173";"200-2-000175";"200-4-000000307";"200-4-000000316";"310-2-000007";"320-2-000000094";"320-2-000034";"320-2-000064";"320-3-000226";"330-2-000000030";
132 347 "330-2-000000033";"330-2-000000200";"330-2-000000213";"330-2-000003";"330-2-000013";"620-3-010000057";"620-3-010000838";"620-3-010001103";"620-3-010001107";"620-3-010001108";
133 348 "620-3-010001109";"620-3-010001125";"620-3-010001274";"620-3-010001448";"620-3-010001732";"620-3-010001772";"711-3-010000021";"712-1-900003";"712-1-900004";"720-3-000071";
134 349 "720-3-010000323";"DP1999";"DP2002";"DP2003";"EkspressWieczorny";"forumowisko.pl_20218";"forumowisko.pl_42911";"forumowisko.pl_724";"GazetaGoleniowska";"GazetaTczewska";
135   - "NIE";"SuperExpress";"TrybunaSlaska";*)
  350 + "NIE";"SuperExpress";"TrybunaSlaska"; *)
136 351 (* "120-2-000009";"120-2-000010";"120-2-000012";"120-2-900019";"120-2-900041";"120-2-900044";"120-2-900092";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900001";
137 352 "120-4-900008";"130-3-900001";"130-5-000000267";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001628";"130-5-000001635";"130-5-000001742";"200-1-000011";
138 353 "200-2-000078";"200-2-000181";"200-4-000000314";"200-4-000026";"200-4-000059";"310-2-000007";"320-2-000000087";"320-2-000000094";"320-2-000034";"330-2-000013";"620-3-010000057";
... ... @@ -152,10 +367,15 @@ let selection = StringSet.of_list [(*&quot;Rzeczpospolita&quot;;&quot;200-4-000014&quot;;&quot;040-2-0000
152 367 let _ =
153 368 ENIAMtokenizer.initialize ();
154 369 ENIAMinflexion.initialize ();
155   - let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) ->
156   - validate_morphology stats name typ channel entries) in
  370 + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) ->
  371 + create_ntoken_list stats name typ channel entries) in *)
  372 + (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
  373 + create_ntoken_list stats name typ channel entries) in *)
  374 + let stats = fold_ntokens ntokens_filename StringQMap.empty validate_ntoken in
  375 + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) ->
  376 + validate_morphology stats name typ channel entries) in *)
157 377 (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
158   - validate_segmentation stats name typ channel entries) in *)
  378 + validate_morphology stats name typ channel entries) in *)
159 379 let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in
160 380 Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k);
161 381 ()
... ...
NKJP2/validateTokenizer.ml
... ... @@ -589,9 +589,9 @@ let rec match_and_combine name paragraph stats l = function
589 589 with Not_found ->
590 590 let e_tokens,n_tokens,ets,l = combine "" "" [] [] (et :: ets) l in
591 591 (* let stats = StringQMap.add stats (string_of_eniam_token_orths e_tokens ^ "\t" ^ string_of_nkjp_token_orths n_tokens ^ "\t" ^ name) in *)
592   - (* let stats = StringQMap.add stats (string_of_eniam_token_orths e_tokens ^ "\t" ^ string_of_nkjp_token_orths n_tokens ^ "\t" ^ paragraph) in *)
  592 + let stats = StringQMap.add stats (string_of_eniam_token_orths e_tokens ^ "\t" ^ string_of_nkjp_token_orths n_tokens ^ "\t" ^ paragraph) in
593 593 (* let stats = StringQMap.add stats ("[\"" ^ string_of_eniam_token_orths2 e_tokens ^ "\"],[\"" ^ string_of_nkjp_token_orths2 n_tokens ^ "\"];" ^ "\t" ^ name) in *)
594   - let stats = StringQMap.add stats ("[\"" ^ string_of_eniam_token_orths2 e_tokens ^ "\"],[\"" ^ string_of_nkjp_token_orths2 n_tokens ^ "\"];" ^ "\t" ^ paragraph) in
  594 + (* let stats = StringQMap.add stats ("[\"" ^ string_of_eniam_token_orths2 e_tokens ^ "\"],[\"" ^ string_of_nkjp_token_orths2 n_tokens ^ "\"];" ^ "\t" ^ paragraph) in *)
595 595 match_and_combine name paragraph stats l ets)))
596 596 | [] -> if l = [] then stats else StringQMap.add stats ("match_and_combine: " ^ name ^ "\t" ^ string_of_nkjp_token_orths l ^ "\t" ^ paragraph)
597 597  
... ... @@ -667,6 +667,16 @@ let set_sent sent t =
667 667 | SentBegEnd -> Token {t with attrs=SentBegEnd :: t.attrs}
668 668 | Space -> failwith "set_sent"
669 669  
  670 +let set_sent_list ets l = (* FIXME: todo *)
  671 + (* print_endline (String.concat " " (Xlist.map l (fun n ->
  672 + match n.nsent with
  673 + SentBeg -> "B"
  674 + | SentEnd -> "E"
  675 + | Inside -> "I"
  676 + | SentBegEnd -> "BE"
  677 + | Space -> "S"))); *)
  678 + ets
  679 +
670 680 let rec allign rev = function
671 681 {orth=""} as t :: ets,nts -> allign ((t,[]) :: rev) (ets,nts)
672 682 | [{orth="."} as x;{orth="''"} as y],[{north="''"};{north="."}] -> List.rev rev @ [x,[];y,[]]
... ... @@ -679,26 +689,69 @@ let rec allign rev = function
679 689 | [],[] -> List.rev rev
680 690 | _ -> failwith "allign 3"
681 691  
  692 +let transform_nkjp_interp cat interp1 =
  693 + if interp1 = [] then [] else
  694 + let interp = Xlist.map interp1 (fun s -> [s]) in
  695 + match cat with
  696 + "subst" | "ppron12" | "ppron3" | "ppas" | "pact" | "adj" | "num" | "depr" | "numcol" ->
  697 + (match interp with
  698 + ["sg"] :: case :: ["n"] :: l -> ["sg"] :: case :: ["n1";"n2"] :: l
  699 + | ["pl"] :: case :: ["n"] :: l -> ["pl"] :: case :: ["n1";"n2";"p2";"p3"] :: l
  700 + | ["pl"] :: case :: ["m1"] :: l -> ["pl"] :: case :: ["m1";"p1"] :: l
  701 + | l -> l)
  702 + | "ger" ->
  703 + (match interp with
  704 + num :: case :: ["n"] :: l -> num :: case :: ["n2"] :: l
  705 + | l -> l)
  706 + | "praet" | "winien" ->
  707 + (match interp with
  708 + ["sg"] :: ["n"] :: l -> ["sg"] :: ["n1";"n2"] :: l
  709 + | ["pl"] :: ["n"] :: l -> ["pl"] :: ["n1";"n2";"p2";"p3"] :: l
  710 + | ["pl"] :: ["m1"] :: l -> ["pl"] :: ["m1";"p1"] :: l
  711 + | l -> l)
  712 + | "prep" | "adv" | "fin" | "inf" | "imps" | "pcon" | "bedzie" | "impt" | "siebie" | "aglt" | "pant" | "brev" | "qub" -> interp
  713 + | _ -> print_endline ("transform_nkjp_interp: " ^ cat ^ " " ^ String.concat ":" interp1); interp
  714 +
682 715 let merge_token = function
683 716 t,[] -> Token t
684 717 | t,[{ncat="brev"} as n] -> set_sent n.nsent {t with attrs=BrevLemma n.nlemma :: t.attrs}
685 718 | t,[n] ->
686   - if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,n.ninterp) :: t.attrs}
  719 + if n.nlemma = "+/-" then set_sent n.nsent t else
  720 + if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: t.attrs}
687 721 else set_sent n.nsent t
688 722 | _ -> failwith "merge_token"
689 723  
690   -let transform_nkjp_interp = function
691   - | l -> (*print_endline ("transform_nkjp_interp: " ^ String.concat ":" l);*) Xlist.map l (fun s -> [s])
  724 +let merge_letni l seq =
  725 + if l = [] then failwith "merge_letni" else
  726 + let n = List.hd (List.rev l) in
  727 + let lemma = List.hd (List.rev (Xstring.split "-" n.nlemma)) in
  728 + let seq = match seq with
  729 + first :: l -> if n.nsent=SentBeg || n.nsent=SentBegEnd then {first with attrs=SentBeg :: first.attrs} :: l else first :: l
  730 + | _ -> failwith "merge_letni" in
  731 + match List.rev seq with
  732 + last :: l ->
  733 + let attrs = if n.nsent=SentEnd || n.nsent=SentBegEnd then (SentEnd : attr) :: last.attrs else last.attrs in
  734 + Seq(Xlist.rev_map ({last with attrs=Disamb(lemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: attrs} :: l) (fun t -> Token t))
  735 + | _ -> failwith "merge_letni"
  736 +
  737 +let blabla_orths = StringSet.of_list ["8.12"; "9.11"; "1.1"; "1.2"]
  738 +
  739 +let is_blabla = function
  740 + [{north=s};{north="."}] -> StringSet.mem blabla_orths s (*then (print_endline ("blabla: " ^ s); true) else false*)
  741 + | _ -> false
692 742  
693 743 let merge_paragraph name = function
694 744 AT(t,l) -> merge_token (t,l)
695   - | AV(variants,l) as t -> (*print_endline (string_of_atoken t);*) Variant(Xlist.rev_map variants (fun ets ->
696   - Seq(Xlist.map (allign [] (ets,l)) merge_token)))
697   - | AR("tys",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map ets (fun t -> Token t))))
698   - | AR("both-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map ets (fun t -> Token t))))
699   - | AR("eniam-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map ets (fun t -> Token t))))
700   - | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp n.ninterp])}))
701   - | t -> (*print_endline (string_of_atoken t);*) Token empty_token_env
  745 + | AV(variants,l) ->
  746 + if is_blabla l then Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) else
  747 + Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (allign [] (ets,l)) merge_token)))
  748 + | AR("tys",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t))))
  749 + | AR("letni",variants,l) -> Variant(Xlist.rev_map variants (merge_letni l)) (*in print_endline (ENIAMtokens.string_of_tokens 0 t); t*)
  750 + | AR("brev",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t))))
  751 + | AR("both-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t))))
  752 + | AR("eniam-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t))))
  753 + | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp n.ncat n.ninterp])})) (* FIXME: ustalenie beg len next *)
  754 + | t -> failwith ("merge_paragraph: " ^ string_of_atoken t)
702 755  
703 756 let test_annotate name typ channel entries =
704 757 (* if name = "620-3-010001854" then prerr_endline "620-3-010001854 omited" else ( *)
... ... @@ -723,7 +776,7 @@ let test_annotate name typ channel entries =
723 776 (* print_endline "test_annotate 2"; *)
724 777 let m = annotate_paragraph name paragraph tokens eniam_tokens in
725 778 (* print_endline "test_annotate 3"; *)
726   - (* check_annotation paragraph m; *)
  779 + check_annotation paragraph m;
727 780 let _ = List.rev (Xlist.rev_map m (merge_paragraph name)) in
728 781 ());
729 782 (* print_endline (String.concat "\n" (Xlist.map m string_of_atoken))); *)
... ... @@ -738,20 +791,21 @@ let annotate name sentences =
738 791 let eniam_tokens = convert_eniam_tokens [] eniam_tokens in
739 792 let eniam_tokens = annotate_variants_par eniam_tokens in
740 793 let m = annotate_paragraph name paragraph tokens eniam_tokens in
  794 + let m = List.rev (Xlist.rev_map m (merge_paragraph name)) in
741 795 paragraph, m
742 796  
743 797 let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005";
744 798 "620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727";
745 799 "620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056";
746 800 "711-3-010000079";"720-3-010000217";"720-3-010000335";"720-3-010000341";"forumowisko.pl_18535";"forumowisko.pl_424";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";*)
747   - (* "040-2-000001";"040-2-000007";"040-4-000000103";"120-2-000003";"120-2-000007";"120-2-000009";"120-2-000010";"120-2-900017";"120-2-900041";"120-2-900044";"120-2-900083";
  801 + "040-2-000001";"040-2-000007";"040-4-000000103";"120-2-000003";"120-2-000007";"120-2-000009";"120-2-000010";"120-2-900017";"120-2-900041";"120-2-900044";"120-2-900083";
748 802 "120-2-900092";"120-2-900094";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900008";"120-4-900010";"130-3-900001";"130-3-910001";"130-5-000000267";
749 803 "130-5-000000406";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001338";"130-5-000001628";"130-5-000001742";"200-1-000011";"200-1-000026";"200-2-000078";
750 804 "200-2-000173";"200-2-000175";"200-4-000000307";"200-4-000000316";"310-2-000007";"320-2-000000094";"320-2-000034";"320-2-000064";"320-3-000226";"330-2-000000030";
751 805 "330-2-000000033";"330-2-000000200";"330-2-000000213";"330-2-000003";"330-2-000013";"620-3-010000057";"620-3-010000838";"620-3-010001103";"620-3-010001107";"620-3-010001108";
752 806 "620-3-010001109";"620-3-010001125";"620-3-010001274";"620-3-010001448";"620-3-010001732";"620-3-010001772";"711-3-010000021";"712-1-900003";"712-1-900004";"720-3-000071";
753 807 "720-3-010000323";"DP1999";"DP2002";"DP2003";"EkspressWieczorny";"forumowisko.pl_20218";"forumowisko.pl_42911";"forumowisko.pl_724";"GazetaGoleniowska";"GazetaTczewska";
754   - "NIE";"SuperExpress";"TrybunaSlaska"; *)
  808 + "NIE";"SuperExpress";"TrybunaSlaska";
755 809 (* "120-2-000009";"120-2-000010";"120-2-000012";"120-2-900019";"120-2-900041";"120-2-900044";"120-2-900092";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900001";
756 810 "120-4-900008";"130-3-900001";"130-5-000000267";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001628";"130-5-000001635";"130-5-000001742";"200-1-000011";
757 811 "200-2-000078";"200-2-000181";"200-4-000000314";"200-4-000026";"200-4-000059";"310-2-000007";"320-2-000000087";"320-2-000000094";"320-2-000034";"330-2-000013";"620-3-010000057";
... ... @@ -764,7 +818,7 @@ let selection = StringSet.of_list [(*&quot;Rzeczpospolita&quot;;&quot;200-4-000014&quot;;&quot;040-2-0000
764 818 "KurierKwidzynski";"NIE";"Rzeczpospolita";"TrybunaSlaska" *)
765 819 (* "110-4-000000102";"120-2-000006";"120-2-900032";"130-5-000000507";"130-5-000001156";
766 820 "620-3-010000835";"GazetaGoleniowska";"KurierKwidzynski";"NIE";"Rzeczpospolita"; *)
767   - (*"110-4-000000102";"KurierKwidzynski";*)(*"620-3-010001496;"*)(*"130-5-000001341";*)(*"620-3-010001854"*)"620-3-010001106"
  821 + (*"110-4-000000102";"KurierKwidzynski";*)(*"620-3-010001496;"*)(*"130-5-000001341";*)(*"620-3-010001854"*)(*"620-3-010001106"*)
768 822 ]
769 823  
770 824 let _ =
... ... @@ -779,8 +833,8 @@ let _ =
779 833 validate_segmentation stats name typ channel entries) in *)
780 834 (* ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] () (fun () (name,typ,channel,entries) ->
781 835 test_annotate name typ channel entries); *)
782   - ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) ->
783   - test_annotate name typ channel entries);
  836 + (* ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) ->
  837 + test_annotate name typ channel entries); *)
784 838 (* let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in
785 839 Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k); *)
786 840 ()
... ...
documentation/motto.txt 0 → 100644
  1 +Jeśli wyznajesz jakąkolwiek religię,
  2 +żeby się oświecić,
  3 +musisz ją porzucić.
  4 +Jeśli nie wierzysz w Boga,
  5 +żeby się oświecić,
  6 +musisz dostrzec boskość przenikającą rzeczywistość.
  7 +
  8 +IAAM
... ...
subsyntax/ENIAMsubsyntax.ml
... ... @@ -241,6 +241,11 @@ let rec calculate_quality q = function
241 241 | NotValProper :: l -> calculate_quality (q-1) l
242 242 | LemmLowercase :: l -> calculate_quality q l
243 243 | Roman :: l -> calculate_quality q l
  244 + | SentBeg :: l -> calculate_quality q l
  245 + | SentBegEnd :: l -> calculate_quality q l
  246 + | SentEnd :: l -> calculate_quality q l
  247 + | BrevLemma _ :: l -> calculate_quality q l
  248 + | Disamb _ :: l -> calculate_quality q l
244 249 | [] -> q
245 250  
246 251 let select_tokens2 paths =
... ...
tokenizer/ENIAMtokenizerTypes.ml
... ... @@ -44,7 +44,7 @@ type attr =
44 44 CS | MaybeCS | ReqValLemm | MWE | LemmNotVal | TokNotFound | NotValProper | LemmLowercase | Roman
45 45 | SentBeg | SentEnd | SentBegEnd
46 46 | BrevLemma of string
47   - | Disamb of string * string * string list
  47 + | Disamb of string * string * string list list
48 48  
49 49 (* Tekst reprezentuję jako zbiór obiektów typu token_record zawierających
50 50 informacje o poszczególnych tokenach *)
... ...
tokenizer/ENIAMtokens.ml
... ... @@ -100,7 +100,7 @@ let string_of_attr = function
100 100 | SentEnd -> "NKJP sentence end"
101 101 | SentBegEnd -> "NKJP sentence begin-end"
102 102 | BrevLemma s -> "NKJP brev lemma: " ^ s
103   - | Disamb(lemma,cat,interp) -> "NKJP disamb: " ^ lemma ^ ":" ^ cat ^ ":" ^ String.concat ":" interp
  103 + | Disamb(lemma,cat,interp) -> "NKJP disamb: " ^ lemma ^ ":" ^ cat ^ ":" ^ String.concat ":" (Xlist.map interp (String.concat "."))
104 104  
105 105 let string_of_token_env t =
106 106 sprintf "{orth=%s;beg=%d;len=%d;next=%d;token=%s;weight=%.2f;attrs=[%s]}" t.orth t.beg t.len t.next (string_of_token t.token) t.weight
... ...