Commit 686d1a264f759063e9655eb298dd53b55f67dfab
1 parent
644020fc
Początek walidacji lematyzacji
Showing
8 changed files
with
347 additions
and
59 deletions
NKJP2/data/eniam-correct.tab
NKJP2/makefile
... | ... | @@ -6,7 +6,7 @@ OCAMLFLAGS=$(INCLUDES) -g |
6 | 6 | OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa |
7 | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | |
9 | -SOURCES=ENIAM_NKJP.ml validateTokenizer.ml #validateMorphology.ml #validateSubsyntax.ml | |
9 | +SOURCES=ENIAM_NKJP.ml validateTokenizer.ml validateMorphology.ml #validateSubsyntax.ml | |
10 | 10 | |
11 | 11 | all: $(SOURCES) |
12 | 12 | $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^ |
... | ... |
NKJP2/validateMorphology.ml
... | ... | @@ -21,7 +21,33 @@ open ENIAMtokenizerTypes |
21 | 21 | open Xstd |
22 | 22 | open ValidateTokenizer |
23 | 23 | |
24 | -let rec select_interp = function (* przejście z m1 do m1.p1 *) | |
24 | +let rec has_brev = function | |
25 | + BrevLemma _ :: _ -> true | |
26 | + | _ :: l -> has_brev l | |
27 | + | [] -> false | |
28 | + | |
29 | +let rec get_ntoken = function | |
30 | + (Disamb(nlemma,ncat,ninterp) : attr) :: _ -> nlemma,ncat,ninterp | |
31 | + | _ :: l -> get_ntoken l | |
32 | + | [] -> raise Not_found | |
33 | + | |
34 | +let rec add_ntoken stats = function | |
35 | + Token t -> | |
36 | + (try | |
37 | + let nlemma,ncat,ninterp = get_ntoken t.attrs in | |
38 | + StringQMap.add stats (nlemma ^ "\t" ^ ncat ^ "\t" ^ ENIAMtokens.string_of_interps [ninterp]) | |
39 | + with Not_found -> stats) | |
40 | + | Seq l -> Xlist.fold l stats add_ntoken | |
41 | + | Variant l -> Xlist.fold l stats add_ntoken | |
42 | + | |
43 | +let create_ntoken_list stats name typ channel entries = | |
44 | + prerr_endline name; | |
45 | + Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) -> | |
46 | + Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> | |
47 | + let paragraph,tokens = annotate name sentences in | |
48 | + Xlist.fold tokens stats add_ntoken)) | |
49 | + | |
50 | +(* let rec select_interp = function (* przejście z m1 do m1.p1 *) | |
25 | 51 | "n" :: l,["n1"] :: ll -> ["n1"] :: (select_interp (l,ll)) |
26 | 52 | | "n" :: l,["n2"] :: ll -> ["n2"] :: (select_interp (l,ll)) |
27 | 53 | | "n" :: l,["p2"] :: ll -> ["p2"] :: (select_interp (l,ll)) |
... | ... | @@ -39,7 +65,7 @@ let rec select_interp = function (* przejście z m1 do m1.p1 *) |
39 | 65 | | "n" :: l,["_"] :: ll -> ["n1";"n2";"p2";"p3"] :: (select_interp (l,ll)) |
40 | 66 | | a :: l,al :: ll -> if Xlist.mem al a then [a] :: (select_interp (l,ll)) else raise Not_found |
41 | 67 | | [],[] -> [] |
42 | - | _ -> raise Not_found | |
68 | + | _ -> raise Not_found *) | |
43 | 69 | |
44 | 70 | let lowercase s = function |
45 | 71 | AllSmall _ -> s |
... | ... | @@ -50,17 +76,210 @@ let lowercase s = function |
50 | 76 | else failwith ("lowercase: " ^ s ^ " " ^ c) |
51 | 77 | | t -> failwith ("lowercase: " ^ ENIAMtokens.string_of_token t) |
52 | 78 | |
53 | -let match_lemmatize stats t lemma cat interp = | |
54 | - if cat = "brev" then stats else | |
55 | - if t.token = Symbol "." then stats else | |
79 | +let lemmatize_string s = | |
80 | + let l = Xunicode.classified_chars_of_utf8_string s in | |
81 | + let l = ENIAMtokens.tokenize l in | |
82 | + let l = ENIAMpatterns.normalize_tokens [] l in | |
83 | + let l = match l with | |
84 | + [Token {token=Interp "<query>"};Variant l;Token {token=Interp "</query>"}] -> l | |
85 | + | [Token {token=Interp "<query>"};t;Token {token=Interp "</query>"}] -> [t] | |
86 | + | _ -> failwith ("lemmatize_string 1: " ^ s ^ " " ^ String.concat " " (Xlist.map l (fun t -> ENIAMtokens.string_of_tokens_simple t))) in | |
87 | + let l = Xlist.fold l [] (fun l -> function | |
88 | + Token ({token=AllSmall _} as t) -> t :: l | |
89 | + | Token ({token=SmallLetter _} as t) -> t :: l | |
90 | + | Token ({token=SomeCap _} as t) -> t :: l | |
91 | + | Token ({token=FirstCap _} as t) -> t :: l | |
92 | + | Token ({token=AllCap _} as t) -> t :: l | |
93 | + | Token ({token=CapLetter _} as t) -> t :: l | |
94 | + | Token ({token=RomanDig _}) -> (*print_endline ("lemmatize_string: " ^ s);*) (*t ::*) l | |
95 | + | Token ({token=Dig _} as t) -> (*print_endline ("lemmatize_string: " ^ s);*) t :: l | |
96 | + | Token ({token=Proper _} as t) -> t :: l | |
97 | + | Seq[Token {token=AllSmall _};Token {token=Lemma _}] -> l | |
98 | + | Seq[Token {token=SmallLetter _};Token {token=Lemma _}] -> l | |
99 | + | Seq[Token {token=FirstCap _};Token {token=Lemma _}] -> l | |
100 | + | Seq[Token {token=CapLetter _};Token {token=Lemma _}] -> l | |
101 | + | Seq[Token {token=SomeCap _};Token {token=Lemma _}] -> l | |
102 | + | Seq[Token {token=AllSmall _};Token {token=Lemma _};Token {token=Lemma _}] -> l | |
103 | + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=AllSmall _}] -> l | |
104 | + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=SmallLetter _}] -> l | |
105 | + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=FirstCap _}] -> l | |
106 | + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=AllCap _}] -> l | |
107 | + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=CapLetter _}] -> l | |
108 | + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=RomanDig _}] -> l | |
109 | + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=Dig _}] -> l | |
110 | + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=AllSmall _};Token {token=Lemma _}] -> l | |
111 | + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=FirstCap _};Token {token=Lemma _}] -> l | |
112 | + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=SmallLetter _};Token {token=Lemma _}] -> l | |
113 | + | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=CapLetter _};Token {token=Lemma _}] -> l | |
114 | + (* | Seq[Token {token=Interp "<sentence>"};Token {token=Interp "<clause>"};Token {token=AllSmall _};Token {token=Lemma _};Token {token=Lemma _}] -> l *) | |
115 | + | t -> failwith ("lemmatize_string 3: " ^ ENIAMtokens.string_of_tokens_simple t)) in | |
116 | + if l = [] then failwith "lemmatize_string 3" else | |
117 | + List.flatten (Xlist.map l ENIAMpaths.lemmatize_token) | |
118 | + (* match l with | |
119 | + [] -> failwith "lemmatize_string 2" | |
120 | + | [t] -> t | |
121 | + | _ -> Xlist.iter l (fun t -> print_endline (ENIAMtokens.string_of_tokens_simple t)); failwith "lemmatize_string 3" *) | |
122 | + (* Xlist.iter l (fun t -> print_endline (ENIAMtokens.string_of_tokens_simple t)); | |
123 | + print_endline ""; | |
124 | + Token empty_token_env *) | |
125 | + | |
126 | +let get_cat_interp = function | |
127 | + "subst","subst",[n;c;["m1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]] | |
128 | + | "subst","subst",[n;c;["m2"]],[_;_;["m2"]] -> "subst",[n;c;["m2"]] | |
129 | + | "subst","subst",[n;c;["m3"]],[_;_;["m3"]] -> "subst",[n;c;["m3"]] | |
130 | + | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]] | |
131 | + | "subst","subst",[n;c;["n1";"n2"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]] | |
132 | + | "subst","subst",[n;c;["f"]],[_;_;["f"]] -> "subst",[n;c;["f"]] | |
133 | + | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n1"]] -> "subst",[n;c;["n1"]] | |
134 | + | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["n2"]] -> "subst",[n;c;["n2"]] | |
135 | + | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p2"]] -> "subst",[n;c;["p2"]] | |
136 | + | "subst","subst",[n;c;["n1";"n2";"p2";"p3"]],[_;_;["p3"]] -> "subst",[n;c;["p3"]] | |
137 | + | "subst","subst",[n;c;["m1";"p1"]],[_;_;["m1"]] -> "subst",[n;c;["m1"]] | |
138 | + | "subst","subst",[n;c;["m1";"p1"]],[_;_;["p1"]] -> "subst",[n;c;["p1"]] | |
139 | + | "depr","subst",[["pl"];["nom"];["m2"]],[["sg"];["nom"];["m1"]] -> "depr",[["pl"];["nom"];["m2"]] | |
140 | + | "ppron3","ppron3",ninterp,[["sg"];["nom"];["m1";"m2";"m3"];["ter"];_;_] -> "ppron3",ninterp | |
141 | + | "ppron12","ppron12",ninterp,[_;["nom"];_;_] -> "ppron3",ninterp | |
142 | + | "numcol","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *) | |
143 | + | "num","num",ninterp,_ -> "num",ninterp (* FIXME: wiele wpisów przejdzie *) | |
144 | + | "siebie","siebie",[[c]],[["acc";"gen"]] -> "siebie",[[c]] | |
145 | + | "adj","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adj",ninterp | |
146 | + | "adja","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adja",ninterp | |
147 | + | "adjc","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjc",ninterp | |
148 | + | "adjp","adj",ninterp,[["sg"];["nom";"voc"];["m1";"m2";"m3"];["pos"]] -> "adjp",ninterp | |
149 | + | "adv","adv",[[g]],[["pos"]] -> "adv",[[g]] | |
150 | + | "adv","adv",ninterp,interp -> if ninterp = interp then "adv",ninterp else raise Not_found | |
151 | + | "prep","prep",ninterp,interp -> if ninterp = interp then "prep",ninterp else raise Not_found | |
152 | + | "qub","qub",ninterp,interp -> if ninterp = interp then "qub",ninterp else raise Not_found | |
153 | + | "conj","conj",ninterp,interp -> if ninterp = interp then "conj",ninterp else raise Not_found | |
154 | + | "comp","comp",ninterp,interp -> if ninterp = interp then "comp",ninterp else raise Not_found | |
155 | + | "interj","interj",ninterp,interp -> if ninterp = interp then "interj",ninterp else raise Not_found | |
156 | + | "burk","burk",ninterp,interp -> if ninterp = interp then "burk",ninterp else raise Not_found | |
157 | + | "pred","pred",ninterp,interp -> if ninterp = interp then "pred",ninterp else raise Not_found | |
158 | + | "fin","inf",[n;p;["imperf"]],[["imperf";"perf"]] -> "fin",[n;p;["imperf"]] | |
159 | + | "fin","inf",[n;p;["imperf"]],[["imperf"]] -> "fin",[n;p;["imperf"]] | |
160 | + | "fin","inf",[n;p;["perf"]],[["imperf";"perf"]] -> "fin",[n;p;["perf"]] | |
161 | + | "fin","inf",[n;p;["perf"]],[["perf"]] -> "fin",[n;p;["perf"]] | |
162 | + | "impt","inf",[n;p;["imperf"]],[["imperf";"perf"]] -> "impt",[n;p;["imperf"]] | |
163 | + | "impt","inf",[n;p;["imperf"]],[["imperf"]] -> "impt",[n;p;["imperf"]] | |
164 | + | "impt","inf",[n;p;["perf"]],[["imperf";"perf"]] -> "impt",[n;p;["perf"]] | |
165 | + | "impt","inf",[n;p;["perf"]],[["perf"]] -> "impt",[n;p;["perf"]] | |
166 | + | "inf","inf",[["imperf"]],[["imperf";"perf"]] -> "inf",[["imperf"]] | |
167 | + | "inf","inf",[["imperf"]],[["imperf"]] -> "inf",[["imperf"]] | |
168 | + | "inf","inf",[["perf"]],[["imperf";"perf"]] -> "inf",[["perf"]] | |
169 | + | "inf","inf",[["perf"]],[["perf"]] -> "inf",[["perf"]] | |
170 | + | "praet","inf",[n;g;["imperf"]],[["imperf";"perf"]] -> "praet",[n;g;["imperf"]] | |
171 | + | "praet","inf",[n;g;["imperf"]],[["imperf"]] -> "praet",[n;g;["imperf"]] | |
172 | + | "praet","inf",[n;g;["perf"]],[["imperf";"perf"]] -> "praet",[n;g;["perf"]] | |
173 | + | "praet","inf",[n;g;["perf"]],[["perf"]] -> "praet",[n;g;["perf"]] | |
174 | + | "praet","inf",[n;g;["imperf"];a],[["imperf";"perf"]] -> "praet",[n;g;["imperf"];a] | |
175 | + | "praet","inf",[n;g;["imperf"];a],[["imperf"]] -> "praet",[n;g;["imperf"];a] | |
176 | + | "praet","inf",[n;g;["perf"];a],[["imperf";"perf"]] -> "praet",[n;g;["perf"];a] | |
177 | + | "praet","inf",[n;g;["perf"];a],[["perf"]] -> "praet",[n;g;["perf"];a] | |
178 | + | "winien","inf",[n;g;["imperf"]],[["imperf"]] -> "winien",[n;g;["imperf"]] | |
179 | + | "ppas","inf",[n;c;g;["imperf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["imperf"];a] | |
180 | + | "ppas","inf",[n;c;g;["imperf"];a],[["imperf"]] -> "ppas",[n;c;g;["imperf"];a] | |
181 | + | "ppas","inf",[n;c;g;["perf"];a],[["imperf";"perf"]] -> "ppas",[n;c;g;["perf"];a] | |
182 | + | "ppas","inf",[n;c;g;["perf"];a],[["perf"]] -> "ppas",[n;c;g;["perf"];a] | |
183 | + | "pact","inf",[n;c;g;["imperf"];a],[["imperf";"perf"]] -> "pact",[n;c;g;["imperf"];a] | |
184 | + | "pact","inf",[n;c;g;["imperf"];a],[["imperf"]] -> "pact",[n;c;g;["imperf"];a] | |
185 | + | "pact","inf",[n;c;g;["perf"];a],[["imperf";"perf"]] -> "pact",[n;c;g;["perf"];a] | |
186 | + | "pact","inf",[n;c;g;["perf"];a],[["perf"]] -> "pact",[n;c;g;["perf"];a] | |
187 | + | "pant","inf",[["imperf"]],[["imperf";"perf"]] -> "pant",[["imperf"]] | |
188 | + | "pant","inf",[["imperf"]],[["imperf"]] -> "pant",[["imperf"]] | |
189 | + | "pant","inf",[["perf"]],[["imperf";"perf"]] -> "pant",[["perf"]] | |
190 | + | "pant","inf",[["perf"]],[["perf"]] -> "pant",[["perf"]] | |
191 | + | "pcon","inf",[["imperf"]],[["imperf";"perf"]] -> "pcon",[["imperf"]] | |
192 | + | "pcon","inf",[["imperf"]],[["imperf"]] -> "pcon",[["imperf"]] | |
193 | + | "pcon","inf",[["perf"]],[["imperf";"perf"]] -> "pcon",[["perf"]] | |
194 | + | "pcon","inf",[["perf"]],[["perf"]] -> "pcon",[["perf"]] | |
195 | + | "ger","inf",[n;c;g;["imperf"];a],[["imperf";"perf"]] -> "ger",[n;c;g;["imperf"];a] | |
196 | + | "ger","inf",[n;c;g;["imperf"];a],[["imperf"]] -> "ger",[n;c;g;["imperf"];a] | |
197 | + | "ger","inf",[n;c;g;["perf"];a],[["imperf";"perf"]] -> "ger",[n;c;g;["perf"];a] | |
198 | + | "ger","inf",[n;c;g;["perf"];a],[["perf"]] -> "ger",[n;c;g;["perf"];a] | |
199 | + | "imps","inf",[["imperf"]],[["imperf";"perf"]] -> "imps",[["imperf"]] | |
200 | + | "imps","inf",[["imperf"]],[["imperf"]] -> "imps",[["imperf"]] | |
201 | + | "imps","inf",[["perf"]],[["imperf";"perf"]] -> "imps",[["perf"]] | |
202 | + | "imps","inf",[["perf"]],[["perf"]] -> "imps",[["perf"]] | |
203 | + | _ -> raise Not_found | |
204 | + | |
205 | +let correct_nlemma = function | |
206 | + "letnia " -> "letnia" | |
207 | + | "10minutowy" -> "minutowy" | |
208 | + | "23-letni" -> "letni" | |
209 | + | "40--letni" -> "letni" | |
210 | + | "5minutowy" -> "minutowy" | |
211 | + | "10-ta" -> (*"10."*)raise Not_found | |
212 | + | "10-tej" -> (*"10."*)raise Not_found | |
213 | + | "13-letni" -> "letni" | |
214 | + | "itineraryjny " -> "itineraryjny" | |
215 | + | "Składowy " -> "Składowy" | |
216 | + | "tak " -> "tak" | |
217 | + | "letni " -> "letni" | |
218 | + | "Kaznodziey'a" -> raise Not_found | |
219 | + | "Naczelna Rada Łowiecka" -> raise Not_found | |
220 | + | "PR-owy" -> raise Not_found | |
221 | + | "starać się" -> raise Not_found | |
222 | + | "vis-à-vis" -> raise Not_found | |
223 | + | "Ewangelia wg św. Jana" -> raise Not_found | |
224 | + | "`a" -> raise Not_found | |
225 | + | "6-piętrowy" -> "piętrowy" | |
226 | + | "6-letni" -> "letni" | |
227 | + | "5—lampowy" -> "lampowy" | |
228 | + | "4-piętrowy" -> "piętrowy" | |
229 | + | "3-centymetrowy" -> "centymetrowy" | |
230 | + | "34-letni" -> "letni" | |
231 | + | "18-ka" -> (*"18"*)raise Not_found | |
232 | + | "185-osobowy" -> "osobowy" | |
233 | + | "16-latek" -> raise Not_found | |
234 | + | s -> s | |
235 | + | |
236 | +let process_ntoken stats nlemma ncat ninterp = | |
237 | + try | |
238 | + let nlemma = correct_nlemma nlemma in | |
239 | + let nl = lemmatize_string nlemma in | |
240 | + let nl2 = Xlist.fold nl [] (fun nl -> function | |
241 | + {token=Lemma(lemma,cat,interp)} -> | |
242 | + Xlist.fold interp nl (fun nl interp -> | |
243 | + try | |
244 | + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in | |
245 | + if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl | |
246 | + with Not_found -> nl) | |
247 | + | {token=Dig _} -> nl (* FIXME: todo *) | |
248 | + | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *) | |
249 | + | _ -> nl) in | |
250 | + if nl2 = [] then StringQMap.add stats (ncat ^ " " ^ ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) | |
251 | + else StringQMap.add stats "lemmatized" | |
252 | + with Not_found -> StringQMap.add stats "incorrect" | |
253 | + | |
254 | +let validate_ntoken stats (nlemma,ncat,ninterp) = | |
255 | + process_ntoken stats nlemma ncat ninterp | |
256 | + | |
257 | +let match_lemmatize stats t = | |
258 | + if has_brev t.attrs then StringQMap.add stats "brev" else | |
56 | 259 | let l = ENIAMpaths.lemmatize_token t in |
57 | - let l2 = Xlist.fold l [] (fun l2 t2 -> | |
58 | - match t2.token with | |
59 | - Lemma(lemma2,cat2,interp2) -> if lemma = lemma2 || lemma = lowercase lemma2 t.token then t2 :: l2 else l2 | |
60 | - | Proper(lemma2,cat2,interp2,_) -> if lemma = lemma2 || lemma = lowercase lemma2 t.token then t2 :: l2 else l2 | |
61 | - | _ -> t2 :: l2) in | |
62 | - if l2 = [] then StringQMap.add stats ("no lemma: " ^ t.orth ^ " " ^ lemma) else | |
63 | - let l3 = Xlist.fold l2 [] (fun l3 t -> | |
260 | + try | |
261 | + let nlemma,ncat,ninterp = get_ntoken t.attrs in | |
262 | + let nlemma = correct_nlemma nlemma in | |
263 | + let nl = lemmatize_string nlemma in | |
264 | + let nl2 = Xlist.fold nl [] (fun nl -> function | |
265 | + {token=Lemma(lemma,cat,interp)} -> | |
266 | + Xlist.fold interp nl (fun nl interp -> | |
267 | + try | |
268 | + let cat,interp = get_cat_interp (ncat,cat,ninterp,interp) in | |
269 | + if lemma = nlemma then (Lemma(lemma,cat,[interp])) :: nl else nl | |
270 | + with Not_found -> nl) | |
271 | + | {token=Dig _} -> nl (* FIXME: todo *) | |
272 | + | {token=Proper(lemma,cat,interp,_)} -> nl (* FIXME: todo *) | |
273 | + | _ -> nl) in | |
274 | + if nl2 = [] then StringQMap.add stats (ENIAMtokens.string_of_token (Lemma(nlemma,ncat,[ninterp])) ^ ": " ^ String.concat " " (Xlist.map nl (fun t -> ENIAMtokens.string_of_token t.token))) | |
275 | + (* let l2 = Xlist.fold l [] (fun l2 t2 -> | |
276 | + match t2.token with | |
277 | + Lemma(lemma,cat,interp) -> if lemma = nlemma (*|| lemma = lowercase nlemma t.token*) then t2 :: l2 else l2 | |
278 | + (* | Proper(lemma,cat,interp,_) -> if lemma = nlemma || lemma = lowercase nlemma t.token then t2 :: l2 else l2 *) | |
279 | + | _ -> l2) in | |
280 | + if l2 = [] then StringQMap.add stats ("no lemma: " ^ t.orth ^ " " ^ nlemma) else *) | |
281 | + else StringQMap.add stats "lemmatized" | |
282 | +(* let l3 = Xlist.fold l2 [] (fun l3 t -> | |
64 | 283 | match t.token with |
65 | 284 | Lemma(lemma2,cat2,interp2) -> if cat = cat2 then t :: l3 else l3 |
66 | 285 | | Proper(lemma2,cat2,interp2,_) -> if cat = cat2 then t :: l3 else l3 |
... | ... | @@ -88,27 +307,13 @@ let match_lemmatize stats t lemma cat interp = |
88 | 307 | | [{token=Lemma _};{token=AllSmall _}] -> stats |
89 | 308 | | [{token=Lemma _};{token=SmallLetter _}] -> stats |
90 | 309 | | [{token=Lemma _};{token=FirstCap _}] -> stats |
91 | - | l -> StringQMap.add stats ("multiple interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env)) | |
92 | - | |
93 | -let is_lemmatizable = function | |
94 | - | AllSmall _ -> true | |
95 | - | SmallLetter _ -> true | |
96 | - | FirstCap _ -> true | |
97 | - | AllCap _ -> true | |
98 | - | CapLetter _ -> true | |
99 | - | SomeCap _ -> true | |
100 | - | t -> false | |
101 | - | |
102 | -let validate_token stats = function | |
103 | - AT(t,[sent,orth,lemma,"brev",interp]) -> StringQMap.add stats "brev" | |
104 | - | AT(t,l(*[sent,orth,lemma,cat,interp]*)) -> | |
105 | - if is_lemmatizable t.token then | |
106 | - StringQMap.add stats "lemmatizable" else StringQMap.add stats "non lemmatizable" | |
107 | - (*match_lemmatize stats t lemma cat interp*) | |
108 | - (* | AT(_,l) as t -> StringQMap.add stats ("validate_token: " ^ string_of_atoken t)*) | |
109 | - | AV(tl,l) as t -> StringQMap.add stats ("validate_token: " ^ string_of_atoken t) | |
110 | - | AR(stat,tl,l) as t -> StringQMap.add stats ("validate_token: " ^ string_of_atoken t) | |
111 | - (* | _ -> StringQMap.add stats "validate_token: ni" *) | |
310 | + | l -> StringQMap.add stats ("multiple interp: " ^ t.orth ^ " " ^ lemma ^ " " ^ cat ^ "\n" ^ String.concat "\n" (Xlist.map l ENIAMtokens.string_of_token_env))*) | |
311 | + with Not_found -> StringQMap.add stats "no ntoken" (*("no ntoken for: " ^ t.orth ^ " " ^ ENIAMtokens.string_of_token t.token)*) | |
312 | + | |
313 | +let rec validate_token stats = function | |
314 | + Token t -> match_lemmatize stats t | |
315 | + | Seq l -> Xlist.fold l stats validate_token | |
316 | + | Variant l -> Xlist.fold l stats validate_token | |
112 | 317 | |
113 | 318 | let validate_morphology stats name typ channel entries = |
114 | 319 | prerr_endline name; |
... | ... | @@ -121,18 +326,28 @@ let validate_morphology stats name typ channel entries = |
121 | 326 | Xlist.fold tokens stats validate_token |
122 | 327 | (*else stats*))) |
123 | 328 | |
329 | +let ntokens_filename = "results/ntokens.tab" | |
330 | + | |
331 | +let parse_ninterp s = | |
332 | + Xlist.map (Xstring.split ":" s) (fun s -> Xstring.split "\\." s) | |
333 | + | |
334 | +let fold_ntokens ntokens_filename s f = | |
335 | + File.fold_tab ntokens_filename s (fun s -> function | |
336 | + [_;nlemma;ncat;ninterp] -> f s (nlemma,ncat,parse_ninterp ninterp) | |
337 | + | l -> failwith ("fold_ntokens: " ^ String.concat "\t" l)) | |
338 | + | |
124 | 339 | let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005"; |
125 | 340 | "620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727"; |
126 | 341 | "620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056"; |
127 | 342 | "711-3-010000079";"720-3-010000217";"720-3-010000335";"720-3-010000341";"forumowisko.pl_18535";"forumowisko.pl_424";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";*) |
128 | - (*"040-2-000001";"040-2-000007";"040-4-000000103";"120-2-000003";"120-2-000007";"120-2-000009";"120-2-000010";"120-2-900017";"120-2-900041";"120-2-900044";"120-2-900083"; | |
343 | + (* "040-2-000001";"040-2-000007";"040-4-000000103";"120-2-000003";"120-2-000007";"120-2-000009";"120-2-000010";"120-2-900017";"120-2-900041";"120-2-900044";"120-2-900083"; | |
129 | 344 | "120-2-900092";"120-2-900094";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900008";"120-4-900010";"130-3-900001";"130-3-910001";"130-5-000000267"; |
130 | 345 | "130-5-000000406";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001338";"130-5-000001628";"130-5-000001742";"200-1-000011";"200-1-000026";"200-2-000078"; |
131 | 346 | "200-2-000173";"200-2-000175";"200-4-000000307";"200-4-000000316";"310-2-000007";"320-2-000000094";"320-2-000034";"320-2-000064";"320-3-000226";"330-2-000000030"; |
132 | 347 | "330-2-000000033";"330-2-000000200";"330-2-000000213";"330-2-000003";"330-2-000013";"620-3-010000057";"620-3-010000838";"620-3-010001103";"620-3-010001107";"620-3-010001108"; |
133 | 348 | "620-3-010001109";"620-3-010001125";"620-3-010001274";"620-3-010001448";"620-3-010001732";"620-3-010001772";"711-3-010000021";"712-1-900003";"712-1-900004";"720-3-000071"; |
134 | 349 | "720-3-010000323";"DP1999";"DP2002";"DP2003";"EkspressWieczorny";"forumowisko.pl_20218";"forumowisko.pl_42911";"forumowisko.pl_724";"GazetaGoleniowska";"GazetaTczewska"; |
135 | - "NIE";"SuperExpress";"TrybunaSlaska";*) | |
350 | + "NIE";"SuperExpress";"TrybunaSlaska"; *) | |
136 | 351 | (* "120-2-000009";"120-2-000010";"120-2-000012";"120-2-900019";"120-2-900041";"120-2-900044";"120-2-900092";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900001"; |
137 | 352 | "120-4-900008";"130-3-900001";"130-5-000000267";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001628";"130-5-000001635";"130-5-000001742";"200-1-000011"; |
138 | 353 | "200-2-000078";"200-2-000181";"200-4-000000314";"200-4-000026";"200-4-000059";"310-2-000007";"320-2-000000087";"320-2-000000094";"320-2-000034";"330-2-000013";"620-3-010000057"; |
... | ... | @@ -152,10 +367,15 @@ let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-0000 |
152 | 367 | let _ = |
153 | 368 | ENIAMtokenizer.initialize (); |
154 | 369 | ENIAMinflexion.initialize (); |
155 | - let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) -> | |
156 | - validate_morphology stats name typ channel entries) in | |
370 | + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) -> | |
371 | + create_ntoken_list stats name typ channel entries) in *) | |
372 | + (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> | |
373 | + create_ntoken_list stats name typ channel entries) in *) | |
374 | + let stats = fold_ntokens ntokens_filename StringQMap.empty validate_ntoken in | |
375 | + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] StringQMap.empty (fun stats (name,typ,channel,entries) -> | |
376 | + validate_morphology stats name typ channel entries) in *) | |
157 | 377 | (* let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> |
158 | - validate_segmentation stats name typ channel entries) in *) | |
378 | + validate_morphology stats name typ channel entries) in *) | |
159 | 379 | let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in |
160 | 380 | Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k); |
161 | 381 | () |
... | ... |
NKJP2/validateTokenizer.ml
... | ... | @@ -589,9 +589,9 @@ let rec match_and_combine name paragraph stats l = function |
589 | 589 | with Not_found -> |
590 | 590 | let e_tokens,n_tokens,ets,l = combine "" "" [] [] (et :: ets) l in |
591 | 591 | (* let stats = StringQMap.add stats (string_of_eniam_token_orths e_tokens ^ "\t" ^ string_of_nkjp_token_orths n_tokens ^ "\t" ^ name) in *) |
592 | - (* let stats = StringQMap.add stats (string_of_eniam_token_orths e_tokens ^ "\t" ^ string_of_nkjp_token_orths n_tokens ^ "\t" ^ paragraph) in *) | |
592 | + let stats = StringQMap.add stats (string_of_eniam_token_orths e_tokens ^ "\t" ^ string_of_nkjp_token_orths n_tokens ^ "\t" ^ paragraph) in | |
593 | 593 | (* let stats = StringQMap.add stats ("[\"" ^ string_of_eniam_token_orths2 e_tokens ^ "\"],[\"" ^ string_of_nkjp_token_orths2 n_tokens ^ "\"];" ^ "\t" ^ name) in *) |
594 | - let stats = StringQMap.add stats ("[\"" ^ string_of_eniam_token_orths2 e_tokens ^ "\"],[\"" ^ string_of_nkjp_token_orths2 n_tokens ^ "\"];" ^ "\t" ^ paragraph) in | |
594 | + (* let stats = StringQMap.add stats ("[\"" ^ string_of_eniam_token_orths2 e_tokens ^ "\"],[\"" ^ string_of_nkjp_token_orths2 n_tokens ^ "\"];" ^ "\t" ^ paragraph) in *) | |
595 | 595 | match_and_combine name paragraph stats l ets))) |
596 | 596 | | [] -> if l = [] then stats else StringQMap.add stats ("match_and_combine: " ^ name ^ "\t" ^ string_of_nkjp_token_orths l ^ "\t" ^ paragraph) |
597 | 597 | |
... | ... | @@ -667,6 +667,16 @@ let set_sent sent t = |
667 | 667 | | SentBegEnd -> Token {t with attrs=SentBegEnd :: t.attrs} |
668 | 668 | | Space -> failwith "set_sent" |
669 | 669 | |
670 | +let set_sent_list ets l = (* FIXME: todo *) | |
671 | + (* print_endline (String.concat " " (Xlist.map l (fun n -> | |
672 | + match n.nsent with | |
673 | + SentBeg -> "B" | |
674 | + | SentEnd -> "E" | |
675 | + | Inside -> "I" | |
676 | + | SentBegEnd -> "BE" | |
677 | + | Space -> "S"))); *) | |
678 | + ets | |
679 | + | |
670 | 680 | let rec allign rev = function |
671 | 681 | {orth=""} as t :: ets,nts -> allign ((t,[]) :: rev) (ets,nts) |
672 | 682 | | [{orth="."} as x;{orth="''"} as y],[{north="''"};{north="."}] -> List.rev rev @ [x,[];y,[]] |
... | ... | @@ -679,26 +689,69 @@ let rec allign rev = function |
679 | 689 | | [],[] -> List.rev rev |
680 | 690 | | _ -> failwith "allign 3" |
681 | 691 | |
692 | +let transform_nkjp_interp cat interp1 = | |
693 | + if interp1 = [] then [] else | |
694 | + let interp = Xlist.map interp1 (fun s -> [s]) in | |
695 | + match cat with | |
696 | + "subst" | "ppron12" | "ppron3" | "ppas" | "pact" | "adj" | "num" | "depr" | "numcol" -> | |
697 | + (match interp with | |
698 | + ["sg"] :: case :: ["n"] :: l -> ["sg"] :: case :: ["n1";"n2"] :: l | |
699 | + | ["pl"] :: case :: ["n"] :: l -> ["pl"] :: case :: ["n1";"n2";"p2";"p3"] :: l | |
700 | + | ["pl"] :: case :: ["m1"] :: l -> ["pl"] :: case :: ["m1";"p1"] :: l | |
701 | + | l -> l) | |
702 | + | "ger" -> | |
703 | + (match interp with | |
704 | + num :: case :: ["n"] :: l -> num :: case :: ["n2"] :: l | |
705 | + | l -> l) | |
706 | + | "praet" | "winien" -> | |
707 | + (match interp with | |
708 | + ["sg"] :: ["n"] :: l -> ["sg"] :: ["n1";"n2"] :: l | |
709 | + | ["pl"] :: ["n"] :: l -> ["pl"] :: ["n1";"n2";"p2";"p3"] :: l | |
710 | + | ["pl"] :: ["m1"] :: l -> ["pl"] :: ["m1";"p1"] :: l | |
711 | + | l -> l) | |
712 | + | "prep" | "adv" | "fin" | "inf" | "imps" | "pcon" | "bedzie" | "impt" | "siebie" | "aglt" | "pant" | "brev" | "qub" -> interp | |
713 | + | _ -> print_endline ("transform_nkjp_interp: " ^ cat ^ " " ^ String.concat ":" interp1); interp | |
714 | + | |
682 | 715 | let merge_token = function |
683 | 716 | t,[] -> Token t |
684 | 717 | | t,[{ncat="brev"} as n] -> set_sent n.nsent {t with attrs=BrevLemma n.nlemma :: t.attrs} |
685 | 718 | | t,[n] -> |
686 | - if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,n.ninterp) :: t.attrs} | |
719 | + if n.nlemma = "+/-" then set_sent n.nsent t else | |
720 | + if is_lemmatizable t.token then set_sent n.nsent {t with attrs=Disamb(n.nlemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: t.attrs} | |
687 | 721 | else set_sent n.nsent t |
688 | 722 | | _ -> failwith "merge_token" |
689 | 723 | |
690 | -let transform_nkjp_interp = function | |
691 | - | l -> (*print_endline ("transform_nkjp_interp: " ^ String.concat ":" l);*) Xlist.map l (fun s -> [s]) | |
724 | +let merge_letni l seq = | |
725 | + if l = [] then failwith "merge_letni" else | |
726 | + let n = List.hd (List.rev l) in | |
727 | + let lemma = List.hd (List.rev (Xstring.split "-" n.nlemma)) in | |
728 | + let seq = match seq with | |
729 | + first :: l -> if n.nsent=SentBeg || n.nsent=SentBegEnd then {first with attrs=SentBeg :: first.attrs} :: l else first :: l | |
730 | + | _ -> failwith "merge_letni" in | |
731 | + match List.rev seq with | |
732 | + last :: l -> | |
733 | + let attrs = if n.nsent=SentEnd || n.nsent=SentBegEnd then (SentEnd : attr) :: last.attrs else last.attrs in | |
734 | + Seq(Xlist.rev_map ({last with attrs=Disamb(lemma,n.ncat,transform_nkjp_interp n.ncat n.ninterp) :: attrs} :: l) (fun t -> Token t)) | |
735 | + | _ -> failwith "merge_letni" | |
736 | + | |
737 | +let blabla_orths = StringSet.of_list ["8.12"; "9.11"; "1.1"; "1.2"] | |
738 | + | |
739 | +let is_blabla = function | |
740 | + [{north=s};{north="."}] -> StringSet.mem blabla_orths s (*then (print_endline ("blabla: " ^ s); true) else false*) | |
741 | + | _ -> false | |
692 | 742 | |
693 | 743 | let merge_paragraph name = function |
694 | 744 | AT(t,l) -> merge_token (t,l) |
695 | - | AV(variants,l) as t -> (*print_endline (string_of_atoken t);*) Variant(Xlist.rev_map variants (fun ets -> | |
696 | - Seq(Xlist.map (allign [] (ets,l)) merge_token))) | |
697 | - | AR("tys",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map ets (fun t -> Token t)))) | |
698 | - | AR("both-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map ets (fun t -> Token t)))) | |
699 | - | AR("eniam-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map ets (fun t -> Token t)))) | |
700 | - | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp n.ninterp])})) | |
701 | - | t -> (*print_endline (string_of_atoken t);*) Token empty_token_env | |
745 | + | AV(variants,l) -> | |
746 | + if is_blabla l then Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) else | |
747 | + Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (allign [] (ets,l)) merge_token))) | |
748 | + | AR("tys",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) | |
749 | + | AR("letni",variants,l) -> Variant(Xlist.rev_map variants (merge_letni l)) (*in print_endline (ENIAMtokens.string_of_tokens 0 t); t*) | |
750 | + | AR("brev",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) | |
751 | + | AR("both-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) | |
752 | + | AR("eniam-correct",variants,l) -> Variant(Xlist.rev_map variants (fun ets -> Seq(Xlist.map (set_sent_list ets l) (fun t -> Token t)))) | |
753 | + | AR("nkjp-correct",variants,l) -> Seq(Xlist.map l (fun n -> set_sent n.nsent {empty_token_env with orth=n.north; token=Lemma(n.nlemma,n.ncat,[transform_nkjp_interp n.ncat n.ninterp])})) (* FIXME: ustalenie beg len next *) | |
754 | + | t -> failwith ("merge_paragraph: " ^ string_of_atoken t) | |
702 | 755 | |
703 | 756 | let test_annotate name typ channel entries = |
704 | 757 | (* if name = "620-3-010001854" then prerr_endline "620-3-010001854 omited" else ( *) |
... | ... | @@ -723,7 +776,7 @@ let test_annotate name typ channel entries = |
723 | 776 | (* print_endline "test_annotate 2"; *) |
724 | 777 | let m = annotate_paragraph name paragraph tokens eniam_tokens in |
725 | 778 | (* print_endline "test_annotate 3"; *) |
726 | - (* check_annotation paragraph m; *) | |
779 | + check_annotation paragraph m; | |
727 | 780 | let _ = List.rev (Xlist.rev_map m (merge_paragraph name)) in |
728 | 781 | ()); |
729 | 782 | (* print_endline (String.concat "\n" (Xlist.map m string_of_atoken))); *) |
... | ... | @@ -738,20 +791,21 @@ let annotate name sentences = |
738 | 791 | let eniam_tokens = convert_eniam_tokens [] eniam_tokens in |
739 | 792 | let eniam_tokens = annotate_variants_par eniam_tokens in |
740 | 793 | let m = annotate_paragraph name paragraph tokens eniam_tokens in |
794 | + let m = List.rev (Xlist.rev_map m (merge_paragraph name)) in | |
741 | 795 | paragraph, m |
742 | 796 | |
743 | 797 | let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005"; |
744 | 798 | "620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727"; |
745 | 799 | "620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056"; |
746 | 800 | "711-3-010000079";"720-3-010000217";"720-3-010000335";"720-3-010000341";"forumowisko.pl_18535";"forumowisko.pl_424";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";*) |
747 | - (* "040-2-000001";"040-2-000007";"040-4-000000103";"120-2-000003";"120-2-000007";"120-2-000009";"120-2-000010";"120-2-900017";"120-2-900041";"120-2-900044";"120-2-900083"; | |
801 | + "040-2-000001";"040-2-000007";"040-4-000000103";"120-2-000003";"120-2-000007";"120-2-000009";"120-2-000010";"120-2-900017";"120-2-900041";"120-2-900044";"120-2-900083"; | |
748 | 802 | "120-2-900092";"120-2-900094";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900008";"120-4-900010";"130-3-900001";"130-3-910001";"130-5-000000267"; |
749 | 803 | "130-5-000000406";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001338";"130-5-000001628";"130-5-000001742";"200-1-000011";"200-1-000026";"200-2-000078"; |
750 | 804 | "200-2-000173";"200-2-000175";"200-4-000000307";"200-4-000000316";"310-2-000007";"320-2-000000094";"320-2-000034";"320-2-000064";"320-3-000226";"330-2-000000030"; |
751 | 805 | "330-2-000000033";"330-2-000000200";"330-2-000000213";"330-2-000003";"330-2-000013";"620-3-010000057";"620-3-010000838";"620-3-010001103";"620-3-010001107";"620-3-010001108"; |
752 | 806 | "620-3-010001109";"620-3-010001125";"620-3-010001274";"620-3-010001448";"620-3-010001732";"620-3-010001772";"711-3-010000021";"712-1-900003";"712-1-900004";"720-3-000071"; |
753 | 807 | "720-3-010000323";"DP1999";"DP2002";"DP2003";"EkspressWieczorny";"forumowisko.pl_20218";"forumowisko.pl_42911";"forumowisko.pl_724";"GazetaGoleniowska";"GazetaTczewska"; |
754 | - "NIE";"SuperExpress";"TrybunaSlaska"; *) | |
808 | + "NIE";"SuperExpress";"TrybunaSlaska"; | |
755 | 809 | (* "120-2-000009";"120-2-000010";"120-2-000012";"120-2-900019";"120-2-900041";"120-2-900044";"120-2-900092";"120-2-900123";"120-2-910000011";"120-4-900000001";"120-4-900001"; |
756 | 810 | "120-4-900008";"130-3-900001";"130-5-000000267";"130-5-000000817";"130-5-000001188";"130-5-000001274";"130-5-000001628";"130-5-000001635";"130-5-000001742";"200-1-000011"; |
757 | 811 | "200-2-000078";"200-2-000181";"200-4-000000314";"200-4-000026";"200-4-000059";"310-2-000007";"320-2-000000087";"320-2-000000094";"320-2-000034";"330-2-000013";"620-3-010000057"; |
... | ... | @@ -764,7 +818,7 @@ let selection = StringSet.of_list [(*"Rzeczpospolita";"200-4-000014";"040-2-0000 |
764 | 818 | "KurierKwidzynski";"NIE";"Rzeczpospolita";"TrybunaSlaska" *) |
765 | 819 | (* "110-4-000000102";"120-2-000006";"120-2-900032";"130-5-000000507";"130-5-000001156"; |
766 | 820 | "620-3-010000835";"GazetaGoleniowska";"KurierKwidzynski";"NIE";"Rzeczpospolita"; *) |
767 | - (*"110-4-000000102";"KurierKwidzynski";*)(*"620-3-010001496;"*)(*"130-5-000001341";*)(*"620-3-010001854"*)"620-3-010001106" | |
821 | + (*"110-4-000000102";"KurierKwidzynski";*)(*"620-3-010001496;"*)(*"130-5-000001341";*)(*"620-3-010001854"*)(*"620-3-010001106"*) | |
768 | 822 | ] |
769 | 823 | |
770 | 824 | let _ = |
... | ... | @@ -779,8 +833,8 @@ let _ = |
779 | 833 | validate_segmentation stats name typ channel entries) in *) |
780 | 834 | (* ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection [] [] () (fun () (name,typ,channel,entries) -> |
781 | 835 | test_annotate name typ channel entries); *) |
782 | - ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) -> | |
783 | - test_annotate name typ channel entries); | |
836 | + (* ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path () (fun () (name,typ,channel,entries) -> | |
837 | + test_annotate name typ channel entries); *) | |
784 | 838 | (* let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in |
785 | 839 | Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\t%s\n" v k); *) |
786 | 840 | () |
... | ... |
documentation/motto.txt
0 → 100644
subsyntax/ENIAMsubsyntax.ml
... | ... | @@ -241,6 +241,11 @@ let rec calculate_quality q = function |
241 | 241 | | NotValProper :: l -> calculate_quality (q-1) l |
242 | 242 | | LemmLowercase :: l -> calculate_quality q l |
243 | 243 | | Roman :: l -> calculate_quality q l |
244 | + | SentBeg :: l -> calculate_quality q l | |
245 | + | SentBegEnd :: l -> calculate_quality q l | |
246 | + | SentEnd :: l -> calculate_quality q l | |
247 | + | BrevLemma _ :: l -> calculate_quality q l | |
248 | + | Disamb _ :: l -> calculate_quality q l | |
244 | 249 | | [] -> q |
245 | 250 | |
246 | 251 | let select_tokens2 paths = |
... | ... |
tokenizer/ENIAMtokenizerTypes.ml
... | ... | @@ -44,7 +44,7 @@ type attr = |
44 | 44 | CS | MaybeCS | ReqValLemm | MWE | LemmNotVal | TokNotFound | NotValProper | LemmLowercase | Roman |
45 | 45 | | SentBeg | SentEnd | SentBegEnd |
46 | 46 | | BrevLemma of string |
47 | - | Disamb of string * string * string list | |
47 | + | Disamb of string * string * string list list | |
48 | 48 | |
49 | 49 | (* Tekst reprezentuję jako zbiór obiektów typu token_record zawierających |
50 | 50 | informacje o poszczególnych tokenach *) |
... | ... |
tokenizer/ENIAMtokens.ml
... | ... | @@ -100,7 +100,7 @@ let string_of_attr = function |
100 | 100 | | SentEnd -> "NKJP sentence end" |
101 | 101 | | SentBegEnd -> "NKJP sentence begin-end" |
102 | 102 | | BrevLemma s -> "NKJP brev lemma: " ^ s |
103 | - | Disamb(lemma,cat,interp) -> "NKJP disamb: " ^ lemma ^ ":" ^ cat ^ ":" ^ String.concat ":" interp | |
103 | + | Disamb(lemma,cat,interp) -> "NKJP disamb: " ^ lemma ^ ":" ^ cat ^ ":" ^ String.concat ":" (Xlist.map interp (String.concat ".")) | |
104 | 104 | |
105 | 105 | let string_of_token_env t = |
106 | 106 | sprintf "{orth=%s;beg=%d;len=%d;next=%d;token=%s;weight=%.2f;attrs=[%s]}" t.orth t.beg t.len t.next (string_of_token t.token) t.weight |
... | ... |