Commit 59cdc551498fb155f90b97a52cde992cc90eb84c
1 parent
b4c2dd39
poprawki w LCGlexicon i walidacja tokenizacji
Showing
7 changed files
with
90 additions
and
32 deletions
LCGlexicon/ENIAM_LCGlexicon.ml
@@ -58,7 +58,7 @@ let assign_quantifiers (selectors,rule,weight) = | @@ -58,7 +58,7 @@ let assign_quantifiers (selectors,rule,weight) = | ||
58 | let pos = find_selector Pos selectors in | 58 | let pos = find_selector Pos selectors in |
59 | let categories = | 59 | let categories = |
60 | try StringMap.find pos_categories pos | 60 | try StringMap.find pos_categories pos |
61 | - with Not_found -> failwith ("assign_quantifiers: " ^ pos) in | 61 | + with Not_found -> failwith ("assign_quantifiers: unknown part of speech " ^ pos) in |
62 | let categories = Xlist.map categories (fun s -> s,Top) in | 62 | let categories = Xlist.map categories (fun s -> s,Top) in |
63 | let syntax,rule = get_syntax [] rule in | 63 | let syntax,rule = get_syntax [] rule in |
64 | let quant,rule = get_quant [] rule in | 64 | let quant,rule = get_quant [] rule in |
@@ -111,7 +111,7 @@ let make_rules x_flag filename = | @@ -111,7 +111,7 @@ let make_rules x_flag filename = | ||
111 | dict_of_grammar lexicon | 111 | dict_of_grammar lexicon |
112 | 112 | ||
113 | let find_rules rules cats = | 113 | let find_rules rules cats = |
114 | - let lex_rules,rules = try StringMap.find rules cats.pos with Not_found -> failwith "find_rules 1" in | 114 | + let lex_rules,rules = try StringMap.find rules cats.pos with Not_found -> failwith ("find_rules: unable to find rules for category " ^ cats.pos) in |
115 | (* Printf.printf "find_rules: %s %s |rules|=%d\n" cats.lemma cats.pos (Xlist.size rules); *) | 115 | (* Printf.printf "find_rules: %s %s |rules|=%d\n" cats.lemma cats.pos (Xlist.size rules); *) |
116 | let rules = try StringMap.find lex_rules cats.lemma @ rules with Not_found -> rules in | 116 | let rules = try StringMap.find lex_rules cats.lemma @ rules with Not_found -> rules in |
117 | Xlist.fold rules [] (fun rules (selectors,syntax,semantics) -> | 117 | Xlist.fold rules [] (fun rules (selectors,syntax,semantics) -> |
LCGlexicon/ENIAM_LCGlexiconParser.ml
@@ -152,7 +152,7 @@ let find_internal_grammar_symbols atoms = function | @@ -152,7 +152,7 @@ let find_internal_grammar_symbols atoms = function | ||
152 | | s -> if StringSet.mem selector_names s then B (AVar s) else | 152 | | s -> if StringSet.mem selector_names s then B (AVar s) else |
153 | if StringSet.mem atoms s then B (Atom s) else | 153 | if StringSet.mem atoms s then B (Atom s) else |
154 | if StringSet.mem operators s then A s else | 154 | if StringSet.mem operators s then A s else |
155 | - failwith ("find_internal_grammar_symbols: " ^ s) | 155 | + failwith ("find_internal_grammar_symbols: unknown symbol " ^ s) |
156 | 156 | ||
157 | let rec find_tensor = function | 157 | let rec find_tensor = function |
158 | B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: A "*" :: B s7 :: A "*" :: B s8 :: l -> failwith "find_tensor 1" | 158 | B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: A "*" :: B s7 :: A "*" :: B s8 :: l -> failwith "find_tensor 1" |
@@ -163,7 +163,7 @@ let rec find_tensor = function | @@ -163,7 +163,7 @@ let rec find_tensor = function | ||
163 | | B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: l -> C (Tensor[s1;s2;s3]) :: find_tensor l | 163 | | B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: l -> C (Tensor[s1;s2;s3]) :: find_tensor l |
164 | | B s1 :: A "*" :: B s2 :: l -> C (Tensor[s1;s2]) :: find_tensor l | 164 | | B s1 :: A "*" :: B s2 :: l -> C (Tensor[s1;s2]) :: find_tensor l |
165 | | B s1 :: l -> C (Tensor[s1]) :: find_tensor l | 165 | | B s1 :: l -> C (Tensor[s1]) :: find_tensor l |
166 | - | A "*" :: _ -> failwith "find_tensor 2" | 166 | + | A "*" :: _ -> failwith "find_tensor 2: unexpected '*'" |
167 | | t :: l -> t :: find_tensor l | 167 | | t :: l -> t :: find_tensor l |
168 | | [] -> [] | 168 | | [] -> [] |
169 | 169 | ||
@@ -174,7 +174,7 @@ let rec find_plus = function | @@ -174,7 +174,7 @@ let rec find_plus = function | ||
174 | | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: l -> C (Plus[s1;s2;s3;s4]) :: find_plus l | 174 | | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: l -> C (Plus[s1;s2;s3;s4]) :: find_plus l |
175 | | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: l -> C (Plus[s1;s2;s3]) :: find_plus l | 175 | | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: l -> C (Plus[s1;s2;s3]) :: find_plus l |
176 | | C s1 :: A "+" :: C s2 :: l -> C (Plus[s1;s2]) :: find_plus l | 176 | | C s1 :: A "+" :: C s2 :: l -> C (Plus[s1;s2]) :: find_plus l |
177 | - | A "+" :: _ -> failwith "find_plus 2" | 177 | + | A "+" :: _ -> failwith "find_plus 2: unexpected '+'" |
178 | | t :: l -> t :: find_plus l | 178 | | t :: l -> t :: find_plus l |
179 | | [] -> [] | 179 | | [] -> [] |
180 | 180 | ||
@@ -192,7 +192,7 @@ let rec find_imp = function | @@ -192,7 +192,7 @@ let rec find_imp = function | ||
192 | 192 | ||
193 | let rec find_maybe = function | 193 | let rec find_maybe = function |
194 | | A "?" :: C s2 :: l -> C (Maybe s2) :: find_maybe l | 194 | | A "?" :: C s2 :: l -> C (Maybe s2) :: find_maybe l |
195 | - | A "?" :: _ -> failwith "find_maybe 1" | 195 | + | A "?" :: _ -> failwith "find_maybe 1: unexpected '?'" |
196 | | s :: l -> s :: find_maybe l | 196 | | s :: l -> s :: find_maybe l |
197 | | [] -> [] | 197 | | [] -> [] |
198 | 198 | ||
@@ -203,11 +203,11 @@ let rec find_mult_imp = function | @@ -203,11 +203,11 @@ let rec find_mult_imp = function | ||
203 | | A "," :: A "/" :: C s2 :: l -> A "," :: D (Forward,s2) :: find_mult_imp l | 203 | | A "," :: A "/" :: C s2 :: l -> A "," :: D (Forward,s2) :: find_mult_imp l |
204 | | A "," :: A "|" :: C s2 :: l -> A "," :: D (Both,s2) :: find_mult_imp l | 204 | | A "," :: A "|" :: C s2 :: l -> A "," :: D (Both,s2) :: find_mult_imp l |
205 | | A "," :: A "\\" :: C s2 :: l -> A "," :: D (Backward,s2) :: find_mult_imp l | 205 | | A "," :: A "\\" :: C s2 :: l -> A "," :: D (Backward,s2) :: find_mult_imp l |
206 | - | A "/" :: _ -> failwith "find_mult_imp 1" | ||
207 | - | A "|" :: _ -> failwith "find_mult_imp 2" | ||
208 | - | A "\\" :: _ -> failwith "find_mult_imp 3" | ||
209 | - | A "(" :: _ -> failwith "find_mult_imp 4" | ||
210 | - | A ")" :: _ -> failwith "find_mult_imp 5" | 206 | + | A "/" :: _ -> failwith "find_mult_imp 1: unexpected '/'" |
207 | + | A "|" :: _ -> failwith "find_mult_imp 2: unexpected '|'" | ||
208 | + | A "\\" :: _ -> failwith "find_mult_imp 3: unexpected '\\'" | ||
209 | + | A "(" :: _ -> failwith "find_mult_imp 4: unexpected '('" | ||
210 | + | A ")" :: _ -> failwith "find_mult_imp 5: unexpected ')'" | ||
211 | | s :: l -> s :: find_mult_imp l | 211 | | s :: l -> s :: find_mult_imp l |
212 | | [] -> [] | 212 | | [] -> [] |
213 | 213 | ||
@@ -220,9 +220,9 @@ let rec find_mult = function | @@ -220,9 +220,9 @@ let rec find_mult = function | ||
220 | | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3] :: find_mult l | 220 | | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3] :: find_mult l |
221 | | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "}" :: l -> E[s1,t1;s2,t2] :: find_mult l | 221 | | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "}" :: l -> E[s1,t1;s2,t2] :: find_mult l |
222 | | A "{" :: D(s1,t1) :: A "}" :: l -> E[s1,t1] :: find_mult l | 222 | | A "{" :: D(s1,t1) :: A "}" :: l -> E[s1,t1] :: find_mult l |
223 | - | A "{" :: _ -> failwith "find_mult 2" | ||
224 | - | A "}" :: _ -> failwith "find_mult 3" | ||
225 | - | A "," :: _ -> failwith "find_mult 4" | 223 | + | A "{" :: _ -> failwith "find_mult 2: unexpected '{'" |
224 | + | A "}" :: _ -> failwith "find_mult 3: unexpected '}'" | ||
225 | + | A "," :: _ -> failwith "find_mult 4: unexpected ','" | ||
226 | | t :: l -> t :: find_mult l | 226 | | t :: l -> t :: find_mult l |
227 | | [] -> [] | 227 | | [] -> [] |
228 | 228 |
LCGlexicon/ENIAMcategoriesPL.ml
@@ -488,8 +488,8 @@ let pos_categories = Xlist.fold [ | @@ -488,8 +488,8 @@ let pos_categories = Xlist.fold [ | ||
488 | "ppron12",[Lemma;Number;Case;Gender;Person;]; | 488 | "ppron12",[Lemma;Number;Case;Gender;Person;]; |
489 | "ppron3",[Lemma;Number;Case;Gender;Person;Praep;]; | 489 | "ppron3",[Lemma;Number;Case;Gender;Person;Praep;]; |
490 | "siebie",[Lemma;Number;Case;Gender;Person;]; | 490 | "siebie",[Lemma;Number;Case;Gender;Person;]; |
491 | - "prep",[Lemma;Case;]; | ||
492 | - "compar",[Lemma;Case;]; | 491 | + "prep",[Lemma;Cat;Case;]; |
492 | + "compar",[Lemma;Cat;Case;]; | ||
493 | "num",[Lemma;Number;Case;Gender;Person;Acm;]; | 493 | "num",[Lemma;Number;Case;Gender;Person;Acm;]; |
494 | "intnum",[Lemma;Number;Case;Gender;Person;Acm;]; | 494 | "intnum",[Lemma;Number;Case;Gender;Person;Acm;]; |
495 | "realnum",[Lemma;Number;Case;Gender;Person;Acm;]; | 495 | "realnum",[Lemma;Number;Case;Gender;Person;Acm;]; |
NKJP2/ENIAM_NKJP.ml
@@ -401,6 +401,20 @@ let fold path s f = | @@ -401,6 +401,20 @@ let fold path s f = | ||
401 | let entries = merge_entries name [] (text,segmentation,morphosyntax,named) in | 401 | let entries = merge_entries name [] (text,segmentation,morphosyntax,named) in |
402 | f s (name,typ,channel,entries)) | 402 | f s (name,typ,channel,entries)) |
403 | 403 | ||
404 | +let fold_selected path selection s f = | ||
405 | + let names = get_folders path in | ||
406 | + Xlist.fold names s (fun s name -> | ||
407 | + if not (StringSet.mem selection name) then s else | ||
408 | + (* print_endline name; *) | ||
409 | + if name = "030-2-000000012" then s else | ||
410 | + let typ,channel = load_header path name in | ||
411 | + let text = load_text path name in | ||
412 | + let segmentation = load_segmentation path name in | ||
413 | + let morphosyntax = load_morphosyntax path name in | ||
414 | + let named = load_named path name in | ||
415 | + let entries = merge_entries name [] (text,segmentation,morphosyntax,named) in | ||
416 | + f s (name,typ,channel,entries)) | ||
417 | + | ||
404 | let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/" | 418 | let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/" |
405 | 419 | ||
406 | let calculate_statistics stats typ channel entries = | 420 | let calculate_statistics stats typ channel entries = |
NKJP2/validateTokenizer.ml
@@ -28,13 +28,20 @@ let sencence_end = {empty_token_env with token=Interp "</sentence>"} | @@ -28,13 +28,20 @@ let sencence_end = {empty_token_env with token=Interp "</sentence>"} | ||
28 | let clause_beg = {empty_token_env with token=Interp "<clause>"} | 28 | let clause_beg = {empty_token_env with token=Interp "<clause>"} |
29 | let clause_end = {empty_token_env with token=Interp "</clause>"} | 29 | let clause_end = {empty_token_env with token=Interp "</clause>"} |
30 | 30 | ||
31 | -type sent = SentBeg | SentEnd | Inside | 31 | +type sent = SentBeg | SentEnd | Inside | SentBegEnd |
32 | 32 | ||
33 | let set_sent_end = function | 33 | let set_sent_end = function |
34 | - (_,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ -> | 34 | + (Inside,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ -> |
35 | (SentEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l | 35 | (SentEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l |
36 | + | (SentBeg,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ -> | ||
37 | + (SentBegEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l | ||
36 | | _ -> failwith "set_sent_end" | 38 | | _ -> failwith "set_sent_end" |
37 | 39 | ||
40 | +let set_beg_as_zero = function | ||
41 | + (sent,_,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l -> | ||
42 | + (sent,0,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l | ||
43 | + | [] -> failwith "set_beg_as_zero" | ||
44 | + | ||
38 | let flatten_sentences sentences = | 45 | let flatten_sentences sentences = |
39 | List.rev (Xlist.fold sentences [] (fun l (id_s,tokens,named_tokens) -> | 46 | List.rev (Xlist.fold sentences [] (fun l (id_s,tokens,named_tokens) -> |
40 | set_sent_end (Xlist.fold tokens (l,SentBeg) (fun (l,sent) (beg,len,no_spaces,real_orth,orth,lemma,cat,interp) -> | 47 | set_sent_end (Xlist.fold tokens (l,SentBeg) (fun (l,sent) (beg,len,no_spaces,real_orth,orth,lemma,cat,interp) -> |
@@ -72,11 +79,17 @@ let is_space_required prev_orth prev_cat orth cat = | @@ -72,11 +79,17 @@ let is_space_required prev_orth prev_cat orth cat = | ||
72 | let rec allign prev_orth prev_cat rev = function | 79 | let rec allign prev_orth prev_cat rev = function |
73 | (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l -> | 80 | (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l -> |
74 | allign orth cat ((make_token orth lemma cat interp) :: clause_beg :: sencence_beg :: query_beg :: rev) l | 81 | allign orth cat ((make_token orth lemma cat interp) :: clause_beg :: sencence_beg :: query_beg :: rev) l |
75 | - | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith "allign" | 82 | + | (SentBegEnd,0,_,_,_,orth,lemma,cat,interp) :: l -> |
83 | + allign orth cat (List.rev [query_beg;sencence_beg;clause_beg;make_token orth lemma cat interp;clause_end;sencence_end]) l | ||
84 | + | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith ("allign 1: " ^ orth) | ||
76 | | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l -> | 85 | | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l -> |
77 | let rev = | 86 | let rev = |
78 | if no_spaces > 0 then space :: rev else | 87 | if no_spaces > 0 then space :: rev else |
79 | if is_space_required prev_orth prev_cat orth cat then space :: rev else rev in | 88 | if is_space_required prev_orth prev_cat orth cat then space :: rev else rev in |
89 | + if sent = SentBegEnd then | ||
90 | + let rev = (List.rev [sencence_beg;clause_beg;make_token orth lemma cat interp;clause_end;sencence_end]) @ rev in | ||
91 | + allign orth cat rev l | ||
92 | + else | ||
80 | let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in | 93 | let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in |
81 | let rev = (make_token orth lemma cat interp) :: rev in | 94 | let rev = (make_token orth lemma cat interp) :: rev in |
82 | let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in | 95 | let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in |
@@ -115,6 +128,13 @@ let rec get_next = function | @@ -115,6 +128,13 @@ let rec get_next = function | ||
115 | | Variant [] -> failwith "get_next" | 128 | | Variant [] -> failwith "get_next" |
116 | | Variant l -> get_next (List.hd l) | 129 | | Variant l -> get_next (List.hd l) |
117 | 130 | ||
131 | +let rec get_beg = function | ||
132 | + Token t -> t.beg | ||
133 | + | Seq [] -> failwith "get_beg" | ||
134 | + | Seq l -> get_beg (List.hd l) | ||
135 | + | Variant [] -> failwith "get_beg" | ||
136 | + | Variant l -> get_beg (List.hd l) | ||
137 | + | ||
118 | let make_seq = function | 138 | let make_seq = function |
119 | [] -> failwith "make_seq" | 139 | [] -> failwith "make_seq" |
120 | | [t] -> t | 140 | | [t] -> t |
@@ -130,7 +150,10 @@ let rec match_token_sequence erev nrev rev = function | @@ -130,7 +150,10 @@ let rec match_token_sequence erev nrev rev = function | ||
130 | match_token_sequence (et :: erev) nrev rev (ets, nt :: nts) | 150 | match_token_sequence (et :: erev) nrev rev (ets, nt :: nts) |
131 | else match_token_sequence erev (nt :: nrev) rev (et :: ets, nts) | 151 | else match_token_sequence erev (nt :: nrev) rev (et :: ets, nts) |
132 | | [],[] -> Xlist.fold rev [] (fun l (et,nt) -> (make_seq et, make_seq nt) :: l) | 152 | | [],[] -> Xlist.fold rev [] (fun l (et,nt) -> (make_seq et, make_seq nt) :: l) |
133 | - | _ -> failwith "match_token_sequence" | 153 | + | ets,nts -> |
154 | + let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (Seq ets)) in | ||
155 | + let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (Seq nts)) in | ||
156 | + (*failwith*)print_endline (Printf.sprintf "match_token_sequence: %s\n\n%s\n" s t); [] | ||
134 | 157 | ||
135 | let rec compare_tokens = function | 158 | let rec compare_tokens = function |
136 | Token et, Token nt -> | 159 | Token et, Token nt -> |
@@ -146,28 +169,49 @@ let rec compare_tokens = function | @@ -146,28 +169,49 @@ let rec compare_tokens = function | ||
146 | | Seq(et::ets),Seq(nt::nts) -> if compare_tokens (et,nt) then compare_tokens (Seq ets,Seq nts) else false | 169 | | Seq(et::ets),Seq(nt::nts) -> if compare_tokens (et,nt) then compare_tokens (Seq ets,Seq nts) else false |
147 | | _ -> false | 170 | | _ -> false |
148 | 171 | ||
172 | +let rec shift_token_rec beg = function | ||
173 | + Token t -> Token{t with beg=t.beg-beg; next=t.next-beg} | ||
174 | + | Seq l -> Seq(Xlist.map l (shift_token_rec beg)) | ||
175 | + | Variant l -> Variant(Xlist.map l (shift_token_rec beg)) | ||
176 | + | ||
177 | +let shift_token t = | ||
178 | + let beg = get_beg t in | ||
179 | + shift_token_rec beg t | ||
180 | + | ||
149 | let validate stats name typ channel entries = | 181 | let validate stats name typ channel entries = |
150 | - (* if name = "120-2-900066" then ( *) | ||
151 | print_endline name; | 182 | print_endline name; |
152 | Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) -> | 183 | Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) -> |
184 | + (* if id_div = 3 then *) | ||
153 | Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> | 185 | Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> |
186 | + (* Printf.printf "%d\t%s\n" id_div paragraph; *) | ||
154 | let tokens = flatten_sentences sentences in | 187 | let tokens = flatten_sentences sentences in |
155 | - let tokens = allign "" "" [] tokens in | 188 | + let tokens = allign "" "" [] (set_beg_as_zero tokens) in |
156 | let paragraph = render_paragraph tokens in | 189 | let paragraph = render_paragraph tokens in |
190 | + (* Printf.printf "rend:\t%s\n" paragraph; *) | ||
157 | let tokens = set_lengths 0 [] tokens in | 191 | let tokens = set_lengths 0 [] tokens in |
158 | let tokens = set_special_tokens_lengths [] tokens in | 192 | let tokens = set_special_tokens_lengths [] tokens in |
159 | let tokens = ENIAMpatterns.remove_spaces [] tokens in | 193 | let tokens = ENIAMpatterns.remove_spaces [] tokens in |
160 | let eniam_tokens = ENIAMtokenizer.parse paragraph in | 194 | let eniam_tokens = ENIAMtokenizer.parse paragraph in |
195 | + (* Printf.printf "eniam_tokens: %s\n" (ENIAMtokens.string_of_tokens 0 (Seq eniam_tokens)); | ||
196 | + Printf.printf "tokens: %s\n" (ENIAMtokens.string_of_tokens 0 (Seq tokens)); *) | ||
161 | let l = match_token_sequence [] [] [] (eniam_tokens,tokens) in | 197 | let l = match_token_sequence [] [] [] (eniam_tokens,tokens) in |
162 | Xlist.fold l stats (fun stats (eniam_token,nkjp_token) -> | 198 | Xlist.fold l stats (fun stats (eniam_token,nkjp_token) -> |
163 | if compare_tokens (eniam_token,nkjp_token) then stats else ( | 199 | if compare_tokens (eniam_token,nkjp_token) then stats else ( |
164 | - let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 eniam_token) in | ||
165 | - let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 nkjp_token) in | ||
166 | - Printf.printf "%s\n%s\n\n%!" s t; | ||
167 | - StringQMap.add stats (s ^ "\n" ^ t))))) | 200 | + let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (shift_token eniam_token)) in |
201 | + let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (shift_token nkjp_token)) in | ||
202 | + (* Printf.printf "%s\n%s\n\n%!" s t; *) | ||
203 | + StringQMap.add stats (s ^ "\n" ^ t)))) (*else stats*)) | ||
168 | 204 | ||
205 | +let selection = StringSet.of_list ["040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005"; | ||
206 | +"620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727"; | ||
207 | +"620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056"; | ||
208 | +"711-3-010000079";"720-3-010000217";"720-3-010000335";"720-3-010000341";"forumowisko.pl_18535";"forumowisko.pl_424";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";] | ||
169 | 209 | ||
170 | -(*let _ = | 210 | +let _ = |
171 | let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> | 211 | let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> |
172 | validate stats name typ channel entries) in | 212 | validate stats name typ channel entries) in |
173 | - ()*) | 213 | + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection StringQMap.empty (fun stats (name,typ,channel,entries) -> |
214 | + validate stats name typ channel entries) in *) | ||
215 | + (* let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in | ||
216 | + Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\n%s\n" v k); *) | ||
217 | + () |
tokenizer/ENIAMtokens.ml
@@ -768,12 +768,12 @@ let rec recognize_sign_group poss_s_beg i = function | @@ -768,12 +768,12 @@ let rec recognize_sign_group poss_s_beg i = function | ||
768 | Token{empty_token_env with beg=i+20;len=factor-20;next=i+factor;token=Interp "<clause>"}],i+factor,l,false*) | 768 | Token{empty_token_env with beg=i+20;len=factor-20;next=i+factor;token=Interp "<clause>"}],i+factor,l,false*) |
769 | | (Sign "'") :: (Sign "'") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "'";Sign "'"] l | 769 | | (Sign "'") :: (Sign "'") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "'";Sign "'"] l |
770 | | (Sign "'") :: (Sign "'") :: l -> | 770 | | (Sign "'") :: (Sign "'") :: l -> |
771 | - let t,i = create_empty_sign_token i [Sign "”"] in | 771 | + let t,i = create_empty_sign_token i [Sign "'";Sign "'"] in |
772 | Variant[Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg | 772 | Variant[Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg |
773 | | (Sign "'") :: l -> create_sign_token poss_s_beg i [Sign "'"] l (Symbol "’") | 773 | | (Sign "'") :: l -> create_sign_token poss_s_beg i [Sign "'"] l (Symbol "’") |
774 | | (Sign "’") :: (Sign "’") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "’";Sign "’"] l | 774 | | (Sign "’") :: (Sign "’") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "’";Sign "’"] l |
775 | | (Sign "’") :: (Sign "’") :: l -> | 775 | | (Sign "’") :: (Sign "’") :: l -> |
776 | - let t,i = create_empty_sign_token i [Sign "”"] in | 776 | + let t,i = create_empty_sign_token i [Sign "’";Sign "’"] in |
777 | Variant[Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg | 777 | Variant[Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg |
778 | | (Sign "’") :: l -> create_sign_token poss_s_beg i [Sign "’"] l (Symbol "’") | 778 | | (Sign "’") :: l -> create_sign_token poss_s_beg i [Sign "’"] l (Symbol "’") |
779 | | (Sign ";") :: (Sign "*") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "*") :: []) l (make_lemma (";*","sinterj")) | 779 | | (Sign ";") :: (Sign "*") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "*") :: []) l (make_lemma (";*","sinterj")) |
tokenizer/TODO
1 | -- liczba przed kropką kończącą zdanie jest interpretowana jako ordnum | 1 | +- liczba przed kropką kończącą zdanie jest interpretowana jako ordnum - to WAŻNE !!! |
2 | 2 | ||
3 | - przenieść ustalanie weight do następnego modułu | 3 | - przenieść ustalanie weight do następnego modułu |
4 | 4 | ||
@@ -7,4 +7,4 @@ Odkryłem gdzie jest problem z kodowaniem utf. | @@ -7,4 +7,4 @@ Odkryłem gdzie jest problem z kodowaniem utf. | ||
7 | Mianowicie dla zdania "Fan: Niech nie straszą, Że to bomba jest kalorii." preprocesor zwraca segmentacje w której jeden z segmentów zawiera pole lemma z niepoprawnym znakiem. | 7 | Mianowicie dla zdania "Fan: Niech nie straszą, Że to bomba jest kalorii." preprocesor zwraca segmentacje w której jeden z segmentów zawiera pole lemma z niepoprawnym znakiem. |
8 | Pole wygląda następująco: "Ż\BCe". | 8 | Pole wygląda następująco: "Ż\BCe". |
9 | 9 | ||
10 | -- przecinek "," nie jest traktowany jako Symbol a jedynie jako Interp, co może stwarzać problemy przy parsowaniu MWE uwzględniającym fleksję. | 10 | +- przecinek "," nie jest traktowany jako Symbol a jedynie jako Interp, co może stwarzać problemy przy parsowaniu MWE uwzględniającym fleksję. |