Commit 59cdc551498fb155f90b97a52cde992cc90eb84c

Authored by Wojciech Jaworski
1 parent b4c2dd39

poprawki w LCGlexicon i walidacja tokenizacji

LCGlexicon/ENIAM_LCGlexicon.ml
@@ -58,7 +58,7 @@ let assign_quantifiers (selectors,rule,weight) = @@ -58,7 +58,7 @@ let assign_quantifiers (selectors,rule,weight) =
58 let pos = find_selector Pos selectors in 58 let pos = find_selector Pos selectors in
59 let categories = 59 let categories =
60 try StringMap.find pos_categories pos 60 try StringMap.find pos_categories pos
61 - with Not_found -> failwith ("assign_quantifiers: " ^ pos) in 61 + with Not_found -> failwith ("assign_quantifiers: unknown part of speech " ^ pos) in
62 let categories = Xlist.map categories (fun s -> s,Top) in 62 let categories = Xlist.map categories (fun s -> s,Top) in
63 let syntax,rule = get_syntax [] rule in 63 let syntax,rule = get_syntax [] rule in
64 let quant,rule = get_quant [] rule in 64 let quant,rule = get_quant [] rule in
@@ -111,7 +111,7 @@ let make_rules x_flag filename = @@ -111,7 +111,7 @@ let make_rules x_flag filename =
111 dict_of_grammar lexicon 111 dict_of_grammar lexicon
112 112
113 let find_rules rules cats = 113 let find_rules rules cats =
114 - let lex_rules,rules = try StringMap.find rules cats.pos with Not_found -> failwith "find_rules 1" in 114 + let lex_rules,rules = try StringMap.find rules cats.pos with Not_found -> failwith ("find_rules: unable to find rules for category " ^ cats.pos) in
115 (* Printf.printf "find_rules: %s %s |rules|=%d\n" cats.lemma cats.pos (Xlist.size rules); *) 115 (* Printf.printf "find_rules: %s %s |rules|=%d\n" cats.lemma cats.pos (Xlist.size rules); *)
116 let rules = try StringMap.find lex_rules cats.lemma @ rules with Not_found -> rules in 116 let rules = try StringMap.find lex_rules cats.lemma @ rules with Not_found -> rules in
117 Xlist.fold rules [] (fun rules (selectors,syntax,semantics) -> 117 Xlist.fold rules [] (fun rules (selectors,syntax,semantics) ->
LCGlexicon/ENIAM_LCGlexiconParser.ml
@@ -152,7 +152,7 @@ let find_internal_grammar_symbols atoms = function @@ -152,7 +152,7 @@ let find_internal_grammar_symbols atoms = function
152 | s -> if StringSet.mem selector_names s then B (AVar s) else 152 | s -> if StringSet.mem selector_names s then B (AVar s) else
153 if StringSet.mem atoms s then B (Atom s) else 153 if StringSet.mem atoms s then B (Atom s) else
154 if StringSet.mem operators s then A s else 154 if StringSet.mem operators s then A s else
155 - failwith ("find_internal_grammar_symbols: " ^ s) 155 + failwith ("find_internal_grammar_symbols: unknown symbol " ^ s)
156 156
157 let rec find_tensor = function 157 let rec find_tensor = function
158 B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: A "*" :: B s7 :: A "*" :: B s8 :: l -> failwith "find_tensor 1" 158 B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: A "*" :: B s7 :: A "*" :: B s8 :: l -> failwith "find_tensor 1"
@@ -163,7 +163,7 @@ let rec find_tensor = function @@ -163,7 +163,7 @@ let rec find_tensor = function
163 | B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: l -> C (Tensor[s1;s2;s3]) :: find_tensor l 163 | B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: l -> C (Tensor[s1;s2;s3]) :: find_tensor l
164 | B s1 :: A "*" :: B s2 :: l -> C (Tensor[s1;s2]) :: find_tensor l 164 | B s1 :: A "*" :: B s2 :: l -> C (Tensor[s1;s2]) :: find_tensor l
165 | B s1 :: l -> C (Tensor[s1]) :: find_tensor l 165 | B s1 :: l -> C (Tensor[s1]) :: find_tensor l
166 - | A "*" :: _ -> failwith "find_tensor 2" 166 + | A "*" :: _ -> failwith "find_tensor 2: unexpected '*'"
167 | t :: l -> t :: find_tensor l 167 | t :: l -> t :: find_tensor l
168 | [] -> [] 168 | [] -> []
169 169
@@ -174,7 +174,7 @@ let rec find_plus = function @@ -174,7 +174,7 @@ let rec find_plus = function
174 | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: l -> C (Plus[s1;s2;s3;s4]) :: find_plus l 174 | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: l -> C (Plus[s1;s2;s3;s4]) :: find_plus l
175 | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: l -> C (Plus[s1;s2;s3]) :: find_plus l 175 | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: l -> C (Plus[s1;s2;s3]) :: find_plus l
176 | C s1 :: A "+" :: C s2 :: l -> C (Plus[s1;s2]) :: find_plus l 176 | C s1 :: A "+" :: C s2 :: l -> C (Plus[s1;s2]) :: find_plus l
177 - | A "+" :: _ -> failwith "find_plus 2" 177 + | A "+" :: _ -> failwith "find_plus 2: unexpected '+'"
178 | t :: l -> t :: find_plus l 178 | t :: l -> t :: find_plus l
179 | [] -> [] 179 | [] -> []
180 180
@@ -192,7 +192,7 @@ let rec find_imp = function @@ -192,7 +192,7 @@ let rec find_imp = function
192 192
193 let rec find_maybe = function 193 let rec find_maybe = function
194 | A "?" :: C s2 :: l -> C (Maybe s2) :: find_maybe l 194 | A "?" :: C s2 :: l -> C (Maybe s2) :: find_maybe l
195 - | A "?" :: _ -> failwith "find_maybe 1" 195 + | A "?" :: _ -> failwith "find_maybe 1: unexpected '?'"
196 | s :: l -> s :: find_maybe l 196 | s :: l -> s :: find_maybe l
197 | [] -> [] 197 | [] -> []
198 198
@@ -203,11 +203,11 @@ let rec find_mult_imp = function @@ -203,11 +203,11 @@ let rec find_mult_imp = function
203 | A "," :: A "/" :: C s2 :: l -> A "," :: D (Forward,s2) :: find_mult_imp l 203 | A "," :: A "/" :: C s2 :: l -> A "," :: D (Forward,s2) :: find_mult_imp l
204 | A "," :: A "|" :: C s2 :: l -> A "," :: D (Both,s2) :: find_mult_imp l 204 | A "," :: A "|" :: C s2 :: l -> A "," :: D (Both,s2) :: find_mult_imp l
205 | A "," :: A "\\" :: C s2 :: l -> A "," :: D (Backward,s2) :: find_mult_imp l 205 | A "," :: A "\\" :: C s2 :: l -> A "," :: D (Backward,s2) :: find_mult_imp l
206 - | A "/" :: _ -> failwith "find_mult_imp 1"  
207 - | A "|" :: _ -> failwith "find_mult_imp 2"  
208 - | A "\\" :: _ -> failwith "find_mult_imp 3"  
209 - | A "(" :: _ -> failwith "find_mult_imp 4"  
210 - | A ")" :: _ -> failwith "find_mult_imp 5" 206 + | A "/" :: _ -> failwith "find_mult_imp 1: unexpected '/'"
  207 + | A "|" :: _ -> failwith "find_mult_imp 2: unexpected '|'"
  208 + | A "\\" :: _ -> failwith "find_mult_imp 3: unexpected '\\'"
  209 + | A "(" :: _ -> failwith "find_mult_imp 4: unexpected '('"
  210 + | A ")" :: _ -> failwith "find_mult_imp 5: unexpected ')'"
211 | s :: l -> s :: find_mult_imp l 211 | s :: l -> s :: find_mult_imp l
212 | [] -> [] 212 | [] -> []
213 213
@@ -220,9 +220,9 @@ let rec find_mult = function @@ -220,9 +220,9 @@ let rec find_mult = function
220 | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3] :: find_mult l 220 | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3] :: find_mult l
221 | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "}" :: l -> E[s1,t1;s2,t2] :: find_mult l 221 | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "}" :: l -> E[s1,t1;s2,t2] :: find_mult l
222 | A "{" :: D(s1,t1) :: A "}" :: l -> E[s1,t1] :: find_mult l 222 | A "{" :: D(s1,t1) :: A "}" :: l -> E[s1,t1] :: find_mult l
223 - | A "{" :: _ -> failwith "find_mult 2"  
224 - | A "}" :: _ -> failwith "find_mult 3"  
225 - | A "," :: _ -> failwith "find_mult 4" 223 + | A "{" :: _ -> failwith "find_mult 2: unexpected '{'"
  224 + | A "}" :: _ -> failwith "find_mult 3: unexpected '}'"
  225 + | A "," :: _ -> failwith "find_mult 4: unexpected ','"
226 | t :: l -> t :: find_mult l 226 | t :: l -> t :: find_mult l
227 | [] -> [] 227 | [] -> []
228 228
LCGlexicon/ENIAMcategoriesPL.ml
@@ -488,8 +488,8 @@ let pos_categories = Xlist.fold [ @@ -488,8 +488,8 @@ let pos_categories = Xlist.fold [
488 "ppron12",[Lemma;Number;Case;Gender;Person;]; 488 "ppron12",[Lemma;Number;Case;Gender;Person;];
489 "ppron3",[Lemma;Number;Case;Gender;Person;Praep;]; 489 "ppron3",[Lemma;Number;Case;Gender;Person;Praep;];
490 "siebie",[Lemma;Number;Case;Gender;Person;]; 490 "siebie",[Lemma;Number;Case;Gender;Person;];
491 - "prep",[Lemma;Case;];  
492 - "compar",[Lemma;Case;]; 491 + "prep",[Lemma;Cat;Case;];
  492 + "compar",[Lemma;Cat;Case;];
493 "num",[Lemma;Number;Case;Gender;Person;Acm;]; 493 "num",[Lemma;Number;Case;Gender;Person;Acm;];
494 "intnum",[Lemma;Number;Case;Gender;Person;Acm;]; 494 "intnum",[Lemma;Number;Case;Gender;Person;Acm;];
495 "realnum",[Lemma;Number;Case;Gender;Person;Acm;]; 495 "realnum",[Lemma;Number;Case;Gender;Person;Acm;];
NKJP2/ENIAM_NKJP.ml
@@ -401,6 +401,20 @@ let fold path s f = @@ -401,6 +401,20 @@ let fold path s f =
401 let entries = merge_entries name [] (text,segmentation,morphosyntax,named) in 401 let entries = merge_entries name [] (text,segmentation,morphosyntax,named) in
402 f s (name,typ,channel,entries)) 402 f s (name,typ,channel,entries))
403 403
  404 +let fold_selected path selection s f =
  405 + let names = get_folders path in
  406 + Xlist.fold names s (fun s name ->
  407 + if not (StringSet.mem selection name) then s else
  408 + (* print_endline name; *)
  409 + if name = "030-2-000000012" then s else
  410 + let typ,channel = load_header path name in
  411 + let text = load_text path name in
  412 + let segmentation = load_segmentation path name in
  413 + let morphosyntax = load_morphosyntax path name in
  414 + let named = load_named path name in
  415 + let entries = merge_entries name [] (text,segmentation,morphosyntax,named) in
  416 + f s (name,typ,channel,entries))
  417 +
404 let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/" 418 let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/"
405 419
406 let calculate_statistics stats typ channel entries = 420 let calculate_statistics stats typ channel entries =
NKJP2/validateTokenizer.ml
@@ -28,13 +28,20 @@ let sencence_end = {empty_token_env with token=Interp "</sentence>"} @@ -28,13 +28,20 @@ let sencence_end = {empty_token_env with token=Interp "</sentence>"}
28 let clause_beg = {empty_token_env with token=Interp "<clause>"} 28 let clause_beg = {empty_token_env with token=Interp "<clause>"}
29 let clause_end = {empty_token_env with token=Interp "</clause>"} 29 let clause_end = {empty_token_env with token=Interp "</clause>"}
30 30
31 -type sent = SentBeg | SentEnd | Inside 31 +type sent = SentBeg | SentEnd | Inside | SentBegEnd
32 32
33 let set_sent_end = function 33 let set_sent_end = function
34 - (_,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ -> 34 + (Inside,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ ->
35 (SentEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l 35 (SentEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l
  36 + | (SentBeg,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l,_ ->
  37 + (SentBegEnd,beg,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l
36 | _ -> failwith "set_sent_end" 38 | _ -> failwith "set_sent_end"
37 39
  40 +let set_beg_as_zero = function
  41 + (sent,_,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l ->
  42 + (sent,0,len,no_spaces,real_orth,orth,lemma,cat,interp) :: l
  43 + | [] -> failwith "set_beg_as_zero"
  44 +
38 let flatten_sentences sentences = 45 let flatten_sentences sentences =
39 List.rev (Xlist.fold sentences [] (fun l (id_s,tokens,named_tokens) -> 46 List.rev (Xlist.fold sentences [] (fun l (id_s,tokens,named_tokens) ->
40 set_sent_end (Xlist.fold tokens (l,SentBeg) (fun (l,sent) (beg,len,no_spaces,real_orth,orth,lemma,cat,interp) -> 47 set_sent_end (Xlist.fold tokens (l,SentBeg) (fun (l,sent) (beg,len,no_spaces,real_orth,orth,lemma,cat,interp) ->
@@ -72,11 +79,17 @@ let is_space_required prev_orth prev_cat orth cat = @@ -72,11 +79,17 @@ let is_space_required prev_orth prev_cat orth cat =
72 let rec allign prev_orth prev_cat rev = function 79 let rec allign prev_orth prev_cat rev = function
73 (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l -> 80 (SentBeg,0,_,_,_,orth,lemma,cat,interp) :: l ->
74 allign orth cat ((make_token orth lemma cat interp) :: clause_beg :: sencence_beg :: query_beg :: rev) l 81 allign orth cat ((make_token orth lemma cat interp) :: clause_beg :: sencence_beg :: query_beg :: rev) l
75 - | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith "allign" 82 + | (SentBegEnd,0,_,_,_,orth,lemma,cat,interp) :: l ->
  83 + allign orth cat (List.rev [query_beg;sencence_beg;clause_beg;make_token orth lemma cat interp;clause_end;sencence_end]) l
  84 + | (_,0,_,_,_,orth,lemma,cat,interp) :: l -> failwith ("allign 1: " ^ orth)
76 | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l -> 85 | (sent,beg,_,no_spaces,_,orth,lemma,cat,interp) :: l ->
77 let rev = 86 let rev =
78 if no_spaces > 0 then space :: rev else 87 if no_spaces > 0 then space :: rev else
79 if is_space_required prev_orth prev_cat orth cat then space :: rev else rev in 88 if is_space_required prev_orth prev_cat orth cat then space :: rev else rev in
  89 + if sent = SentBegEnd then
  90 + let rev = (List.rev [sencence_beg;clause_beg;make_token orth lemma cat interp;clause_end;sencence_end]) @ rev in
  91 + allign orth cat rev l
  92 + else
80 let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in 93 let rev = if sent = SentBeg then clause_beg :: sencence_beg :: rev else rev in
81 let rev = (make_token orth lemma cat interp) :: rev in 94 let rev = (make_token orth lemma cat interp) :: rev in
82 let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in 95 let rev = if sent = SentEnd then sencence_end :: clause_end :: rev else rev in
@@ -115,6 +128,13 @@ let rec get_next = function @@ -115,6 +128,13 @@ let rec get_next = function
115 | Variant [] -> failwith "get_next" 128 | Variant [] -> failwith "get_next"
116 | Variant l -> get_next (List.hd l) 129 | Variant l -> get_next (List.hd l)
117 130
  131 +let rec get_beg = function
  132 + Token t -> t.beg
  133 + | Seq [] -> failwith "get_beg"
  134 + | Seq l -> get_beg (List.hd l)
  135 + | Variant [] -> failwith "get_beg"
  136 + | Variant l -> get_beg (List.hd l)
  137 +
118 let make_seq = function 138 let make_seq = function
119 [] -> failwith "make_seq" 139 [] -> failwith "make_seq"
120 | [t] -> t 140 | [t] -> t
@@ -130,7 +150,10 @@ let rec match_token_sequence erev nrev rev = function @@ -130,7 +150,10 @@ let rec match_token_sequence erev nrev rev = function
130 match_token_sequence (et :: erev) nrev rev (ets, nt :: nts) 150 match_token_sequence (et :: erev) nrev rev (ets, nt :: nts)
131 else match_token_sequence erev (nt :: nrev) rev (et :: ets, nts) 151 else match_token_sequence erev (nt :: nrev) rev (et :: ets, nts)
132 | [],[] -> Xlist.fold rev [] (fun l (et,nt) -> (make_seq et, make_seq nt) :: l) 152 | [],[] -> Xlist.fold rev [] (fun l (et,nt) -> (make_seq et, make_seq nt) :: l)
133 - | _ -> failwith "match_token_sequence" 153 + | ets,nts ->
  154 + let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (Seq ets)) in
  155 + let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (Seq nts)) in
  156 + (*failwith*)print_endline (Printf.sprintf "match_token_sequence: %s\n\n%s\n" s t); []
134 157
135 let rec compare_tokens = function 158 let rec compare_tokens = function
136 Token et, Token nt -> 159 Token et, Token nt ->
@@ -146,28 +169,49 @@ let rec compare_tokens = function @@ -146,28 +169,49 @@ let rec compare_tokens = function
146 | Seq(et::ets),Seq(nt::nts) -> if compare_tokens (et,nt) then compare_tokens (Seq ets,Seq nts) else false 169 | Seq(et::ets),Seq(nt::nts) -> if compare_tokens (et,nt) then compare_tokens (Seq ets,Seq nts) else false
147 | _ -> false 170 | _ -> false
148 171
  172 +let rec shift_token_rec beg = function
  173 + Token t -> Token{t with beg=t.beg-beg; next=t.next-beg}
  174 + | Seq l -> Seq(Xlist.map l (shift_token_rec beg))
  175 + | Variant l -> Variant(Xlist.map l (shift_token_rec beg))
  176 +
  177 +let shift_token t =
  178 + let beg = get_beg t in
  179 + shift_token_rec beg t
  180 +
149 let validate stats name typ channel entries = 181 let validate stats name typ channel entries =
150 - (* if name = "120-2-900066" then ( *)  
151 print_endline name; 182 print_endline name;
152 Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) -> 183 Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
  184 + (* if id_div = 3 then *)
153 Xlist.fold paragraphs stats (fun stats (paragraph,sentences) -> 185 Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
  186 + (* Printf.printf "%d\t%s\n" id_div paragraph; *)
154 let tokens = flatten_sentences sentences in 187 let tokens = flatten_sentences sentences in
155 - let tokens = allign "" "" [] tokens in 188 + let tokens = allign "" "" [] (set_beg_as_zero tokens) in
156 let paragraph = render_paragraph tokens in 189 let paragraph = render_paragraph tokens in
  190 + (* Printf.printf "rend:\t%s\n" paragraph; *)
157 let tokens = set_lengths 0 [] tokens in 191 let tokens = set_lengths 0 [] tokens in
158 let tokens = set_special_tokens_lengths [] tokens in 192 let tokens = set_special_tokens_lengths [] tokens in
159 let tokens = ENIAMpatterns.remove_spaces [] tokens in 193 let tokens = ENIAMpatterns.remove_spaces [] tokens in
160 let eniam_tokens = ENIAMtokenizer.parse paragraph in 194 let eniam_tokens = ENIAMtokenizer.parse paragraph in
  195 + (* Printf.printf "eniam_tokens: %s\n" (ENIAMtokens.string_of_tokens 0 (Seq eniam_tokens));
  196 + Printf.printf "tokens: %s\n" (ENIAMtokens.string_of_tokens 0 (Seq tokens)); *)
161 let l = match_token_sequence [] [] [] (eniam_tokens,tokens) in 197 let l = match_token_sequence [] [] [] (eniam_tokens,tokens) in
162 Xlist.fold l stats (fun stats (eniam_token,nkjp_token) -> 198 Xlist.fold l stats (fun stats (eniam_token,nkjp_token) ->
163 if compare_tokens (eniam_token,nkjp_token) then stats else ( 199 if compare_tokens (eniam_token,nkjp_token) then stats else (
164 - let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 eniam_token) in  
165 - let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 nkjp_token) in  
166 - Printf.printf "%s\n%s\n\n%!" s t;  
167 - StringQMap.add stats (s ^ "\n" ^ t))))) 200 + let s = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (shift_token eniam_token)) in
  201 + let t = Printf.sprintf "%s" (ENIAMtokens.string_of_tokens 0 (shift_token nkjp_token)) in
  202 + (* Printf.printf "%s\n%s\n\n%!" s t; *)
  203 + StringQMap.add stats (s ^ "\n" ^ t)))) (*else stats*))
168 204
  205 +let selection = StringSet.of_list ["040-2-000007";"120-2-900126";"120-2-910000001";"120-2-910000002";"120-4-900005";
  206 +"620-3-010001110";"620-3-010001449";"620-3-010001622";"620-3-010001727";
  207 +"620-3-010001731";"620-3-010001741";"620-3-010001854";"711-3-010000051";"711-3-010000056";
  208 +"711-3-010000079";"720-3-010000217";"720-3-010000335";"720-3-010000341";"forumowisko.pl_18535";"forumowisko.pl_424";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";"";]
169 209
170 -(*let _ = 210 +let _ =
171 let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) -> 211 let stats = ENIAM_NKJP.fold ENIAM_NKJP.nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
172 validate stats name typ channel entries) in 212 validate stats name typ channel entries) in
173 - ()*) 213 + (* let stats = ENIAM_NKJP.fold_selected ENIAM_NKJP.nkjp_path selection StringQMap.empty (fun stats (name,typ,channel,entries) ->
  214 + validate stats name typ channel entries) in *)
  215 + (* let stats = StringQMap.fold stats [] (fun stats k v -> (v,k) :: stats) in
  216 + Xlist.iter (Xlist.sort stats compare) (fun (v,k) -> Printf.printf "%d\n%s\n" v k); *)
  217 + ()
tokenizer/ENIAMtokens.ml
@@ -768,12 +768,12 @@ let rec recognize_sign_group poss_s_beg i = function @@ -768,12 +768,12 @@ let rec recognize_sign_group poss_s_beg i = function
768 Token{empty_token_env with beg=i+20;len=factor-20;next=i+factor;token=Interp "<clause>"}],i+factor,l,false*) 768 Token{empty_token_env with beg=i+20;len=factor-20;next=i+factor;token=Interp "<clause>"}],i+factor,l,false*)
769 | (Sign "'") :: (Sign "'") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "'";Sign "'"] l 769 | (Sign "'") :: (Sign "'") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "'";Sign "'"] l
770 | (Sign "'") :: (Sign "'") :: l -> 770 | (Sign "'") :: (Sign "'") :: l ->
771 - let t,i = create_empty_sign_token i [Sign ""] in 771 + let t,i = create_empty_sign_token i [Sign "'";Sign "'"] in
772 Variant[Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg 772 Variant[Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg
773 | (Sign "'") :: l -> create_sign_token poss_s_beg i [Sign "'"] l (Symbol "’") 773 | (Sign "'") :: l -> create_sign_token poss_s_beg i [Sign "'"] l (Symbol "’")
774 | (Sign "’") :: (Sign "’") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "’";Sign "’"] l 774 | (Sign "’") :: (Sign "’") :: (Sign ".") :: l -> create_quot_digit_token i [Sign "’";Sign "’"] l
775 | (Sign "’") :: (Sign "’") :: l -> 775 | (Sign "’") :: (Sign "’") :: l ->
776 - let t,i = create_empty_sign_token i [Sign ""] in 776 + let t,i = create_empty_sign_token i [Sign "’";Sign "’"] in
777 Variant[Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg 777 Variant[Token{t with token=Interp "”"};Token{t with token=Interp "”s"}],i,l,poss_s_beg
778 | (Sign "’") :: l -> create_sign_token poss_s_beg i [Sign "’"] l (Symbol "’") 778 | (Sign "’") :: l -> create_sign_token poss_s_beg i [Sign "’"] l (Symbol "’")
779 | (Sign ";") :: (Sign "*") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "*") :: []) l (make_lemma (";*","sinterj")) 779 | (Sign ";") :: (Sign "*") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "*") :: []) l (make_lemma (";*","sinterj"))
tokenizer/TODO
1 -- liczba przed kropką kończącą zdanie jest interpretowana jako ordnum 1 +- liczba przed kropką kończącą zdanie jest interpretowana jako ordnum - to WAŻNE !!!
2 2
3 - przenieść ustalanie weight do następnego modułu 3 - przenieść ustalanie weight do następnego modułu
4 4
@@ -7,4 +7,4 @@ Odkryłem gdzie jest problem z kodowaniem utf. @@ -7,4 +7,4 @@ Odkryłem gdzie jest problem z kodowaniem utf.
7 Mianowicie dla zdania "Fan: Niech nie straszą, Że to bomba jest kalorii." preprocesor zwraca segmentacje w której jeden z segmentów zawiera pole lemma z niepoprawnym znakiem. 7 Mianowicie dla zdania "Fan: Niech nie straszą, Że to bomba jest kalorii." preprocesor zwraca segmentacje w której jeden z segmentów zawiera pole lemma z niepoprawnym znakiem.
8 Pole wygląda następująco: "Ż\BCe". 8 Pole wygląda następująco: "Ż\BCe".
9 9
10 -- przecinek "," nie jest traktowany jako Symbol a jedynie jako Interp, co może stwarzać problemy przy parsowaniu MWE uwzględniającym fleksję. 10 +- przecinek "," nie jest traktowany jako Symbol a jedynie jako Interp, co może stwarzać problemy przy parsowaniu MWE uwzględniającym fleksję.