Commit aedafaff92de27ab899e4b1ae8df24b9e034cf51

Authored by Wojciech Jaworski
1 parent 06c25d4e

ne i przygotowanie do walidacji tokenizacji

NKJP2/ENIAM_NKJP.ml
... ... @@ -19,9 +19,9 @@
19 19  
20 20 open Xstd
21 21  
22   -type id = {corref: string; prefix: string; suffix: string; numbers: int list}
  22 +type id = {corref: string; prefix: string; suffix: string; suffix2: string; numbers: int list}
23 23  
24   -let empty_id = {corref = ""; prefix = ""; suffix = ""; numbers = []}
  24 +let empty_id = {corref = ""; prefix = ""; suffix = ""; suffix2 = ""; numbers = []}
25 25  
26 26 let parse_id id =
27 27 (* if String.length s = 0 then empty_id else *)
... ... @@ -30,14 +30,15 @@ let parse_id id =
30 30 [corref;id] -> corref,id
31 31 | [id] -> "",id
32 32 | _ -> failwith ("parse_id 1: " ^ id) in
33   - let prefix,id = match Xstring.split "_" id with
34   - [prefix;id] -> prefix,id
  33 + let prefix,id,suffix2 = match Xstring.split "_" id with
  34 + [prefix;id] -> prefix,id,""
  35 + | [prefix;id;suffix2] -> prefix,id,suffix2
35 36 | _ -> failwith ("parse_id 2: " ^ id) in
36 37 let suffix,id = match Xstring.split "-" id with
37 38 [id;suffix] -> suffix,id
38 39 | _ -> failwith ("parse_id 3: " ^ id) in
39 40 let numbers = try Xlist.map (Xstring.split "\\." id) int_of_string with _ -> failwith ("parse_id 4: " ^ id) in
40   - {corref=corref; prefix=prefix; suffix=suffix; numbers=numbers}
  41 + {corref=corref; prefix=prefix; suffix=suffix; suffix2=suffix2; numbers=numbers}
41 42  
42 43 let process_header_type typ =
43 44 if Xstring.check_prefix "#typ_" typ then Xstring.cut_prefix "#typ_" typ
... ... @@ -158,6 +159,59 @@ let load_morphosyntax path name =
158 159 List.rev (Xlist.rev_map entries load_morph_entry)
159 160 | _ -> failwith "load_morphosyntax"
160 161  
  162 +type named = {typ: string; orth: string; base: string; cert: string; subtype: string; derived: string*string; wheen: string; }
  163 +
  164 +let empty_named = {typ=""; orth=""; base=""; cert=""; subtype=""; derived="",""; wheen=""}
  165 +
  166 +let load_named_feature named = function
  167 + Xml.Element("f",["name","type"],[Xml.Element("symbol",["value",v],[])]) -> {named with typ=v}
  168 + | Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]) -> {named with orth=orth}
  169 + | Xml.Element("f",["name","base"],[Xml.Element("string",[],[Xml.PCData base])]) -> {named with base=base}
  170 + | Xml.Element("f",["name","certainty"],[Xml.Element("symbol",["value",cert],[])]) -> {named with cert=cert}
  171 + | Xml.Element("f",["name","subtype"],[Xml.Element("symbol",["value",v],[])]) -> {named with subtype=v}
  172 + | Xml.Element("f",["name","derived"],[Xml.Element("fs",["type","derivation"],[
  173 + Xml.Element("f",["name","derivType"],[Xml.Element("symbol",["value",v],[])]);
  174 + Xml.Element("f",["name","derivedFrom"],[Xml.Element("string",[],[Xml.PCData from])])])]) -> {named with derived=(v,from)}
  175 + | Xml.Element("f",["name","derived"],[Xml.Element("fs",["type","derivation"],[
  176 + Xml.Element("f",["name","derivType"],[Xml.Element("symbol",["value",v],[])]);
  177 + Xml.Element("f",["name","derivedFrom"],[Xml.Element("string",[],[])])])]) -> {named with derived=(v,"")}
  178 + | Xml.Element("f",["name","when"],[Xml.Element("string",[],[Xml.PCData w])]) -> {named with wheen=w}
  179 + | Xml.Element("f",["name","when"],[Xml.Element("string",[],[])]) -> {named with wheen=""}
  180 + | Xml.Element("f",["name","comment"],[Xml.Element("string",[],[Xml.PCData base])]) -> named
  181 + | Xml.Element("f",["name","comment"],[Xml.Element("string",[],[])]) -> named
  182 + | xml -> failwith ("load_named_feature: " ^ Xml.to_string_fmt xml)
  183 +
  184 +let load_ptr = function
  185 + | Xml.Element("ptr",["target",target],[]) -> parse_id target
  186 + | xml -> failwith ("load_ptr: " ^ Xml.to_string_fmt xml)
  187 +
  188 +let load_named_token = function
  189 + Xml.Element("seg",["xml:id",id_seg],Xml.Element("fs",["type","named"],features) :: ptrs) ->
  190 + let named = Xlist.fold features empty_named load_named_feature in
  191 + let ptrs = Xlist.fold ptrs [] (fun ptrs xml -> load_ptr xml :: ptrs) in
  192 + parse_id id_seg,named,List.rev ptrs
  193 + | xml -> failwith ("load_named_token: " ^ Xml.to_string_fmt xml)
  194 +
  195 +let load_named_sentence = function
  196 + Xml.Element("s",["xml:id",id_s;"corresp",corresp],tokens) ->
  197 + parse_id corresp,parse_id id_s,List.rev (Xlist.rev_map tokens load_named_token)
  198 + | xml -> failwith ("load_morph_sentence: " ^ Xml.to_string_fmt xml)
  199 +
  200 +let load_named_entry = function
  201 + Xml.Element("p",["xml:id",id_p;"corresp",corresp],sentences) ->
  202 + parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_named_sentence)
  203 + | xml -> failwith ("load_morph_entry: " ^ Xml.to_string_fmt xml)
  204 +
  205 +let load_named path name =
  206 + try
  207 + match Xml.parse_file (path ^ name ^ "/ann_named.xml") with
  208 + Xml.Element("teiCorpus", _,[Xml.Element("xi:include",_,_);
  209 + Xml.Element("TEI",[],[Xml.Element("xi:include",_,_);
  210 + Xml.Element("text",["xml:lang","pl"],[Xml.Element("body",[],entries)])])]) ->
  211 + List.rev (Xlist.rev_map entries load_named_entry)
  212 + | _ -> failwith "load_morphosyntax"
  213 + with Xml.File_not_found _ -> []
  214 +
161 215 let parse_seg_corresp corresp =
162 216 if not (Xstring.check_prefix "text.xml#string-range(" corresp) then failwith "parse_seg_corresp" else
163 217 if not (Xstring.check_sufix ")" corresp) then failwith "parse_seg_corresp" else
... ... @@ -216,7 +270,33 @@ let print_tokens tokens =
216 270 let rec merge_sentences name id_p rev = function
217 271 ({corref=""; prefix="segm"; numbers=[id_segm_p;id_segm_s]; suffix="s"},segm_tokens) :: segmentation,
218 272 ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p;c_segm_s]; suffix="s"},
219   - {corref=""; prefix="morph"; numbers=[id_morph_p;id_morph_s]; suffix="s"},morph_tokens) :: morphosyntax ->
  273 + {corref=""; prefix="morph"; numbers=[id_morph_p;id_morph_s]; suffix="s"},morph_tokens) :: morphosyntax,
  274 + ({corref="ann_morphosyntax.xml"; prefix="morph"; numbers=[c_morph_p;c_morph_s]; suffix="s"},
  275 + {corref=""; prefix="named"; numbers=[id_named_p;id_named_s]; suffix="s"},named_tokens) :: named ->
  276 + (* if id_p <> id_segm_p then Printf.printf "merge_sentences inconsistent numbering: %s segm_%d-p segm_%d.%d-s\n" name id_p id_segm_p id_segm_s; *)
  277 + if id_segm_p <> c_segm_p || id_segm_p <> id_morph_p || id_segm_p <> c_morph_p || id_segm_p <> id_named_p then failwith "merge_sentences 2" else
  278 + if id_segm_s <> c_segm_s || c_segm_s <> id_morph_s || c_segm_s <> c_morph_s || c_segm_s <> id_named_s then failwith "merge_sentences 3" else
  279 + let tokens = merge_tokens name id_p [] (segm_tokens,morph_tokens) in
  280 + (* let _ = print_tokens tokens in *)
  281 + let id_s = string_of_int id_segm_p ^ "." ^ string_of_int id_segm_s in
  282 + if tokens = [] then failwith "merge_sentences 4" else
  283 + let id_div,id_ab,token = List.hd tokens in
  284 + let l = match split_sentences id_div id_ab [token] [] (List.tl tokens) with
  285 + [id_div,id_ab,tokens] -> [id_div,id_ab,id_s,tokens]
  286 + | [id_div1,id_ab1,tokens1;id_div2,id_ab2,tokens2] -> [id_div2,id_ab2,id_s^"b",tokens2;id_div1,id_ab1,id_s^"a",tokens1]
  287 + | [id_div1,id_ab1,tokens1;id_div2,id_ab2,tokens2;id_div3,id_ab3,tokens3] -> [id_div3,id_ab3,id_s^"c",tokens3;id_div2,id_ab2,id_s^"b",tokens2;id_div1,id_ab1,id_s^"a",tokens1]
  288 + | _ -> failwith (Printf.sprintf "merge_sentences 5: %s %d %d" name id_div id_ab) in
  289 + let named_tokens = Xlist.fold named_tokens [] (fun named_tokens (id,n,ptrs) ->
  290 + (StringSet.of_list (Xstring.split " " n.orth),id,n,ptrs) :: named_tokens) in
  291 + let l = Xlist.map l (fun (id_div,id_ab,id_s,tokens) ->
  292 + let orths = Xlist.fold tokens StringSet.empty (fun orths (_,_,_,orth,_,_,_) -> StringSet.add orths orth) in
  293 + let named_tokens = Xlist.fold named_tokens [] (fun named_tokens (n_orths,id,n,ptrs) ->
  294 + if StringSet.size (StringSet.intersection orths n_orths) = StringSet.size n_orths then (id,n,ptrs) :: named_tokens else named_tokens) in
  295 + id_div,id_ab,id_s,tokens,named_tokens) in
  296 + merge_sentences name id_p (l @ rev) (segmentation,morphosyntax,named)
  297 + | ({corref=""; prefix="segm"; numbers=[id_segm_p;id_segm_s]; suffix="s"},segm_tokens) :: segmentation,
  298 + ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p;c_segm_s]; suffix="s"},
  299 + {corref=""; prefix="morph"; numbers=[id_morph_p;id_morph_s]; suffix="s"},morph_tokens) :: morphosyntax, [] ->
220 300 (* if id_p <> id_segm_p then Printf.printf "merge_sentences inconsistent numbering: %s segm_%d-p segm_%d.%d-s\n" name id_p id_segm_p id_segm_s; *)
221 301 if id_segm_p <> c_segm_p || id_segm_p <> id_morph_p then failwith "merge_sentences 2" else
222 302 if id_segm_s <> c_segm_s || c_segm_s <> id_morph_s then failwith "merge_sentences 3" else
... ... @@ -230,14 +310,15 @@ let rec merge_sentences name id_p rev = function
230 310 | [id_div1,id_ab1,tokens1;id_div2,id_ab2,tokens2] -> [id_div2,id_ab2,id_s^"b",tokens2;id_div1,id_ab1,id_s^"a",tokens1]
231 311 | [id_div1,id_ab1,tokens1;id_div2,id_ab2,tokens2;id_div3,id_ab3,tokens3] -> [id_div3,id_ab3,id_s^"c",tokens3;id_div2,id_ab2,id_s^"b",tokens2;id_div1,id_ab1,id_s^"a",tokens1]
232 312 | _ -> failwith (Printf.sprintf "merge_sentences 5: %s %d %d" name id_div id_ab) in
233   - merge_sentences name id_p (l @ rev) (segmentation,morphosyntax)
234   - | [],[] -> List.rev rev
  313 + let l = Xlist.map l (fun (id_div,id_ab,id_s,tokens) -> id_div,id_ab,id_s,tokens,[]) in
  314 + merge_sentences name id_p (l @ rev) (segmentation,morphosyntax,[])
  315 + | [],[],[] -> List.rev rev
235 316 | _ -> failwith "merge_sentences"
236 317  
237 318 let rec merge_paragraph id_div id_ab rev = function
238   - (id_div2,id_ab2,id_s,tokens) :: sentences ->
239   - if id_div <> id_div2 || id_ab <> id_ab2 then List.rev rev, (id_div2,id_ab2,id_s,tokens) :: sentences
240   - else merge_paragraph id_div id_ab ((id_s,tokens) :: rev) sentences
  319 + (id_div2,id_ab2,id_s,tokens,named_tokens) :: sentences ->
  320 + if id_div <> id_div2 || id_ab <> id_ab2 then List.rev rev, (id_div2,id_ab2,id_s,tokens,named_tokens) :: sentences
  321 + else merge_paragraph id_div id_ab ((id_s,tokens,named_tokens) :: rev) sentences
241 322 | [] -> List.rev rev, []
242 323  
243 324 let rec get_spaces n = function
... ... @@ -252,7 +333,7 @@ let rec split_front rev n p =
252 333 let match_tokens name id_p s sentences =
253 334 let p = Xunicode.utf8_chars_of_utf8_string s in
254 335 let len = Xlist.size p in
255   - let i,p,sentences = Xlist.fold sentences (0,p,[]) (fun (i,p,sentences) (id_s,tokens) ->
  336 + let i,p,sentences = Xlist.fold sentences (0,p,[]) (fun (i,p,sentences) (id_s,tokens,named_tokens) ->
256 337 let i,p,tokens = Xlist.fold tokens (i,p,[]) (fun (i,p,tokens) (beg,len,nps,orth,lemma,cat,interp) ->
257 338 (* Printf.printf "match_tokens: %s %n i=%d beg=%d len=%d\n" name id_p i beg len; *)
258 339 let no_spaces,p = get_spaces 0 p in
... ... @@ -261,17 +342,17 @@ let match_tokens name id_p s sentences =
261 342 let real_orth,p = split_front [] len p in
262 343 if beg = i then i+len, p, (beg,len,no_spaces,String.concat "" real_orth,orth,lemma,cat,interp) :: tokens else
263 344 failwith (Printf.sprintf "match_tokens 1: %s %n i=%d beg=%d len=%d" name id_p i beg len)) in
264   - i,p,(id_s,List.rev tokens) :: sentences) in
  345 + i,p,(id_s,List.rev tokens,named_tokens) :: sentences) in
265 346 let no_spaces,p = get_spaces 0 p in
266 347 if i+no_spaces <> len then failwith (Printf.sprintf "match_tokens 2: %s %n i=%d len=%d p='%s'" name id_p i len (String.concat "" p))
267 348 else List.rev sentences
268 349  
269 350 let rec merge_paragraphs name id_p rev = function
270 351 ({corref=""; prefix="txt"; numbers=[id_div;id_ab]; suffix="ab"},paragraph) :: paragraphs,
271   - (id_div2,id_ab2,id_s,tokens) :: sentences ->
  352 + (id_div2,id_ab2,id_s,tokens,named_tokens) :: sentences ->
272 353 (* print_endline ("B " ^ string_of_int id_p ^ " " ^ string_of_int id_p ^ " " ^ paragraph); *)
273 354 if id_div <> id_div2 && id_ab <> id_ab2 then failwith "merge_paragraphs 1" else
274   - let l,sentences = merge_paragraph id_div id_ab [id_s,tokens] sentences in
  355 + let l,sentences = merge_paragraph id_div id_ab [id_s,tokens,named_tokens] sentences in
275 356 (* Printf.printf "%d.%d: %s\n" id_div id_ab (String.concat " " (Xlist.map l fst)); *)
276 357 let l =
277 358 try match_tokens name id_p paragraph l
... ... @@ -285,24 +366,50 @@ let rec merge_entries name rev = function
285 366 ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"},
286 367 {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation,
287 368 ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"},
288   - {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax ->
  369 + {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax,
  370 + ({corref="ann_morphosyntax.xml"; prefix="morph"; numbers=[c_morph_p]; suffix="p"},
  371 + {corref=""; prefix="named"; numbers=[id_named_p]; suffix="p"},named_sentences) :: named ->
  372 + (* print_endline ("A " ^ string_of_int id_div); *)
  373 + if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p ||
  374 + c_segm_p <> id_morph_p || id_morph_p <> c_morph_p || c_morph_p <> id_named_p then failwith "merge_entries 2" else
  375 + let sentences = merge_sentences name id_div [] (segm_sentences,morph_sentences,named_sentences) in
  376 + let paragraphs = merge_paragraphs name id_div [] (paragraphs,sentences) in
  377 + merge_entries name ((id_div,true,paragraphs) :: rev) (text,segmentation,morphosyntax,named)
  378 + | ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text,
  379 + ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"},
  380 + {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation,
  381 + ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"},
  382 + {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax, [] ->
289 383 (* print_endline ("A " ^ string_of_int id_div); *)
290 384 if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else
291   - let sentences = merge_sentences name id_div [] (segm_sentences,morph_sentences) in
  385 + let sentences = merge_sentences name id_div [] (segm_sentences,morph_sentences,[]) in
292 386 let paragraphs = merge_paragraphs name id_div [] (paragraphs,sentences) in
293   - merge_entries name ((id_div,paragraphs) :: rev) (text,segmentation,morphosyntax)
294   - | [],[],[] -> List.rev rev
  387 + merge_entries name ((id_div,false,paragraphs) :: rev) (text,segmentation,morphosyntax,[])
  388 + | [],[],[],[] -> List.rev rev
295 389 | _ -> failwith "merge_entries"
296 390  
  391 +let fold path s f =
  392 + let names = get_folders path in
  393 + Xlist.fold names s (fun s name ->
  394 + (* print_endline name; *)
  395 + if name = "030-2-000000012" then s else
  396 + let typ,channel = load_header path name in
  397 + let text = load_text path name in
  398 + let segmentation = load_segmentation path name in
  399 + let morphosyntax = load_morphosyntax path name in
  400 + let named = load_named path name in
  401 + let entries = merge_entries name [] (text,segmentation,morphosyntax,named) in
  402 + f s (name,typ,channel,entries))
  403 +
297 404 let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/"
298 405  
299 406 let calculate_statistics stats typ channel entries =
300   - Xlist.fold entries stats (fun stats (id_div,paragraphs) ->
  407 + Xlist.fold entries stats (fun stats (id_div,has_ne,paragraphs) ->
301 408 Xlist.fold paragraphs stats (fun stats (paragraph,sentences) ->
302   - Xlist.fold sentences stats (fun stats (id_s,tokens) ->
  409 + Xlist.fold sentences stats (fun stats (id_s,tokens,named_tokens) ->
303 410 let bad_tokens = Xlist.fold tokens 0 (fun n (_,_,_,real_orth,orth,_,_,_) ->
304 411 if real_orth = orth then n else n+1) in
305   - let all_tokens = Xlist.size tokens in
  412 + (* let all_tokens = Xlist.size tokens in *)
306 413 let s = Printf.sprintf "%s %s %d" typ channel bad_tokens in
307 414 StringQMap.add stats s)))
308 415  
... ... @@ -310,22 +417,11 @@ let print_stats stats =
310 417 StringQMap.iter stats (fun k v ->
311 418 Printf.printf "%5d %s\n" v k)
312 419  
313   -let _ =
314   - let names = get_folders nkjp_path in
315   - let stats = Xlist.fold names StringQMap.empty (fun stats name ->
316   - (* print_endline name; *)
317   - if name = "030-2-000000012" then stats else
318   - let typ,channel = load_header nkjp_path name in
319   - (* print_endline typ; *)
320   - (* print_endline channel; *)
321   - (* print_endline (typ ^ "\t" ^ channel); *)
322   - let text = load_text nkjp_path name in
323   - let segmentation = load_segmentation nkjp_path name in
324   - let morphosyntax = load_morphosyntax nkjp_path name in
325   - let entries = merge_entries name [] (text,segmentation,morphosyntax) in
  420 +(* let _ =
  421 + let stats = fold nkjp_path StringQMap.empty (fun stats (name,typ,channel,entries) ->
326 422 calculate_statistics stats typ channel entries) in
327   - print_stats stats
328   -(*130-3-900001 - poprawione w korpusie*)
  423 + print_stats stats *)
  424 +
329 425 (*
330 426 frekwencje typów:
331 427 127 fakt
... ...
NKJP2/makefile
... ... @@ -3,10 +3,10 @@ OCAMLOPT=ocamlopt
3 3 OCAMLDEP=ocamldep
4 4 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
5 5 OCAMLFLAGS=$(INCLUDES) -g
6   -OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa #eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa
  6 +OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa #eniam-morphology.cmxa eniam-subsyntax.cmxa
7 7 INSTALLDIR=`ocamlc -where`/eniam
8 8  
9   -SOURCES=ENIAM_NKJP.ml
  9 +SOURCES=ENIAM_NKJP.ml validateTokenizer.ml
10 10  
11 11 all: $(SOURCES)
12 12 $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $^
... ...