diff --git a/NKJP2/ENIAM_NKJP.ml b/NKJP2/ENIAM_NKJP.ml index 9d1de42..5390230 100644 --- a/NKJP2/ENIAM_NKJP.ml +++ b/NKJP2/ENIAM_NKJP.ml @@ -19,6 +19,34 @@ open Xstd +type id = {corref: string; prefix: string; suffix: string; numbers: int list} + +let empty_id = {corref = ""; prefix = ""; suffix = ""; numbers = []} + +let parse_id id = + (* if String.length s = 0 then empty_id else *) + if String.length id < 6 then failwith "parse_id: za krótkie id" else + let corref,id = match Xstring.split "#" id with + [corref;id] -> corref,id + | [id] -> "",id + | _ -> failwith ("parse_id 1: " ^ id) in + let prefix,id = match Xstring.split "_" id with + [prefix;id] -> prefix,id + | _ -> failwith ("parse_id 2: " ^ id) in + let suffix,id = match Xstring.split "-" id with + [id;suffix] -> suffix,id + | _ -> failwith ("parse_id 3: " ^ id) in + let numbers = try Xlist.map (Xstring.split "\\." id) int_of_string with _ -> failwith ("parse_id 4: " ^ id) in + {corref=corref; prefix=prefix; suffix=suffix; numbers=numbers} + +let process_header_type typ = + if Xstring.check_prefix "#typ_" typ then Xstring.cut_prefix "#typ_" typ + else failwith ("process_header_type: " ^ typ) + +let process_header_channel c = + if Xstring.check_prefix "#kanal_" c then Xstring.cut_prefix "#kanal_" c + else failwith ("process_header_channel: " ^ c) + let load_header path name = match Xml.parse_file (path ^ name ^ "/header.xml") with Xml.Element("teiHeader",_,[Xml.Element("fileDesc",[],_); @@ -26,7 +54,7 @@ let load_header path name = Xml.Element("catRef",["scheme","#taxonomy-NKJP-type";"target",typ],[]); Xml.Element("catRef",["scheme","#taxonomy-NKJP-channel";"target",channel],[])])]); Xml.Element("revisionDesc",_,_)]) -> - typ,channel + process_header_type typ,process_header_channel channel | _ -> failwith "load_header" let get_folders path = @@ -35,12 +63,12 @@ let get_folders path = let load_paragraph = function Xml.Element("ab",["n",_;"xml:id",id_ab],[Xml.PCData paragraph]) -> - id_ab,paragraph + parse_id id_ab,paragraph | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml) let load_text_entry = function Xml.Element("div",["xml:id",id_div;"decls",_],paragraphs) -> - id_div,List.rev (Xlist.rev_map paragraphs load_paragraph) + parse_id id_div,List.rev (Xlist.rev_map paragraphs load_paragraph) | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml) let load_text path name = @@ -64,9 +92,9 @@ let remove_rejected rev = function let rec load_segm_token = function Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[]) -> - [corresp,false,id_seg] + [corresp,false,parse_id id_seg] | Xml.Element("seg",["corresp",corresp;"nkjp:nps","true";"xml:id",id_seg],[]) -> - [corresp,true,id_seg] + [corresp,true,parse_id id_seg] | Xml.Element("nkjp:paren",[],tokens) -> List.flatten (Xlist.map tokens load_segm_token) | Xml.Element("choice",[],alt) as xml -> let alt = Xlist.fold alt [] remove_rejected in @@ -77,12 +105,12 @@ let rec load_segm_token = function let load_segm_sentence = function Xml.Element("s",["xml:id",id_s],tokens) -> - id_s,List.flatten (List.rev (Xlist.rev_map tokens load_segm_token)) + parse_id id_s,List.flatten (List.rev (Xlist.rev_map tokens load_segm_token)) | xml -> failwith ("load_segm_sentence: " ^ Xml.to_string_fmt xml) let load_segm_entry = function Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) -> - corresp,id_p,List.rev (Xlist.rev_map sentences load_segm_sentence) + parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_segm_sentence) | xml -> failwith ("load_segm_entry: " ^ Xml.to_string_fmt xml) let load_segmentation path name = @@ -104,22 +132,22 @@ let load_morph_token = function Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"], [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]); Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) -> - corresp,id_seg,orth,load_disamb disamb + parse_id corresp,parse_id id_seg,orth,load_disamb disamb | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"], [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]); Xml.Element("f",["name","nps"],[Xml.Element("binary",["value","true"],[])]); Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) -> - corresp,id_seg,orth,load_disamb disamb + parse_id corresp,parse_id id_seg,orth,load_disamb disamb | xml -> failwith ("load_morph_token: " ^ Xml.to_string_fmt xml) let load_morph_sentence = function Xml.Element("s",["corresp",corresp;"xml:id",id_s],tokens) -> - corresp,id_s,List.rev (Xlist.rev_map tokens load_morph_token) + parse_id corresp,parse_id id_s,List.rev (Xlist.rev_map tokens load_morph_token) | xml -> failwith ("load_morph_sentence: " ^ Xml.to_string_fmt xml) let load_morph_entry = function Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) -> - corresp,id_p,List.rev (Xlist.rev_map sentences load_morph_sentence) + parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_morph_sentence) | xml -> failwith ("load_morph_entry: " ^ Xml.to_string_fmt xml) let load_morphosyntax path name = @@ -130,6 +158,16 @@ let load_morphosyntax path name = List.rev (Xlist.rev_map entries load_morph_entry) | _ -> failwith "load_morphosyntax" +let rec merge_entries rev = function + ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text, + ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"}, + {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation, + ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"}, + {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax -> + if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else + merge_entries ((id_div,paragraphs,segm_sentences,morph_sentences) :: rev) (text,segmentation,morphosyntax) + | [],[],[] -> List.rev rev + | _ -> failwith "merge_entries" let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/" @@ -137,13 +175,66 @@ let _ = let names = get_folders nkjp_path in Xlist.iter names (fun name -> print_endline name; - let header = load_text nkjp_path name in - (* let text = load_text nkjp_path name in *) - (* let segmentation = load_segmentation nkjp_path name in *) - (* let morphosyntax = load_morphosyntax nkjp_path name in *) + let typ,channel = load_header nkjp_path name in + (* print_endline typ; *) + (* print_endline channel; *) + (* print_endline (typ ^ "\t" ^ channel); *) + let text = load_text nkjp_path name in + let segmentation = load_segmentation nkjp_path name in + let morphosyntax = load_morphosyntax nkjp_path name in + let entries = merge_entries [] (text,segmentation,morphosyntax) in ()) (* +frekwencje typów: + 127 fakt + 56 inf-por + 283 konwers + 2 listy + 376 lit + 1 lit_poezja + 80 media + 175 nd + 161 net_interakt + 227 net_nieinterakt + 20 nklas + 1986 publ + 8 qmow + 387 urzed + +frekwencje kanałów + 388 internet + 817 ksiazka + 363 mowiony + 146 prasa + 1744 prasa_dziennik + 398 prasa_inne + 5 prasa_miesiecznik + 28 prasa_tygodnik + +frekwencje łączne typów-kanałów + 127 fakt ksiazka + 56 inf-por ksiazka + 283 konwers mowiony + 2 listy ksiazka + 376 lit ksiazka + 1 lit_poezja ksiazka + 80 media mowiony + 175 nd ksiazka + 161 net_interakt internet + 227 net_nieinterakt internet + 20 nklas ksiazka + 60 publ ksiazka + 146 publ prasa + 1744 publ prasa_dziennik + 3 publ prasa_inne + 5 publ prasa_miesiecznik + 28 publ prasa_tygodnik + 8 qmow prasa_inne + 387 urzed prasa_inne + + *) +(* type id = {hash: bool; suffix: string; numbers: int list}