From caeb305ab86cefe8f85c04835fae2a47f2715eae Mon Sep 17 00:00:00 2001
From: Wojciech Jaworski <wjaworski@mimuw.edu.pl>
Date: Fri, 24 Mar 2017 09:58:56 +0100
Subject: [PATCH] łączenie na poziomie haseł
---
NKJP2/ENIAM_NKJP.ml | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 106 insertions(+), 15 deletions(-)
diff --git a/NKJP2/ENIAM_NKJP.ml b/NKJP2/ENIAM_NKJP.ml
index 9d1de42..5390230 100644
--- a/NKJP2/ENIAM_NKJP.ml
+++ b/NKJP2/ENIAM_NKJP.ml
@@ -19,6 +19,34 @@
open Xstd
+type id = {corref: string; prefix: string; suffix: string; numbers: int list}
+
+let empty_id = {corref = ""; prefix = ""; suffix = ""; numbers = []}
+
+let parse_id id =
+ (* if String.length s = 0 then empty_id else *)
+ if String.length id < 6 then failwith "parse_id: za krótkie id" else
+ let corref,id = match Xstring.split "#" id with
+ [corref;id] -> corref,id
+ | [id] -> "",id
+ | _ -> failwith ("parse_id 1: " ^ id) in
+ let prefix,id = match Xstring.split "_" id with
+ [prefix;id] -> prefix,id
+ | _ -> failwith ("parse_id 2: " ^ id) in
+ let suffix,id = match Xstring.split "-" id with
+ [id;suffix] -> suffix,id
+ | _ -> failwith ("parse_id 3: " ^ id) in
+ let numbers = try Xlist.map (Xstring.split "\\." id) int_of_string with _ -> failwith ("parse_id 4: " ^ id) in
+ {corref=corref; prefix=prefix; suffix=suffix; numbers=numbers}
+
+let process_header_type typ =
+ if Xstring.check_prefix "#typ_" typ then Xstring.cut_prefix "#typ_" typ
+ else failwith ("process_header_type: " ^ typ)
+
+let process_header_channel c =
+ if Xstring.check_prefix "#kanal_" c then Xstring.cut_prefix "#kanal_" c
+ else failwith ("process_header_channel: " ^ c)
+
let load_header path name =
match Xml.parse_file (path ^ name ^ "/header.xml") with
Xml.Element("teiHeader",_,[Xml.Element("fileDesc",[],_);
@@ -26,7 +54,7 @@ let load_header path name =
Xml.Element("catRef",["scheme","#taxonomy-NKJP-type";"target",typ],[]);
Xml.Element("catRef",["scheme","#taxonomy-NKJP-channel";"target",channel],[])])]);
Xml.Element("revisionDesc",_,_)]) ->
- typ,channel
+ process_header_type typ,process_header_channel channel
| _ -> failwith "load_header"
let get_folders path =
@@ -35,12 +63,12 @@ let get_folders path =
let load_paragraph = function
Xml.Element("ab",["n",_;"xml:id",id_ab],[Xml.PCData paragraph]) ->
- id_ab,paragraph
+ parse_id id_ab,paragraph
| xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml)
let load_text_entry = function
Xml.Element("div",["xml:id",id_div;"decls",_],paragraphs) ->
- id_div,List.rev (Xlist.rev_map paragraphs load_paragraph)
+ parse_id id_div,List.rev (Xlist.rev_map paragraphs load_paragraph)
| xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml)
let load_text path name =
@@ -64,9 +92,9 @@ let remove_rejected rev = function
let rec load_segm_token = function
Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[]) ->
- [corresp,false,id_seg]
+ [corresp,false,parse_id id_seg]
| Xml.Element("seg",["corresp",corresp;"nkjp:nps","true";"xml:id",id_seg],[]) ->
- [corresp,true,id_seg]
+ [corresp,true,parse_id id_seg]
| Xml.Element("nkjp:paren",[],tokens) -> List.flatten (Xlist.map tokens load_segm_token)
| Xml.Element("choice",[],alt) as xml ->
let alt = Xlist.fold alt [] remove_rejected in
@@ -77,12 +105,12 @@ let rec load_segm_token = function
let load_segm_sentence = function
Xml.Element("s",["xml:id",id_s],tokens) ->
- id_s,List.flatten (List.rev (Xlist.rev_map tokens load_segm_token))
+ parse_id id_s,List.flatten (List.rev (Xlist.rev_map tokens load_segm_token))
| xml -> failwith ("load_segm_sentence: " ^ Xml.to_string_fmt xml)
let load_segm_entry = function
Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) ->
- corresp,id_p,List.rev (Xlist.rev_map sentences load_segm_sentence)
+ parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_segm_sentence)
| xml -> failwith ("load_segm_entry: " ^ Xml.to_string_fmt xml)
let load_segmentation path name =
@@ -104,22 +132,22 @@ let load_morph_token = function
Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"],
[Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]);
Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) ->
- corresp,id_seg,orth,load_disamb disamb
+ parse_id corresp,parse_id id_seg,orth,load_disamb disamb
| Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"],
[Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]);
Xml.Element("f",["name","nps"],[Xml.Element("binary",["value","true"],[])]);
Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) ->
- corresp,id_seg,orth,load_disamb disamb
+ parse_id corresp,parse_id id_seg,orth,load_disamb disamb
| xml -> failwith ("load_morph_token: " ^ Xml.to_string_fmt xml)
let load_morph_sentence = function
Xml.Element("s",["corresp",corresp;"xml:id",id_s],tokens) ->
- corresp,id_s,List.rev (Xlist.rev_map tokens load_morph_token)
+ parse_id corresp,parse_id id_s,List.rev (Xlist.rev_map tokens load_morph_token)
| xml -> failwith ("load_morph_sentence: " ^ Xml.to_string_fmt xml)
let load_morph_entry = function
Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) ->
- corresp,id_p,List.rev (Xlist.rev_map sentences load_morph_sentence)
+ parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_morph_sentence)
| xml -> failwith ("load_morph_entry: " ^ Xml.to_string_fmt xml)
let load_morphosyntax path name =
@@ -130,6 +158,16 @@ let load_morphosyntax path name =
List.rev (Xlist.rev_map entries load_morph_entry)
| _ -> failwith "load_morphosyntax"
+let rec merge_entries rev = function
+ ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text,
+ ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"},
+ {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation,
+ ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"},
+ {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax ->
+ if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else
+ merge_entries ((id_div,paragraphs,segm_sentences,morph_sentences) :: rev) (text,segmentation,morphosyntax)
+ | [],[],[] -> List.rev rev
+ | _ -> failwith "merge_entries"
let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/"
@@ -137,13 +175,66 @@ let _ =
let names = get_folders nkjp_path in
Xlist.iter names (fun name ->
print_endline name;
- let header = load_text nkjp_path name in
- (* let text = load_text nkjp_path name in *)
- (* let segmentation = load_segmentation nkjp_path name in *)
- (* let morphosyntax = load_morphosyntax nkjp_path name in *)
+ let typ,channel = load_header nkjp_path name in
+ (* print_endline typ; *)
+ (* print_endline channel; *)
+ (* print_endline (typ ^ "\t" ^ channel); *)
+ let text = load_text nkjp_path name in
+ let segmentation = load_segmentation nkjp_path name in
+ let morphosyntax = load_morphosyntax nkjp_path name in
+ let entries = merge_entries [] (text,segmentation,morphosyntax) in
())
(*
+frekwencje typów:
+ 127 fakt
+ 56 inf-por
+ 283 konwers
+ 2 listy
+ 376 lit
+ 1 lit_poezja
+ 80 media
+ 175 nd
+ 161 net_interakt
+ 227 net_nieinterakt
+ 20 nklas
+ 1986 publ
+ 8 qmow
+ 387 urzed
+
+frekwencje kanałów
+ 388 internet
+ 817 ksiazka
+ 363 mowiony
+ 146 prasa
+ 1744 prasa_dziennik
+ 398 prasa_inne
+ 5 prasa_miesiecznik
+ 28 prasa_tygodnik
+
+frekwencje łączne typów-kanałów
+ 127 fakt ksiazka
+ 56 inf-por ksiazka
+ 283 konwers mowiony
+ 2 listy ksiazka
+ 376 lit ksiazka
+ 1 lit_poezja ksiazka
+ 80 media mowiony
+ 175 nd ksiazka
+ 161 net_interakt internet
+ 227 net_nieinterakt internet
+ 20 nklas ksiazka
+ 60 publ ksiazka
+ 146 publ prasa
+ 1744 publ prasa_dziennik
+ 3 publ prasa_inne
+ 5 publ prasa_miesiecznik
+ 28 publ prasa_tygodnik
+ 8 qmow prasa_inne
+ 387 urzed prasa_inne
+
+ *)
+(*
type id = {hash: bool; suffix: string; numbers: int list}
--
libgit2 0.22.2