Commit caeb305ab86cefe8f85c04835fae2a47f2715eae

Authored by Wojciech Jaworski
1 parent 584f9657

łączenie na poziomie haseł

Showing 1 changed file with 106 additions and 15 deletions
NKJP2/ENIAM_NKJP.ml
@@ -19,6 +19,34 @@ @@ -19,6 +19,34 @@
19 19
20 open Xstd 20 open Xstd
21 21
  22 +type id = {corref: string; prefix: string; suffix: string; numbers: int list}
  23 +
  24 +let empty_id = {corref = ""; prefix = ""; suffix = ""; numbers = []}
  25 +
  26 +let parse_id id =
  27 + (* if String.length s = 0 then empty_id else *)
  28 + if String.length id < 6 then failwith "parse_id: za krótkie id" else
  29 + let corref,id = match Xstring.split "#" id with
  30 + [corref;id] -> corref,id
  31 + | [id] -> "",id
  32 + | _ -> failwith ("parse_id 1: " ^ id) in
  33 + let prefix,id = match Xstring.split "_" id with
  34 + [prefix;id] -> prefix,id
  35 + | _ -> failwith ("parse_id 2: " ^ id) in
  36 + let suffix,id = match Xstring.split "-" id with
  37 + [id;suffix] -> suffix,id
  38 + | _ -> failwith ("parse_id 3: " ^ id) in
  39 + let numbers = try Xlist.map (Xstring.split "\\." id) int_of_string with _ -> failwith ("parse_id 4: " ^ id) in
  40 + {corref=corref; prefix=prefix; suffix=suffix; numbers=numbers}
  41 +
  42 +let process_header_type typ =
  43 + if Xstring.check_prefix "#typ_" typ then Xstring.cut_prefix "#typ_" typ
  44 + else failwith ("process_header_type: " ^ typ)
  45 +
  46 +let process_header_channel c =
  47 + if Xstring.check_prefix "#kanal_" c then Xstring.cut_prefix "#kanal_" c
  48 + else failwith ("process_header_channel: " ^ c)
  49 +
22 let load_header path name = 50 let load_header path name =
23 match Xml.parse_file (path ^ name ^ "/header.xml") with 51 match Xml.parse_file (path ^ name ^ "/header.xml") with
24 Xml.Element("teiHeader",_,[Xml.Element("fileDesc",[],_); 52 Xml.Element("teiHeader",_,[Xml.Element("fileDesc",[],_);
@@ -26,7 +54,7 @@ let load_header path name = @@ -26,7 +54,7 @@ let load_header path name =
26 Xml.Element("catRef",["scheme","#taxonomy-NKJP-type";"target",typ],[]); 54 Xml.Element("catRef",["scheme","#taxonomy-NKJP-type";"target",typ],[]);
27 Xml.Element("catRef",["scheme","#taxonomy-NKJP-channel";"target",channel],[])])]); 55 Xml.Element("catRef",["scheme","#taxonomy-NKJP-channel";"target",channel],[])])]);
28 Xml.Element("revisionDesc",_,_)]) -> 56 Xml.Element("revisionDesc",_,_)]) ->
29 - typ,channel 57 + process_header_type typ,process_header_channel channel
30 | _ -> failwith "load_header" 58 | _ -> failwith "load_header"
31 59
32 let get_folders path = 60 let get_folders path =
@@ -35,12 +63,12 @@ let get_folders path = @@ -35,12 +63,12 @@ let get_folders path =
35 63
36 let load_paragraph = function 64 let load_paragraph = function
37 Xml.Element("ab",["n",_;"xml:id",id_ab],[Xml.PCData paragraph]) -> 65 Xml.Element("ab",["n",_;"xml:id",id_ab],[Xml.PCData paragraph]) ->
38 - id_ab,paragraph 66 + parse_id id_ab,paragraph
39 | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml) 67 | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml)
40 68
41 let load_text_entry = function 69 let load_text_entry = function
42 Xml.Element("div",["xml:id",id_div;"decls",_],paragraphs) -> 70 Xml.Element("div",["xml:id",id_div;"decls",_],paragraphs) ->
43 - id_div,List.rev (Xlist.rev_map paragraphs load_paragraph) 71 + parse_id id_div,List.rev (Xlist.rev_map paragraphs load_paragraph)
44 | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml) 72 | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml)
45 73
46 let load_text path name = 74 let load_text path name =
@@ -64,9 +92,9 @@ let remove_rejected rev = function @@ -64,9 +92,9 @@ let remove_rejected rev = function
64 92
65 let rec load_segm_token = function 93 let rec load_segm_token = function
66 Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[]) -> 94 Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[]) ->
67 - [corresp,false,id_seg] 95 + [corresp,false,parse_id id_seg]
68 | Xml.Element("seg",["corresp",corresp;"nkjp:nps","true";"xml:id",id_seg],[]) -> 96 | Xml.Element("seg",["corresp",corresp;"nkjp:nps","true";"xml:id",id_seg],[]) ->
69 - [corresp,true,id_seg] 97 + [corresp,true,parse_id id_seg]
70 | Xml.Element("nkjp:paren",[],tokens) -> List.flatten (Xlist.map tokens load_segm_token) 98 | Xml.Element("nkjp:paren",[],tokens) -> List.flatten (Xlist.map tokens load_segm_token)
71 | Xml.Element("choice",[],alt) as xml -> 99 | Xml.Element("choice",[],alt) as xml ->
72 let alt = Xlist.fold alt [] remove_rejected in 100 let alt = Xlist.fold alt [] remove_rejected in
@@ -77,12 +105,12 @@ let rec load_segm_token = function @@ -77,12 +105,12 @@ let rec load_segm_token = function
77 105
78 let load_segm_sentence = function 106 let load_segm_sentence = function
79 Xml.Element("s",["xml:id",id_s],tokens) -> 107 Xml.Element("s",["xml:id",id_s],tokens) ->
80 - id_s,List.flatten (List.rev (Xlist.rev_map tokens load_segm_token)) 108 + parse_id id_s,List.flatten (List.rev (Xlist.rev_map tokens load_segm_token))
81 | xml -> failwith ("load_segm_sentence: " ^ Xml.to_string_fmt xml) 109 | xml -> failwith ("load_segm_sentence: " ^ Xml.to_string_fmt xml)
82 110
83 let load_segm_entry = function 111 let load_segm_entry = function
84 Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) -> 112 Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) ->
85 - corresp,id_p,List.rev (Xlist.rev_map sentences load_segm_sentence) 113 + parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_segm_sentence)
86 | xml -> failwith ("load_segm_entry: " ^ Xml.to_string_fmt xml) 114 | xml -> failwith ("load_segm_entry: " ^ Xml.to_string_fmt xml)
87 115
88 let load_segmentation path name = 116 let load_segmentation path name =
@@ -104,22 +132,22 @@ let load_morph_token = function @@ -104,22 +132,22 @@ let load_morph_token = function
104 Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"], 132 Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"],
105 [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]); 133 [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]);
106 Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) -> 134 Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) ->
107 - corresp,id_seg,orth,load_disamb disamb 135 + parse_id corresp,parse_id id_seg,orth,load_disamb disamb
108 | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"], 136 | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"],
109 [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]); 137 [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]);
110 Xml.Element("f",["name","nps"],[Xml.Element("binary",["value","true"],[])]); 138 Xml.Element("f",["name","nps"],[Xml.Element("binary",["value","true"],[])]);
111 Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) -> 139 Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) ->
112 - corresp,id_seg,orth,load_disamb disamb 140 + parse_id corresp,parse_id id_seg,orth,load_disamb disamb
113 | xml -> failwith ("load_morph_token: " ^ Xml.to_string_fmt xml) 141 | xml -> failwith ("load_morph_token: " ^ Xml.to_string_fmt xml)
114 142
115 let load_morph_sentence = function 143 let load_morph_sentence = function
116 Xml.Element("s",["corresp",corresp;"xml:id",id_s],tokens) -> 144 Xml.Element("s",["corresp",corresp;"xml:id",id_s],tokens) ->
117 - corresp,id_s,List.rev (Xlist.rev_map tokens load_morph_token) 145 + parse_id corresp,parse_id id_s,List.rev (Xlist.rev_map tokens load_morph_token)
118 | xml -> failwith ("load_morph_sentence: " ^ Xml.to_string_fmt xml) 146 | xml -> failwith ("load_morph_sentence: " ^ Xml.to_string_fmt xml)
119 147
120 let load_morph_entry = function 148 let load_morph_entry = function
121 Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) -> 149 Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) ->
122 - corresp,id_p,List.rev (Xlist.rev_map sentences load_morph_sentence) 150 + parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_morph_sentence)
123 | xml -> failwith ("load_morph_entry: " ^ Xml.to_string_fmt xml) 151 | xml -> failwith ("load_morph_entry: " ^ Xml.to_string_fmt xml)
124 152
125 let load_morphosyntax path name = 153 let load_morphosyntax path name =
@@ -130,6 +158,16 @@ let load_morphosyntax path name = @@ -130,6 +158,16 @@ let load_morphosyntax path name =
130 List.rev (Xlist.rev_map entries load_morph_entry) 158 List.rev (Xlist.rev_map entries load_morph_entry)
131 | _ -> failwith "load_morphosyntax" 159 | _ -> failwith "load_morphosyntax"
132 160
  161 +let rec merge_entries rev = function
  162 + ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text,
  163 + ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"},
  164 + {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation,
  165 + ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"},
  166 + {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax ->
  167 + if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else
  168 + merge_entries ((id_div,paragraphs,segm_sentences,morph_sentences) :: rev) (text,segmentation,morphosyntax)
  169 + | [],[],[] -> List.rev rev
  170 + | _ -> failwith "merge_entries"
133 171
134 let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/" 172 let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/"
135 173
@@ -137,13 +175,66 @@ let _ = @@ -137,13 +175,66 @@ let _ =
137 let names = get_folders nkjp_path in 175 let names = get_folders nkjp_path in
138 Xlist.iter names (fun name -> 176 Xlist.iter names (fun name ->
139 print_endline name; 177 print_endline name;
140 - let header = load_text nkjp_path name in  
141 - (* let text = load_text nkjp_path name in *)  
142 - (* let segmentation = load_segmentation nkjp_path name in *)  
143 - (* let morphosyntax = load_morphosyntax nkjp_path name in *) 178 + let typ,channel = load_header nkjp_path name in
  179 + (* print_endline typ; *)
  180 + (* print_endline channel; *)
  181 + (* print_endline (typ ^ "\t" ^ channel); *)
  182 + let text = load_text nkjp_path name in
  183 + let segmentation = load_segmentation nkjp_path name in
  184 + let morphosyntax = load_morphosyntax nkjp_path name in
  185 + let entries = merge_entries [] (text,segmentation,morphosyntax) in
144 ()) 186 ())
145 187
146 (* 188 (*
  189 +frekwencje typów:
  190 + 127 fakt
  191 + 56 inf-por
  192 + 283 konwers
  193 + 2 listy
  194 + 376 lit
  195 + 1 lit_poezja
  196 + 80 media
  197 + 175 nd
  198 + 161 net_interakt
  199 + 227 net_nieinterakt
  200 + 20 nklas
  201 + 1986 publ
  202 + 8 qmow
  203 + 387 urzed
  204 +
  205 +frekwencje kanałów
  206 + 388 internet
  207 + 817 ksiazka
  208 + 363 mowiony
  209 + 146 prasa
  210 + 1744 prasa_dziennik
  211 + 398 prasa_inne
  212 + 5 prasa_miesiecznik
  213 + 28 prasa_tygodnik
  214 +
  215 +frekwencje łączne typów-kanałów
  216 + 127 fakt ksiazka
  217 + 56 inf-por ksiazka
  218 + 283 konwers mowiony
  219 + 2 listy ksiazka
  220 + 376 lit ksiazka
  221 + 1 lit_poezja ksiazka
  222 + 80 media mowiony
  223 + 175 nd ksiazka
  224 + 161 net_interakt internet
  225 + 227 net_nieinterakt internet
  226 + 20 nklas ksiazka
  227 + 60 publ ksiazka
  228 + 146 publ prasa
  229 + 1744 publ prasa_dziennik
  230 + 3 publ prasa_inne
  231 + 5 publ prasa_miesiecznik
  232 + 28 publ prasa_tygodnik
  233 + 8 qmow prasa_inne
  234 + 387 urzed prasa_inne
  235 +
  236 + *)
  237 +(*
147 238
148 type id = {hash: bool; suffix: string; numbers: int list} 239 type id = {hash: bool; suffix: string; numbers: int list}
149 240