Commit caeb305ab86cefe8f85c04835fae2a47f2715eae
1 parent
584f9657
łączenie na poziomie haseł
Showing
1 changed file
with
106 additions
and
15 deletions
NKJP2/ENIAM_NKJP.ml
... | ... | @@ -19,6 +19,34 @@ |
19 | 19 | |
20 | 20 | open Xstd |
21 | 21 | |
22 | +type id = {corref: string; prefix: string; suffix: string; numbers: int list} | |
23 | + | |
24 | +let empty_id = {corref = ""; prefix = ""; suffix = ""; numbers = []} | |
25 | + | |
26 | +let parse_id id = | |
27 | + (* if String.length s = 0 then empty_id else *) | |
28 | + if String.length id < 6 then failwith "parse_id: za krótkie id" else | |
29 | + let corref,id = match Xstring.split "#" id with | |
30 | + [corref;id] -> corref,id | |
31 | + | [id] -> "",id | |
32 | + | _ -> failwith ("parse_id 1: " ^ id) in | |
33 | + let prefix,id = match Xstring.split "_" id with | |
34 | + [prefix;id] -> prefix,id | |
35 | + | _ -> failwith ("parse_id 2: " ^ id) in | |
36 | + let suffix,id = match Xstring.split "-" id with | |
37 | + [id;suffix] -> suffix,id | |
38 | + | _ -> failwith ("parse_id 3: " ^ id) in | |
39 | + let numbers = try Xlist.map (Xstring.split "\\." id) int_of_string with _ -> failwith ("parse_id 4: " ^ id) in | |
40 | + {corref=corref; prefix=prefix; suffix=suffix; numbers=numbers} | |
41 | + | |
42 | +let process_header_type typ = | |
43 | + if Xstring.check_prefix "#typ_" typ then Xstring.cut_prefix "#typ_" typ | |
44 | + else failwith ("process_header_type: " ^ typ) | |
45 | + | |
46 | +let process_header_channel c = | |
47 | + if Xstring.check_prefix "#kanal_" c then Xstring.cut_prefix "#kanal_" c | |
48 | + else failwith ("process_header_channel: " ^ c) | |
49 | + | |
22 | 50 | let load_header path name = |
23 | 51 | match Xml.parse_file (path ^ name ^ "/header.xml") with |
24 | 52 | Xml.Element("teiHeader",_,[Xml.Element("fileDesc",[],_); |
... | ... | @@ -26,7 +54,7 @@ let load_header path name = |
26 | 54 | Xml.Element("catRef",["scheme","#taxonomy-NKJP-type";"target",typ],[]); |
27 | 55 | Xml.Element("catRef",["scheme","#taxonomy-NKJP-channel";"target",channel],[])])]); |
28 | 56 | Xml.Element("revisionDesc",_,_)]) -> |
29 | - typ,channel | |
57 | + process_header_type typ,process_header_channel channel | |
30 | 58 | | _ -> failwith "load_header" |
31 | 59 | |
32 | 60 | let get_folders path = |
... | ... | @@ -35,12 +63,12 @@ let get_folders path = |
35 | 63 | |
36 | 64 | let load_paragraph = function |
37 | 65 | Xml.Element("ab",["n",_;"xml:id",id_ab],[Xml.PCData paragraph]) -> |
38 | - id_ab,paragraph | |
66 | + parse_id id_ab,paragraph | |
39 | 67 | | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml) |
40 | 68 | |
41 | 69 | let load_text_entry = function |
42 | 70 | Xml.Element("div",["xml:id",id_div;"decls",_],paragraphs) -> |
43 | - id_div,List.rev (Xlist.rev_map paragraphs load_paragraph) | |
71 | + parse_id id_div,List.rev (Xlist.rev_map paragraphs load_paragraph) | |
44 | 72 | | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml) |
45 | 73 | |
46 | 74 | let load_text path name = |
... | ... | @@ -64,9 +92,9 @@ let remove_rejected rev = function |
64 | 92 | |
65 | 93 | let rec load_segm_token = function |
66 | 94 | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[]) -> |
67 | - [corresp,false,id_seg] | |
95 | + [corresp,false,parse_id id_seg] | |
68 | 96 | | Xml.Element("seg",["corresp",corresp;"nkjp:nps","true";"xml:id",id_seg],[]) -> |
69 | - [corresp,true,id_seg] | |
97 | + [corresp,true,parse_id id_seg] | |
70 | 98 | | Xml.Element("nkjp:paren",[],tokens) -> List.flatten (Xlist.map tokens load_segm_token) |
71 | 99 | | Xml.Element("choice",[],alt) as xml -> |
72 | 100 | let alt = Xlist.fold alt [] remove_rejected in |
... | ... | @@ -77,12 +105,12 @@ let rec load_segm_token = function |
77 | 105 | |
78 | 106 | let load_segm_sentence = function |
79 | 107 | Xml.Element("s",["xml:id",id_s],tokens) -> |
80 | - id_s,List.flatten (List.rev (Xlist.rev_map tokens load_segm_token)) | |
108 | + parse_id id_s,List.flatten (List.rev (Xlist.rev_map tokens load_segm_token)) | |
81 | 109 | | xml -> failwith ("load_segm_sentence: " ^ Xml.to_string_fmt xml) |
82 | 110 | |
83 | 111 | let load_segm_entry = function |
84 | 112 | Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) -> |
85 | - corresp,id_p,List.rev (Xlist.rev_map sentences load_segm_sentence) | |
113 | + parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_segm_sentence) | |
86 | 114 | | xml -> failwith ("load_segm_entry: " ^ Xml.to_string_fmt xml) |
87 | 115 | |
88 | 116 | let load_segmentation path name = |
... | ... | @@ -104,22 +132,22 @@ let load_morph_token = function |
104 | 132 | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"], |
105 | 133 | [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]); |
106 | 134 | Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) -> |
107 | - corresp,id_seg,orth,load_disamb disamb | |
135 | + parse_id corresp,parse_id id_seg,orth,load_disamb disamb | |
108 | 136 | | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"], |
109 | 137 | [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]); |
110 | 138 | Xml.Element("f",["name","nps"],[Xml.Element("binary",["value","true"],[])]); |
111 | 139 | Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) -> |
112 | - corresp,id_seg,orth,load_disamb disamb | |
140 | + parse_id corresp,parse_id id_seg,orth,load_disamb disamb | |
113 | 141 | | xml -> failwith ("load_morph_token: " ^ Xml.to_string_fmt xml) |
114 | 142 | |
115 | 143 | let load_morph_sentence = function |
116 | 144 | Xml.Element("s",["corresp",corresp;"xml:id",id_s],tokens) -> |
117 | - corresp,id_s,List.rev (Xlist.rev_map tokens load_morph_token) | |
145 | + parse_id corresp,parse_id id_s,List.rev (Xlist.rev_map tokens load_morph_token) | |
118 | 146 | | xml -> failwith ("load_morph_sentence: " ^ Xml.to_string_fmt xml) |
119 | 147 | |
120 | 148 | let load_morph_entry = function |
121 | 149 | Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) -> |
122 | - corresp,id_p,List.rev (Xlist.rev_map sentences load_morph_sentence) | |
150 | + parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_morph_sentence) | |
123 | 151 | | xml -> failwith ("load_morph_entry: " ^ Xml.to_string_fmt xml) |
124 | 152 | |
125 | 153 | let load_morphosyntax path name = |
... | ... | @@ -130,6 +158,16 @@ let load_morphosyntax path name = |
130 | 158 | List.rev (Xlist.rev_map entries load_morph_entry) |
131 | 159 | | _ -> failwith "load_morphosyntax" |
132 | 160 | |
161 | +let rec merge_entries rev = function | |
162 | + ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text, | |
163 | + ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"}, | |
164 | + {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation, | |
165 | + ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"}, | |
166 | + {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax -> | |
167 | + if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else | |
168 | + merge_entries ((id_div,paragraphs,segm_sentences,morph_sentences) :: rev) (text,segmentation,morphosyntax) | |
169 | + | [],[],[] -> List.rev rev | |
170 | + | _ -> failwith "merge_entries" | |
133 | 171 | |
134 | 172 | let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/" |
135 | 173 | |
... | ... | @@ -137,13 +175,66 @@ let _ = |
137 | 175 | let names = get_folders nkjp_path in |
138 | 176 | Xlist.iter names (fun name -> |
139 | 177 | print_endline name; |
140 | - let header = load_text nkjp_path name in | |
141 | - (* let text = load_text nkjp_path name in *) | |
142 | - (* let segmentation = load_segmentation nkjp_path name in *) | |
143 | - (* let morphosyntax = load_morphosyntax nkjp_path name in *) | |
178 | + let typ,channel = load_header nkjp_path name in | |
179 | + (* print_endline typ; *) | |
180 | + (* print_endline channel; *) | |
181 | + (* print_endline (typ ^ "\t" ^ channel); *) | |
182 | + let text = load_text nkjp_path name in | |
183 | + let segmentation = load_segmentation nkjp_path name in | |
184 | + let morphosyntax = load_morphosyntax nkjp_path name in | |
185 | + let entries = merge_entries [] (text,segmentation,morphosyntax) in | |
144 | 186 | ()) |
145 | 187 | |
146 | 188 | (* |
189 | +frekwencje typów: | |
190 | + 127 fakt | |
191 | + 56 inf-por | |
192 | + 283 konwers | |
193 | + 2 listy | |
194 | + 376 lit | |
195 | + 1 lit_poezja | |
196 | + 80 media | |
197 | + 175 nd | |
198 | + 161 net_interakt | |
199 | + 227 net_nieinterakt | |
200 | + 20 nklas | |
201 | + 1986 publ | |
202 | + 8 qmow | |
203 | + 387 urzed | |
204 | + | |
205 | +frekwencje kanałów | |
206 | + 388 internet | |
207 | + 817 ksiazka | |
208 | + 363 mowiony | |
209 | + 146 prasa | |
210 | + 1744 prasa_dziennik | |
211 | + 398 prasa_inne | |
212 | + 5 prasa_miesiecznik | |
213 | + 28 prasa_tygodnik | |
214 | + | |
215 | +frekwencje łączne typów-kanałów | |
216 | + 127 fakt ksiazka | |
217 | + 56 inf-por ksiazka | |
218 | + 283 konwers mowiony | |
219 | + 2 listy ksiazka | |
220 | + 376 lit ksiazka | |
221 | + 1 lit_poezja ksiazka | |
222 | + 80 media mowiony | |
223 | + 175 nd ksiazka | |
224 | + 161 net_interakt internet | |
225 | + 227 net_nieinterakt internet | |
226 | + 20 nklas ksiazka | |
227 | + 60 publ ksiazka | |
228 | + 146 publ prasa | |
229 | + 1744 publ prasa_dziennik | |
230 | + 3 publ prasa_inne | |
231 | + 5 publ prasa_miesiecznik | |
232 | + 28 publ prasa_tygodnik | |
233 | + 8 qmow prasa_inne | |
234 | + 387 urzed prasa_inne | |
235 | + | |
236 | + *) | |
237 | +(* | |
147 | 238 | |
148 | 239 | type id = {hash: bool; suffix: string; numbers: int list} |
149 | 240 | |
... | ... |