Commit caeb305ab86cefe8f85c04835fae2a47f2715eae
1 parent
584f9657
łączenie na poziomie haseł
Showing
1 changed file
with
106 additions
and
15 deletions
NKJP2/ENIAM_NKJP.ml
@@ -19,6 +19,34 @@ | @@ -19,6 +19,34 @@ | ||
19 | 19 | ||
20 | open Xstd | 20 | open Xstd |
21 | 21 | ||
22 | +type id = {corref: string; prefix: string; suffix: string; numbers: int list} | ||
23 | + | ||
24 | +let empty_id = {corref = ""; prefix = ""; suffix = ""; numbers = []} | ||
25 | + | ||
26 | +let parse_id id = | ||
27 | + (* if String.length s = 0 then empty_id else *) | ||
28 | + if String.length id < 6 then failwith "parse_id: za krótkie id" else | ||
29 | + let corref,id = match Xstring.split "#" id with | ||
30 | + [corref;id] -> corref,id | ||
31 | + | [id] -> "",id | ||
32 | + | _ -> failwith ("parse_id 1: " ^ id) in | ||
33 | + let prefix,id = match Xstring.split "_" id with | ||
34 | + [prefix;id] -> prefix,id | ||
35 | + | _ -> failwith ("parse_id 2: " ^ id) in | ||
36 | + let suffix,id = match Xstring.split "-" id with | ||
37 | + [id;suffix] -> suffix,id | ||
38 | + | _ -> failwith ("parse_id 3: " ^ id) in | ||
39 | + let numbers = try Xlist.map (Xstring.split "\\." id) int_of_string with _ -> failwith ("parse_id 4: " ^ id) in | ||
40 | + {corref=corref; prefix=prefix; suffix=suffix; numbers=numbers} | ||
41 | + | ||
42 | +let process_header_type typ = | ||
43 | + if Xstring.check_prefix "#typ_" typ then Xstring.cut_prefix "#typ_" typ | ||
44 | + else failwith ("process_header_type: " ^ typ) | ||
45 | + | ||
46 | +let process_header_channel c = | ||
47 | + if Xstring.check_prefix "#kanal_" c then Xstring.cut_prefix "#kanal_" c | ||
48 | + else failwith ("process_header_channel: " ^ c) | ||
49 | + | ||
22 | let load_header path name = | 50 | let load_header path name = |
23 | match Xml.parse_file (path ^ name ^ "/header.xml") with | 51 | match Xml.parse_file (path ^ name ^ "/header.xml") with |
24 | Xml.Element("teiHeader",_,[Xml.Element("fileDesc",[],_); | 52 | Xml.Element("teiHeader",_,[Xml.Element("fileDesc",[],_); |
@@ -26,7 +54,7 @@ let load_header path name = | @@ -26,7 +54,7 @@ let load_header path name = | ||
26 | Xml.Element("catRef",["scheme","#taxonomy-NKJP-type";"target",typ],[]); | 54 | Xml.Element("catRef",["scheme","#taxonomy-NKJP-type";"target",typ],[]); |
27 | Xml.Element("catRef",["scheme","#taxonomy-NKJP-channel";"target",channel],[])])]); | 55 | Xml.Element("catRef",["scheme","#taxonomy-NKJP-channel";"target",channel],[])])]); |
28 | Xml.Element("revisionDesc",_,_)]) -> | 56 | Xml.Element("revisionDesc",_,_)]) -> |
29 | - typ,channel | 57 | + process_header_type typ,process_header_channel channel |
30 | | _ -> failwith "load_header" | 58 | | _ -> failwith "load_header" |
31 | 59 | ||
32 | let get_folders path = | 60 | let get_folders path = |
@@ -35,12 +63,12 @@ let get_folders path = | @@ -35,12 +63,12 @@ let get_folders path = | ||
35 | 63 | ||
36 | let load_paragraph = function | 64 | let load_paragraph = function |
37 | Xml.Element("ab",["n",_;"xml:id",id_ab],[Xml.PCData paragraph]) -> | 65 | Xml.Element("ab",["n",_;"xml:id",id_ab],[Xml.PCData paragraph]) -> |
38 | - id_ab,paragraph | 66 | + parse_id id_ab,paragraph |
39 | | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml) | 67 | | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml) |
40 | 68 | ||
41 | let load_text_entry = function | 69 | let load_text_entry = function |
42 | Xml.Element("div",["xml:id",id_div;"decls",_],paragraphs) -> | 70 | Xml.Element("div",["xml:id",id_div;"decls",_],paragraphs) -> |
43 | - id_div,List.rev (Xlist.rev_map paragraphs load_paragraph) | 71 | + parse_id id_div,List.rev (Xlist.rev_map paragraphs load_paragraph) |
44 | | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml) | 72 | | xml -> failwith ("load_text_entry: " ^ Xml.to_string_fmt xml) |
45 | 73 | ||
46 | let load_text path name = | 74 | let load_text path name = |
@@ -64,9 +92,9 @@ let remove_rejected rev = function | @@ -64,9 +92,9 @@ let remove_rejected rev = function | ||
64 | 92 | ||
65 | let rec load_segm_token = function | 93 | let rec load_segm_token = function |
66 | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[]) -> | 94 | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[]) -> |
67 | - [corresp,false,id_seg] | 95 | + [corresp,false,parse_id id_seg] |
68 | | Xml.Element("seg",["corresp",corresp;"nkjp:nps","true";"xml:id",id_seg],[]) -> | 96 | | Xml.Element("seg",["corresp",corresp;"nkjp:nps","true";"xml:id",id_seg],[]) -> |
69 | - [corresp,true,id_seg] | 97 | + [corresp,true,parse_id id_seg] |
70 | | Xml.Element("nkjp:paren",[],tokens) -> List.flatten (Xlist.map tokens load_segm_token) | 98 | | Xml.Element("nkjp:paren",[],tokens) -> List.flatten (Xlist.map tokens load_segm_token) |
71 | | Xml.Element("choice",[],alt) as xml -> | 99 | | Xml.Element("choice",[],alt) as xml -> |
72 | let alt = Xlist.fold alt [] remove_rejected in | 100 | let alt = Xlist.fold alt [] remove_rejected in |
@@ -77,12 +105,12 @@ let rec load_segm_token = function | @@ -77,12 +105,12 @@ let rec load_segm_token = function | ||
77 | 105 | ||
78 | let load_segm_sentence = function | 106 | let load_segm_sentence = function |
79 | Xml.Element("s",["xml:id",id_s],tokens) -> | 107 | Xml.Element("s",["xml:id",id_s],tokens) -> |
80 | - id_s,List.flatten (List.rev (Xlist.rev_map tokens load_segm_token)) | 108 | + parse_id id_s,List.flatten (List.rev (Xlist.rev_map tokens load_segm_token)) |
81 | | xml -> failwith ("load_segm_sentence: " ^ Xml.to_string_fmt xml) | 109 | | xml -> failwith ("load_segm_sentence: " ^ Xml.to_string_fmt xml) |
82 | 110 | ||
83 | let load_segm_entry = function | 111 | let load_segm_entry = function |
84 | Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) -> | 112 | Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) -> |
85 | - corresp,id_p,List.rev (Xlist.rev_map sentences load_segm_sentence) | 113 | + parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_segm_sentence) |
86 | | xml -> failwith ("load_segm_entry: " ^ Xml.to_string_fmt xml) | 114 | | xml -> failwith ("load_segm_entry: " ^ Xml.to_string_fmt xml) |
87 | 115 | ||
88 | let load_segmentation path name = | 116 | let load_segmentation path name = |
@@ -104,22 +132,22 @@ let load_morph_token = function | @@ -104,22 +132,22 @@ let load_morph_token = function | ||
104 | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"], | 132 | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"], |
105 | [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]); | 133 | [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]); |
106 | Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) -> | 134 | Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) -> |
107 | - corresp,id_seg,orth,load_disamb disamb | 135 | + parse_id corresp,parse_id id_seg,orth,load_disamb disamb |
108 | | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"], | 136 | | Xml.Element("seg",["corresp",corresp;"xml:id",id_seg],[Xml.Element("fs",["type","morph"], |
109 | [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]); | 137 | [Xml.Element("f",["name","orth"],[Xml.Element("string",[],[Xml.PCData orth])]); |
110 | Xml.Element("f",["name","nps"],[Xml.Element("binary",["value","true"],[])]); | 138 | Xml.Element("f",["name","nps"],[Xml.Element("binary",["value","true"],[])]); |
111 | Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) -> | 139 | Xml.Element("f",["name","interps"],_);Xml.Element("f",["name","disamb"],[disamb])])]) -> |
112 | - corresp,id_seg,orth,load_disamb disamb | 140 | + parse_id corresp,parse_id id_seg,orth,load_disamb disamb |
113 | | xml -> failwith ("load_morph_token: " ^ Xml.to_string_fmt xml) | 141 | | xml -> failwith ("load_morph_token: " ^ Xml.to_string_fmt xml) |
114 | 142 | ||
115 | let load_morph_sentence = function | 143 | let load_morph_sentence = function |
116 | Xml.Element("s",["corresp",corresp;"xml:id",id_s],tokens) -> | 144 | Xml.Element("s",["corresp",corresp;"xml:id",id_s],tokens) -> |
117 | - corresp,id_s,List.rev (Xlist.rev_map tokens load_morph_token) | 145 | + parse_id corresp,parse_id id_s,List.rev (Xlist.rev_map tokens load_morph_token) |
118 | | xml -> failwith ("load_morph_sentence: " ^ Xml.to_string_fmt xml) | 146 | | xml -> failwith ("load_morph_sentence: " ^ Xml.to_string_fmt xml) |
119 | 147 | ||
120 | let load_morph_entry = function | 148 | let load_morph_entry = function |
121 | Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) -> | 149 | Xml.Element("p",["corresp",corresp;"xml:id",id_p],sentences) -> |
122 | - corresp,id_p,List.rev (Xlist.rev_map sentences load_morph_sentence) | 150 | + parse_id corresp,parse_id id_p,List.rev (Xlist.rev_map sentences load_morph_sentence) |
123 | | xml -> failwith ("load_morph_entry: " ^ Xml.to_string_fmt xml) | 151 | | xml -> failwith ("load_morph_entry: " ^ Xml.to_string_fmt xml) |
124 | 152 | ||
125 | let load_morphosyntax path name = | 153 | let load_morphosyntax path name = |
@@ -130,6 +158,16 @@ let load_morphosyntax path name = | @@ -130,6 +158,16 @@ let load_morphosyntax path name = | ||
130 | List.rev (Xlist.rev_map entries load_morph_entry) | 158 | List.rev (Xlist.rev_map entries load_morph_entry) |
131 | | _ -> failwith "load_morphosyntax" | 159 | | _ -> failwith "load_morphosyntax" |
132 | 160 | ||
161 | +let rec merge_entries rev = function | ||
162 | + ({corref=""; prefix="txt"; numbers=[id_div]; suffix="div"},paragraphs) :: text, | ||
163 | + ({corref="text.xml"; prefix="txt"; numbers=[c_div]; suffix="div"}, | ||
164 | + {corref=""; prefix="segm"; numbers=[id_segm_p]; suffix="p"},segm_sentences) :: segmentation, | ||
165 | + ({corref="ann_segmentation.xml"; prefix="segm"; numbers=[c_segm_p]; suffix="p"}, | ||
166 | + {corref=""; prefix="morph"; numbers=[id_morph_p]; suffix="p"},morph_sentences) :: morphosyntax -> | ||
167 | + if id_div <> c_div || c_div <> id_segm_p || id_segm_p <> c_segm_p || c_segm_p <> id_morph_p then failwith "merge_entries 2" else | ||
168 | + merge_entries ((id_div,paragraphs,segm_sentences,morph_sentences) :: rev) (text,segmentation,morphosyntax) | ||
169 | + | [],[],[] -> List.rev rev | ||
170 | + | _ -> failwith "merge_entries" | ||
133 | 171 | ||
134 | let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/" | 172 | let nkjp_path = "../../NLP resources/NKJP-PodkorpusMilionowy-1.2/" |
135 | 173 | ||
@@ -137,13 +175,66 @@ let _ = | @@ -137,13 +175,66 @@ let _ = | ||
137 | let names = get_folders nkjp_path in | 175 | let names = get_folders nkjp_path in |
138 | Xlist.iter names (fun name -> | 176 | Xlist.iter names (fun name -> |
139 | print_endline name; | 177 | print_endline name; |
140 | - let header = load_text nkjp_path name in | ||
141 | - (* let text = load_text nkjp_path name in *) | ||
142 | - (* let segmentation = load_segmentation nkjp_path name in *) | ||
143 | - (* let morphosyntax = load_morphosyntax nkjp_path name in *) | 178 | + let typ,channel = load_header nkjp_path name in |
179 | + (* print_endline typ; *) | ||
180 | + (* print_endline channel; *) | ||
181 | + (* print_endline (typ ^ "\t" ^ channel); *) | ||
182 | + let text = load_text nkjp_path name in | ||
183 | + let segmentation = load_segmentation nkjp_path name in | ||
184 | + let morphosyntax = load_morphosyntax nkjp_path name in | ||
185 | + let entries = merge_entries [] (text,segmentation,morphosyntax) in | ||
144 | ()) | 186 | ()) |
145 | 187 | ||
146 | (* | 188 | (* |
189 | +frekwencje typów: | ||
190 | + 127 fakt | ||
191 | + 56 inf-por | ||
192 | + 283 konwers | ||
193 | + 2 listy | ||
194 | + 376 lit | ||
195 | + 1 lit_poezja | ||
196 | + 80 media | ||
197 | + 175 nd | ||
198 | + 161 net_interakt | ||
199 | + 227 net_nieinterakt | ||
200 | + 20 nklas | ||
201 | + 1986 publ | ||
202 | + 8 qmow | ||
203 | + 387 urzed | ||
204 | + | ||
205 | +frekwencje kanałów | ||
206 | + 388 internet | ||
207 | + 817 ksiazka | ||
208 | + 363 mowiony | ||
209 | + 146 prasa | ||
210 | + 1744 prasa_dziennik | ||
211 | + 398 prasa_inne | ||
212 | + 5 prasa_miesiecznik | ||
213 | + 28 prasa_tygodnik | ||
214 | + | ||
215 | +frekwencje łączne typów-kanałów | ||
216 | + 127 fakt ksiazka | ||
217 | + 56 inf-por ksiazka | ||
218 | + 283 konwers mowiony | ||
219 | + 2 listy ksiazka | ||
220 | + 376 lit ksiazka | ||
221 | + 1 lit_poezja ksiazka | ||
222 | + 80 media mowiony | ||
223 | + 175 nd ksiazka | ||
224 | + 161 net_interakt internet | ||
225 | + 227 net_nieinterakt internet | ||
226 | + 20 nklas ksiazka | ||
227 | + 60 publ ksiazka | ||
228 | + 146 publ prasa | ||
229 | + 1744 publ prasa_dziennik | ||
230 | + 3 publ prasa_inne | ||
231 | + 5 publ prasa_miesiecznik | ||
232 | + 28 publ prasa_tygodnik | ||
233 | + 8 qmow prasa_inne | ||
234 | + 387 urzed prasa_inne | ||
235 | + | ||
236 | + *) | ||
237 | +(* | ||
147 | 238 | ||
148 | type id = {hash: bool; suffix: string; numbers: int list} | 239 | type id = {hash: bool; suffix: string; numbers: int list} |
149 | 240 |