Commit 2efead34b139c9989719482d8d6e6d0c3db33b61
1 parent
9f53f85c
rozwiniece drzew zaleznosciowych
Showing
11 changed files
with
485 additions
and
32959 deletions
Too many changes to show.
To preserve performance only 1 of 11 files are displayed.
corpora/CONLL.ml
... | ... | @@ -132,6 +132,71 @@ let match_corpus corpus = |
132 | 132 | |
133 | 133 | (******************) |
134 | 134 | |
135 | +type to_text = { t_orth: string; t_cat: string; t_interp: string list list list } | |
136 | + | |
137 | +let empty_to_text = { t_orth = ""; t_cat = ""; t_interp = [[[]]] } | |
138 | + | |
139 | +let get_text tokens = | |
140 | + let get i = | |
141 | + let cat = match (ExtArray.get tokens i).token with | |
142 | + Lemma(_,cat,_) -> cat | |
143 | + | _ -> "" in | |
144 | + let interp = match (ExtArray.get tokens i).token with | |
145 | + Lemma(_,_,i) -> i | |
146 | + | _ -> [[[]]] in | |
147 | + { t_orth = (ExtArray.get tokens i).orth; t_cat = cat; t_interp = interp } in | |
148 | + | |
149 | + let n_tokens = Int.fold_down (ExtArray.size tokens - 1) 0 [] | |
150 | + (fun acc i -> (get i)::acc)in | |
151 | + | |
152 | + let quote_open = ref false in | |
153 | + let hyphenated = ref false in | |
154 | + | |
155 | + let maybe_add_space pre_previous previous token next = | |
156 | + if previous.t_orth = "" && token.t_orth = "\"" | |
157 | + then quote_open := true; | |
158 | + if token.t_cat = "aglt" || | |
159 | + (token.t_orth = "by" && previous.t_cat = "praet") || | |
160 | + (previous.t_orth = "\"" && !quote_open) || | |
161 | + previous.t_orth = "(" || | |
162 | + previous.t_orth = "„" || | |
163 | + previous.t_orth = "" || | |
164 | + token.t_orth = "ń" || (* wyrażenie nań *) | |
165 | + (token.t_orth = "że" && (previous.t_orth = "czym" || previous.t_orth = "Czym")) || (*wyrażenie czymże*) | |
166 | +(* (token.orth = "r" && token.cat = "brev") || (*skrót r. - np. 1991r. *) *) | |
167 | + (pre_previous.t_cat = "adj" && previous.t_orth = "." && | |
168 | + token.t_cat = "num" && token.t_interp = [[["pl"];["nom"];["f"];["rec"]]]) (* godzina - np 13.15*) | |
169 | + then token.t_orth | |
170 | + else if !hyphenated | |
171 | + then (hyphenated := false; token.t_orth) | |
172 | + else match token.t_orth with | |
173 | + "." -> "." | |
174 | + | "…" -> "…" | |
175 | + | "?" -> "?" | |
176 | + | "!" -> "!" | |
177 | + | "," -> "," | |
178 | + | ":" -> ":" | |
179 | + | ";" -> ";" | |
180 | + | ")" -> ")" | |
181 | + | "”" -> "”" | |
182 | + | "-" -> if previous.t_cat = "adja" || | |
183 | + (previous.t_cat = "subst" && next.t_cat = "subst" && previous.t_interp = next.t_interp) | |
184 | + then (hyphenated := true; "-") | |
185 | + else " -" | |
186 | + | "\"" -> if !quote_open | |
187 | + then (quote_open := false; "\"") | |
188 | + else (quote_open := true; " \"") | |
189 | + | s -> " "^s in | |
190 | + | |
191 | + let rec fold4 acc = function | |
192 | + a::b::c::d::t -> fold4 (acc^maybe_add_space a b c d) (b::c::d::t) | |
193 | + | a::b::c::[] -> fold4 (acc^maybe_add_space a b c empty_to_text) (b::c::[]) | |
194 | + | a::b::[] -> acc | |
195 | + | _ -> failwith ("get_sentence") in | |
196 | + fold4 "" (empty_to_text::empty_to_text::n_tokens) | |
197 | + | |
198 | +(******************) | |
199 | + | |
135 | 200 | let establish_next tokens paths = |
136 | 201 | let n = ExtArray.size tokens in |
137 | 202 | Int.iter 1 (n - 2) (fun i -> |
... | ... | @@ -156,12 +221,16 @@ let rec establish_for_token i text tokens = function |
156 | 221 | if Xstring.check_prefix " " text |
157 | 222 | then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l |
158 | 223 | else if Xstring.check_prefix h.orth text |
224 | + (* || (h.orth = "m.in." && Xstring.check_prefix "m.in" text) *) | |
159 | 225 | then |
160 | 226 | let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in |
161 | 227 | let n_h = {h with beg = i ; len = n} in |
162 | 228 | ExtArray.set tokens id n_h; |
229 | + (* if Xstring.check_prefix h.orth text then *) | |
163 | 230 | establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t |
164 | - else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text) | |
231 | + (* else establish_for_token (i+n) (Xstring.cut_prefix "m.in" text) tokens t *) | |
232 | + else (prerr_endline ("establish_for_token :" ^ h.orth ^ " " ^ text); | |
233 | + failwith ("establish_for_token :" ^ h.orth ^ " " ^ text)) | |
165 | 234 | | [] -> 100, i |
166 | 235 | |
167 | 236 | let rec establish_lengths text paths tokens = |
... | ... | @@ -196,16 +265,24 @@ let match_sentence (p_record,tokens) = |
196 | 265 | then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts)) |
197 | 266 | else failwith ("match_sentence: no CONLL mode in AltSentence")*) in |
198 | 267 | let info_token, paths = info_token p_record.psentence in |
199 | - try | |
200 | - let id, text = StringMap.find info_map info_token in | |
268 | + (* try *) | |
269 | + let id, text = try | |
270 | + StringMap.find info_map info_token | |
271 | + with | |
272 | + | _ -> p_record.pid, get_text tokens in | |
201 | 273 | let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in |
202 | 274 | AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix=""; |
203 | 275 | psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)] |
204 | 276 | (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) |
205 | - with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] | |
277 | + (* with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] *) | |
206 | 278 | |
207 | 279 | let match_corpus corpus = |
208 | - Xlist.map corpus match_sentence | |
280 | + let rec pom f = function | |
281 | + [] -> [] | |
282 | + | a::l -> try | |
283 | + let r = f a in r :: pom f l | |
284 | + with e -> (*print_endline (Printexc.to_string e);*) pom f l in | |
285 | + pom match_sentence corpus | |
209 | 286 | |
210 | 287 | (******************) |
211 | 288 | |
... | ... | @@ -232,8 +309,11 @@ let load_token in_channel = |
232 | 309 | else if line.[0] = '#' |
233 | 310 | then |
234 | 311 | if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line |
235 | - then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in | |
236 | - raise (Id_line id) | |
312 | + then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in | |
313 | + raise (Id_line id) | |
314 | + else if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.tree" line | |
315 | + then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.tree" line in | |
316 | + raise (Id_line id) | |
237 | 317 | else failwith ("load_token: " ^ line) |
238 | 318 | else |
239 | 319 | match Xstring.split "\t" line with |
... | ... | @@ -283,5 +363,6 @@ let load_corpus in_channel = |
283 | 363 | try |
284 | 364 | let conll_sentence, tokens = load_sentence in_channel in |
285 | 365 | pom ((conll_sentence, tokens) :: res) |
286 | - with End_of_file -> res in | |
366 | + with End_of_file -> res | |
367 | + | e -> prerr_endline (Printexc.to_string e); res in | |
287 | 368 | (* match_corpus @@ *) List.rev @@ pom [] |
... | ... |