Commit 2efead34b139c9989719482d8d6e6d0c3db33b61

Authored by Daniel Oklesiński
1 parent 9f53f85c

rozwiniece drzew zaleznosciowych

Too many changes to show.

To preserve performance only 1 of 11 files are displayed.

corpora/CONLL.ml
@@ -132,6 +132,71 @@ let match_corpus corpus = @@ -132,6 +132,71 @@ let match_corpus corpus =
132 132
133 (******************) 133 (******************)
134 134
  135 +type to_text = { t_orth: string; t_cat: string; t_interp: string list list list }
  136 +
  137 +let empty_to_text = { t_orth = ""; t_cat = ""; t_interp = [[[]]] }
  138 +
  139 +let get_text tokens =
  140 + let get i =
  141 + let cat = match (ExtArray.get tokens i).token with
  142 + Lemma(_,cat,_) -> cat
  143 + | _ -> "" in
  144 + let interp = match (ExtArray.get tokens i).token with
  145 + Lemma(_,_,i) -> i
  146 + | _ -> [[[]]] in
  147 + { t_orth = (ExtArray.get tokens i).orth; t_cat = cat; t_interp = interp } in
  148 +
  149 + let n_tokens = Int.fold_down (ExtArray.size tokens - 1) 0 []
  150 + (fun acc i -> (get i)::acc)in
  151 +
  152 + let quote_open = ref false in
  153 + let hyphenated = ref false in
  154 +
  155 + let maybe_add_space pre_previous previous token next =
  156 + if previous.t_orth = "" && token.t_orth = "\""
  157 + then quote_open := true;
  158 + if token.t_cat = "aglt" ||
  159 + (token.t_orth = "by" && previous.t_cat = "praet") ||
  160 + (previous.t_orth = "\"" && !quote_open) ||
  161 + previous.t_orth = "(" ||
  162 + previous.t_orth = "„" ||
  163 + previous.t_orth = "" ||
  164 + token.t_orth = "ń" || (* wyrażenie nań *)
  165 + (token.t_orth = "że" && (previous.t_orth = "czym" || previous.t_orth = "Czym")) || (*wyrażenie czymże*)
  166 +(* (token.orth = "r" && token.cat = "brev") || (*skrót r. - np. 1991r. *) *)
  167 + (pre_previous.t_cat = "adj" && previous.t_orth = "." &&
  168 + token.t_cat = "num" && token.t_interp = [[["pl"];["nom"];["f"];["rec"]]]) (* godzina - np 13.15*)
  169 + then token.t_orth
  170 + else if !hyphenated
  171 + then (hyphenated := false; token.t_orth)
  172 + else match token.t_orth with
  173 + "." -> "."
  174 + | "…" -> "…"
  175 + | "?" -> "?"
  176 + | "!" -> "!"
  177 + | "," -> ","
  178 + | ":" -> ":"
  179 + | ";" -> ";"
  180 + | ")" -> ")"
  181 + | "”" -> "”"
  182 + | "-" -> if previous.t_cat = "adja" ||
  183 + (previous.t_cat = "subst" && next.t_cat = "subst" && previous.t_interp = next.t_interp)
  184 + then (hyphenated := true; "-")
  185 + else " -"
  186 + | "\"" -> if !quote_open
  187 + then (quote_open := false; "\"")
  188 + else (quote_open := true; " \"")
  189 + | s -> " "^s in
  190 +
  191 + let rec fold4 acc = function
  192 + a::b::c::d::t -> fold4 (acc^maybe_add_space a b c d) (b::c::d::t)
  193 + | a::b::c::[] -> fold4 (acc^maybe_add_space a b c empty_to_text) (b::c::[])
  194 + | a::b::[] -> acc
  195 + | _ -> failwith ("get_sentence") in
  196 + fold4 "" (empty_to_text::empty_to_text::n_tokens)
  197 +
  198 +(******************)
  199 +
135 let establish_next tokens paths = 200 let establish_next tokens paths =
136 let n = ExtArray.size tokens in 201 let n = ExtArray.size tokens in
137 Int.iter 1 (n - 2) (fun i -> 202 Int.iter 1 (n - 2) (fun i ->
@@ -156,12 +221,16 @@ let rec establish_for_token i text tokens = function @@ -156,12 +221,16 @@ let rec establish_for_token i text tokens = function
156 if Xstring.check_prefix " " text 221 if Xstring.check_prefix " " text
157 then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l 222 then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l
158 else if Xstring.check_prefix h.orth text 223 else if Xstring.check_prefix h.orth text
  224 + (* || (h.orth = "m.in." && Xstring.check_prefix "m.in" text) *)
159 then 225 then
160 let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in 226 let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in
161 let n_h = {h with beg = i ; len = n} in 227 let n_h = {h with beg = i ; len = n} in
162 ExtArray.set tokens id n_h; 228 ExtArray.set tokens id n_h;
  229 + (* if Xstring.check_prefix h.orth text then *)
163 establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t 230 establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t
164 - else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text) 231 + (* else establish_for_token (i+n) (Xstring.cut_prefix "m.in" text) tokens t *)
  232 + else (prerr_endline ("establish_for_token :" ^ h.orth ^ " " ^ text);
  233 + failwith ("establish_for_token :" ^ h.orth ^ " " ^ text))
165 | [] -> 100, i 234 | [] -> 100, i
166 235
167 let rec establish_lengths text paths tokens = 236 let rec establish_lengths text paths tokens =
@@ -196,16 +265,24 @@ let match_sentence (p_record,tokens) = @@ -196,16 +265,24 @@ let match_sentence (p_record,tokens) =
196 then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts)) 265 then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
197 else failwith ("match_sentence: no CONLL mode in AltSentence")*) in 266 else failwith ("match_sentence: no CONLL mode in AltSentence")*) in
198 let info_token, paths = info_token p_record.psentence in 267 let info_token, paths = info_token p_record.psentence in
199 - try  
200 - let id, text = StringMap.find info_map info_token in 268 + (* try *)
  269 + let id, text = try
  270 + StringMap.find info_map info_token
  271 + with
  272 + | _ -> p_record.pid, get_text tokens in
201 let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in 273 let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
202 AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix=""; 274 AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix="";
203 psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)] 275 psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)]
204 (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) 276 (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
205 - with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] 277 + (* with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] *)
206 278
207 let match_corpus corpus = 279 let match_corpus corpus =
208 - Xlist.map corpus match_sentence 280 + let rec pom f = function
  281 + [] -> []
  282 + | a::l -> try
  283 + let r = f a in r :: pom f l
  284 + with e -> (*print_endline (Printexc.to_string e);*) pom f l in
  285 + pom match_sentence corpus
209 286
210 (******************) 287 (******************)
211 288
@@ -232,8 +309,11 @@ let load_token in_channel = @@ -232,8 +309,11 @@ let load_token in_channel =
232 else if line.[0] = '#' 309 else if line.[0] = '#'
233 then 310 then
234 if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line 311 if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line
235 - then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in  
236 - raise (Id_line id) 312 + then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in
  313 + raise (Id_line id)
  314 + else if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.tree" line
  315 + then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.tree" line in
  316 + raise (Id_line id)
237 else failwith ("load_token: " ^ line) 317 else failwith ("load_token: " ^ line)
238 else 318 else
239 match Xstring.split "\t" line with 319 match Xstring.split "\t" line with
@@ -283,5 +363,6 @@ let load_corpus in_channel = @@ -283,5 +363,6 @@ let load_corpus in_channel =
283 try 363 try
284 let conll_sentence, tokens = load_sentence in_channel in 364 let conll_sentence, tokens = load_sentence in_channel in
285 pom ((conll_sentence, tokens) :: res) 365 pom ((conll_sentence, tokens) :: res)
286 - with End_of_file -> res in 366 + with End_of_file -> res
  367 + | e -> prerr_endline (Printexc.to_string e); res in
287 (* match_corpus @@ *) List.rev @@ pom [] 368 (* match_corpus @@ *) List.rev @@ pom []