Commit 2efead34b139c9989719482d8d6e6d0c3db33b61

Authored by Daniel Oklesiński
1 parent 9f53f85c

rozwiniece drzew zaleznosciowych

Too many changes to show.

To preserve performance only 1 of 11 files are displayed.

corpora/CONLL.ml
... ... @@ -132,6 +132,71 @@ let match_corpus corpus =
132 132  
133 133 (******************)
134 134  
  135 +type to_text = { t_orth: string; t_cat: string; t_interp: string list list list }
  136 +
  137 +let empty_to_text = { t_orth = ""; t_cat = ""; t_interp = [[[]]] }
  138 +
  139 +let get_text tokens =
  140 + let get i =
  141 + let cat = match (ExtArray.get tokens i).token with
  142 + Lemma(_,cat,_) -> cat
  143 + | _ -> "" in
  144 + let interp = match (ExtArray.get tokens i).token with
  145 + Lemma(_,_,i) -> i
  146 + | _ -> [[[]]] in
  147 + { t_orth = (ExtArray.get tokens i).orth; t_cat = cat; t_interp = interp } in
  148 +
  149 + let n_tokens = Int.fold_down (ExtArray.size tokens - 1) 0 []
  150 + (fun acc i -> (get i)::acc)in
  151 +
  152 + let quote_open = ref false in
  153 + let hyphenated = ref false in
  154 +
  155 + let maybe_add_space pre_previous previous token next =
  156 + if previous.t_orth = "" && token.t_orth = "\""
  157 + then quote_open := true;
  158 + if token.t_cat = "aglt" ||
  159 + (token.t_orth = "by" && previous.t_cat = "praet") ||
  160 + (previous.t_orth = "\"" && !quote_open) ||
  161 + previous.t_orth = "(" ||
  162 + previous.t_orth = "„" ||
  163 + previous.t_orth = "" ||
  164 + token.t_orth = "ń" || (* wyrażenie nań *)
  165 + (token.t_orth = "że" && (previous.t_orth = "czym" || previous.t_orth = "Czym")) || (*wyrażenie czymże*)
  166 +(* (token.orth = "r" && token.cat = "brev") || (*skrót r. - np. 1991r. *) *)
  167 + (pre_previous.t_cat = "adj" && previous.t_orth = "." &&
  168 + token.t_cat = "num" && token.t_interp = [[["pl"];["nom"];["f"];["rec"]]]) (* godzina - np 13.15*)
  169 + then token.t_orth
  170 + else if !hyphenated
  171 + then (hyphenated := false; token.t_orth)
  172 + else match token.t_orth with
  173 + "." -> "."
  174 + | "…" -> "…"
  175 + | "?" -> "?"
  176 + | "!" -> "!"
  177 + | "," -> ","
  178 + | ":" -> ":"
  179 + | ";" -> ";"
  180 + | ")" -> ")"
  181 + | "”" -> "”"
  182 + | "-" -> if previous.t_cat = "adja" ||
  183 + (previous.t_cat = "subst" && next.t_cat = "subst" && previous.t_interp = next.t_interp)
  184 + then (hyphenated := true; "-")
  185 + else " -"
  186 + | "\"" -> if !quote_open
  187 + then (quote_open := false; "\"")
  188 + else (quote_open := true; " \"")
  189 + | s -> " "^s in
  190 +
  191 + let rec fold4 acc = function
  192 + a::b::c::d::t -> fold4 (acc^maybe_add_space a b c d) (b::c::d::t)
  193 + | a::b::c::[] -> fold4 (acc^maybe_add_space a b c empty_to_text) (b::c::[])
  194 + | a::b::[] -> acc
  195 + | _ -> failwith ("get_sentence") in
  196 + fold4 "" (empty_to_text::empty_to_text::n_tokens)
  197 +
  198 +(******************)
  199 +
135 200 let establish_next tokens paths =
136 201 let n = ExtArray.size tokens in
137 202 Int.iter 1 (n - 2) (fun i ->
... ... @@ -156,12 +221,16 @@ let rec establish_for_token i text tokens = function
156 221 if Xstring.check_prefix " " text
157 222 then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l
158 223 else if Xstring.check_prefix h.orth text
  224 + (* || (h.orth = "m.in." && Xstring.check_prefix "m.in" text) *)
159 225 then
160 226 let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in
161 227 let n_h = {h with beg = i ; len = n} in
162 228 ExtArray.set tokens id n_h;
  229 + (* if Xstring.check_prefix h.orth text then *)
163 230 establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t
164   - else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text)
  231 + (* else establish_for_token (i+n) (Xstring.cut_prefix "m.in" text) tokens t *)
  232 + else (prerr_endline ("establish_for_token :" ^ h.orth ^ " " ^ text);
  233 + failwith ("establish_for_token :" ^ h.orth ^ " " ^ text))
165 234 | [] -> 100, i
166 235  
167 236 let rec establish_lengths text paths tokens =
... ... @@ -196,16 +265,24 @@ let match_sentence (p_record,tokens) =
196 265 then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
197 266 else failwith ("match_sentence: no CONLL mode in AltSentence")*) in
198 267 let info_token, paths = info_token p_record.psentence in
199   - try
200   - let id, text = StringMap.find info_map info_token in
  268 + (* try *)
  269 + let id, text = try
  270 + StringMap.find info_map info_token
  271 + with
  272 + | _ -> p_record.pid, get_text tokens in
201 273 let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
202 274 AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix="";
203 275 psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)]
204 276 (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
205   - with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)]
  277 + (* with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)] *)
206 278  
207 279 let match_corpus corpus =
208   - Xlist.map corpus match_sentence
  280 + let rec pom f = function
  281 + [] -> []
  282 + | a::l -> try
  283 + let r = f a in r :: pom f l
  284 + with e -> (*print_endline (Printexc.to_string e);*) pom f l in
  285 + pom match_sentence corpus
209 286  
210 287 (******************)
211 288  
... ... @@ -232,8 +309,11 @@ let load_token in_channel =
232 309 else if line.[0] = '#'
233 310 then
234 311 if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line
235   - then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in
236   - raise (Id_line id)
  312 + then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in
  313 + raise (Id_line id)
  314 + else if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.tree" line
  315 + then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.tree" line in
  316 + raise (Id_line id)
237 317 else failwith ("load_token: " ^ line)
238 318 else
239 319 match Xstring.split "\t" line with
... ... @@ -283,5 +363,6 @@ let load_corpus in_channel =
283 363 try
284 364 let conll_sentence, tokens = load_sentence in_channel in
285 365 pom ((conll_sentence, tokens) :: res)
286   - with End_of_file -> res in
  366 + with End_of_file -> res
  367 + | e -> prerr_endline (Printexc.to_string e); res in
287 368 (* match_corpus @@ *) List.rev @@ pom []
... ...