CONLL.ml 6.89 KB
open Xstd
open PreTypes

let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts
      then f mode (snd @@ List.find (fun (m,_) -> m = mode) alts)
      else f mode (snd @@ List.find (fun (m,_) -> m = Struct) alts)

let string_of_token mode token = match mode with
  | Raw -> token.orth
  | Struct -> failwith ("function string_of_token for mode Struct is not defined")
  | CONLL -> let lemma,cat,interp = match token.token with
      | Lemma(a,b,c) -> a,b,if c = [[]]
                   then "_"
                   else String.concat "][" @@ Xlist.map c (fun x ->
                          String.concat "|" @@ Xlist.map x ( fun y ->
                            String.concat "." y))
      | _ -> failwith ("string_of_token: not Lemma") in
    String.concat "\t" [token.conll_id;
                 token.orth; lemma; cat; cat; interp; "_"; "_";
                 string_of_int token.beg; string_of_int token.len]
  | _ -> failwith "string_of_token: ni"

let rec string_of_sentence mode = function
      RawSentence s -> if mode = Raw then s else ""
    | StructSentence (tokens, _) -> String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)
    | ORSentence (_,_,_,_) -> failwith ("string_of_sentence: ORSentence")
    | AltSentence alts -> alternative_string string_of_sentence mode alts

let string_of_p_record mode p_record =
  (if p_record.pid = "" then "" else p_record.pid ^ "\n") ^
  string_of_sentence mode p_record.psentence

let rec string_of_paragraph mode = function
    RawParagraph s -> if mode = Raw then s else ""
  | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode)
  | AltParagraph alts -> alternative_string string_of_paragraph mode alts

let rec string_of_text mode = function
    RawText s -> if mode = Raw then s else ""
  | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode)
  | AltText alts -> alternative_string string_of_text mode alts


(******************)

let rec establish_for_token i res text = function
    h :: t -> if Xstring.check_prefix " " text
      then establish_for_token (i+100) res (Xstring.cut_prefix " " text) (h :: t)
      else if Xstring.check_prefix h.orth text
        then
          let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth)*100 in
          let n_h = {h with beg = i ; len = n} in
          establish_for_token (i+n) (n_h :: res) (Xstring.cut_prefix h.orth text) t
        else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text)
  | [] -> 100, i, res

let rec establish_lengths text = function
    RawSentence text -> failwith ("establish_lengths: " ^ text)
  | StructSentence (tokens, n) -> let pbeg, plen, rev_tokens = establish_for_token 100 [] text tokens in
       pbeg, plen, StructSentence (List.rev rev_tokens, n)
  | ORSentence (_,_,_,_) -> failwith ("establish_lengths: ORSentence")
  | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts
        then establish_lengths text (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
        else failwith ("establish_lengths: no CONLL mode in AltSentence")


(******************)

exception ErrorInfoFile of string

let info_file = "../corpora/info_sentences.txt"

let info = Xstring.split "\n\n" @@ File.load_file_gen info_file

let add_to_map map info_str =
  match Xstring.split "\n" info_str with
    [id; text; info_token] ->  StringMap.add map info_token (id, text)
  | _ -> raise (ErrorInfoFile info_str)

let info_map =
  Xlist.fold info StringMap.empty add_to_map

let match_sentence p_record =
  let rec info_token s = match s with
      RawSentence text -> failwith ("match_sentence: " ^ text)
    | StructSentence (tokens, n) -> String.concat " " @@ List.map (fun x -> x.orth) tokens
    | ORSentence (_,_,_,_) -> failwith ("match_sentence: ORSentence")
    | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts
        then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
        else failwith ("match_sentence: no CONLL mode in AltSentence") in
  let info_token = info_token p_record.psentence in
  try
    let id, text = StringMap.find info_map info_token in
    let beg, len, n_sentence = establish_lengths text p_record.psentence (* -1, -1, p_record.psentence *) in
    AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len;
     psentence = AltSentence[Raw, RawSentence text; CONLL, n_sentence]}]],-1)]
(*  {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
  with _ -> StructText([StructParagraph[p_record]],-1)

let match_corpus corpus =
  Xlist.map corpus match_sentence

(******************)

exception Empty_line
exception Empty_sentence
exception Id_line of string

let load_token in_channel =
  let fail line =
    (* failwith ("load_token: " ^ line) *)
    () in
  let n_token id orth lemma cat interp super label =
    let interp = if interp = "_"
            then [[]]
            else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in
    {empty_token with conll_id = id; orth = orth; token = Lemma(lemma,cat,interp); conll_super=super; conll_label=label} in
  let line = input_line in_channel in
  if line = ""
   then raise Empty_line
   else if line.[0] = '#'
     then
       if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line
         then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in
              raise (Id_line id)
         else failwith ("load_token: " ^ line)
     else
       match Xstring.split "\t" line with
         [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] ->
          (if cat <> cat2 then fail line; n_token id orth lemma cat interp super label)
       | id :: orth :: lemma :: cat :: cat2 :: interp :: e ->
          (fail line; n_token id orth lemma cat interp "" "") (* FIXME: "" "" trzeba na coś zmienic *)
       | _ -> failwith ("load_token: " ^ line)
(*     {c_id = List.nth pom 1;
       c_lemma = List.nth pom 2;
       c_cat = List.nth pom 3;
       c_interp = (let interp = List.nth pom 5 in
         if interp = "_"
           then []
           else Str.split (Str.regexp "|") interp);
       c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *)

let load_sentence in_channel =
  let rec pom rev_tokens id =
    try
      let token = load_token in_channel in
      pom (token :: rev_tokens) id
    with Id_line new_id -> pom rev_tokens new_id
      | Empty_line -> rev_tokens, id
      | End_of_file -> if rev_tokens = []
          then raise End_of_file
          else rev_tokens, id in
  let rev_tokens, id = pom [] "" in
  {pid = id; pbeg = -1; plen = -1; psentence = StructSentence(List.rev rev_tokens,-1)}
(*  {s_id = id; s_text = ""; s_tokens = (List.rev rev_tokens)} *)

let load_corpus in_channel =
  let rec pom res =
    try
      let conll_sentence = load_sentence in_channel in
      pom (conll_sentence :: res)
    with End_of_file -> res in
  (* match_corpus @@ *) List.rev @@ pom []