CONLL.ml 3.64 KB
open PreTypes

let string_of_token token =
  String.concat "\t" [string_of_int token.c_id;
                 token.c_orth;
                 token.c_lemma;
                 token.c_cat;
                 token.c_cat;
                 (if token.c_interp = []
                   then "_"
                   else String.concat "|" token.c_interp);
                 (if token.c_super = -1
                   then "_"
                   else string_of_int token.c_super);
                 (if token.c_label = ""
                   then "_"
                   else token.c_label);
                 "_";"_"]

let string_of_sentence sentence =
  String.concat "\n" (sentence.s_id :: (List.map string_of_token sentence.s_tokens)) ^ "\n"

(******************)

exception Empty_line
exception Empty_sentence
exception Id_line of string

let load_token stream =
  let line = input_line stream in
  if line = ""
   then raise Empty_line
   else if line.[0] = '#'
     then
       let id = String.sub line 8 ((String.length line)-17) in
       raise (Id_line id)
     else
       match Str.split (Str.regexp "\t") line with
         [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] ->
            if cat <> cat2 then failwith ("load_token: " ^ line) else
            let interp = if interp = "_" then [[]] else
              [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])]
            (* {empty_token with id = id; orth = orth; token = Lemma(lemma,cat,interp)}
       c_lemma = List.nth pom 2;
       c_cat = List.nth pom 3;
       c_interp = (let interp = List.nth pom 5 in
         if interp = "_"
           then []
           else Str.split (Str.regexp "|") interp);
       c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *)
       | _ -> failwith ("load_token: " ^ line)

let load_sentence stream =
  let rec pom rev_tokens id =
    try
      let token = load_token stream in
      pom (token :: rev_tokens) id
    with Id_line new_id -> pom rev_tokens new_id
      | Empty_line -> rev_tokens, id
      | End_of_file -> if rev_tokens = []
          then raise Empty_sentence
          else rev_tokens, id in
  let rev_tokens, id = pom [] "" in
  {s_id = id; s_text = ""; s_tokens = (List.rev rev_tokens)}
  (* {pid=pi; pbeg=-1; plen=-1; psentence=StructSentence(List.rev rev_tokens,-1)} *)

let load_corpus stream =
  let rec pom res =
    try
      let conll_sentence = load_sentence stream in
      pom (conll_sentence :: res)
    with _ -> res in
  pom []

(******************)

exception ErrorInfoFile

module Info =
  struct
    type t = string
    let compare a b = Pervasives.compare a b
  end

module InfoMap = Map.Make(Info)

let info_file = "resources/info_sentences.txt"

let file_in filename f =
  let file = open_in filename in
  let x = f file in
  close_in file;
  x

let load_file filename =
  let size = (Unix.stat filename).Unix.st_size in
  let buf = Bytes.create size in
  file_in filename (fun file ->
    ignore (really_input file buf 0 size));
  buf

let info = Str.split (Str.regexp "\n\n") @@ load_file info_file

let add_to_map map info_str =
  match Str.split (Str.regexp "\n") info_str with
    [id; text; info_token] ->  InfoMap.add info_token (id, text) map
  | _ -> raise ErrorInfoFile

let info_map =
  List.fold_left add_to_map InfoMap.empty info

let match_sentence sentence =
  let info_token = String.concat " " @@ List.map (fun x -> x.c_orth) sentence.s_tokens in
  try
    let id, text = InfoMap.find info_token info_map in
    {s_id = id; s_text = text; s_tokens = sentence.s_tokens}
    (* {pid=sentence.pid; psentence=AltSentence[Raw, RawSentence text; CONLL, psentence.s_tokens]} *)
  with _ -> sentence

let match_corpus corpus =
  List.map match_sentence corpus