CONLL.ml 5.65 KB
open Xstd
open PreTypes

let string_of_token token =
  let lemma,cat,interp = match token.token with
      Lemma(a,b,c) -> a,b,if c = [[]]
                   then "_"
                   else String.concat "][" @@ Xlist.map c (fun x ->
                          String.concat "|" @@ Xlist.map x ( fun y ->
                            String.concat "." y))
    | _ -> failwith ("string_of_token: not Lemma") in
  String.concat "\t" [string_of_int token.id;
                 token.orth; lemma; cat; cat; interp; "_"; "_"; "_"; "_"]

let string_of_sentence sentence =
  let rec pom = function
      RawSentence text -> failwith ("string_of_sentence: " ^ text)
    | StructSentence (tokens, n) -> String.concat "\n" @@ List.map (fun x -> string_of_token x) tokens
    | ORSentence (_,_,_,_) -> failwith ("string_of_sentence: ORSentence")
    | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts 
        then pom (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
        else failwith ("string_of_sentence: no CONLL mode in AltSentence") in
  (if sentence.pid = ""
    then ""
    else sentence.pid ^ "\n") ^ (pom sentence.psentence)

(******************)

exception Empty_line
exception Empty_sentence
exception Id_line of string

let load_token stream =
  let line = input_line stream in
  if line = ""
   then raise Empty_line
   else if line.[0] = '#'
     then
       if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line
         then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in
              raise (Id_line id)
         else failwith ("load_token: " ^ line)
     else
       match Xstring.split "\t" line with
         [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] ->
          (*if cat <> cat2
            then failwith ("load_token: " ^ line)
            else *)
              let interp = if interp = "_"
                then [[]]
                else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in
            {empty_token with id = int_of_string id; orth = orth; token = Lemma(lemma,cat,interp)}
       | _ -> failwith ("load_token: " ^ line)
(*     {c_id = List.nth pom 1;
       c_lemma = List.nth pom 2;
       c_cat = List.nth pom 3;
       c_interp = (let interp = List.nth pom 5 in
         if interp = "_"
           then []
           else Str.split (Str.regexp "|") interp);
       c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *)

let load_sentence stream =
  let rec pom rev_tokens id =
    try
      let token = load_token stream in
      pom (token :: rev_tokens) id
    with Id_line new_id -> pom rev_tokens new_id
      | Empty_line -> rev_tokens, id
      | End_of_file -> if rev_tokens = []
          then raise End_of_file
          else rev_tokens, id in
  let rev_tokens, id = pom [] "" in
  {pid = id; pbeg = -1; plen = -1; psentence = StructSentence(List.rev rev_tokens,-1)}
(*  {s_id = id; s_text = ""; s_tokens = (List.rev rev_tokens)} *)

let load_corpus stream =
  let rec pom res =
    try
      let conll_sentence = load_sentence stream in
      pom (conll_sentence :: res)
    with e -> print_endline (Printexc.to_string e); res in
  pom []

(******************)

let rec establish_for_token i res text = function
    h :: t -> if Xstring.check_prefix " " text
      then establish_for_token (i+1) res (Xstring.cut_prefix " " text) (h :: t)
      else if Xstring.check_prefix h.orth text
        then 
          let n = List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth in
          let n_h = {h with beg = i ; len = n} in
          establish_for_token (i+n) (n_h :: res) (Xstring.cut_prefix h.orth text) t
        else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text)
  | [] -> 0, i, res

let rec establish_lengths text = function
    RawSentence text -> failwith ("establish_lengths: " ^ text)
  | StructSentence (tokens, n) -> let pbeg, plen, rev_tokens = establish_for_token 0 [] text tokens in
       pbeg, plen, StructSentence (List.rev rev_tokens, n)
  | ORSentence (_,_,_,_) -> failwith ("establish_lengths: ORSentence")
  | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts 
        then establish_lengths text (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
        else failwith ("establish_lengths: no CONLL mode in AltSentence")
  

(******************)

exception ErrorInfoFile of string

let info_file = "../corpora/info_sentences.txt"

let info = Xstring.split "\n\n" @@ File.load_file_gen info_file

let add_to_map map info_str =
  match Xstring.split "\n" info_str with
    [id; text; info_token] ->  StringMap.add map info_token (id, text)
  | _ -> raise (ErrorInfoFile info_str)

let info_map =
  Xlist.fold info StringMap.empty add_to_map

let match_sentence sentence =
  let rec info_token s = match s with
      RawSentence text -> failwith ("match_sentence: " ^ text)
    | StructSentence (tokens, n) -> String.concat " " @@ List.map (fun x -> x.orth) tokens
    | ORSentence (_,_,_,_) -> failwith ("match_sentence: ORSentence")
    | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts 
        then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
        else failwith ("match_sentence: no CONLL mode in AltSentence") in
  let info_token = info_token sentence.psentence in
  try
    let id, text = StringMap.find info_map info_token in
    let pbeg, plen, n_sentence = establish_lengths text sentence.psentence (* -1, -1, sentence.psentence *) in
    {pid = sentence.pid; pbeg = pbeg; plen = plen; psentence = AltSentence[Raw, RawSentence text; CONLL, n_sentence]}
(*  {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
  with _ -> sentence

let match_corpus corpus =
  Xlist.map corpus match_sentence