CONLL.ml 12.5 KB
open Xstd
open PreTypes

let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts
      then f (snd @@ List.find (fun (m,_) -> m = mode) alts)
      else f (snd @@ List.find (fun (m,_) -> m = PreTypes.Struct) alts)

let string_of_token mode token conll_id super label =
  let decompose_lemma = function
    | PreTypes.Lemma(a,b,c) -> a,b,if c = [[]]
                 then "_"
                 else String.concat "][" @@ Xlist.map c (fun x ->
                        String.concat "|" @@ Xlist.map x ( fun y ->
                          String.concat "." y))
    | t -> failwith ("string_of_token: not Lemma") in
  match mode with
    | PreTypes.Raw -> token.PreTypes.orth
    | PreTypes.Struct -> failwith ("function string_of_token for mode Struct is not defined")
    | PreTypes.CONLL -> let lemma,cat,interp = decompose_lemma token.PreTypes.token in
        String.concat "\t" [string_of_int conll_id;
                 token.PreTypes.orth; lemma; cat; cat; interp; "_"; "_";
                 string_of_int token.PreTypes.beg; string_of_int token.PreTypes.len]
    | PreTypes.Mate -> let lemma,cat,interp = decompose_lemma token.PreTypes.token in
        String.concat "\t" [string_of_int conll_id;
                 token.PreTypes.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"]
    | _ -> failwith "string_of_token: ni"

let string_of_paths mode tokens paths =
  let l = Int.fold 1 (Array.length paths - 1) [] (fun l conll_id ->
    let id,super,label = paths.(conll_id) in
    (string_of_token mode (ExtArray.get tokens id) conll_id super label) :: l) in
  String.concat "\n" (List.rev l) ^ "\n\n"

let rec string_of_sentence mode tokens = function
      RawSentence s -> if mode = Raw then s else ""
    | StructSentence (tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*)
    | DepSentence (paths) -> string_of_paths mode tokens paths
    | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences")
    | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts

let string_of_p_record mode tokens p_record =
  (if p_record.pid = "" then "" else p_record.pid ^ "\n") ^
  string_of_sentence mode tokens p_record.psentence

(*let rec string_of_paragraph mode tokens = function
    RawParagraph s -> if mode = Raw then s else ""
  | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode tokens)
  | AltParagraph alts -> alternative_string (string_of_paragraph mode) mode alts

let rec string_of_text mode tokens = function
    RawText s -> if mode = Raw then s else ""
  | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode tokens)
  | AltText alts -> alternative_string (string_of_text mode) mode alts*)


(******************)

let establish_next tokens paths =
  let n = ExtArray.size tokens in
  Int.iter 1 (n - 2) (fun i ->
    let f = ExtArray.get tokens i in
    let s = ExtArray.get tokens (i+1) in
    ExtArray.set tokens i {f with next = s.beg});
  let last = ExtArray.get tokens (n-1) in
  ExtArray.set tokens (n-1) {last with next = last.beg + last.len}


  (*let rec pom res = function
    h :: t -> let next = if res = []
        then h.beg+h.len
        else (List.hd res).beg in
      pom ({h with next = next} :: res) t
  | [] -> res in
  pom [] rev_tokens*)

let rec establish_for_token i text tokens = function
    (id,_,_) :: t as l->
      let h = ExtArray.get tokens id in
      if Xstring.check_prefix " " text
      then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l
      else if Xstring.check_prefix h.orth text
        then
          let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in
          let n_h = {h with beg = i ; len = n} in
          ExtArray.set tokens id n_h;
          establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t
        else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text)
  | [] -> 100, i

let rec establish_lengths text paths tokens =
  let pbeg, plen = establish_for_token 100 text tokens (List.tl (Array.to_list paths)) in
  establish_next tokens paths;
  pbeg, plen-100

(******************)

exception ErrorInfoFile of string

let info_file = "../corpora/info_sentences.txt"

let info = Xstring.split "\n\n" @@ File.load_file_gen info_file

let add_to_map map info_str =
  match Xstring.split "\n" info_str with
    [id; text; info_token] ->  StringMap.add map info_token (id, text)
  | _ -> raise (ErrorInfoFile info_str)

let info_map =
  Xlist.fold info StringMap.empty add_to_map

let match_sentence (p_record,tokens) =
  let rec info_token s = match s with
      RawSentence text -> failwith ("match_sentence: " ^ text)
    | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)
    | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths
    | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences")
    | AltSentence alts -> failwith ("match_sentence: AltSentence")
        (*if List.exists (fun (mode, s) -> mode = CONLL) alts
        then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
        else failwith ("match_sentence: no CONLL mode in AltSentence")*) in
  let info_token, paths = info_token p_record.psentence in
  try
    let id, text = StringMap.find info_map info_token in
    let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
    AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix="";
     psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)]
(*  {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
  with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)]

let match_corpus corpus =
  Xlist.map corpus match_sentence

(******************)

let establish_next tokens paths =
  let n = ExtArray.size tokens in
  Int.iter 1 (n - 2) (fun i ->
    let f = ExtArray.get tokens i in
    let s = ExtArray.get tokens (i+1) in
    ExtArray.set tokens i {f with next = s.beg});
  let last = ExtArray.get tokens (n-1) in
  ExtArray.set tokens (n-1) {last with next = last.beg + last.len}


  (*let rec pom res = function
    h :: t -> let next = if res = []
        then h.beg+h.len
        else (List.hd res).beg in
      pom ({h with next = next} :: res) t
  | [] -> res in
  pom [] rev_tokens*)

let rec establish_for_token i text tokens = function
    (id,_,_) :: t as l->
      let h = ExtArray.get tokens id in
      if Xstring.check_prefix " " text
      then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l
      else if Xstring.check_prefix h.orth text
        then
          let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in
          let n_h = {h with beg = i ; len = n} in
          ExtArray.set tokens id n_h;
          establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t
        else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text)
  | [] -> 100, i

let rec establish_lengths text paths tokens =
  let pbeg, plen = establish_for_token 100 text tokens (List.tl (Array.to_list paths)) in
  establish_next tokens paths;
  pbeg, plen-100

(******************)

exception ErrorInfoFile of string

let info_file = "../corpora/info_sentences.txt"

let info = Xstring.split "\n\n" @@ File.load_file_gen info_file

let add_to_map map info_str =
  match Xstring.split "\n" info_str with
    [id; text; info_token] ->  StringMap.add map info_token (id, text)
  | _ -> raise (ErrorInfoFile info_str)

let info_map =
  Xlist.fold info StringMap.empty add_to_map

let match_sentence (p_record,tokens) =
  let rec info_token s = match s with
      RawSentence text -> failwith ("match_sentence: " ^ text)
    | StructSentence (tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)
    | DepSentence (paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths
    | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences")
    | AltSentence alts -> failwith ("match_sentence: AltSentence")
        (*if List.exists (fun (mode, s) -> mode = CONLL) alts
        then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
        else failwith ("match_sentence: no CONLL mode in AltSentence")*) in
  let info_token, paths = info_token p_record.psentence in
  try
    let id, text = StringMap.find info_map info_token in
    let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
    AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len; pfile_prefix="";
     psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence paths]}]],tokens)]
(*  {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
  with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)]

let match_corpus corpus =
  Xlist.map corpus match_sentence

(******************)

exception Empty_line
exception Empty_sentence
exception Id_line of string

let load_token in_channel =
  let fail line =
    (* failwith ("load_token: " ^ line) *)
    () in
  let int_of_super = function
     "_" -> -1
   | s -> int_of_string s in
  let n_token id orth lemma cat interp super label =
    let interp = if interp = "_"
            then [[]]
            else [Xlist.map (Xstring.split_delim "|" interp) (fun tag -> [tag])] in
    {empty_token with orth = orth; token = Lemma(lemma,cat,interp);}, int_of_string id, int_of_super super, label in
  let line = input_line in_channel in
  (* print_endline ("load_token: " ^ line); *)
  if line = ""
   then raise Empty_line
   else if line.[0] = '#'
     then
       if Xstring.check_prefix "# trees/" line && Xstring.check_sufix ".xml.trees" line
         then let id = Xstring.cut_prefix "# trees/" @@ Xstring.cut_sufix ".xml.trees" line in
              raise (Id_line id)
         else failwith ("load_token: " ^ line)
     else
       match Xstring.split "\t" line with
         [id; orth; lemma; cat; cat2; interp; super; label; "_"; "_"] ->
          (if cat <> cat2 then fail line;
           n_token id orth lemma cat interp super label)
       | [id; orth; lemma; lemma2; cat; cat2; interp; interp2; "-1"; super; "_"; label; "_"; "_"] ->
          (if (cat, lemma, interp) <> (cat2, lemma2, interp2) then fail line;
           n_token id orth lemma cat interp super label)
       | [id; orth; lemma; cat; cat2; interp; super; label_err; "_"] ->
          (if cat <> cat2 && Xstring.check_sufix "_" label_err then fail line;
           let label = Xstring.cut_sufix "_" label_err in
           n_token id orth lemma cat interp super label)
       | _ -> failwith ("load_token: " ^ line)
(*     {c_id = List.nth pom 1;
       c_lemma = List.nth pom 2;
       c_cat = List.nth pom 3;
       c_interp = (let interp = List.nth pom 5 in
         if interp = "_"
           then []
           else Str.split (Str.regexp "|") interp);
       c_super = -1; c_label = ""; c_beg = -1; c_len = -1} *)

let load_sentence in_channel =
  let tokens = ExtArray.make 100 empty_token in
  let _ = ExtArray.add tokens {empty_token with token = Interp "<conll_root>"} in
  let rec pom rev_paths id =
    (* print_endline "pom 1"; *)
    try
      (* print_endline "pom 2"; *)
      let token, conll_id, super, label = load_token in_channel in
      let id_a = ExtArray.add tokens token in
      if id_a <> conll_id then failwith "load_sentence: different ids" else
      (* print_endline "pom 3"; *)
      pom ((id_a,super,label) :: rev_paths) id
    with Id_line new_id -> (*print_endline "pom 4";*)pom rev_paths new_id
      | Empty_line -> (*print_endline "pom 5";*)rev_paths, id
      | End_of_file -> (*print_endline "pom 6";*)if rev_paths = []
          then raise End_of_file
          else rev_paths, id in
  let rev_paths, id = pom [] "" in
  {pid = id; pbeg = -1; plen = -1; pnext = -1; pfile_prefix = ""; psentence = DepSentence(Array.of_list ((0,-1,"") :: List.rev rev_paths))}, tokens
(*  {s_id = id; s_text = ""; s_paths = (List.rev rev_paths)} *)

let load_corpus in_channel =
  let rec pom res =
    try
      let conll_sentence, tokens = load_sentence in_channel in
      pom ((conll_sentence, tokens) :: res)
    with End_of_file -> res in
  (* match_corpus @@ *) List.rev @@ pom []