resources.ml 3.9 KB
(*open Types*)

module Id =
  struct
    type t = string
    let compare a b = Pervasives.compare a b
  end

module IdMap = Map.Make(Id)

let skladnica_frazowa_filename = "resources/skladnica_walencyjna"

let get_filenames path =
  List.fold_left
      (fun files_list1 folder1 -> if folder1 = ".DS_Store" then files_list1 else
          List.fold_left
          (fun files_list2 folder2 -> if folder2 = ".DS_Store" then files_list2 else
              List.fold_left
              (fun files_list3 file -> if file = ".DS_Store" then files_list3 else
                (path ^ "/" ^ folder1 ^ "/" ^ folder2 ^ "/" ^file) :: files_list3)
              files_list2
              (Array.to_list (Sys.readdir (path ^ "/" ^ folder1 ^ "/" ^ folder2))))
          files_list1
          (Array.to_list (Sys.readdir (path ^ "/" ^ folder1))))
      []
      (Array.to_list (Sys.readdir path))

let add_to_map map = function
    Xml.Element("forest",("sent_id",sent_id) :: _,
      Xml.Element("text",[],[Xml.PCData text]) :: _) -> IdMap.add sent_id text map
  | _ -> failwith "add_to_map"

let i = ref 1

(* map(id,text) *)
let sentencesIdText = List.fold_left (fun acc filename -> print_endline (string_of_int !i); i := !i + 1; add_to_map acc (Xml.parse_file filename)) IdMap.empty (get_filenames skladnica_frazowa_filename)

let number_of_sentences_skladnica_frazowa = List.length (get_filenames skladnica_frazowa_filename)

(*************************************************************************************************************)

module Info =
  struct
    type t = string list
    let compare a b = Pervasives.compare a b
  end

module InfoMap = Map.Make(Info)

let krzaki_filename = "resources/krzaki.conll"

let file_in filename f =
  let file = open_in filename in
  let x = f file in
  close_in file;
  x

let load_file filename =
  let size = (Unix.stat filename).Unix.st_size in
  let buf = Bytes.create size in
  file_in filename (fun file -> 
    ignore (really_input file buf 0 size));
  buf

let load_krzaki filename = Str.split (Str.regexp "\n\n") (load_file filename)
(*  let krzaki = Str.split (Str.regexp "\n\n") (load_file filename) in
  rev_map (fun krzak ->
    print_endline ("krzak: " ^ krzak);  krzak) krzaki *)

let split_word stringname =
  let pom = Str.split (Str.regexp "\t") stringname in
    { c_id = int_of_string (List.nth pom 0);
      c_orth = List.nth pom 1;
      c_lemma = List.nth pom 2;
      c_cat = List.nth pom 3;
      c_interp = (Str.split (Str.regexp "|") (List.nth pom 5));
      c_super = int_of_string (List.nth pom 6);
      c_label = List.nth pom 7;
      c_beg = -1;
      c_len = -1}

let split_krzak stringname =
  let pom = Str.split (Str.regexp "\n") stringname in
  let s_id = String.sub stringname 8 ((String.length @@ List.hd pom)-17) in
    { s_id = s_id;
      s_text =
        (try
          IdMap.find s_id sentencesIdText
        with _ -> prerr_endline s_id; "not_found");
      s_tokens = List.map (fun word -> split_word word) (List.tl pom)}

let parse_krzaki list_of_string =
  List.map (fun krzak -> split_krzak krzak) list_of_string

let number_of_sentences_krzaki = List.length (load_krzaki krzaki_filename)

(* conll_sequence list *)
let data_conll = parse_krzaki (load_krzaki krzaki_filename)

(* map(form_sequence,conll_sentence) *)
let conll_info = List.fold_left (fun map sentence ->
  InfoMap.add (List.map (fun token -> token.c_orth) sentence.s_tokens) sentence map) InfoMap.empty data_conll

(*let info_file () = 
  let oc = open_out "info_sentences.txt" in
  List.iter (fun (key, sentence) ->
    output_string oc (sentence.s_id^"\n"^sentence.s_text^"\n"^(String.concat " " key)^"\n\n")) (InfoMap.bindings conll_info)*)

(*let frazowa_info =
  let got_info = List.map (fun (_, sentence) -> sentence.s_id, sentence.s_text) (InfoMap.bindings conll_info) in
  List.fold_left (fun map (id, text) -> if List.mem (id, text) got_info
    then map
    else IdMap.add text id map) IdMap.empty (IdMap.bindings sentecesIdText) *)