conllParser.ml 4 KB
(*open Types*)

let skladnica_zaleznosciowa_filename = "resources/skladnica_zaleznosciowa.conll"

let oc = open_out "resources/info_sentences.txt"

let empty_token = { c_id = 0; c_orth = ""; c_lemma = ""; c_cat = "";
      c_interp = []; c_super = 0; c_label = ""; c_beg = 0; c_len = 0}

let quote_open = ref false

let hyphenated = ref false

let reset () =
  quote_open := false;
  hyphenated := false

let maybe_add_space pre_previous previous token next =
  if previous.c_orth = "" && token.c_orth = "\""
    then quote_open := true;
  if token.c_cat = "aglt" ||
    (token.c_orth = "by" && previous.c_cat = "praet") ||
    (previous.c_orth = "\"" && !quote_open) ||
    previous.c_orth = "(" ||
    previous.c_orth = "„" ||
    previous.c_orth = "" ||
    token.c_orth = "ń" || (* wyrażenie nań *)
    (token.c_orth = "że" && (previous.c_orth  = "czym" || previous.c_orth  = "Czym")) || (*wyrażenie czymże*)
(*    (token.c_orth = "r" && token.c_cat = "brev") || (*skrót r. - np. 1991r. *) *)
    (pre_previous.c_cat = "adj" && previous.c_orth = "." && token.c_cat = "num" && token.c_interp = ["pl";"nom";"f";"rec"]) (* godzina - np 13.15*)
      then token.c_orth
      else if !hyphenated
        then (hyphenated := false; token.c_orth)
        else match token.c_orth with
      "." -> "."
    | "…" -> "…"
    | "?" -> "?"
    | "!" -> "!"
    | "," -> ","
    | ":" -> ":"
    | ";" -> ";"
    | ")" -> ")"
    | "”" -> "”"
    | "-" -> if previous.c_cat = "adja" ||
               (previous.c_cat = "subst" && next.c_cat = "subst" && previous.c_interp = next.c_interp)
               then (hyphenated := true; "-")
               else " -"
    | "\"" -> if !quote_open
                then (quote_open := false; "\"")
                else (quote_open := true; " \"")
    | s -> " "^s
(*FIXME: cudzysłowy*)
    
let getSentence tokens =
  let rec fold4 acc = function
    a::b::c::d::t -> fold4 (acc^maybe_add_space a b c d) (b::c::d::t)
  | a::b::c::[] -> fold4 (acc^maybe_add_space a b c empty_token) (b::c::[])
  | a::b::[] -> acc in
  reset ();
  fold4 "" (empty_token::empty_token::tokens)

let split_word stringname =
  let pom = Str.split (Str.regexp "\t") stringname in
    { c_id = int_of_string (List.nth pom 0);
      c_orth = List.nth pom 1;
      c_lemma = List.nth pom 2;
      c_cat = List.nth pom 3;
      c_interp = (Str.split (Str.regexp "|") (List.nth pom 5));
      c_super = int_of_string (List.nth pom 6);
      c_label = List.nth pom 7;
      c_beg = -1;
      c_len = -1}

let any_difference string1 string2 = if string1 = string2
  then false
  else (String.sub string2 0 (String.length string2 -1)) ^ " " ^ (String.sub string2 (String.length string2 -1) 1) <> string1

let find_info tokens =
  let text_generated = getSentence tokens in
  try
    let sentence = (*Resources.*)InfoMap.find (List.map (fun token -> token.c_orth) tokens) (*Resources.*)conll_info in
    let id, text = sentence.s_id, sentence.s_text in
    (*if any_difference text text_generated && text <> "not_found"
      then print_endline (text ^ "\n" ^ text_generated ^ "\n\n");*)
    if text = "not_found"
      then { s_id = id;
             s_text = "Auto-generated text: "^text_generated;
             s_tokens = tokens}
      else { s_id = id;
             s_text = text;
             s_tokens = tokens}
  with _ -> (*prerr_endline ("Id not found\n" ^ text_generated ^ "\n\n");*) { s_id = "Id not found";
    s_text = text_generated;
    s_tokens = tokens}

let process_sentence sentenceString =
  let pom = Str.split (Str.regexp "\n") sentenceString in
  let tokens = List.map (fun word -> split_word word) pom in
  find_info tokens

let print_info sentence =
  let sentence = process_sentence sentence in
  let form_sequence = String.concat " " @@ List.map (fun token -> token.c_orth) sentence.s_tokens in
  output_string oc (sentence.s_id^"\n"^sentence.s_text^"\n"^form_sequence^"\n\n");
  flush oc

let processSkladnica =
  List.iter (fun sentence -> print_info sentence) (Str.split (Str.regexp "\n\n") ((*Resources.*)load_file skladnica_zaleznosciowa_filename))