ENIAMpreIntegration.ml 8.42 KB
(*
 *  ENIAMintegration, a library that integrates ENIAM with other parsers.
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>, Jan Lupa, Daniel Oklesiński
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMtokenizerTypes
open ENIAMsubsyntaxTypes

let concraft_enabled = ref false
let mate_parser_enabled = ref false
let swigra_enabled = ref false
let polfie_enabled = ref false

let concraft_model_filename = "../tools/concraft/nkjp-model-0.2.gz"
let concraft_server_pid = ref (-1)

let mate_parser_path = "../tools/mate-tools/"
let mate_in = ref stdin
let mate_out = ref stdout

let concraft_exists () =
  let check_in, check_out, check_err = Unix.open_process_full ("command -v concraft-pl")
    [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
  let close_check () = Unix.close_process_full (check_in, check_out, check_err) in
  try
    ignore @@ input_line check_in;
    ignore @@ close_check ();
    true
  with End_of_file -> ignore @@ close_check (); false

let wait_for_server () =
  let rec wait s a =
    try Unix.connect s a
    with e -> Unix.sleep 1; wait s a in
  let s = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 6 in
  let a = Unix.ADDR_INET (Unix.inet_addr_loopback, 10089) in
  wait s a;
  Unix.shutdown s Unix.SHUTDOWN_SEND;
  Unix.close s

let start_server m =
  let client_out, server_out = Unix.pipe () in
  let client_err, server_err = Unix.pipe () in
  let pid = Unix.create_process "concraft-pl" [|"concraft-pl"; "server"; "--inmodel"; m|]
    Unix.stdin server_out server_err in
  List.iter Unix.close [client_out; server_out; client_err; server_err];
  wait_for_server ();
  pid

let stop_server pid =
  Unix.kill pid Sys.sigint

let initialize () =
  if !concraft_enabled then (
    if not (concraft_exists ()) then failwith "The command concraft-pl is missing. Please make sure Concraft is installed properly." else
    if not (Sys.file_exists concraft_model_filename) then failwith "Concraft model file does not exist." else
    print_endline "Starting Concraft Server";
    concraft_server_pid := start_server concraft_model_filename;
    print_endline "Server started");
  if !mate_parser_enabled then (
    let m_in, m_out = (*Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test"*)
      Unix.open_process ("java -jar " ^ mate_parser_path ^ "dist/anna-3.5.jar -model " ^
        mate_parser_path ^ "examples/160622_Polish_MateParser.mdl -test") in
    mate_in := m_in;
    mate_out := m_out)


let read_whole_channel c =
  let r = ref [] in
  try
    while true do
      r := (input_line c) :: !r
    done;
    !r
  with End_of_file -> List.rev (!r)

let concraft_parse s =
  let concraft_in, concraft_out, concraft_err =
    Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client")
      [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
  let err_msg = String.concat "\n" (read_whole_channel concraft_err) in
  let result = read_whole_channel concraft_in in
  ignore (Unix.close_process_full (concraft_in, concraft_out, concraft_err));
  if err_msg <> "" then failwith err_msg else
  result

let rec process_concraft_result orth lemma interp others rev = function
    [] -> List.rev ((orth,(lemma,interp) :: others) :: rev)
  | "" :: l -> process_concraft_result orth lemma interp others rev l
  | line :: l ->
      (match Xstring.split_delim "\t" line with
        [orth2;s] when s = "none" || s = "space" ->
           if orth = "" then process_concraft_result orth2 lemma interp others rev l
           else process_concraft_result orth2 "" "" [] ((orth,(lemma,interp) :: others) :: rev) l
      | ["";lemma2;interp2] -> process_concraft_result orth lemma interp ((lemma2,interp2) :: others) rev l
      | ["";lemma;interp;"disamb"] -> process_concraft_result orth lemma interp others rev l
      | _ -> failwith ("process_concraft_result: " ^ line))

let make_token (orth,l) =
  if l = [] then failwith "make_token 1" else
  let lemma,interp = List.hd l in
  let cat,interp = match Xstring.split ":" interp with
      cat :: l -> cat, [Xlist.map l (fun tag -> [tag])]
    | _ -> failwith ("make_token 2: " ^ orth ^ " " ^ lemma ^ " " ^ interp) in
  {empty_token_env with orth = orth; token = Lemma(lemma,cat,interp)}

let parse_mate tokens pbeg s =
  let result = concraft_parse s in
  let l = process_concraft_result "" "" "" [] [] result in
  let l = Xlist.map l make_token in
  let l = {empty_token_env with token = Interp "<conll_root>"} :: l in
  let l = Xlist.map l (fun t -> ExtArray.add tokens t,-1,"") in
  let _ = ENIAM_CONLL.establish_lengths pbeg s l tokens in
  let dep_paths = Array.of_list l in
  (* parse_conll tokens dep_paths; *)
  dep_paths

let get_paths old_paths = function
    {sentence=DepSentence(paths)},_ ->
       Int.iter 0 (Array.length paths - 1) (fun i ->
         let id,_,_ = old_paths.(i) in
         let _,super,label = paths.(i) in
         paths.(i) <- id,super,label);
       paths
  | _ -> failwith "get_paths"

let rec parse_mate_sentence tokens pbeg s =
  let paths = parse_mate tokens pbeg s in
  print_endline "parse_mate2 1";
          (* print_endline (Visualization.html_of_dep_sentence tokens paths); *)
  let conll = ENIAM_CONLL.string_of_paths Mate tokens paths in
  print_endline "parse_mate2 2";
          (* printf "|%s|\n" conll; *)
  Printf.fprintf !mate_out "%s%!" conll;
  print_endline "parse_mate2 3";
  let new_paths = get_paths paths (ENIAM_CONLL.load_sentence !mate_in) in
  print_endline "parse_mate2 4";
          (* print_endline (Visualization.html_of_dep_sentence tokens new_paths); *)
  DepSentence new_paths

let compare_mode (x,_) (y,_) = compare_mode x y

let rec parse_sentence mode tokens pbeg = function
    RawSentence s ->
      [Raw,RawSentence s] @
      (if !mate_parser_enabled && !concraft_enabled then [Mate,parse_mate_sentence tokens pbeg s] else []) @
      (if !swigra_enabled then [Swigra,RawSentence s] else []) @
      (if !polfie_enabled then [POLFIE,RawSentence s] else [])
  | StructSentence(paths,last) -> [mode,StructSentence(paths,last)]
  | DepSentence(paths) -> [mode,DepSentence paths]
  | QuotedSentences sentences ->
      let sentences =Xlist.rev_map sentences (fun p ->
        let sentence = parse_sentence mode tokens p.beg p.sentence in (* FIXME: p.pbeg czy pbeg *)
        let sentence = match sentence with
            [_,s] -> s
          | _ -> failwith "ENIAMpreIntegration.parse_sentence" in
        {p with sentence=sentence}) in
      [mode,QuotedSentences(List.rev sentences)]
  | AltSentence l ->
      let l = List.flatten (Xlist.rev_map l (fun (mode,sentence) ->
        parse_sentence mode tokens pbeg sentence)) in
      [mode,AltSentence(Xlist.sort l compare_mode)]

let rec parse_paragraph mode tokens = function
    RawParagraph s -> RawParagraph s
  | StructParagraph sentences ->
      let sentences = Xlist.rev_map sentences (fun p ->
        let sentence = parse_sentence mode tokens p.beg p.sentence in
        let sentence = match sentence with
            [_,s] -> s
          | _ -> failwith "ENIAMpreIntegration.parse_paragraph" in
        {p with sentence=sentence}) in
      StructParagraph(List.rev sentences)
  | AltParagraph l ->
      let l = Xlist.rev_map l (fun (mode,paragraph) ->
        mode, parse_paragraph mode tokens paragraph) in
      AltParagraph(List.rev l)

let rec parse_text mode tokens = function
    RawText s -> RawText s
  | StructText paragraphs ->
      let paragraphs = Xlist.rev_map paragraphs (fun paragraph ->
        parse_paragraph mode tokens paragraph) in
      StructText(List.rev paragraphs)
  | AltText l -> AltText(Xlist.map l (fun (mode,text) ->
       mode, parse_text mode tokens text))

let catch_parse_text mode tokens text =
  try
    parse_text mode tokens text,""
  with e ->
    text, Printexc.to_string e