parser.ml 8.38 KB
(*
 *  ENIAMexec implements ENIAM processing stream
 *  Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMsubsyntaxTypes
open Xstd

let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.rules_filename

let lexSemantics_built_in = ref true
let lexSemantics_host = ref "localhost"
let lexSemantics_port = ref 5739
let verbosity = ref 1
let img = ref 1
let timeout = ref 30.
let select_sentence_modes_flag = ref false
let select_sentences_flag = ref true
let assign_semantic_valence_flag = ref true
let output_dir = ref "results/"
let spec_list = [
(*  "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)";
  "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences";
  "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)";
  "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number";
  "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)";
  "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML";
  "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure";
  "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML";
  "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off";*)
  (* "-r", Arg.String (fun p ->
        ENIAMtokenizerTypes.set_resource_path p;
        ENIAMmorphologyTypes.set_resource_path p;
        ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *)
  "-b", Arg.Unit (fun () -> lexSemantics_built_in:=true), "Use built in version of ENIAMlexSemantics (default)";
  "--port", Arg.Int (fun p -> lexSemantics_built_in:=false; lexSemantics_port:=p), "<port> Connect to ENIAMlexSemantics on a given port";
  "--host", Arg.String (fun s -> lexSemantics_built_in:=false; lexSemantics_host:=s), "<hostname> Connect to ENIAMlexSemantics on a given host (by default localhost)";
  "--timeout", Arg.Float (fun x -> timeout:=x), "<seconds> Sets timeout value for parser (default 30 seconds)";
  "-v", Arg.Int (fun v -> verbosity:=v), "<val> Sets verbosity level of parser\n     0 - print only status information\n     1 - print data relevant to the status of a given sentence (default)\n     2 - print all data structures";
  "--img", Arg.Int (fun v -> img:=v), "<val> Selects which images are included in output html page \n     0 - no images included\n     1 - simple dependency trees included (default)\n     2 - dependency trees included";
  "--output", Arg.String (fun s -> output_dir:=s), "<dir> Sets output directory (by default results/)";
  "--sel_modes", Arg.Unit (fun () -> select_sentence_modes_flag:=true), "Select sencence modes";
  "--no_sel_modes", Arg.Unit (fun () -> select_sentence_modes_flag:=false), "Do not select sencence modes (default)";
  "--sel_sent", Arg.Unit (fun () -> select_sentences_flag:=true), "Select parsed sentences (default)";
  "--no_sel_sent", Arg.Unit (fun () -> select_sentences_flag:=false), "Do not select parsed sentences";
  "--sem_valence", Arg.Unit (fun () -> assign_semantic_valence_flag:=true), "Assign semantic valence (default)";
  "--no_sem_valence", Arg.Unit (fun () -> assign_semantic_valence_flag:=false), "Do not assign semantic valence";
  ]

let usage_msg =
  "Usage: semparser <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:"

let message = "ENIAM_LCGsemparser, semantic parser for Logical Categorial Grammar formalism\n\
Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\
Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences"

let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s))

let input_text channel =
  let s = ref (try input_line channel with End_of_file -> "") in
  let lines = ref [] in
  while !s <> "" do
    lines := !s :: !lines;
    s := try input_line channel with End_of_file -> ""
  done;
  String.concat "\n" (List.rev !lines)

let rec main_loop sub_in sub_out =
  let text = input_text stdin in
  if text = "" then () else (
    let text,tokens,lex_sems,msg =
      if !lexSemantics_built_in then
        let text,tokens,msg = ENIAMsubsyntax.catch_parse_text text in
        let lex_sems,msg =
          if msg <> "" then ExtArray.make 0 ENIAMlexSemanticsTypes.empty_lex_sem, msg
          else ENIAMlexSemantics.catch_assign tokens text in
        text,tokens,lex_sems,msg else (
      Printf.fprintf sub_out "%s\n\n%!" text;
      (Marshal.from_channel sub_in : ENIAMsubsyntaxTypes.text * ENIAMtokenizerTypes.token_env ExtArray.t * ENIAMlexSemanticsTypes.lex_sem ExtArray.t * string)) in
    if msg <> "" then print_endline msg else (
    let text = ENIAMexec.translate_text text in
    let text = ENIAMexec.parse !timeout !verbosity rules tokens lex_sems text in
    let text = if !select_sentence_modes_flag then ENIAMselectSent.select_sentence_modes_text text else text in
    let text = if !select_sentences_flag then ENIAMselectSent.select_sentences_text ENIAMexecTypes.Struct text else text in
    let text = if !assign_semantic_valence_flag then ENIAMsemValence.assign tokens lex_sems text else text in
    let text = if !assign_semantic_valence_flag then ENIAMsemValence.reduce tokens lex_sems text else text in
    ENIAMvisualization.print_html_text !output_dir "parsed_text" text !img !verbosity tokens);
    prerr_endline "Done!";
    main_loop sub_in sub_out)

let get_sock_addr host_name port =
  let he = Unix.gethostbyname host_name in
  let addr = he.Unix.h_addr_list in
  Unix.ADDR_INET(addr.(0),port)

let _ =
  prerr_endline message;
  ENIAMcategoriesPL.initialize ();
  Arg.parse spec_list anon_fun usage_msg;
  if !lexSemantics_built_in then ENIAMlexSemantics.initialize ();
  Gc.compact ();
  let sub_in,sub_out =
    if !lexSemantics_built_in then stdin,stdout
    else Unix.open_connection (get_sock_addr !lexSemantics_host !lexSemantics_port) in
  prerr_endline "Ready!";
  main_loop sub_in sub_out

let examples = [
  (* "Szpak","Szpak śpiewa."; *)
  (* "miał","Miałem miał."; *)
(*  "Ala","Ala ma kota.";
  "Ale","Ale mają kota:"; *)
  (*  "zima","Szpak frunie zimą.";*)
  (* "październik","Kot miauczy w październiku."; *)
(*  "Szpak-Kot","Szpak frunie. Kot miauczy.";
    "powiedział","Szpak powiedział: „Frunę. Kiszę.”";*)
    (* "teraz","Teraz frunie jakiś szpak.";
      "chłopcy","Chłopcy mają ulicę kwiatami."; *)
     (*  "arabia","Arabia Saudyjska biegnie.";*)
(*  "Tom","Tom idzie."; *)
  (* "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie.";
  "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994."; *)
  (* "przez_nią","Frunę przez nią.";  *)
  (* "o_nie","Witold frasuje się o nie."; *)
  (* "or1","- Frunę.";  *)
  (* "or2","- Frunę - powiedział szpak."; *)
  (*"or3","- Frunę! - powiedział szpak.";*)
]
(*
let _ =
  ENIAMsubsyntax.initialize ();
  ENIAMcategoriesPL.initialize ();
  ENIAMwalParser.initialize ();
  ENIAMwalReduce.initialize ();
  Xlist.iter examples (fun (name,example) ->
  let text,tokens,msg = ENIAMsubsyntax.catch_parse_text example in
  if msg <> "" then print_endline msg else (
    let lex_sems = ENIAMlexSemantics.assign tokens text in
    let text = ENIAMexec.translate_text text in
    let text = ENIAMexec.parse 30. !verbosity rules tokens lex_sems text in
    (* let text = ENIAMselectSent.select_sentence_modes_text text in *)
    let text = ENIAMselectSent.select_sentences_text ENIAMexecTypes.Struct text in
    ENIAMvisualization.print_html_text "results/" "parsed_text" text !img !verbosity tokens))
    *)