exec.ml 16.1 KB
(*
 *  ENIAM: Categorial Syntactic-Semantic Parser for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open LCGtypes
open ExecTypes

let empty_result = {
  query="";
  status=Idle;
  msg="";
  pre_time1=0.;
  pre_time2=0.;
  lex_time=0.;
  parse_time=0.;
  reduction_time=0.;
  sem_time=0.;
  paths_size=0;
  graph_size=0;
  term_size=0;
  graph=[| |];
  term=[| |];
  disamb=[| |];
  sem=[| |];
  sem2=[| |];
  sem3=LCGtypes.Dot;
  trees=[];
  mrls=[];
  paths=[| |];
  (*structs=SemTypes.Atom "",SemTypes.Label "",SemTypes.Label "",[],""*)}

let empty_sum_result = {
  no_queries=0;
  no_pre_error=0;
  no_lex_error=0;
  no_parse_error=0;
  no_timeout=0;
  no_reduction_error=0;
  no_sem_error=0;
  no_not_parsed=0;
  no_not_reduced=0;
  no_too_many_nodes=0;
  no_not_translated=0;
  no_parsed=0;
  sum_pre_time1=0.;
  sum_pre_time2=0.;
  sum_lex_time=0.;
  sum_parse_time=0.;
  sum_reduction_time=0.;
  sum_sem_time=0.;
  }

open Printf

let make_paths_array (paths,_,next_id) =
  let a = Array.make next_id PreTypes.empty_token in
  Xlist.iter paths (fun t -> a.(t.PreTypes.id) <- t);
  a

let extend_paths_array2 paths_array = function
    Node t ->
      if paths_array.(t.id).PreTypes.id = 0 then
        paths_array.(t.id) <- {paths_array.(t.id) with
          PreTypes.token=PreTypes.Lemma(t.pred,t.cat,[]);
          PreTypes.senses=if t.cat="pro" then [t.pred,["0"],0.] else [];
          PreTypes.id=t.id}
  | t -> failwith ("extend_paths_array2: " ^ LCGstringOf.linear_term 0 t)

let extend_paths_array old_paths_array references = (* FIXME: wstawić pos *)
  let paths_array = Array.make (!LCGrenderer.pro_id_counter + 1) PreTypes.empty_token in
  Int.iter 0 (Array.length old_paths_array - 1) (fun i ->
    paths_array.(i) <- old_paths_array.(i));
(*  Int.iter (Array.lenght old_paths_array - 1) (!LCGrenderer.pro_id_counter) (fun i ->
    paths_array.(i) <- {paths_array.(i) with id=i});*)
  Array.iter (extend_paths_array2 paths_array) references;
  paths_array

open PreTypes

let process_sentence = function
    AltSentence[Raw,RawSentence s;Struct,StructSentence(paths,last)] ->
       printf "SENTENCE: %s\n" s;
       ()
  | _ -> failwith "process_text"

let process_paragraph = function
    AltParagraph[Raw,RawParagraph s;Struct,StructParagraph(sentences,next_id)] ->
       printf "PAR: %s\n" s;
       Xlist.iter sentences (fun p -> process_sentence p.psentence);
       ()
  | _ -> failwith "process_paragraph"

let process_text = function
    AltText[Raw,RawText s;Struct,StructText paragraphs] ->
       printf "TEXT: %s\n" s;
       Xlist.iter paragraphs process_paragraph;
       [],0,0
  | PreTypes.RawText "" -> [],0,0
  | _ -> failwith "process_text"

let process_query ic oc timeout test_only_flag id query max_n =
  let result = {empty_result with query=query} in
  let time1 = time_fun () in
  (* Printf.fprintf oc "%s\n%!" query; *)
  print_endline "process_query 1";
  Marshal.to_channel oc (PreTypes.RawText query) [];
  flush oc;
  print_endline "process_query 2";
  let text,msg,pre_time1 = (Marshal.from_channel ic : PreTypes.text * string * float) in
  print_endline "process_query 3";
  let paths = process_text text in
  (* let paths = match paths with
      PreTypes.RawText "" -> [],0,0
    | PreTypes.StructText [PreTypes.StructParagraph [{PreTypes.pcontents=PreTypes.StructSentence(paths,last,next_id)}]] -> paths,last,next_id
    | _ -> failwith "process_query: pre format" in *)
  let paths_array = make_paths_array paths in
  let result = if test_only_flag then result else {result with paths=paths_array} in
(*   if not test_only_flag then print_endline (paths_to_string_indexed paths);  *)
  let time2 = time_fun () in
  let result = {result with pre_time1=pre_time1; pre_time2=time2 -. time1;
    paths_size=let _,_,next_id = paths in next_id-1} in
  if msg <> "" then {result with status=PreprocessingError; msg=msg} else
  try
    let graph = LCGlexicon.create query paths in
    let graph,references,next_reference = LCGchart.lazify graph in
    let time3 = time_fun () in
    let result = {result with lex_time=time3 -. time2} in
    try
      let graph,references,next_reference = LCGchart.parse timeout graph references next_reference time_fun in
      let time4 = time_fun () in
      let result = if test_only_flag then result else {result with graph=graph} in
      let result = {result with parse_time=time4 -. time3; graph_size=LCGchart.get_no_entries graph} in
      if LCGchart.is_parsed graph then
        try
          let term = LCGchart.get_parsed_term graph in
          let references = LCGreductions.reduce term references next_reference in
          let time5 = time_fun () in
          let result = if test_only_flag then result else {result with term=references} in
          let result = {result with reduction_time=time5 -. time4; term_size=Array.length references} in
          if LCGreductions.is_reduced_references references then
            try
              LCGreductions.assign_labels references; (* uwaga: niejawna zmiana imperatywna w result *)
              LCGreductions.remove_cuts references; (* uwaga: niejawna zmiana imperatywna w result *)
(*               if Array.length references < 10000 then print_xml_graph "results/trees/" id references;  *)
              let paths_array = extend_paths_array paths_array references in
              let result = if test_only_flag then result else {result with paths=paths_array} in
              let (*references2*)(*sem*)disamb = LCGvalence.assign_frames_and_senses paths_array references(*disamb*) in (* FIXME: wstawić jako nowy etap i na nową zmienną *)
              let disamb(*sem*) = DisambSelPref.fit_sel_prefs DisambSelPref.fit_node1 (*references2*)disamb in
              let (*sem*)disamb = DisambLemma.disambiguate_nodes (*references*)(*sem*)disamb in
              let (*sem*)disamb = DisambLemma.remove_unused(*disambiguate_nodes*) (*references*)(*sem*)disamb in
              let (*sem*)disamb = DisambLemma.remove_unused_choices(*disambiguate_nodes*) (*references*)(*sem*)disamb in
              let (*disamb*)sem = DisambSelPref.fit_sel_prefs DisambSelPref.fit_node2 (*references2*)disamb in
              let result = if test_only_flag then result else {result with disamb=disamb} in
              let sem = DisambLemma.disambiguate_meanings (*references*)sem in
              let sem(*disamb*) = DisambLemma.remove_unused_choices(*disambiguate_nodes*) (*references*)sem(*disamb*) in
              let result = if test_only_flag then result else {result with sem=sem} in
              let sem2 = SemGraph.translate paths_array (*disamb*)sem in
              let result = if test_only_flag then result else {result with sem2=sem2} in
              let sem3(*disamb*) = SemGraph.make_tree(*disambiguate_nodes*) (*references*)sem2(*disamb*) in
              let sem3(*disamb*) = SemGraph.simplify_tree(*disambiguate_nodes*) (*references*)sem3(*disamb*) in
(*               let sem3(*disamb*) = SemGraph.manage_quantification(*disambiguate_nodes*) (*references*)sem3(*disamb*) in  *)
              let sem3(*disamb*) = SemGraph.simplify_gender(*disambiguate_nodes*) (*references*)sem3(*disamb*) in
(*               if Array.length disamb < 10000 then print_xml_graph "results/trees/" (id ^ "dis") disamb; *)
              let result = if test_only_flag then result else {result with sem3=sem3} in
              let time6 = time_fun () in
              if SemGraph.validate_semantics sem3 then
                let trees = SemGraph.draw_trees max_n sem3 in
                let trees2 = Xlist.map trees SemMrl.variable_alpha_convertion in
                let mrls = Xlist.map trees2 SemMrl.make_mrl in
                let mrls = Xlist.map mrls SemMrl.move_requirements in
                let mrss = Xlist.map mrls SemMrl.make_mrs_of_mrl in
                let mrss = Xlist.map mrss SemMrl.mrs_handle_alpha_convertion in
                let fols = Xlist.map mrss (fun mrs ->
                  let l = SemMrl.foll_of_mrs_greedy mrs in
                  if l = [] then failwith "empty fol" else
                  List.hd l) in
                let result = if test_only_flag then result else {result with trees=trees; mrls=fols(*mrls*)} in
                {result with status=Parsed; sem_time=time6 -. time5}
              else {result with status=NotTranslated; sem_time=time6 -. time5}
            with e ->
              let time6 = time_fun () in
              {result with status=SemError; msg=Printexc.to_string e; sem_time=time6 -. time5}
          else
            {result with status=NotReduced}
        with
        | SemTooBig ->
          let time5 = time_fun () in
          {result with status=TooManyNodes; reduction_time=time5 -. time4}
        | e ->
          let time5 = time_fun () in
          {result with status=ReductionError; msg=Printexc.to_string e; reduction_time=time5 -. time4}
      else {result with status=NotParsed}
    with
      Timeout t ->
        let time4 = time_fun () in
        {result with status=ParseTimeout; msg=Printf.sprintf "%f" t; parse_time=time4 -. time3}
    | e ->
        let time4 = time_fun () in
        {result with status=ParseError; msg=Printexc.to_string e; parse_time=time4 -. time3}
  with e ->
    let time3 = time_fun () in
    {result with status=LexiconError; msg=Printexc.to_string e; lex_time=time3 -. time2}

let print_result file result =
  Printf.fprintf file "query: %s\n" result.query;
  (match result.status with
    Idle -> Printf.fprintf file "idle\n"
  | PreprocessingError -> Printf.fprintf file "error_pre: %s\n" result.msg
  | LexiconError -> Printf.fprintf file "error_lex: %s\n" result.msg
  | ParseError -> Printf.fprintf file "error_parse: %s\n" result.msg
  | ParseTimeout -> Printf.fprintf file "timeout: %s\n" result.msg
  | NotParsed -> Printf.fprintf file "not_parsed: paths_size=%d graph_size=%d\n" result.paths_size result.graph_size
  | ReductionError -> Printf.fprintf file "error_reduction: %s\n" result.msg
  | TooManyNodes -> Printf.fprintf file "to_many_nodes: paths_size=%d graph_size=%d\n" result.paths_size result.graph_size
  | NotReduced -> Printf.fprintf file "not_reduced: paths_size=%d graph_size=%d\n" result.paths_size result.graph_size
  | SemError -> Printf.fprintf file "error_sem: %s\n" result.msg
  | NotTranslated -> Printf.fprintf file "not_translated: \n"
  | Parsed -> Printf.fprintf file "parsed: paths_size=%d graph_size=%d term_size=%d\n" result.paths_size result.graph_size result.term_size);
  Printf.fprintf file "times: pre_time1=%f pre_time2=%f lex_time=%f parse_time=%f reduction_time=%f sem_time=%f\n%!"
    result.pre_time1 result.pre_time2 result.lex_time result.parse_time result.reduction_time result.sem_time

let add_result sum_result result =
  let sum_result = {sum_result with no_queries=sum_result.no_queries+1} in
  let sum_result = match result.status with
    Idle -> failwith "sum_result"
  | PreprocessingError -> {sum_result with no_pre_error=sum_result.no_pre_error+1}
  | LexiconError -> {sum_result with no_lex_error=sum_result.no_lex_error+1}
  | ParseError -> {sum_result with no_parse_error=sum_result.no_parse_error+1}
  | ParseTimeout -> {sum_result with no_timeout=sum_result.no_timeout+1}
  | NotParsed -> {sum_result with no_not_parsed=sum_result.no_not_parsed+1}
  | ReductionError -> {sum_result with no_reduction_error=sum_result.no_reduction_error+1}
  | TooManyNodes -> {sum_result with no_too_many_nodes=sum_result.no_too_many_nodes+1}
  | NotReduced -> {sum_result with no_not_reduced=sum_result.no_not_reduced+1}
  | SemError -> {sum_result with no_sem_error=sum_result.no_sem_error+1}
  | NotTranslated -> {sum_result with no_not_translated=sum_result.no_not_translated+1}
  | Parsed -> {sum_result with no_parsed=sum_result.no_parsed+1} in
  {sum_result with
     sum_pre_time1=sum_result.sum_pre_time1 +. result.pre_time1;
     sum_pre_time2=sum_result.sum_pre_time2 +. result.pre_time2;
     sum_lex_time=sum_result.sum_lex_time +. result.lex_time;
     sum_parse_time=sum_result.sum_parse_time +. result.parse_time;
     sum_reduction_time=sum_result.sum_reduction_time +. result.reduction_time;
     sum_sem_time=sum_result.sum_sem_time +. result.sem_time}

let print_sum_result file r =
  Printf.fprintf file "avg_times: pre_time1=%f pre_time2=%f lex_time=%f parse_time=%f reduction_time=%f sem_time=%f\n"
    (r.sum_pre_time1 /. float r.no_queries)
    (r.sum_pre_time2 /. float r.no_queries)
    (r.sum_lex_time /. float r.no_queries)
    (r.sum_parse_time /. float r.no_queries)
    (r.sum_reduction_time /. float r.no_queries)
    (r.sum_sem_time /. float r.no_queries);
  Printf.fprintf file "sum_results: pre_error=%d (%f%%) lex_error=%d (%f%%) parse_error=%d (%f%%) timeout=%d (%f%%) not_parsed=%d (%f%%) reduction_error=%d (%f%%) too_many_nodes=%d (%f%%) not_reduced=%d (%f%%) sem_error=%d (%f%%) not_translated=%d (%f%%) parsed=%d (%f%%)\n%!"
    r.no_pre_error (float r.no_pre_error /. float r.no_queries *. 100.)
    r.no_lex_error (float r.no_lex_error /. float r.no_queries *. 100.)
    r.no_parse_error (float r.no_parse_error /. float r.no_queries *. 100.)
    r.no_timeout (float r.no_timeout /. float r.no_queries *. 100.)
    r.no_not_parsed (float r.no_not_parsed /. float r.no_queries *. 100.)
    r.no_reduction_error (float r.no_reduction_error /. float r.no_queries *. 100.)
    r.no_too_many_nodes (float r.no_too_many_nodes /. float r.no_queries *. 100.)
    r.no_not_reduced (float r.no_not_reduced /. float r.no_queries *. 100.)
    r.no_sem_error (float r.no_sem_error /. float r.no_queries *. 100.)
    r.no_not_translated (float r.no_not_translated /. float r.no_queries *. 100.)
    r.no_parsed (float r.no_parsed /. float r.no_queries *. 100.)

let get_sock_addr host_name port =
  let he = Unix.gethostbyname host_name in
  let addr = he.Unix.h_addr_list in
  Unix.ADDR_INET(addr.(0),port)

let generate_queries filename timeout =
  let queries = File.load_lines filename in
  List.rev (fst (Xlist.fold queries ([],1) (fun (l,id) query ->
    let query = try List.hd (Str.split (Str.regexp "\t") query) with _ -> "" in
    (string_of_int id,(query,timeout)) :: l, id+1)))

let generate_queries_id filename timeout =
  let queries = File.load_lines filename in
  List.rev (Xlist.rev_map queries (fun line ->
    match Str.split (Str.regexp "\t") line with
      [id;query] -> id,(query,timeout)
    | _ -> failwith ("generate_queries_id: " ^ line)))

let test_process_file filename output_filename timeout =
  let queries = generate_queries filename timeout in
  let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in
  File.file_out output_filename (fun file ->
    let _ = Xlist.fold queries empty_sum_result (fun sum_result (id,(query,timeout)) ->
      let result = process_query ic oc timeout true id query 10 in
      print_result file result;
      let sum_result = add_result sum_result result in
      print_sum_result file sum_result;
      sum_result) in
    ());
  Printf.fprintf oc "\n%!";
  let _ = Unix.shutdown_connection ic in
  ()

let process_file_id filename output_filename timeout =
  let queries = generate_queries_id filename timeout in
  let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in
  File.file_out output_filename (fun file ->
    let _ = Xlist.fold queries empty_sum_result (fun sum_result (id,(query,timeout)) ->
      let result = process_query ic oc timeout true id query 10 in
      print_result file result;
      let sum_result = add_result sum_result result in
      print_sum_result file sum_result;
      sum_result) in
    ());
  Printf.fprintf oc "\n%!";
  let _ = Unix.shutdown_connection ic in
  ()