semparser.ml 13 KB

Edit Raw Blame History

open Xstd
open ENIAMsubsyntaxTypes

let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.user_lexicon_filename

let load_senses_map filename =
  File.fold_tab filename StringMap.empty (fun map -> function
    [lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l)
  | l -> failwith ("load_senses_map: " ^ String.concat "\t" l))

let senses_map = load_senses_map ENIAM_LCGlexiconTypes.user_senses_filename


let examples = [
  (* "liceum","W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie."; *)
  "studia","Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994.";
]

let clarify_categories token =
  match token.ENIAMtokenizerTypes.token with
    ENIAMtokenizerTypes.Lemma(lemma,pos,interp) ->
      let senses = try StringMap.find senses_map lemma with Not_found -> ["X"] in
      List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp)))
  | ENIAMtokenizerTypes.Proper(lemma,pos,interp,senses) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp)))
  | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false ["X"] (lemma,"interp",[])
  | _ -> []

let create_chart tokens paths last =
  ENIAM_LCGrenderer.reset_variable_numbers ();
  let chart = ENIAM_LCGchart.make last in
  let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
      let t = ExtArray.get tokens id in
      ENIAM_LCGrenderer.reset_variable_names ();
      ENIAM_LCGrenderer.add_variable_numbers ();
      let cats = clarify_categories t in
      let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats [] in
      ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
  chart

let test_example name tokens paths last =
  ENIAM_LCGreductions.reset_variant_label ();
  let chart = create_chart tokens paths last in
  ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart;
  let chart,references = ENIAM_LCGchart.lazify chart in
  ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart;
  ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references;
  let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
  ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart;
  ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references;
  if ENIAM_LCGchart.is_parsed chart then (
    let term = ENIAM_LCGchart.get_parsed_term chart in
    Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file ->
        Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
    Xlatex.latex_compile_and_clean "results/" (name^"4_term");
    let dependency_tree = ENIAM_LCGreductions.reduce term references in
    ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree;
    if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
      ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
      ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree;
      ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
      ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree;
      ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree;
      ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree;
      ())
    else print_endline "not reduced")
  else print_endline "not parsed"

let rec parse_sentence name id tokens = function
    RawSentence s -> id
  | StructSentence(paths,last) ->
    test_example (name ^ string_of_int id ^ "_") tokens paths last;
    id + 1
  | DepSentence(paths) -> id
  | QuotedSentences sentences ->
    Xlist.fold sentences id (fun id p ->
        parse_sentence name id tokens p.sentence)
  | AltSentence l ->
    Xlist.fold l id (fun id (mode,sentence) ->
        parse_sentence name id tokens sentence)

let rec parse_paragraph name id tokens = function
    RawParagraph s -> id
  | StructParagraph sentences ->
    Xlist.fold sentences id (fun id p ->
        parse_sentence name id tokens p.sentence)
  | AltParagraph l ->
    Xlist.fold l id (fun id (mode,paragraph) ->
        parse_paragraph name id tokens paragraph)

let rec parse_text name id tokens = function
    RawText s -> id
  | StructText paragraphs ->
    Xlist.fold paragraphs id (fun id paragraph ->
      parse_paragraph name id tokens paragraph)
  | AltText l ->
    Xlist.fold l id (fun id (mode,text) ->
      parse_text name id tokens text)


(* let _ =
  Xlist.iter examples (fun (name,example) ->
      let text,tokens = ENIAMsubsyntax.parse_text example in
      ignore(parse_text name 1 tokens text)) *)

(*
type entry = {title: string; info:string; biogram:string; (*primary:string; secondary:string;*) author:string}

let process_xml = function
    Xml.Element("entries",[],entries) ->
    List.rev (Xlist.rev_map entries (function
          Xml.Element("entry",[],[title;info;biogram(*;primary;secondary*);author]) ->
          {title=Xml.to_string title; info=Xml.to_string info; biogram=Xml.to_string biogram;
           (*primary=Xml.to_string primary; secondary=Xml.to_string secondary;*) author=Xml.to_string author}
        | _ -> failwith "process_xml 1"))
  | _ -> failwith "process_xml 2"


let load_ppibl filename =
  let ppibl = File.load_file_gen ("data/" ^ filename) in
  process_xml (Xml.parse_string ppibl)

let named_entities =
  File.fold_tab "data/ne.tab" StringMap.empty (fun map -> function
        [lemma;cat] -> StringMap.add_inc map lemma [cat] (fun l -> cat :: l)
      | _ -> failwith "named_entities")

let assign_named_entities t =
  match t.token with
    Lemma(lemma,"subst",interp) ->
    (try
      let cat = StringMap.find named_entities lemma in
      {t with token=Proper(lemma,"subst",interp,cat)}
     with Not_found -> t)
  | Proper(lemma,"subst",interp,_) ->
      (try
        let cat = StringMap.find named_entities lemma in
        {t with token=Proper(lemma,"subst",interp,cat)}
       with Not_found -> t)
  | _ -> t

let test_strings = [
  (* "Debiutował opowiadaniem pt. <i>Zlecenie na dostawę</i>.";  *)
  "W 1984-89 uczęszczał do VII Liceum Ogólnokształcącego im. K.K. Baczyńskiego w Szczecinie.";
  (* "Następnie studiował architekturę na Politechnice Szczecińskiej, dyplom uzyskał w 1994." *)
  (* "W 2003 obronił doktorat nauk technicznych w zakresie architektury i urbanistyki na Politechnice Krakowskiej i został adiunktem w Zakładzie Teorii Architektury, Historii i Konserwacji Zabytków IAiPP." *)
]

(* let _ =
  let entries = load_ppibl "ak322269.xml" in
  Xlist.iter entries (fun entry -> print_endline entry.biogram) *)

(*
let test_strings = [
  "Szpak frunie.";
  "Kot np. miauczy.";
  "Ala ma kota.";
  "Ale mają kota:"
  ]

let test_strings2 = [
  "Szpak frunie. Kot miauczy.";
  "Szpak powiedział: „Frunę. Kiszę.”";
  ]
*)

let grammar = [
  "pos=year",                 Basic "year",symbol_weight;
  "pos=year-interval",        Basic "year-interval",symbol_weight;
  "lemma=w,pos=prep,case=loc", Basic "time/(year+year-interval)",0.;
  "lemma=w,pos=prep,case=loc", Basic "locat/np*MIASTO*T*loc*T",0.;

  "lemma=uczęszczać,pos=praet|fin,person=ter,negation=aff,mood=indicative", Basic "ip*number*gender{|(1+time),|(1+pp*ORGANIZACJA*do*gen),|(1+locat)}",0.;
  "lemma=do,pos=prep,case=gen", Basic "pp*sense*lemma*case/np*sense*T*case*T",0.;

]

let _ =
  print_endline "Testy wbudowane";
  Xlist.iter test_strings (fun s ->
    print_endline ("\nTEST: " ^ s);
    let paths = ENIAMsubsyntax.parse s in
    let paths = Xlist.map paths assign_named_entities in
    (* print_endline (ENIAMtokenizer.xml_of tokens); *)
    print_endline (ENIAMpaths.to_string (paths,0)));
(*  Xlist.iter test_strings2 (fun s ->
    print_endline ("\nTEST: " ^ s);
    let text,tokens = ENIAMsubsyntax.parse_text s in
    (* print_endline (ENIAMtokenizer.xml_of tokens); *)
    print_endline (ENIAMsubsyntaxStringOf.tokens tokens);
    print_endline "";
    print_endline (ENIAMsubsyntaxStringOf.text "" tokens text));*)
(*  print_endline "Testy użytkownika.";
  print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
  let s = ref (read_line ()) in
  while !s <> "" do
    let tokens = ENIAMtokenizer.parse !s in
    (* print_endline (ENIAMtokenizer.xml_of tokens); *)
    Xlist.iter tokens (fun token -> print_endline (ENIAMtokenizer.string_of 0 token));
    print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy.";
    s := read_line ()
  done;*)
  ()

open ENIAM_LCGlexiconTypes
open ENIAM_LCGtypes


(*
type output = Text | Xml | Html | Marsh | Graphviz

let output = ref Text
let comm_stdio = ref true
let sentence_split = ref true
let port = ref 0

let spec_list = [
  "-s", Arg.Unit (fun () -> sentence_split:=true), "Split input into sentences (default)";
  "-n", Arg.Unit (fun () -> sentence_split:=false), "Do not split input into sentences";
  "-i", Arg.Unit (fun () -> comm_stdio:=true), "Communication using stdio (default)";
  "-p", Arg.Int (fun p -> comm_stdio:=false; port:=p), "<port> Communication using sockets on given port number";
  "-t", Arg.Unit (fun () -> output:=Text), "Output as plain text (default)";
  "-x", Arg.Unit (fun () -> output:=Xml), "Output as XML";
  "-m", Arg.Unit (fun () -> output:=Marsh), "Output as marshalled Ocaml data structure";
  "-h", Arg.Unit (fun () -> output:=Html), "Output as HTML";
  "-g", Arg.Unit (fun () -> output:=Graphviz; sentence_split:=false), "Output as graphviz dot file; turns sentence split off";
  (* "-r", Arg.String (fun p ->
        ENIAMtokenizerTypes.set_resource_path p;
        ENIAMmorphologyTypes.set_resource_path p;
        ENIAMsubsyntaxTypes.set_resource_path p), "<path> Set resource path"; *)
  ]

let usage_msg =
  "Usage: subsyntax <options>\nInput is a sequence of lines. Empty line ends the sequence and invoke parsing. Double empty line shutdown parser.\nOptions are:"
*)*)
let message = "ENIAM_LCGparser, a parser for Logical Categorial Grammar formalism\n\
Copyright (C) 2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>\n\
Copyright (C) 2017 Institute of Computer Science Polish Academy of Sciences"
(*
let anon_fun s = raise (Arg.Bad ("invalid argument: " ^ s))
*)
let input_text channel =
  let s = ref (try input_line channel with End_of_file -> "") in
  let lines = ref [] in
  while !s <> "" do
    lines := !s :: !lines;
    s := try input_line channel with End_of_file -> ""
  done;
  String.concat "\n" (List.rev !lines)

let rec main_loop sub_in sub_out in_chan out_chan =
  let text = input_text in_chan in
  if text = "" then () else (
    Printf.fprintf sub_out "%s\n\n%!" text;
    let text,tokens = (Marshal.from_channel sub_in : ENIAMsubsyntaxTypes.text * ENIAMtokenizerTypes.token_env ExtArray.t) in
      (* let text,tokens = ENIAMsubsyntax.parse_text text in *)
    ignore(parse_text "E"(*name*) 1 tokens text)
    (* print_endline "input text begin";
    print_endline text;
    print_endline "input text end"; *)
    (*if !sentence_split then
       let text,tokens = ENIAMsubsyntax.parse_text text in
       (match !output with
          Text -> output_string out_chan (ENIAMsubsyntaxStringOf.text "" tokens text ^ "\n" ^ ENIAMsubsyntaxStringOf.token_extarray tokens ^ "\n\n")
        | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.text_and_tokens text tokens) ^ "\n\n")
        | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.text_and_tokens text tokens ^ "\n\n")
        | Marsh -> Marshal.to_channel out_chan (text,tokens) []
        | Graphviz -> failwith "main_loop: ni")
    else
      let tokens = ENIAMsubsyntax.parse text in
      (match !output with
         Text -> output_string out_chan (ENIAMsubsyntaxStringOf.token_list tokens ^ "\n\n")
       | Xml -> output_string out_chan (Xml.to_string (ENIAMsubsyntaxXMLof.token_list tokens) ^ "\n\n")
       | Html -> output_string out_chan (ENIAMsubsyntaxHTMLof.token_list tokens ^ "\n\n")
       | Marsh -> Marshal.to_channel out_chan tokens []
       | Graphviz -> output_string out_chan (ENIAMsubsyntaxGraphOf.token_list tokens ^ "\n\n"))*);
    flush out_chan;
    main_loop sub_in sub_out in_chan out_chan)

let get_sock_addr host_name port =
  let he = Unix.gethostbyname host_name in
  let addr = he.Unix.h_addr_list in
  Unix.ADDR_INET(addr.(0),port)

let sub_host = "localhost"
let sub_port = 5739

let _ =
  prerr_endline message;
  (* ENIAMsubsyntax.initialize (); *)
  ENIAMcategoriesPL.initialize ();
  (* Arg.parse spec_list anon_fun usage_msg; *)
  Gc.compact ();
  let sub_in,sub_out = Unix.open_connection (get_sock_addr sub_host sub_port) in
  prerr_endline "Ready!";
  (*if !comm_stdio then*) main_loop sub_in sub_out stdin stdout
  (*else
    let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,!port) in
    Unix.establish_server main_loop sockaddr*)