test2.ml 6.1 KB
(*
 *  ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAM_LCGlexiconTypes
open ENIAM_LCGtypes
open ENIAMsubsyntaxTypes

let rules = ENIAM_LCGlexicon.make_rules false ENIAM_LCGlexiconTypes.rules_filename

let examples = [
  (* "Szpak","Szpak śpiewa.";*)
  (* "miał","Miałem miał."; *)
(*  "Ala","Ala ma kota.";
  "Ale","Ale mają kota:"; *)
  (*  "zima","Szpak frunie zimą.";*)
  (* "październik","Kot miauczy w październiku."; *)
(*  "Szpak-Kot","Szpak frunie. Kot miauczy.";
    "powiedział","Szpak powiedział: „Frunę. Kiszę.”";*)
    "teraz","Teraz frunie jakiś szpak.";
      "chłopcy","Chłopcy mają ulicę kwiatami.";
     (*  "arabia","Arabia Saudyjska biegnie.";*)
(*  "Tom","Tom idzie."; *)
]

let clarify_categories senses token =
  match token.ENIAMtokenizerTypes.token with
    ENIAMtokenizerTypes.Lemma(lemma,pos,interp) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories false senses (lemma,pos,interp)))
  | ENIAMtokenizerTypes.Proper(lemma,pos,interp,_) -> List.flatten (Xlist.map interp (fun interp -> ENIAMcategoriesPL.clarify_categories true senses (lemma,pos,interp)))
  | ENIAMtokenizerTypes.Interp lemma -> ENIAMcategoriesPL.clarify_categories false senses (lemma,"interp",[])
  | _ -> []

let create_chart tokens lex_sems paths last =
  ENIAM_LCGrenderer.reset_variable_numbers ();
  let chart = ENIAM_LCGchart.make last in
  let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
      let t = ExtArray.get tokens id in
      let s = ExtArray.get lex_sems id in
      ENIAM_LCGrenderer.reset_variable_names ();
      ENIAM_LCGrenderer.add_variable_numbers ();
      let cats = clarify_categories ["X"] t in
      let l = ENIAM_LCGlexicon.create_entries rules id t.ENIAMtokenizerTypes.orth cats s.ENIAMlexSemanticsTypes.schemata in
      ENIAM_LCGchart.add_inc_list chart lnode rnode l 0) in
  chart

let test_example name tokens lex_sems paths last =
  ENIAM_LCGreductions.reset_variant_label ();
  let chart = create_chart tokens lex_sems paths last in
  ENIAM_LCGlatexOf.print_chart "results/" (name^"1_chart") "a1" chart;
  let chart,references = ENIAM_LCGchart.lazify chart in
  ENIAM_LCGlatexOf.print_chart "results/" (name^"2_chart") "a4" chart;
  ENIAM_LCGlatexOf.print_references "results/" (name^"2_references") "a4" references;
  let chart = ENIAM_LCGchart.parse chart references 30. Sys.time in (* uwaga: niejawna zmiana imperatywna w references *)
  ENIAM_LCGlatexOf.print_chart "results/" (name^"3_chart") "a4" chart;
  ENIAM_LCGlatexOf.print_references "results/" (name^"3_references") "a4" references;
  if ENIAM_LCGchart.is_parsed chart then (
    let term = ENIAM_LCGchart.get_parsed_term chart in
    Xlatex.latex_file_out "results/" (name^"4_term") "a4" false (fun file ->
        Printf.fprintf file "\\[%s\\]\n" (ENIAM_LCGlatexOf.linear_term 0 term));
    Xlatex.latex_compile_and_clean "results/" (name^"4_term");
    let dependency_tree = ENIAM_LCGreductions.reduce term references in
    ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"4_dependency_tree") "a0" dependency_tree;
    if ENIAM_LCGreductions.is_reduced_dependency_tree dependency_tree then (
      ENIAM_LCGreductions.assign_labels dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
      ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"5_dependency_tree") "a4" dependency_tree;
      ENIAM_LCGreductions.remove_cuts dependency_tree; (* uwaga: niejawna zmiana imperatywna w dependency_tree *)
      ENIAM_LCGlatexOf.print_dependency_tree "results/" (name^"6_dependency_tree") "a4" dependency_tree;
      ENIAM_LCGgraphOf.print_dependency_tree "results/" (name^"6_dependency_tree") dependency_tree;
      ENIAM_LCGgraphOf.print_simplified_dependency_tree "results/" (name^"6_simple_dependency_tree") dependency_tree;
      ())
    else print_endline "not reduced")
  else print_endline "not parsed"

let rec parse_sentence name id tokens lex_sems = function
    RawSentence s -> id
  | StructSentence(paths,last) ->
    test_example (name ^ string_of_int id ^ "_") tokens lex_sems paths last;
    id + 1
  | DepSentence(paths) -> id
  | QuotedSentences sentences ->
    Xlist.fold sentences id (fun id p ->
        parse_sentence name id tokens lex_sems p.sentence)
  | AltSentence l ->
    Xlist.fold l id (fun id (mode,sentence) ->
        parse_sentence name id tokens lex_sems sentence)

let rec parse_paragraph name id tokens lex_sems = function
    RawParagraph s -> id
  | StructParagraph sentences ->
    Xlist.fold sentences id (fun id p ->
        parse_sentence name id tokens lex_sems p.sentence)
  | AltParagraph l ->
    Xlist.fold l id (fun id (mode,paragraph) ->
        parse_paragraph name id tokens lex_sems paragraph)

let rec parse_text name id tokens lex_sems = function
    RawText s -> id
  | StructText paragraphs ->
    Xlist.fold paragraphs id (fun id paragraph ->
      parse_paragraph name id tokens lex_sems paragraph)
  | AltText l ->
    Xlist.fold l id (fun id (mode,text) ->
      parse_text name id tokens lex_sems text)


let _ =
  ENIAMsubsyntax.initialize ();
  ENIAMcategoriesPL.initialize ();
  Xlist.iter examples (fun (name,example) ->
      let text,tokens = ENIAMsubsyntax.parse_text example in
      let lex_sems = ENIAMlexSemantics.assign tokens text in
      ignore(parse_text name 1 tokens lex_sems text))