ENIAM_LCGrendererPL.ml 12.6 KB
(*
 *  ENIAM_LCGgrammarPL is a library that provides LCG lexicon form Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd
open ENIAM_LCGgrammarPLtypes
open ENIAM_LCGtypes

let rec extract_category pat rev = function
    (cat,rel,v) :: l -> if cat = pat then rel,v,(List.rev rev @ l) else extract_category pat ((cat,rel,v) :: rev) l
  | [] -> raise Not_found

let dict_of_grammar grammar =
  print_endline "dict_of_grammar";
  Xlist.fold grammar StringMap.empty (fun dict (selectors,(bracket,quant,syntax,semantics),weight) ->
      let pos_rel,poss,selectors = try extract_category Pos [] selectors with Not_found -> failwith "dict_of_grammar 1" in
      let lemma_rel,lemmas,selectors = try extract_category Lemma [] selectors with Not_found -> Eq,[],selectors in
      if pos_rel <> Eq || lemma_rel <> Eq then failwith "dict_of_grammar 2" else
        let rule = selectors,(bracket,quant,syntax,semantics),weight in
        Xlist.fold poss dict (fun dict pos ->
            let dict2,l = try StringMap.find dict pos with Not_found -> StringMap.empty,[] in
            let dict2,l =
              if lemmas = [] then dict2,rule :: l else
                Xlist.fold lemmas dict2 (fun dict2 lemma ->
                    StringMap.add_inc dict2 lemma [rule] (fun l -> rule :: l)),l in
            StringMap.add dict pos (dict2,l)))

let rules = dict_of_grammar ENIAM_LCGlexiconPL.grammar

let match_selector cats = function
    Lemma -> [cats.lemma]
  (* | NewLemma -> [] *)
  | Number -> cats.numbers
  | Case -> cats.cases
  | Gender -> cats.genders
  | Person -> cats.persons
  | Grad -> cats.grads
  | Praep -> cats.praeps
  | Acm -> cats.acms
  | Aspect -> cats.aspects
  | Negation -> cats.negations
  | Mood -> cats.moods
  | Tense -> cats.tenses
  | Nsyn -> cats.nsyn
  | Nsem -> cats.nsem
  | c -> failwith ("match_selector: " ^ string_of_cat c)

let set_selector cats vals = function
    Number -> {cats with numbers=vals}
  | Case -> {cats with cases=vals}
  | Gender -> {cats with genders=vals}
  | Person -> {cats with persons=vals}
  | Grad -> {cats with grads=vals}
  | Praep -> {cats with praeps=vals}
  | Acm -> {cats with acms=vals}
  | Aspect -> {cats with aspects=vals}
  | Negation -> {cats with negations=vals}
  | Mood -> {cats with moods=vals}
  | Tense -> {cats with tenses=vals}
  | Nsyn -> {cats with nsyn=vals}
  | Nsem -> {cats with nsem=vals}
  | c -> failwith ("set_selector: " ^ string_of_cat c)

let rec apply_selectors cats = function
    [] -> cats
  | (sel,Eq,vals) :: l ->
    let vals = StringSet.intersection (StringSet.of_list (match_selector cats sel)) (StringSet.of_list vals) in
    if StringSet.is_empty vals then raise Not_found else
      apply_selectors (set_selector cats (StringSet.to_list vals) sel) l
  | (sel,Neq,vals) :: l ->
    let vals = StringSet.difference (StringSet.of_list (match_selector cats sel)) (StringSet.of_list vals) in
    if StringSet.is_empty vals then raise Not_found else
      apply_selectors (set_selector cats (StringSet.to_list vals) sel) l

(* let translate_negation = function
    (Negation:negation) -> ["neg"]
   | Aff -> ["aff"]
   | NegationUndef -> ["aff";"neg"]
   | NegationNA -> []

   let translate_aspect = function
    (Aspect s:aspect) -> [s]
   | AspectUndef -> ["imperf";"perf"]
   | AspectNA -> []

   let translate_case = function
    (Case s:case) -> [s]
   | CaseUndef -> all_cases
   | _ -> failwith "translate_case"

   let translate_nsem = function
    Common s -> [s]
   | Time -> ["time"]


   let define_valence_selectors = function
    DefaultAtrs(m,r,o,neg,p,a) -> failwith "apply_valence_selectors"
   | EmptyAtrs m -> []
   | NounAtrs(m,nsyn,nsem) -> [Nsyn,Eq,[nsyn];Nsem,Eq,translate_nsem nsem]
   | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> [Case,Eq,translate_case c]
   | PersAtrs(m,le,neg,mo,t,au,a) -> [Negation,Eq,translate_negation neg;Mood,Eq,[mo];Tense,Eq,[t];Aspect,Eq,translate_aspect a]
   | GerAtrs(m,le,neg,a) -> [Negation,Eq,translate_negation neg;Aspect,Eq,translate_aspect a]
   | NonPersAtrs(m,le,role,role_attr,neg,a) -> [Negation,Eq,translate_negation neg;Aspect,Eq,translate_aspect a]
   | ComprepAtrs _ -> failwith "apply_valence_selectors" *)

let find_rules rules cats =
  let lex_rules,rules = try StringMap.find rules cats.pos with Not_found -> failwith "find_rules 1" in
  let rules = try StringMap.find lex_rules cats.lemma @ rules with Not_found -> rules in
  Xlist.fold rules [] (fun rules (selectors,(bracket,quant,syntax,semantics),weight) ->
      try
        let cats = apply_selectors cats selectors in
        (cats,(bracket,quant,syntax,semantics),weight) :: rules
      with Not_found -> rules)

(* let render_schema schema =
   Xlist.map schema (function
        {morfs=[Multi args]} as s -> LCGrenderer.dir_of_dir s.dir, Maybe(Plus(Xlist.map args LCGrenderer.make_arg_phrase))
      | s -> LCGrenderer.dir_of_dir s.dir, Plus(Xlist.map s.morfs (LCGrenderer.make_arg []))) *)

(* FIXME: pomijam NewLemma *)
(* let assign_valence valence rules =
   Xlist.fold rules [] (fun l (cats,(bracket,quant,syntax,semantics),weight) ->
      Printf.printf "%s |valence|=%d\n" cats.lemma (Xlist.size valence);
      if LCGrenderer.count_avar "schema" syntax > 0 then
        Xlist.fold valence l (fun l -> function
            Frame(attr,schema) ->
              (try
                 let selectors = define_valence_selectors attr in
                 let cats = apply_selectors cats selectors in
                 (cats,(bracket,quant,substitute_schema "schema" (render_schema schema) syntax,semantics),weight) :: l
               with Not_found -> l)
            | _ -> l)
      else (cats,(bracket,quant,syntax,semantics),weight) :: l) *)

let assign_valence valence rules =
  Xlist.fold rules [] (fun l (cats,(bracket,quant,syntax,semantics),weight) ->
      (* Printf.printf "%s |valence|=%d\n" cats.lemma (Xlist.size valence); *)
      if ENIAM_LCGrenderer.count_avar "schema" syntax > 0 then
        Xlist.fold valence l (fun l (selectors,schema) ->
            try
              let cats = apply_selectors cats selectors in
              (cats,(bracket,quant,ENIAM_LCGrenderer.substitute_schema "schema" schema syntax,semantics),weight) :: l
            with Not_found -> l)
      else (cats,(bracket,quant,syntax,semantics),weight) :: l)

(* FIXME: ustawienie wartości symbol i arg_symbol *)
let make_node id lemma cat weight cat_list =
  let attrs = Xlist.fold cat_list(*Xlist.rev_map quant fst*) [] (fun attrs -> function
      | Lemma -> attrs
      | Number -> ("NUM",SubstVar "number") :: attrs
      | Case -> ("CASE",SubstVar "case") :: attrs
      | Gender -> ("GEND",SubstVar "gender") :: attrs
      | Person -> ("PERS",SubstVar "person") :: attrs
      | Grad -> ("GRAD",SubstVar "grad") :: attrs
      | Praep -> attrs
      | Acm -> ("ACM",SubstVar "acm") :: attrs
      | Aspect -> ("ASPECT", SubstVar "aspect") :: attrs
      | Negation -> ("NEGATION",SubstVar "negation") :: attrs
      | Mood -> ("MOOD", SubstVar "mood") :: attrs
      | Tense -> ("TENSE", SubstVar "tense") :: attrs
      | Nsyn -> ("NSYN", SubstVar "nsyn") :: attrs
      | Nsem -> ("NSEM", SubstVar "nsem") :: attrs
      | Ctype -> ("CTYPE", SubstVar "ctype") :: attrs
      (* | "lex" -> ("LEX",Val "+") :: attrs *)
      | s -> failwith ("make_node: " ^ (string_of_cat s))) in
  {ENIAM_LCGrenderer.empty_node with pred=lemma; cat=cat; weight=weight; id=id; attrs=List.rev attrs; args=Dot}

(* FIXME: przenieść niezależne od języka procedury do ENIAM_LCGrenderer *)
let variable_name_ref = ref []

let rec add_variable_name = function
    [] -> ["a"]
  | "z" :: l -> "a" :: add_variable_name l
  | s :: l -> String.make 1 (Char.chr (Char.code (String.get s 0) + 1)) :: l

let get_variable_name () =
  variable_name_ref := add_variable_name (!variable_name_ref);
  String.concat "" (List.rev (!variable_name_ref))

let rec make_term_arg = function
    Tensor l -> let v = get_variable_name () in v, Cut(Var v)
  | Plus l -> let v = get_variable_name () in v, Case(Var v,Xlist.map l make_term_arg)
  (* | Imp(s,d,t2) -> *)
  | One -> get_variable_name (), Dot
  | Maybe s ->
    let v,arg = make_term_arg s in
    let w = get_variable_name () in
    w, Fix(Var w,Lambda(v,arg))
  | _ -> failwith "make_term_arg"

let add_args node args =
  {node with args=Tuple(node.args :: args)}

let rec make_term_imp node = function
  | Imp(s,d,t2) ->
    let v,arg = make_term_arg t2 in
    Lambda(v,make_term_imp (add_args node [arg]) s)
  | ImpSet(s,l) ->
    let vars,args = List.split (Xlist.map l (fun (_,t) -> make_term_arg t)) in
    LambdaSet(vars,make_term_imp (add_args node args) s)
  | Tensor l -> Node node
  | _ -> failwith "make_term_imp"

let make_term id token rules =
  Xlist.map rules (fun (cats,(bracket,quant,syntax,semantics),weight) ->
      match semantics with
        BasicSem cat_list ->
        let node = make_node id cats.lemma cats.pos (weight+.token.ENIAMtokenizerTypes.weight) cat_list in
        cats,bracket,quant,syntax,make_term_imp node syntax
      | _ -> failwith "make_term: ni")
(*cats,bracket,quant,syntax,Dot*)

type labels = {
  number: string;
  case: string;
  gender: string;
  person: string;
  aspect: string;
}

let get_label e = function
    Number -> e.number
  | Case -> e.case
  | Gender -> e.gender
  | Person -> e.person
  | Aspect -> e.aspect
  | _ -> ENIAM_LCGreductions.get_variant_label ()

let get_labels () = {
  number=ENIAM_LCGreductions.get_variant_label ();
  case=ENIAM_LCGreductions.get_variant_label ();
  gender=ENIAM_LCGreductions.get_variant_label ();
  person=ENIAM_LCGreductions.get_variant_label ();
  aspect=ENIAM_LCGreductions.get_variant_label ();
}


let make_quantification e rules =
  Xlist.map rules (fun (cats,bracket,quant,syntax,semantics) ->
      let syntax,semantics = Xlist.fold (List.rev quant) (syntax,semantics) (fun (syntax,semantics) (cat,t) ->
          let t = if t = Top then ENIAM_LCGlexiconPL.make_symbol (match_selector cats cat) else t in
          let category = string_of_cat cat in
          ENIAM_LCGrenderer.simplify_withvar (WithVar(category,t,get_label e cat,syntax), VariantVar(category,semantics))) in
      if bracket then Bracket(true,true,syntax),semantics else Bracket(false,false,syntax),semantics)

(* FIXME: poprawić i dodać moduł testujący *)
(*
let create_entries id token lex_sem =
  Xlist.fold lex_sem.cats [] (fun l cats ->
      variable_name_ref := [];
      if cats.pos="interp" && cats.lemma="<clause>" then (BracketSet(Forward),Dot) :: l else
      if cats.pos="interp" && cats.lemma="</clause>" then (BracketSet(Backward),Dot) :: l else
        let e = get_labels () in
        (* print_endline "create_entries 1"; *)
        let rules = find_rules rules cats in
        (* print_endline "create_entries 2"; *)
        let rules = assign_valence lex_sem.very_simple_valence rules in
        (* print_endline "create_entries 3"; *)
        let rules = make_term id token rules in
        (* print_endline "create_entries 4"; *)
        let rules = make_quantification e rules in
        (* print_endline "create_entries 5"; *)
        rules @ l)

module OrderedIntInt = struct
  type t = int * int
  let compare = compare
end

module IntIntSet = Xset.Make(OrderedIntInt)


let create (paths,last) tokens lex_sems =
  (* uni_weight := 0.; *)
  let chart = LCGchart.make last in
  let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
      let token = ExtArray.get tokens id in
      let lex_sem = ExtArray.get lex_sems id in
      (*     if t.weight < -0.9 || Xlist.mem t.attrs "notvalidated proper" || Xlist.mem t.attrs "lemmatized as lowercase" then chart else *)
      let chart = LCGchart.add_inc chart lnode rnode (Tensor[Atom ("[" ^ token.ENIAMtokenizerTypes.orth ^ "]")], Dot) 0 in
      LCGchart.add_inc_list chart lnode rnode (create_entries (*tokens lex_sems*) id (token:ENIAMtokenizerTypes.token_record) lex_sem (*false*)) 0) in
  let set = Xlist.fold paths IntIntSet.empty (fun set (_,lnode,rnode) -> IntIntSet.add set (lnode,rnode)) in
  let chart = IntIntSet.fold set chart (fun chart (i,j) -> LCGchart.make_unique chart i j) in
  chart
*)