ENIAM_LCGlexicon.ml 13.7 KB
(*
 *  ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd
open ENIAM_LCGtypes
open ENIAM_LCGlexiconTypes
open ENIAMcategoriesPL

let rec find_selector s = function
    (t,Eq,x :: _) :: l -> if t = s then x else find_selector s l
  | (t,_,_) :: l -> if t = s then failwith "find_selector 1" else find_selector s l
  | [] -> failwith "find_selector 2"

let rec get_syntax rev = function
    Syntax syntax :: rule -> syntax, (List.rev rev) @ rule
  | t :: rule -> get_syntax (t :: rev) rule
  | [] -> failwith "get_syntax"

let rec get_quant rev = function
    Quant quant :: rule -> quant, (List.rev rev) @ rule
  | t :: rule -> get_quant (t :: rev) rule
  | [] -> [],  List.rev rev

let rec get_bracket rev = function
    Bracket :: rule -> true, (List.rev rev) @ rule
  | t :: rule -> get_bracket (t :: rev) rule
  | [] -> false, List.rev rev

let merge_quant pos_quants quants =
  let map = Xlist.fold quants SelectorMap.empty (fun map (k,v) -> SelectorMap.add map k v) in
  let l,map = Xlist.fold pos_quants ([],map) (fun (l,map) (cat,v) ->
      if SelectorMap.mem map cat then (cat,SelectorMap.find map cat) :: l, SelectorMap.remove map cat
      else (cat,v) :: l, map) in
  List.rev (SelectorMap.fold map l (fun l cat v -> (cat,v) :: l))

let assign_quantifiers (selectors,rule,weight) =
  let pos = find_selector Pos selectors in
  let categories =
    try StringMap.find pos_categories pos
    with Not_found -> failwith ("assign_quantifiers: " ^ pos) in
  let categories = Xlist.map categories (fun s -> s,Top) in
  let syntax,rule = get_syntax [] rule in
  let quant,rule = get_quant [] rule in
  let bracket,rule = get_bracket [] rule in
  let quant = merge_quant categories quant in
  selectors, (bracket,quant,syntax),(rule,weight)

let _ =
  let lexicon = ENIAM_LCGlexiconParser.load_lexicon "resources/lexicon-pl.dic" in
  List.rev (Xlist.rev_map lexicon assign_quantifiers)

(***
type rule2 =
    Basic of string
  | Quant of (selector * string) list * string
  | Raised of (selector * string) list * string * selector list
  | Quot of (selector * string) list * string
  | Inclusion of string
  | Conj of (selector * string) list * string
  | Bracket of string



let parse_quants_range quant =
  Xlist.map quant (fun (cats,v) -> cats, parse_quant_range (cats,v))
(**
   let parse_rule categories = function
    Basic syntax ->
    let quant = parse_quants_range categories in
    false, quant, parse_syntax syntax, BasicSem(Xlist.map quant fst)
   | Quant(quant,syntax) ->
    let quant = parse_quants_range (merge_quant categories quant) in
    false, quant, parse_syntax syntax, BasicSem(Xlist.map quant fst)
   | Raised(quant,syntax,semantics) ->
    let quant = parse_quants_range (merge_quant categories quant) in
    false, quant, parse_syntax syntax, RaisedSem(Xlist.map quant fst,semantics)
   | Quot(quant,syntax) ->
    let quant = parse_quants_range (merge_quant categories quant) in
    false, quant, parse_syntax syntax, QuotSem(Xlist.map quant fst)
   | Inclusion syntax ->
    let quant = parse_quants_range categories in
    false, quant, parse_syntax syntax, InclusionSem(Xlist.map quant fst)
   | Conj(quant,syntax) ->
    let quant = parse_quants_range (merge_quant categories quant) in
    false, quant, parse_syntax syntax, ConjSem(Xlist.map quant fst)
   | Bracket syntax ->
    let quant = parse_quants_range categories in
    true, quant, parse_syntax syntax, BasicSem(Xlist.map quant fst)

   let parse_grammar grammar =
   List.rev (Xlist.fold grammar [] (fun grammar (selectors,rule,weight) ->
      let selectors = parse_selectors selectors in
      let pos = find_selector Pos selectors in
      let categories =
        try StringMap.find pos_categories pos
        with Not_found -> failwith ("parse_grammar: " ^ pos) in
      let rule = try parse_rule categories rule with Not_found -> failwith ("parse_grammar: " ) in
      (selectors,rule,weight) :: grammar))
 **)

let rec extract_category pat rev = function
    (cat,rel,v) :: l -> if cat = pat then rel,v,(List.rev rev @ l) else extract_category pat ((cat,rel,v) :: rev) l
  | [] -> raise Not_found

let dict_of_grammar grammar =
  (* print_endline "dict_of_grammar"; *)
  Xlist.fold grammar StringMap.empty (fun dict (selectors,(bracket,quant,syntax,semantics),weight) ->
      let pos_rel,poss,selectors = try extract_category Pos [] selectors with Not_found -> failwith "dict_of_grammar 1" in
      let lemma_rel,lemmas,selectors = try extract_category Lemma [] selectors with Not_found -> Eq,[],selectors in
      if pos_rel <> Eq || lemma_rel <> Eq then failwith "dict_of_grammar 2" else
        let rule = selectors,(bracket,quant,syntax,semantics),weight in
        Xlist.fold poss dict (fun dict pos ->
            let dict2,l = try StringMap.find dict pos with Not_found -> StringMap.empty,[] in
            let dict2,l =
              if lemmas = [] then dict2,rule :: l else
                Xlist.fold lemmas dict2 (fun dict2 lemma ->
                    StringMap.add_inc dict2 lemma [rule] (fun l -> rule :: l)),l in
            StringMap.add dict pos (dict2,l)))

(* let rules = dict_of_grammar ENIAM_LCGlexiconPL.grammar *)

(* let translate_negation = function
    (Negation:negation) -> ["neg"]
   | Aff -> ["aff"]
   | NegationUndef -> ["aff";"neg"]
   | NegationNA -> []

   let translate_aspect = function
    (Aspect s:aspect) -> [s]
   | AspectUndef -> ["imperf";"perf"]
   | AspectNA -> []

   let translate_case = function
    (Case s:case) -> [s]
   | CaseUndef -> all_cases
   | _ -> failwith "translate_case"

   let translate_nsem = function
    Common s -> [s]
   | Time -> ["time"]


   let define_valence_selectors = function
    DefaultAtrs(m,r,o,neg,p,a) -> failwith "apply_valence_selectors"
   | EmptyAtrs m -> []
   | NounAtrs(m,nsyn,nsem) -> [Nsyn,Eq,[nsyn];Nsem,Eq,translate_nsem nsem]
   | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> [Case,Eq,translate_case c]
   | PersAtrs(m,le,neg,mo,t,au,a) -> [Negation,Eq,translate_negation neg;Mood,Eq,[mo];Tense,Eq,[t];Aspect,Eq,translate_aspect a]
   | GerAtrs(m,le,neg,a) -> [Negation,Eq,translate_negation neg;Aspect,Eq,translate_aspect a]
   | NonPersAtrs(m,le,role,role_attr,neg,a) -> [Negation,Eq,translate_negation neg;Aspect,Eq,translate_aspect a]
   | ComprepAtrs _ -> failwith "apply_valence_selectors" *)

let find_rules rules cats =
  let lex_rules,rules = try StringMap.find rules cats.pos with Not_found -> failwith "find_rules 1" in
  let rules = try StringMap.find lex_rules cats.lemma @ rules with Not_found -> rules in
  Xlist.fold rules [] (fun rules (selectors,(bracket,quant,syntax,semantics),weight) ->
      try
        let cats = apply_selectors cats selectors in
        (cats,(bracket,quant,syntax,semantics),weight) :: rules
      with Not_found -> rules)

(* FIXME: argumenty X i raised i inne *)

(* let render_schema schema =
   Xlist.map schema (function
        {morfs=[Multi args]} as s -> LCGrenderer.dir_of_dir s.dir, Maybe(Plus(Xlist.map args LCGrenderer.make_arg_phrase))
      | s -> LCGrenderer.dir_of_dir s.dir, Plus(Xlist.map s.morfs (LCGrenderer.make_arg []))) *)

(* let assign_valence valence rules =
   Xlist.fold rules [] (fun l (cats,(bracket,quant,syntax,semantics),weight) ->
      Printf.printf "%s |valence|=%d\n" cats.lemma (Xlist.size valence);
      if LCGrenderer.count_avar "schema" syntax > 0 then
        Xlist.fold valence l (fun l -> function
            Frame(attr,schema) ->
              (try
                 let selectors = define_valence_selectors attr in
                 let cats = apply_selectors cats selectors in
                 (cats,(bracket,quant,substitute_schema "schema" (render_schema schema) syntax,semantics),weight) :: l
               with Not_found -> l)
            | _ -> l)
      else (cats,(bracket,quant,syntax,semantics),weight) :: l) *)

let assign_valence valence rules =
  Xlist.fold rules [] (fun l (cats,(bracket,quant,syntax,semantics),weight) ->
      (* Printf.printf "%s |valence|=%d\n" cats.lemma (Xlist.size valence); *)
      if ENIAM_LCGrenderer.count_avar "schema" syntax > 0 then
        Xlist.fold valence l (fun l (selectors,schema) ->
            try
              let cats = apply_selectors cats selectors in
              (cats,(bracket,quant,ENIAM_LCGrenderer.substitute_schema "schema" schema syntax,semantics),weight) :: l
            with Not_found -> l)
      else (cats,(bracket,quant,syntax,semantics),weight) :: l)

(* FIXME: ustawienie wartości symbol *)
(* FIXME: problem z atrybutami przy zamianie kolejności rzędników *)
let make_node id orth lemma cat weight cat_list =
  let attrs = Xlist.fold cat_list(*Xlist.rev_map quant fst*) [] (fun attrs -> function
      | Lemma -> attrs
      | Cat -> ("CAT",SubstVar "cat") :: attrs
      | Number -> ("NUM",SubstVar "number") :: attrs
      | Case -> ("CASE",SubstVar "case") :: attrs
      | Gender -> ("GEND",SubstVar "gender") :: attrs
      | Person -> ("PERS",SubstVar "person") :: attrs
      | Grad -> ("GRAD",SubstVar "grad") :: attrs
      | Praep -> attrs
      | Acm -> ("ACM",SubstVar "acm") :: attrs
      | Aspect -> ("ASPECT", SubstVar "aspect") :: attrs
      | Negation -> ("NEGATION",SubstVar "negation") :: attrs
      | Mood -> ("MOOD", SubstVar "mood") :: attrs
      | Tense -> ("TENSE", SubstVar "tense") :: attrs
      | Nsyn -> ("NSYN", SubstVar "nsyn") :: attrs
      | Nsem -> ("NSEM", SubstVar "nsem") :: attrs
      | Ctype -> ("CTYPE", SubstVar "ctype") :: attrs
      | s -> (string_of_selector s, Dot) :: attrs) in
  (* | "lex" -> ("LEX",Val "+") :: attrs *)
  (* | s -> failwith ("make_node: " ^ (string_of_selector s))) in *)
  {ENIAM_LCGrenderer.empty_node with orth=orth; lemma=lemma; pos=cat; weight=weight; id=id; attrs=List.rev attrs; args=Dot}

type labels = {
  number: string;
  case: string;
  gender: string;
  person: string;
  aspect: string;
}

let get_label e = function
    Number -> e.number
  | Case -> e.case
  | Gender -> e.gender
  | Person -> e.person
  | Aspect -> e.aspect
  | _ -> ENIAM_LCGreductions.get_variant_label ()

let get_labels () = {
  number=ENIAM_LCGreductions.get_variant_label ();
  case=ENIAM_LCGreductions.get_variant_label ();
  gender=ENIAM_LCGreductions.get_variant_label ();
  person=ENIAM_LCGreductions.get_variant_label ();
  aspect=ENIAM_LCGreductions.get_variant_label ();
}

let make_quantification e rules =
  Xlist.map rules (fun (cats,(bracket,quant,syntax,semantics),weight) ->
      let syntax = Xlist.fold (List.rev quant) syntax (fun syntax (cat,t) ->
          let t = if t = Top then ENIAM_LCGrenderer.make_quant_restriction (match_selector cats cat) else t in
          let category = string_of_selector cat in
          WithVar(category,t,get_label e cat,syntax)) in
      let syntax = if bracket then Bracket(true,true,syntax) else Bracket(false,false,syntax) in
      cats,syntax,semantics,weight)

let make_term id orth rules =
  Xlist.map rules (fun (cats,syntax,semantics,weight) ->
      match semantics with
        BasicSem cat_list ->
        let node = make_node id orth cats.lemma cats.pos weight(*+.token.ENIAMtokenizerTypes.weight*) cat_list in
        let semantics = ENIAM_LCGrenderer.make_term node syntax in
        ENIAM_LCGrenderer.simplify (syntax,semantics)
      | _ -> failwith "make_term: ni")
(*cats,bracket,quant,syntax,Dot*)
(**
   let create_entries id orth cats valence =
   Xlist.fold cats [] (fun l cats ->
      (* variable_name_ref := []; *)
      if cats.pos="interp" && cats.lemma="<clause>" then (BracketSet(Forward),Dot) :: l else
      if cats.pos="interp" && cats.lemma="</clause>" then (BracketSet(Backward),Dot) :: l else
        let e = get_labels () in
        (* print_endline "create_entries 1"; *)
        let rules = find_rules rules cats in
        (* print_endline "create_entries 2"; *)
        let rules = assign_valence valence rules in
        (* print_endline "create_entries 3"; *)
        let rules = make_quantification e rules in
        (* print_endline "create_entries 4"; *)
        let rules = make_term id orth rules in
        (* print_endline "create_entries 5"; *)
        rules @ l)
 **)(*
(* FIXME: poprawić i dodać moduł testujący *)
module OrderedIntInt = struct
  type t = int * int
  let compare = compare
end

module IntIntSet = Xset.Make(OrderedIntInt)


let create (paths,last) tokens lex_sems =
  (* uni_weight := 0.; *)
  let chart = LCGchart.make last in
  let chart = Xlist.fold paths chart (fun chart (id,lnode,rnode) ->
      let token = ExtArray.get tokens id in
      let lex_sem = ExtArray.get lex_sems id in
      (*     if t.weight < -0.9 || Xlist.mem t.attrs "notvalidated proper" || Xlist.mem t.attrs "lemmatized as lowercase" then chart else *)
      let chart = LCGchart.add_inc chart lnode rnode (Tensor[Atom ("[" ^ token.ENIAMtokenizerTypes.orth ^ "]")], Dot) 0 in
      LCGchart.add_inc_list chart lnode rnode (create_entries (*tokens lex_sems*) id (token:ENIAMtokenizerTypes.token_record) lex_sem (*false*)) 0) in
  let set = Xlist.fold paths IntIntSet.empty (fun set (_,lnode,rnode) -> IntIntSet.add set (lnode,rnode)) in
  let chart = IntIntSet.fold set chart (fun chart (i,j) -> LCGchart.make_unique chart i j) in
  chart
*)
  ***)