ENIAMlexSemantics.ml 13.7 KB
(*
 *  ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
 *  Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMtokenizerTypes
open ENIAMsubsyntaxTypes
open ENIAMlexSemanticsTypes
open ENIAMwalTypes
open Xstd

let find_meaning m =
  try
    ENIAMplWordnet.find_meaning m.plwnluid
  with Not_found ->
    m.name ^ "-" ^ m.variant, [], unknown_meaning_weight

let lex_sie = LCG (ENIAMwalRenderer.render_morf (SimpleLexArg("się",QUB)))

let find_senses t s = (* FIXME: sensy zawierające 'się' *)
  let set = Xlist.fold s.frames StringSet.empty (fun set (selectors,meanings,positions) ->
    Xlist.fold meanings set (fun set (name,hipero,weight) ->
      StringSet.add set name)) in
  let senses = match t.token with
      Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses lemma pos
    | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses
    | _ -> [] in
  let senses = Xlist.fold senses [] (fun senses (name,hipero,weight) ->
    if StringSet.mem set name then senses else (name,hipero,weight) :: senses) in
  let senses_sie = match t.token with
      Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses (lemma ^ " się") pos
    | Proper(_,_,_,senses) -> []
    | _ -> [] in
  let senses_sie = Xlist.fold senses_sie [] (fun senses_sie (name,hipero,weight) ->
    if StringSet.mem set name then senses_sie else (name,hipero,weight) :: senses_sie) in
  let frames = if senses = [] then s.frames else ([],senses,[]) :: s.frames in
  let frames = if senses_sie = [] then frames else ([],senses_sie,
    [{empty_position with role="Lemma"; mode=["lemma"]; morfs=[lex_sie]}]) :: frames in
  {s with frames=frames}

let find_selprefs schema = (* FIXME: RelationRole *)
  Xlist.map schema (fun p ->
      let l = Xlist.fold p.sel_prefs [] (fun l -> function
          SynsetId id -> (try ENIAMplWordnet.synset_name id :: l with ENIAMplWordnet.SynsetNotFound -> l)
        | Predef s -> s :: l
        | SynsetName _ -> failwith "find_selprefs"
        | RelationRole _ -> l) in
      let l = if l = [] then ["ALL"] else l in
      {p with sel_prefs=Xlist.map l (fun s -> SynsetName s)})

let rec find a l i =
  if a.(i) = max_int then (
    a.(i) <- i;
    i) else
  if a.(i) = i then (
    Xlist.iter l (fun j -> a.(j) <- i);
    i) else
  find a (i :: l) a.(i)

let union a i j =
  if i = j then i else
  let x = min i j in
  let y = max i j in
  a.(y) <- x;
  x

let rec split_tokens_into_groups_sentence a = function
    RawSentence s -> ()
  | StructSentence([],_) -> ()
  | StructSentence((id,_,_) :: paths,_) ->
      ignore (Xlist.fold paths (find a [] id) (fun m (id,_,_) ->
        union a m (find a [] id)))
  | DepSentence(paths) ->
      if Array.length paths = 0 then () else
      let id,_,_ = paths.(0) in
      ignore (Int.fold 1 (Array.length paths - 1) (find a [] id) (fun m i ->
        let id,_,_ = paths.(i) in
        union a m (find a [] id)))
  | QuotedSentences sentences ->
      Xlist.iter sentences (fun p ->
        split_tokens_into_groups_sentence a p.sentence)
  | AltSentence l -> Xlist.iter l (fun (mode,sentence) ->
        split_tokens_into_groups_sentence a sentence)

let rec split_tokens_into_groups_paragraph a = function
    RawParagraph s -> ()
  | StructParagraph sentences ->
      Xlist.iter sentences (fun p -> split_tokens_into_groups_sentence a p.sentence)
  | AltParagraph l -> Xlist.iter l (fun (mode,paragraph) ->
      split_tokens_into_groups_paragraph a paragraph)

let rec split_tokens_into_groups_text a = function
    RawText s -> ()
  | StructText paragraphs ->
      Xlist.iter paragraphs (split_tokens_into_groups_paragraph a)
  | AltText l -> Xlist.iter l (fun (mode,text) ->
      split_tokens_into_groups_text a text)

let split_tokens_into_groups size text =
  let a = Array.make size max_int in
  split_tokens_into_groups_text a text;
  Int.iter 1 (Array.length a - 1) (fun i ->
    if a.(i) <> max_int then a.(i) <- a.(a.(i)));
  let map = Int.fold 1 (Array.length a - 1) IntMap.empty (fun map i ->
    if a.(i) = max_int then map else
    IntMap.add_inc map a.(i) [i] (fun l -> i :: l)) in
  IntMap.fold map [] (fun l _ v -> v :: l)

let get_preps tokens group = (* FIXME: To nie zadziała przy kilku wystąpieniach tego samego przyimka *)
  let preps,compars = Xlist.fold group (StringMap.empty,StringSet.empty) (fun (preps,compars) id ->
    let t = ExtArray.get tokens id in
    match t.token with
      Lemma(lemma,"prep",interp) ->
        let preps = if lemma = "po" then StringMap.add_inc preps "po" (StringSet.singleton "postp") (fun cases -> StringSet.add cases "postp") else preps in (* FIXME: to należałoby dodawać w morfology *)
        let preps = if lemma = "per" then StringMap.add_inc preps "per" (StringSet.singleton "voc") (fun cases -> StringSet.add cases "voc") else preps in (* FIXME: to należałoby dodawać w morfology *)
        if StringSet.mem ENIAMvalence.compars lemma then preps,StringSet.add compars lemma else
        Xlist.fold interp preps (fun map -> function
                [cases] -> Xlist.fold cases map (fun map case -> StringMap.add_inc map lemma (StringSet.singleton case) (fun cases -> StringSet.add cases case))
              | [cases;_] -> Xlist.fold cases map (fun map case -> StringMap.add_inc map lemma (StringSet.singleton case) (fun cases -> StringSet.add cases case))
              | _ -> map),compars
      | _ -> preps,compars) in
  StringMap.fold preps [] (fun l prep v -> (prep, StringSet.to_list v) :: l), StringSet.to_list compars

let assign_valence tokens lex_sems group =
  let lexemes = Xlist.fold group StringSet.empty (fun lexemes id ->
      let lemma = ENIAMtokens.get_lemma (ExtArray.get tokens id).token in
      StringSet.add lexemes lemma) in
  let preps,compars = get_preps tokens group in
  let compreps = ENIAMwalReduce.select_comprep_adjuncts lexemes in
  let entries,schemata,connected = ENIAMwalReduce.select_entries lexemes in
  Xlist.iter group (fun id ->
      let lemma = ENIAMtokens.get_lemma (ExtArray.get tokens id).token in
      let pos = ENIAMtokens.get_pos (ExtArray.get tokens id).token in
      let pos2 = ENIAMvalence.simplify_pos pos in
      let schemata = Entries.find schemata pos2 lemma in
      let schemata = if schemata = [] then ENIAMvalence.get_default_valence pos2 else schemata in
      (* Printf.printf "A %s %s %s |schemata|=%d\n" lemma pos pos2 (Xlist.size schemata); *)
      let entries = Entries.find entries pos lemma in
      let connected = Entries.find connected pos2 lemma in
      let schemata = List.flatten (Xlist.map schemata (fun (opinion,neg,pred,aspect,schema) ->
          ENIAMvalence.transform_entry pos lemma neg pred aspect schema)) in (* FIXME: gubię opinię *)
      (* Printf.printf "B %s |schemata|=%d\n" lemma (Xlist.size schemata); *)
      let schemata = ENIAMadjuncts.simplify_schemata lexemes pos pos2 lemma schemata in
      (* Printf.printf "C %s |schemata|=%d\n" lemma (Xlist.size schemata); *)
      let schemata = Xlist.rev_map schemata (fun (selectors,schema) ->
          selectors,ENIAMwalRenderer.render_simple_schema schema) in
      let schemata = List.flatten (Xlist.rev_map schemata (ENIAMadjuncts.add_adjuncts preps compreps compars pos2)) in
      (* Printf.printf "D %s |schemata|=%d\n" lemma (Xlist.size schemata); *)
      let entries = List.flatten (Xlist.rev_map entries (ENIAMvalence.transform_lex_entry pos lemma)) in
      let entries = Xlist.map entries (fun (selectors,entry) ->
          selectors,ENIAMwalRenderer.render_lex_entry entry) in
      let connected = List.flatten (Xlist.map connected (fun (sopinion,fopinion,meanings,neg,pred,aspect,schema) ->
          Xlist.rev_map (ENIAMvalence.transform_entry pos lemma neg pred aspect schema) (fun (selectors,schema) ->
              selectors,meanings,schema))) in (* FIXME: gubię opinię *)
      let connected = Xlist.fold connected [] (fun connected (selectors,meanings,schema) ->
          if ENIAMadjuncts.check_selector_lex_constraints lexemes pos selectors then (selectors,meanings,schema) :: connected else connected) in
      let connected = Xlist.rev_map connected (fun (selectors,meanings,schema) ->
          selectors,
          Xlist.map meanings find_meaning,
          find_selprefs (ENIAMwalRenderer.render_connected_schema schema)) in
      ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
                                schemata=schemata; lex_entries=entries; frames=connected})

(* TODO:
   slashe
   zgranie z LCGlexicon
   usuwanie lex_entries gdy nie spełnione są selektory i gdy nie ma pasującego id wśród innych tokenów
   possp jako adjunct dla noun
   - uwzględnienie cech morfoskładniowych - np usunięcie schematów wymagających negacji, gdy nie ma "nie"
   - leksykalizacje bez schema
   - scalanie frames
*)



(*
let assign_valence tokens lex_sems group =
  let lexemes = Xlist.fold group StringMap.empty (fun lexemes id ->
    match (ExtArray.get tokens id).token with
      Lemma(lemma,pos,_) ->
        StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos)
    | Proper(lemma,pos,_,_) ->
        let pos = match pos with
          "subst" -> "psubst"
        | "depr" -> "pdepr"
        | _ -> pos (*failwith ("assign_valence: Proper " ^ pos ^ " " ^ lemma)*) in
        StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) (* nazwy własne mają przypisywaną domyślną walencję rzeczowników *)
    | _ -> lexemes) in
  let valence = ENIAMwalenty.find_frames lexemes in
  Xlist.iter group (fun id ->
    match (ExtArray.get tokens id).token with
      Lemma(lemma,pos,_) ->
        ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
          valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []}
    | Proper(lemma,pos,interp,_) ->
        ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
          valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma)
            (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*))};
        ExtArray.set tokens id {(ExtArray.get tokens id) with token=Lemma(lemma,pos,interp)}
    | _ -> ())
*)


let disambiguate_senses lex_sems group =
  let prefs = Xlist.fold group (StringSet.singleton "ALL") (fun prefs id ->
    Xlist.fold (ExtArray.get lex_sems id).frames prefs (fun prefs (_,_,schema) ->
      Xlist.fold schema prefs (fun prefs t ->
        Xlist.fold t.sel_prefs prefs (fun prefs -> function
          SynsetName s -> StringSet.add prefs s
        | _ -> failwith "disambiguate_senses")))) in
  (*let hipero = Xlist.fold group (StringSet.singleton "ALL") (fun hipero id ->
    Xlist.fold (ExtArray.get lex_sems id).senses hipero (fun hipero (_,l,_) ->
      Xlist.fold l hipero StringSet.add)) in
  let senses = StringSet.intersection prefs hipero in
  let is_zero = StringSet.mem hipero "0" in
  let senses = if is_zero then StringSet.add senses "0" else senses in*)
  Xlist.iter group (fun id ->
    let t = ExtArray.get lex_sems id in
    ExtArray.set lex_sems id {t with frames=Xlist.map t.frames (fun (selectors,meanings,positions) ->
      let meanings = Xlist.map meanings (fun (name,hipero,weight) ->
        let hipero = Xlist.fold hipero ["ALL",0] (fun hipero (name,cost) ->
          if StringSet.mem prefs name then (name,cost) :: hipero else hipero) in
        name,hipero,weight) in
      selectors,meanings,positions)})

let remove_unused_tokens tokens groups =
  let set = Xlist.fold groups IntSet.empty (fun set group ->
    Xlist.fold group set IntSet.add) in
  Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
    if IntSet.mem set i then () else
    ExtArray.set tokens i ENIAMtokenizerTypes.empty_token_env)

let assign tokens text =
  let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in
  let _ = ExtArray.add lex_sems empty_lex_sem in
  Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
    ignore (ExtArray.add lex_sems empty_lex_sem));
  let groups = split_tokens_into_groups (ExtArray.size tokens) text in
  (* Xlist.iter groups (fun group -> print_endline (String.concat " " (Xlist.map group string_of_int))); *)
  remove_unused_tokens tokens groups;
  Xlist.iter groups (fun group -> assign_valence tokens lex_sems group);
  (* Xlist.iter groups (fun group -> assign_valence tokens lex_sems group);*)
  Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
    let token = ExtArray.get tokens i in
    let lex_sem = ExtArray.get lex_sems i in
    let lex_sem = find_senses token lex_sem in
    ExtArray.set lex_sems i lex_sem);
  Xlist.iter groups (fun group -> disambiguate_senses lex_sems group);
  (*Xlist.iter groups (fun group -> ENIAMlexSemanticsData.assign_semantics tokens lex_sems group); *)
  lex_sems

let catch_assign tokens text =
  try
    assign tokens text,""
  with e ->
    ExtArray.make 0 empty_lex_sem,
    Printexc.to_string e

let initialize () =
  ENIAMsubsyntax.initialize ();
  ENIAMwalParser.initialize ();
  ENIAMwalReduce.initialize ();
  ENIAMplWordnet.initialize ();
  ()