ENIAMlexSemantics.ml 17.9 KB
(*
 *  ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
 *  Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMtokenizerTypes
open ENIAMsubsyntaxTypes
open ENIAMlexSemanticsTypes
open ENIAMwalTypes
open Xstd

let find_meaning m =
  try
    ENIAMplWordnet.find_meaning m.plwnluid
  with Not_found ->
    m.name ^ "-" ^ m.variant, [], unknown_meaning_weight

let find_prep_meaning lemma hipero =
  let hipero = match hipero with
      [Predef hipero] -> hipero
    | _ -> failwith "find_prep_meaning" in
  if hipero = "ALL" then lemma, [hipero,0], unknown_meaning_weight else
  let syn_id = StringMap.find !ENIAMplWordnet.predef hipero in
  let hipero = IntMap.fold (ENIAMplWordnet.get_hipero syn_id) [] (fun hipero syn_id cost -> (ENIAMplWordnet.synset_name syn_id, cost) :: hipero) in
  lemma, hipero, unknown_meaning_weight

let lex_sie = LCG (ENIAMwalRenderer.render_morf (SimpleLexArg("się",QUB)))

(* FIXME: naiwnie wierzymy, że jeśli leksem jest opisany semantycznie w walentym to zawiera ramy dla wszystkich sensów *)
let find_senses t s =
  (*let set = Xlist.fold s.frames StringSet.empty (fun set frame ->
    Xlist.fold frame.meanings set (fun set (name,hipero,weight) ->
      StringSet.add set name)) in*)
  let senses = match t.token with
      Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses lemma pos
    | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses
    | _ -> [] in
  (* let senses =
    Xlist.fold senses [] (fun senses (name,hipero,weight) ->
      if StringSet.mem set name then senses else (name,hipero,weight) :: senses) in *)
  let senses_sie = match t.token with
      Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses (lemma ^ " się") pos
    | Proper(_,_,_,senses) -> []
    | _ -> [] in
(*  let senses_sie = Xlist.fold senses_sie [] (fun senses_sie (name,hipero,weight) ->
    if StringSet.mem set name then senses_sie else (name,hipero,weight) :: senses_sie) in
  let frames = if senses = [] then s.frames else {empty_frame with meanings=senses} :: s.frames in
  let frames = if senses_sie = [] then frames else {empty_frame with meanings=senses_sie;
    positions=[{empty_position with role="Lemma"; mode=["lemma"]; morfs=[lex_sie]; is_necessary=Req}]} :: frames in*) (* FIXME: czy to nie usuwa elementów z ramy? *)
  let frames = Xlist.fold s.frames [] (fun frames f ->
    if f.meanings <> [] then f :: frames else
    if senses = [] && senses_sie = [] then {f with meanings=[ENIAMtokens.get_lemma t.token, [], unknown_meaning_weight]} :: frames else
    (if senses_sie = [] then [] else [{f with meanings=senses_sie; positions={empty_position with role="Lemma"; mode=["lemma"]; morfs=[lex_sie]; is_necessary=Req} :: f.positions}]) @
    [{f with meanings=senses}] @ frames) in
  {s with frames=frames}

let find_selprefs schema = (* FIXME: RelationRole *)
  Xlist.map schema (fun p ->
      let l = Xlist.fold p.sel_prefs [] (fun l -> function
          SynsetId id -> (try ENIAMplWordnet.synset_name id :: l with ENIAMplWordnet.SynsetNotFound -> l)
        | Predef s -> s :: l
        | SynsetName _ -> failwith "find_selprefs"
        | RelationRole _ -> l) in
      let l = if l = [] then ["ALL"] else l in
      {p with sel_prefs=Xlist.map l (fun s -> SynsetName s)})

let rec find a l i =
  if a.(i) = max_int then (
    a.(i) <- i;
    i) else
  if a.(i) = i then (
    Xlist.iter l (fun j -> a.(j) <- i);
    i) else
  find a (i :: l) a.(i)

let union a i j =
  if i = j then i else
  let x = min i j in
  let y = max i j in
  a.(y) <- x;
  x

let rec split_tokens_into_groups_sentence a = function
    RawSentence s -> ()
  | StructSentence([],_) -> ()
  | StructSentence((id,_,_) :: paths,_) ->
      ignore (Xlist.fold paths (find a [] id) (fun m (id,_,_) ->
        union a m (find a [] id)))
  | DepSentence(paths) ->
      if Array.length paths = 0 then () else
      let id,_,_ = paths.(0) in
      ignore (Int.fold 1 (Array.length paths - 1) (find a [] id) (fun m i ->
        let id,_,_ = paths.(i) in
        union a m (find a [] id)))
  | QuotedSentences sentences ->
      Xlist.iter sentences (fun p ->
        split_tokens_into_groups_sentence a p.sentence)
  | AltSentence l -> Xlist.iter l (fun (mode,sentence) ->
        split_tokens_into_groups_sentence a sentence)

let rec split_tokens_into_groups_paragraph a = function
    RawParagraph s -> ()
  | StructParagraph sentences ->
      Xlist.iter sentences (fun p -> split_tokens_into_groups_sentence a p.sentence)
  | AltParagraph l -> Xlist.iter l (fun (mode,paragraph) ->
      split_tokens_into_groups_paragraph a paragraph)

let rec split_tokens_into_groups_text a = function
    RawText s -> ()
  | StructText paragraphs ->
      Xlist.iter paragraphs (split_tokens_into_groups_paragraph a)
  | AltText l -> Xlist.iter l (fun (mode,text) ->
      split_tokens_into_groups_text a text)

let split_tokens_into_groups size text =
  let a = Array.make size max_int in
  split_tokens_into_groups_text a text;
  Int.iter 1 (Array.length a - 1) (fun i ->
    if a.(i) <> max_int then a.(i) <- a.(a.(i)));
  let map = Int.fold 1 (Array.length a - 1) IntMap.empty (fun map i ->
    if a.(i) = max_int then map else
    IntMap.add_inc map a.(i) [i] (fun l -> i :: l)) in
  IntMap.fold map [] (fun l _ v -> v :: l)

let get_preps tokens group = (* FIXME: To nie zadziała przy kilku wystąpieniach tego samego przyimka *)
  let preps,compars = Xlist.fold group (StringMap.empty,StringSet.empty) (fun (preps,compars) id ->
    let t = ExtArray.get tokens id in
    match t.token with
      Lemma(lemma,"prep",interp) ->
        let preps = if lemma = "po" then StringMap.add_inc preps "po" (StringSet.singleton "postp") (fun cases -> StringSet.add cases "postp") else preps in (* FIXME: to należałoby dodawać w morfology *)
        let preps = if lemma = "per" then StringMap.add_inc preps "per" (StringSet.singleton "voc") (fun cases -> StringSet.add cases "voc") else preps in (* FIXME: to należałoby dodawać w morfology *)
        if StringSet.mem ENIAMvalence.compars lemma then preps,StringSet.add compars lemma else
        Xlist.fold interp preps (fun map -> function
                [cases] -> Xlist.fold cases map (fun map case -> StringMap.add_inc map lemma (StringSet.singleton case) (fun cases -> StringSet.add cases case))
              | [cases;_] -> Xlist.fold cases map (fun map case -> StringMap.add_inc map lemma (StringSet.singleton case) (fun cases -> StringSet.add cases case))
              | _ -> map),compars
      | _ -> preps,compars) in
  StringMap.fold preps [] (fun l prep v -> (prep, StringSet.to_list v) :: l), StringSet.to_list compars

let make_unique schemata =
  let map = Xlist.fold schemata StringMap.empty (fun map (selectors,schema) ->
    let s = "[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ ENIAMwalStringOf.schema schema ^ "}" in
    StringMap.add map s (selectors,schema)) in
  StringMap.fold map [] (fun l _ (selectors,schema) -> (selectors,schema) :: l)

let semantize lemma pos (selectors,schema) =
  let schema = Xlist.rev_map schema (fun p ->
    {p with role="Arg"; sel_prefs=[Predef "X"]}) in (* FIXME: zaślepka, żeby preferować znane argumenty *)
  Xlist.rev_map (ENIAMvalence.get_aroles schema lemma pos) (fun (sel,arole,arole_attr,arev) ->
    {empty_frame with selectors=sel @ selectors; positions=schema;
     arole=arole; arole_attr=arole_attr; arev=arev})

let assign_prep_semantics lemma =
  let roles = try StringMap.find ENIAMlexSemanticsData.prep_roles lemma with Not_found -> [] in
  (* Printf.printf "assign_prep_semantics: |roles|=%d\n%!" (Xlist.size roles); *)
  Xlist.map roles (function (case,arole,arole_attr,hipero,sel_prefs) ->
    (* Printf.printf "assign_prep_semantics: case=%s arole=%s arole_attr=%s\n%!" case arole arole_attr; *)
    let meaning = find_prep_meaning lemma hipero in (* FIXME: zaślepka dla meaning i weight *)
    (* print_endline "assign_prep_semantics 1"; *)
    let positions = [{empty_position with
      sel_prefs=sel_prefs; dir=if lemma="temu" then Backward_ else Forward_;
      morfs=ENIAMwalRenderer.assing_pref_morfs (lemma,case); is_necessary=Req}] in
    (* print_endline "assign_prep_semantics 2"; *)
    {empty_frame with selectors=[ENIAM_LCGlexiconTypes.Case,ENIAM_LCGlexiconTypes.Eq,[case]]; meanings=[meaning]; positions=find_selprefs positions;
     arole=arole; arole_attr=arole_attr; arev=false})

let assign_valence tokens lex_sems group =
  let lexemes = Xlist.fold group StringSet.empty (fun lexemes id ->
      let lemma = ENIAMtokens.get_lemma (ExtArray.get tokens id).token in
      StringSet.add lexemes lemma) in
  let preps,compars = get_preps tokens group in
  let compreps = ENIAMwalReduce.select_comprep_adjuncts lexemes in
  let entries,schemata,connected = ENIAMwalReduce.select_entries lexemes in
  Xlist.iter group (fun id ->
      let lemma = ENIAMtokens.get_lemma (ExtArray.get tokens id).token in
      let pos = ENIAMtokens.get_pos (ExtArray.get tokens id).token in
      let pos2 = ENIAMvalence.simplify_pos pos in
      let schemata = Entries.find schemata pos2 lemma in
      let schemata = if schemata = [] then ENIAMvalence.get_default_valence pos2 else schemata in
      (* Printf.printf "A %s %s %s |schemata|=%d\n" lemma pos pos2 (Xlist.size schemata); *)
      let entries = Entries.find entries pos lemma in
      let connected = Entries.find connected pos2 lemma in
      let schemata1 = List.flatten (Xlist.map schemata (fun (opinion,neg,pred,aspect,schema) ->
          ENIAMvalence.transform_entry pos lemma neg pred aspect schema)) in (* gubię opinię *)
      (* Printf.printf "B %s |schemata|=%d\n" lemma (Xlist.size schemata); *)
      let schemata = ENIAMadjuncts.simplify_schemata lexemes pos pos2 lemma schemata1 in
      (* Printf.printf "C %s |schemata|=%d\n" lemma (Xlist.size schemata); *)
      let schemata = Xlist.rev_map schemata (fun (selectors,schema) ->
          selectors,ENIAMwalRenderer.render_simple_schema schema) in
      let schemata = List.flatten (Xlist.rev_map schemata (ENIAMadjuncts.add_adjuncts preps compreps compars pos2)) in
      (* Printf.printf "D %s |schemata|=%d\n" lemma (Xlist.size schemata); *)
      let entries = List.flatten (Xlist.rev_map entries (ENIAMvalence.transform_lex_entry pos lemma)) in
      let entries = Xlist.map entries (fun (selectors,entry) ->
          selectors,ENIAMwalRenderer.render_lex_entry entry) in
      let connected = List.flatten (Xlist.map connected (fun (sopinion,fopinion,meanings,neg,pred,aspect,schema1) ->
          List.flatten (Xlist.rev_map (ENIAMvalence.transform_entry pos lemma neg pred aspect schema1) (fun (selectors,schema) ->
              Xlist.rev_map (ENIAMvalence.get_aroles schema1 lemma pos) (fun (sel,arole,arole_attr,arev) ->
                  {selectors=sel @ selectors; meanings=Xlist.map meanings find_meaning; positions=schema;
                   arole=arole; arole_attr=arole_attr; arev=arev; sopinion=sopinion; fopinion=fopinion}))))) in
      (* Printf.printf "E %s |connected|=%d\n" lemma (Xlist.size connected); *)
      let connected = if connected = [] then List.flatten (Xlist.rev_map (make_unique schemata1) (semantize lemma pos)) else connected in
      (* Printf.printf "F %s |connected|=%d\n" lemma (Xlist.size connected); *)
      let connected = Xlist.fold connected [] (fun connected frame ->
          if ENIAMadjuncts.check_selector_lex_constraints lexemes pos frame.selectors then frame :: connected else connected) in
      (* Printf.printf "G %s |connected|=%d\n" lemma (Xlist.size connected); *)
      let connected = Xlist.rev_map connected (fun frame ->
          {frame with
            positions = find_selprefs (ENIAMwalRenderer.render_connected_schema (ENIAMwalReduce.set_necessary frame.positions))}) in
      (* Printf.printf "H %s |connected|=%d\n" lemma (Xlist.size connected); *)
      let connected = List.flatten (Xlist.rev_map connected (ENIAMadjuncts.add_connected_adjuncts preps compreps compars pos2)) in
      (* Printf.printf "I %s |connected|=%d\n" lemma (Xlist.size connected); *)
      let connected = if pos = "prep" then
        if connected <> [] then failwith "assign_valence" else
        assign_prep_semantics lemma else connected in
      (* Printf.printf "J %s |connected|=%d\n" lemma (Xlist.size connected); *)
      let connected = if connected = [] then
        Xlist.rev_map (ENIAMvalence.get_aroles [] lemma pos) (fun (sel,arole,arole_attr,arev) ->
          {empty_frame with selectors=sel; arole=arole; arole_attr=arole_attr; arev=arev}) else connected in
      (* Printf.printf "K %s |connected|=%d\n" lemma (Xlist.size connected); *)
      ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
                                schemata=schemata; lex_entries=entries; frames=connected})

(* TODO:
   slashe
   zgranie z LCGlexicon
   usuwanie lex_entries gdy nie spełnione są selektory i gdy nie ma pasującego id wśród innych tokenów
   possp jako adjunct dla noun
   - uwzględnienie cech morfoskładniowych - np usunięcie schematów wymagających negacji, gdy nie ma "nie"
   - leksykalizacje bez schema
   - scalanie frames
*)



(*
let assign_valence tokens lex_sems group =
  let lexemes = Xlist.fold group StringMap.empty (fun lexemes id ->
    match (ExtArray.get tokens id).token with
      Lemma(lemma,pos,_) ->
        StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos)
    | Proper(lemma,pos,_,_) ->
        let pos = match pos with
          "subst" -> "psubst"
        | "depr" -> "pdepr"
        | _ -> pos (*failwith ("assign_valence: Proper " ^ pos ^ " " ^ lemma)*) in
        StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) (* nazwy własne mają przypisywaną domyślną walencję rzeczowników *)
    | _ -> lexemes) in
  let valence = ENIAMwalenty.find_frames lexemes in
  Xlist.iter group (fun id ->
    match (ExtArray.get tokens id).token with
      Lemma(lemma,pos,_) ->
        ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
          valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []}
    | Proper(lemma,pos,interp,_) ->
        ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
          valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma)
            (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*))};
        ExtArray.set tokens id {(ExtArray.get tokens id) with token=Lemma(lemma,pos,interp)}
    | _ -> ())
*)


let disambiguate_senses lex_sems group =
  let prefs = Xlist.fold group (StringSet.singleton "ALL") (fun prefs id ->
    Xlist.fold (ExtArray.get lex_sems id).frames prefs (fun prefs frame ->
      Xlist.fold frame.positions prefs (fun prefs t ->
        Xlist.fold t.sel_prefs prefs (fun prefs -> function
          SynsetName s -> StringSet.add prefs s
        | _ -> failwith "disambiguate_senses")))) in
  (*let hipero = Xlist.fold group (StringSet.singleton "ALL") (fun hipero id ->
    Xlist.fold (ExtArray.get lex_sems id).senses hipero (fun hipero (_,l,_) ->
      Xlist.fold l hipero StringSet.add)) in
  let senses = StringSet.intersection prefs hipero in
  let is_zero = StringSet.mem hipero "0" in
  let senses = if is_zero then StringSet.add senses "0" else senses in*)
  Xlist.iter group (fun id ->
    let t = ExtArray.get lex_sems id in
    ExtArray.set lex_sems id {t with frames=Xlist.map t.frames (fun frame ->
      let meanings = Xlist.map frame.meanings (fun (name,hipero,weight) ->
        let hipero = Xlist.fold hipero ["ALL",0] (fun hipero (name,cost) ->
          if StringSet.mem prefs name then (name,cost) :: hipero else hipero) in
        name,hipero,weight) in
      {frame with meanings=meanings})})

let remove_unused_tokens tokens groups =
  let set = Xlist.fold groups IntSet.empty (fun set group ->
    Xlist.fold group set IntSet.add) in
  Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
    if IntSet.mem set i then () else
    ExtArray.set tokens i ENIAMtokenizerTypes.empty_token_env)

let assign tokens text =
  let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in
  let _ = ExtArray.add lex_sems empty_lex_sem in
  Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
    ignore (ExtArray.add lex_sems empty_lex_sem));
  let groups = split_tokens_into_groups (ExtArray.size tokens) text in
  (* Xlist.iter groups (fun group -> print_endline (String.concat " " (Xlist.map group string_of_int))); *)
  remove_unused_tokens tokens groups;
  Xlist.iter groups (fun group -> assign_valence tokens lex_sems group);
  Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
    let token = ExtArray.get tokens i in
    let lex_sem = ExtArray.get lex_sems i in
    let lex_sem = find_senses token lex_sem in
    ExtArray.set lex_sems i lex_sem);
  Xlist.iter groups (fun group -> disambiguate_senses lex_sems group);
  (*Xlist.iter groups (fun group -> ENIAMlexSemanticsData.assign_semantics tokens lex_sems group); *)
  lex_sems

let catch_assign tokens text =
  try
    assign tokens text,""
  with e ->
    ExtArray.make 0 empty_lex_sem,
    Printexc.to_string e

let initialize () =
  ENIAMsubsyntax.initialize ();
  ENIAMwalParser.initialize ();
  ENIAMwalReduce.initialize ();
  ENIAMplWordnet.initialize ();
  ENIAMcategoriesPL.initialize ();
  ()