ENIAMlexSemantics.ml 13.7 KB

Edit Raw Blame History

(*
 *  ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
 *  Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMtokenizerTypes
open ENIAMsubsyntaxTypes
open ENIAMlexSemanticsTypes
open ENIAMwalTypes
open Xstd

let string_of_lex_sems tokens lex_sems =
  String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size lex_sems - 1) [] (fun l id ->
    let t = ExtArray.get lex_sems id in
    let t2 = ExtArray.get tokens id in
    let orth = t2.ENIAMtokenizerTypes.orth in
    let lemma = ENIAMtokens.string_of_token t2.ENIAMtokenizerTypes.token in
    let core = Printf.sprintf "%3d %s %s" id orth lemma  in
    let lex_entries = Xlist.map t.lex_entries (fun (selectors,s) ->
        "[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] " ^ ENIAM_LCGstringOf.grammar_symbol 0 s) in
    let schemata = Xlist.map t.schemata (fun (selectors,l) ->
        "[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ String.concat "," (Xlist.map l (fun (d,s) ->
            ENIAM_LCGstringOf.direction d ^ ENIAM_LCGstringOf.grammar_symbol 0 s)) ^ "}") in
    let frames = Xlist.map t.frames (fun (selectors,meanings,schema) ->
        "[" ^ ENIAMcategoriesPL.string_of_selectors selectors ^ "] {" ^ ENIAMwalStringOf.schema schema ^ "} " ^
        String.concat "," (Xlist.map meanings (fun m -> ENIAMwalStringOf.meaning m))) in
    (String.concat "\n    " ([core] @ schemata @ frames @ lex_entries)) :: l)))
    (* let lroles = if snd t.lroles = "" then fst t.lroles else fst t.lroles ^ " " ^ snd t.lroles in
    let core = Printf.sprintf "%3d %s %s %s" id orth lemma lroles in
    let senses = Xlist.map t.senses (fun (sense,hipero,weight) ->
      Printf.sprintf "%s[%s]%.2f" sense (String.concat "," hipero) weight) in
    let valence = Xlist.map t.valence (ENIAMwalStringOf.fnum_frame "") in
    let simple_valence = Xlist.map t.simple_valence (ENIAMwalStringOf.fnum_frame "") in
    (* let semantics =  *)
    (String.concat "\n    " ([core] @ senses @ valence @ simple_valence)) :: l))) *)

(*let find_senses t = (* FIXME: sensy zawierające 'się' *)
  match t.token with
    Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses lemma pos
  | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses
  | _ -> []
*)
let rec find a l i =
  if a.(i) = max_int then (
    a.(i) <- i;
    i) else
  if a.(i) = i then (
    Xlist.iter l (fun j -> a.(j) <- i);
    i) else
  find a (i :: l) a.(i)

let union a i j =
  if i = j then i else
  let x = min i j in
  let y = max i j in
  a.(y) <- x;
  x

let rec split_tokens_into_groups_sentence a = function
    RawSentence s -> ()
  | StructSentence([],_) -> ()
  | StructSentence((id,_,_) :: paths,_) ->
      ignore (Xlist.fold paths (find a [] id) (fun m (id,_,_) ->
        union a m (find a [] id)))
  | DepSentence(paths) ->
      if Array.length paths = 0 then () else
      let id,_,_ = paths.(0) in
      ignore (Int.fold 1 (Array.length paths - 1) (find a [] id) (fun m i ->
        let id,_,_ = paths.(i) in
        union a m (find a [] id)))
  | QuotedSentences sentences ->
      Xlist.iter sentences (fun p ->
        split_tokens_into_groups_sentence a p.sentence)
  | AltSentence l -> Xlist.iter l (fun (mode,sentence) ->
        split_tokens_into_groups_sentence a sentence)

let rec split_tokens_into_groups_paragraph a = function
    RawParagraph s -> ()
  | StructParagraph sentences ->
      Xlist.iter sentences (fun p -> split_tokens_into_groups_sentence a p.sentence)
  | AltParagraph l -> Xlist.iter l (fun (mode,paragraph) ->
      split_tokens_into_groups_paragraph a paragraph)

let rec split_tokens_into_groups_text a = function
    RawText s -> ()
  | StructText paragraphs ->
      Xlist.iter paragraphs (split_tokens_into_groups_paragraph a)
  | AltText l -> Xlist.iter l (fun (mode,text) ->
      split_tokens_into_groups_text a text)

let split_tokens_into_groups size text =
  let a = Array.make size max_int in
  split_tokens_into_groups_text a text;
  Int.iter 1 (Array.length a - 1) (fun i ->
    if a.(i) <> max_int then a.(i) <- a.(a.(i)));
  let map = Int.fold 1 (Array.length a - 1) IntMap.empty (fun map i ->
    if a.(i) = max_int then map else
    IntMap.add_inc map a.(i) [i] (fun l -> i :: l)) in
  IntMap.fold map [] (fun l _ v -> v :: l)

let get_preps tokens group = (* FIXME: To nie zadziała przy kilku wystąpieniach tego samego przyimka *)
  let preps,compars = Xlist.fold group (StringMap.empty,StringSet.empty) (fun (preps,compars) id ->
    let t = ExtArray.get tokens id in
    match t.token with
      Lemma(lemma,"prep",interp) ->
        let preps = if lemma = "po" then StringMap.add_inc preps "po" (StringSet.singleton "postp") (fun cases -> StringSet.add cases "postp") else preps in (* FIXME: to należałoby dodawać w morfology *)
        let preps = if lemma = "per" then StringMap.add_inc preps "per" (StringSet.singleton "voc") (fun cases -> StringSet.add cases "voc") else preps in (* FIXME: to należałoby dodawać w morfology *)
        if StringSet.mem ENIAMvalence.compars lemma then preps,StringSet.add compars lemma else
        Xlist.fold interp preps (fun map -> function
                [cases] -> Xlist.fold cases map (fun map case -> StringMap.add_inc map lemma (StringSet.singleton case) (fun cases -> StringSet.add cases case))
              | [cases;_] -> Xlist.fold cases map (fun map case -> StringMap.add_inc map lemma (StringSet.singleton case) (fun cases -> StringSet.add cases case))
              | _ -> map),compars
      | _ -> preps,compars) in
  StringMap.fold preps [] (fun l prep v -> (prep, StringSet.to_list v) :: l), StringSet.to_list compars

let assign_valence tokens lex_sems group =
  let lexemes = Xlist.fold group StringSet.empty (fun lexemes id ->
      let lemma = ENIAMtokens.get_lemma (ExtArray.get tokens id).token in
      StringSet.add lexemes lemma) in
  let preps,compars = get_preps tokens group in
  let compreps = ENIAMwalReduce.select_comprep_adjuncts lexemes in
  let entries,schemata,connected = ENIAMwalReduce.select_entries lexemes in
  Xlist.iter group (fun id ->
      let lemma = ENIAMtokens.get_lemma (ExtArray.get tokens id).token in
      let pos = ENIAMtokens.get_pos (ExtArray.get tokens id).token in
      let pos2 = ENIAMvalence.simplify_pos pos in
      let schemata = Entries.find schemata pos2 lemma in
      let schemata = if schemata = [] then ENIAMvalence.get_default_valence pos2 else schemata in
      (* Printf.printf "A %s %s %s |schemata|=%d\n" lemma pos pos2 (Xlist.size schemata); *)
      let entries = Entries.find entries pos lemma in
      let connected = Entries.find connected pos2 lemma in
      let schemata = List.flatten (Xlist.map schemata (fun (opinion,neg,pred,aspect,schema) ->
          ENIAMvalence.transform_entry pos lemma neg pred aspect schema)) in (* FIXME: gubię opinię *)
      (* Printf.printf "B %s |schemata|=%d\n" lemma (Xlist.size schemata); *)
      let schemata = ENIAMadjuncts.simplify_schemata lexemes pos pos2 lemma schemata in
      (* Printf.printf "C %s |schemata|=%d\n" lemma (Xlist.size schemata); *)
      let schemata = Xlist.rev_map schemata (fun (selectors,schema) ->
          selectors,ENIAMwalRenderer.render_simple_schema schema) in
      let schemata = List.flatten (Xlist.rev_map schemata (ENIAMadjuncts.add_adjuncts preps compreps compars pos2)) in
      (* Printf.printf "D %s |schemata|=%d\n" lemma (Xlist.size schemata); *)
      let entries = List.flatten (Xlist.rev_map entries (ENIAMvalence.transform_lex_entry pos lemma)) in
      let entries = Xlist.map entries (fun (selectors,entry) ->
          selectors,ENIAMwalRenderer.render_lex_entry entry) in
      let connected = List.flatten (Xlist.map connected (fun (sopinion,fopinion,meanings,neg,pred,aspect,schema) ->
          Xlist.rev_map (ENIAMvalence.transform_entry pos lemma neg pred aspect schema) (fun (selectors,schema) ->
              selectors,meanings,schema))) in (* FIXME: gubię opinię *)
      let connected = Xlist.fold connected [] (fun connected (selectors,meanings,schema) ->
          if ENIAMadjuncts.check_selector_lex_constraints lexemes pos selectors then (selectors,meanings,schema) :: connected else connected) in
      let connected = Xlist.rev_map connected (fun (selectors,meanings,schema) ->
          selectors,meanings,ENIAMwalRenderer.render_connected_schema schema) in
      ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
                                schemata=schemata; lex_entries=entries; frames=connected})

(* TODO:
   slashe
   zgranie z LCGlexicon
   usuwanie lex_entries gdy nie spełnione są selektory i gdy nie ma pasującego id wśród innych tokenów
   possp jako adjunct dla noun
   - uwzględnienie cech morfoskładniowych - np usunięcie schematów wymagających negacji, gdy nie ma "nie"
   - leksykalizacje bez schema
   - scalanie frames
*)


(*
let assign_valence tokens lex_sems group =
  let lexemes = Xlist.fold group StringMap.empty (fun lexemes id ->
    match (ExtArray.get tokens id).token with
      Lemma(lemma,pos,_) ->
        StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos)
    | Proper(lemma,pos,_,_) ->
        let pos = match pos with
          "subst" -> "psubst"
        | "depr" -> "pdepr"
        | _ -> pos (*failwith ("assign_valence: Proper " ^ pos ^ " " ^ lemma)*) in
        StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) (* nazwy własne mają przypisywaną domyślną walencję rzeczowników *)
    | _ -> lexemes) in
  let valence = ENIAMwalenty.find_frames lexemes in
  Xlist.iter group (fun id ->
    match (ExtArray.get tokens id).token with
      Lemma(lemma,pos,_) ->
        ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
          valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []}
    | Proper(lemma,pos,interp,_) ->
        ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
          valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma)
            (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*))};
        ExtArray.set tokens id {(ExtArray.get tokens id) with token=Lemma(lemma,pos,interp)}
    | _ -> ())

let get_prefs_schema prefs schema =
  Xlist.fold schema prefs (fun prefs t ->
    Xlist.fold t.sel_prefs prefs StringSet.add)

let map_prefs_schema senses schema =
  Xlist.map schema (fun t ->
    if Xlist.mem t.morfs (Phrase Pro) || Xlist.mem t.morfs (Phrase ProNG) then t else
    {t with sel_prefs = Xlist.fold t.sel_prefs [] (fun l s ->
      if StringSet.mem senses s then s :: l else l)})

let disambiguate_senses lex_sems group =
  let prefs = Xlist.fold group (StringSet.singleton "ALL") (fun prefs id ->
    Xlist.fold (ExtArray.get lex_sems id).valence prefs (fun prefs -> function
      _,Frame(_,schema) -> get_prefs_schema prefs schema
    | _,LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema
    | _,ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in
  let hipero = Xlist.fold group (StringSet.singleton "ALL") (fun hipero id ->
    Xlist.fold (ExtArray.get lex_sems id).senses hipero (fun hipero (_,l,_) ->
      Xlist.fold l hipero StringSet.add)) in
  let senses = StringSet.intersection prefs hipero in
  let is_zero = StringSet.mem hipero "0" in
  let senses = if is_zero then StringSet.add senses "0" else senses in
  Xlist.iter group (fun id ->
    let t = ExtArray.get lex_sems id in
    ExtArray.set lex_sems id {t with valence = if is_zero then t.valence else
        Xlist.map t.valence (function
          n,Frame(a,schema) -> n,Frame(a,map_prefs_schema senses schema)
        | n,LexFrame(s,p,r,schema) -> n,LexFrame(s,p,r,map_prefs_schema senses schema)
        | n,ComprepFrame(s,p,r,schema) -> n,ComprepFrame(s,p,r,map_prefs_schema senses schema));
      senses = Xlist.map t.senses (fun (s,l,w) ->
        s, List.rev (Xlist.fold l [] (fun l s -> if StringSet.mem senses s then s :: l else l)),w)})

*)


let assign tokens text =
  let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in
  let _ = ExtArray.add lex_sems empty_lex_sem in
  Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
    (* let token = ExtArray.get tokens i in
    (* ExtArray.set tokens i token; *)
    let senses = find_senses token in *)
      let lex_sem = {empty_lex_sem with senses=[](*senses*)} in
    let _ = ExtArray.add lex_sems lex_sem in
    ());
  let groups = split_tokens_into_groups (ExtArray.size tokens) text in
  (* Xlist.iter groups (fun group -> print_endline (String.concat " " (Xlist.map group string_of_int))); *)
  Xlist.iter groups (fun group -> assign_valence tokens lex_sems group);
  (* Xlist.iter groups (fun group -> assign_valence tokens lex_sems group);
  Xlist.iter groups (fun group -> disambiguate_senses lex_sems group);
  Xlist.iter groups (fun group -> assign_simplified_valence tokens lex_sems group);
  Xlist.iter groups (fun group -> assign_very_simplified_valence tokens lex_sems group);
  Xlist.iter groups (fun group -> ENIAMlexSemanticsData.assign_semantics tokens lex_sems group); *)
  lex_sems