ENIAMlexSemantics.ml 17.4 KB
(*
 *  ENIAMlexSemantics is a library that assigns tokens with lexicosemantic information.
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMtokenizerTypes
open ENIAMsubsyntaxTypes
open ENIAMlexSemanticsTypes
open ENIAMwalTypes
open Xstd

let string_of_lex_sems tokens lex_sems =
  String.concat "\n" (List.rev (Int.fold 0 (ExtArray.size lex_sems - 1) [] (fun l id ->
    let t = ExtArray.get lex_sems id in
    let t2 = ExtArray.get tokens id in
    let orth = t2.ENIAMtokenizerTypes.orth in
    let lemma = ENIAMtokens.string_of_token t2.ENIAMtokenizerTypes.token in
    let lroles = if snd t.lroles = "" then fst t.lroles else fst t.lroles ^ " " ^ snd t.lroles in
    let core = Printf.sprintf "%3d %s %s %s" id orth lemma lroles in
    let senses = Xlist.map t.senses (fun (sense,hipero,weight) ->
      Printf.sprintf "%s[%s]%.2f" sense (String.concat "," hipero) weight) in
    let valence = Xlist.map t.valence (ENIAMwalStringOf.fnum_frame "") in
    let simple_valence = Xlist.map t.simple_valence (ENIAMwalStringOf.fnum_frame "") in
    (* let semantics =  *)
    (String.concat "\n    " ([core] @ senses @ valence @ simple_valence)) :: l)))

let load_proper_name proper = function
    [lemma; types] ->
         let types = Str.split (Str.regexp "|") types in
         StringMap.add_inc proper lemma types (fun types2 -> types @ types2)
  | l -> failwith ("proper_names: " ^ String.concat " " l)

let proper_names =
  let proper = File.fold_tab proper_names_filename StringMap.empty load_proper_name in
  let proper = File.fold_tab proper_names_filename2 proper load_proper_name in
  proper

let remove l s =
  Xlist.fold l [] (fun l t ->
    if s = t then l else t :: l)

let find_proper_names tokens i t =
  match t.token with
    Lemma(lemma,pos,interp) ->
        if StringMap.mem proper_names lemma then
          {t with token=Proper(lemma,pos,interp,StringMap.find proper_names lemma);
                  attrs=remove t.attrs "notvalidated proper"} else
        if Xlist.mem t.attrs "notvalidated proper" then
          {t with token=Proper(lemma,pos,interp,[])}
        else t
  | _ -> t

let find_senses t = (* FIXME: sensy zawierające 'się' *)
  match t.token with
    Lemma(lemma,pos,_) -> ENIAMplWordnet.find_senses lemma pos
  | Proper(_,_,_,senses) -> ENIAMplWordnet.find_proper_senses senses
  | _ -> []

let rec find a l i =
  if a.(i) = max_int then (
    a.(i) <- i;
    i) else
  if a.(i) = i then (
    Xlist.iter l (fun j -> a.(j) <- i);
    i) else
  find a (i :: l) a.(i)

let union a i j =
  if i = j then i else
  let x = min i j in
  let y = max i j in
  a.(y) <- x;
  x

let rec split_tokens_into_groups_sentence a = function
    RawSentence s -> ()
  | StructSentence([],_) -> ()
  | StructSentence((id,_,_) :: paths,_) ->
      ignore (Xlist.fold paths (find a [] id) (fun m (id,_,_) ->
        union a m (find a [] id)))
  | DepSentence(paths) ->
      if Array.length paths = 0 then () else
      let id,_,_ = paths.(0) in
      ignore (Int.fold 1 (Array.length paths - 1) (find a [] id) (fun m i ->
        let id,_,_ = paths.(i) in
        union a m (find a [] id)))
  | QuotedSentences sentences ->
      Xlist.iter sentences (fun p ->
        split_tokens_into_groups_sentence a p.psentence)
  | AltSentence l -> Xlist.iter l (fun (mode,sentence) ->
        split_tokens_into_groups_sentence a sentence)

let rec split_tokens_into_groups_paragraph a = function
    RawParagraph s -> ()
  | StructParagraph sentences ->
      Xlist.iter sentences (fun p -> split_tokens_into_groups_sentence a p.psentence)
  | AltParagraph l -> Xlist.iter l (fun (mode,paragraph) ->
      split_tokens_into_groups_paragraph a paragraph)

let rec split_tokens_into_groups_text a = function
    RawText s -> ()
  | StructText paragraphs ->
      Xlist.iter paragraphs (split_tokens_into_groups_paragraph a)
  | AltText l -> Xlist.iter l (fun (mode,text) ->
      split_tokens_into_groups_text a text)

let split_tokens_into_groups size text =
  let a = Array.make size max_int in
  split_tokens_into_groups_text a text;
  Int.iter 1 (Array.length a - 1) (fun i ->
    if a.(i) <> max_int then a.(i) <- a.(a.(i)));
  let map = Int.fold 1 (Array.length a - 1) IntMap.empty (fun map i ->
    if a.(i) = max_int then map else
    IntMap.add_inc map a.(i) [i] (fun l -> i :: l)) in
  IntMap.fold map [] (fun l _ v -> v :: l)

let assign_valence tokens lex_sems group =
  let lexemes = Xlist.fold group StringMap.empty (fun lexemes id ->
    match (ExtArray.get tokens id).token with
      Lemma(lemma,pos,_) ->
        StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos)
    | Proper(lemma,pos,_,_) ->
        let pos = match pos with
          "subst" -> "psubst"
        | "depr" -> "pdepr"
        | _ -> pos (*failwith ("assign_valence: Proper " ^ pos ^ " " ^ lemma)*) in
        StringMap.add_inc lexemes lemma (StringSet.singleton pos) (fun set -> StringSet.add set pos) (* nazwy własne mają przypisywaną domyślną walencję rzeczowników *)
    | _ -> lexemes) in
  let valence = ENIAMwalenty.find_frames lexemes in
  Xlist.iter group (fun id ->
    match (ExtArray.get tokens id).token with
      Lemma(lemma,pos,_) ->
        ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
          valence=try Xlist.rev_map (StringMap.find (StringMap.find valence lemma) pos) (fun frame -> 0,frame) with Not_found -> []}
    | Proper(lemma,pos,interp,_) ->
        ExtArray.set lex_sems id {(ExtArray.get lex_sems id) with
          valence=(try Xlist.rev_map (StringMap.find (StringMap.find valence lemma)
            (if pos = "subst" || pos = "depr" then "p" ^ pos else pos)) (fun frame -> 0,frame) with Not_found -> [](*failwith ("assign_valence: Proper(" ^ lemma ^ "," ^ pos ^ ")")*))};
        ExtArray.set tokens id {(ExtArray.get tokens id) with token=Lemma(lemma,pos,interp)}
    | _ -> ())

let get_prefs_schema prefs schema =
  Xlist.fold schema prefs (fun prefs t ->
    Xlist.fold t.sel_prefs prefs StringSet.add)

let map_prefs_schema senses schema =
  Xlist.map schema (fun t ->
    if Xlist.mem t.morfs (Phrase Pro) || Xlist.mem t.morfs (Phrase ProNG) then t else
    {t with sel_prefs = Xlist.fold t.sel_prefs [] (fun l s ->
      if StringSet.mem senses s then s :: l else l)})

let disambiguate_senses lex_sems group =
  let prefs = Xlist.fold group (StringSet.singleton "ALL") (fun prefs id ->
    Xlist.fold (ExtArray.get lex_sems id).valence prefs (fun prefs -> function
      _,Frame(_,schema) -> get_prefs_schema prefs schema
    | _,LexFrame(_,_,_,schema) -> get_prefs_schema prefs schema
    | _,ComprepFrame(_,_,_,schema) -> get_prefs_schema prefs schema)) in
  let hipero = Xlist.fold group (StringSet.singleton "ALL") (fun hipero id ->
    Xlist.fold (ExtArray.get lex_sems id).senses hipero (fun hipero (_,l,_) ->
      Xlist.fold l hipero StringSet.add)) in
  let senses = StringSet.intersection prefs hipero in
  let is_zero = StringSet.mem hipero "0" in
  let senses = if is_zero then StringSet.add senses "0" else senses in
  Xlist.iter group (fun id ->
    let t = ExtArray.get lex_sems id in
    ExtArray.set lex_sems id {t with valence = if is_zero then t.valence else
        Xlist.map t.valence (function
          n,Frame(a,schema) -> n,Frame(a,map_prefs_schema senses schema)
        | n,LexFrame(s,p,r,schema) -> n,LexFrame(s,p,r,map_prefs_schema senses schema)
        | n,ComprepFrame(s,p,r,schema) -> n,ComprepFrame(s,p,r,map_prefs_schema senses schema));
      senses = Xlist.map t.senses (fun (s,l,w) ->
        s, List.rev (Xlist.fold l [] (fun l s -> if StringSet.mem senses s then s :: l else l)),w)})

let simplify_position_verb l = function (* FIXME: dodać czyszczenie E Pro *)
    Phrase(NP(Case "dat")) -> l
  | Phrase(NP(Case "inst")) -> l
  | Phrase(PrepNP _) -> l
  | Phrase(PrepAdjP _) -> l
  | Phrase(NumP (Case "dat")) -> l
  | Phrase(NumP (Case "inst")) -> l
  | Phrase(PrepNumP _) -> l
  | Phrase(ComprepNP _) -> l
  | Phrase(ComparNP _) -> l
  | Phrase(ComparPP _) -> l
  | Phrase(IP) -> l
  | Phrase(CP _) -> l
  | Phrase(NCP(Case "dat",_,_)) -> l
  | Phrase(NCP(Case "inst",_,_)) -> l
  | Phrase(PrepNCP _) -> l
(*   | Phrase(PadvP) -> l *)
  | Phrase(AdvP) -> l
  | Phrase(PrepP) -> l
  | Phrase(Or) -> l
  | Phrase(Qub) -> l
  | Phrase(Adja) -> l
  | Phrase(Inclusion) -> l
  | Phrase Pro -> Phrase Null :: l
  | t -> t :: l

let simplify_position_noun l = function
    Phrase(NP(Case "gen")) -> l
  | Phrase(NP(Case "nom")) -> l
  | Phrase(NP(CaseAgr)) -> l
  | Phrase(PrepNP _) -> l
  | Phrase(AdjP AllAgr) -> l
  | Phrase(NumP (Case "gen")) -> l
  | Phrase(NumP (Case "nom")) -> l
  | Phrase(NumP (CaseAgr)) -> l
  | Phrase(PrepNumP _) -> l
  | Phrase(ComprepNP _) -> l
  | Phrase(ComparNP _) -> l
  | Phrase(ComparPP _) -> l
  | Phrase(IP) -> l
  | Phrase(NCP(Case "gen",_,_)) -> l
  | Phrase(PrepNCP _) -> l
  | Phrase(PrepP) -> l
  | Phrase(Qub) -> l
  | Phrase(Adja) -> l
  | Phrase(Inclusion) -> l
  | Phrase Pro -> Phrase Null :: l
  | t -> t :: l

let simplify_position_adj l = function
    Phrase(AdvP) -> l
  | t -> t :: l

let simplify_position_adv l = function
    Phrase(AdvP) -> l
  | t -> t :: l


let simplify_position pos l s =
  let morfs = match pos with
      "verb" -> List.rev (Xlist.fold s.morfs [] simplify_position_verb)
    | "noun" -> List.rev (Xlist.fold s.morfs [] simplify_position_noun)
    | "adj" -> List.rev (Xlist.fold s.morfs [] simplify_position_adj)
    | "adv" -> List.rev (Xlist.fold s.morfs [] simplify_position_adv)
    | _ -> s.morfs in
  match morfs with
    [] -> l
  | [Phrase Null] -> l
  | _ -> {s with morfs=morfs} :: l

let simplify_schemata pos schemata =
  let schemata = Xlist.fold schemata StringMap.empty (fun schemata (schema,frame) ->
    let schema = List.sort compare (Xlist.fold schema [] (fun l s ->
      let s = {s with role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[]; morfs=List.sort compare s.morfs} in
      if s.gf <> ARG && s.gf <> ADJUNCT then s :: l else
(*       if s.cr <> [] || s.ce <> [] then s :: l else  *)
      simplify_position pos l s)) in
    StringMap.add_inc schemata (ENIAMwalStringOf.schema schema) (schema,[frame]) (fun (_,frames) -> schema, frame :: frames)) in
  StringMap.fold schemata [] (fun l _ s -> s :: l)

(* FIXME: problem ComprepNP i PrepNCP *)
(* FIXME: problem gdy ten sam token występuje w  kilku ścieżkach *)
let generate_verb_prep_adjuncts preps =
  Xlist.map preps (fun (lemma,case) -> ENIAMwalFrames.verb_prep_adjunct_schema_field lemma case)

let generate_verb_comprep_adjuncts compreps =
  Xlist.map compreps (fun lemma -> ENIAMwalFrames.verb_comprep_adjunct_schema_field lemma)

let generate_verb_compar_adjuncts compars =
  Xlist.map compars (fun lemma -> ENIAMwalFrames.verb_compar_adjunct_schema_field lemma)

let generate_noun_prep_adjuncts preps =
  ENIAMwalFrames.noun_prep_adjunct_schema_field preps

let generate_noun_compar_adjuncts compars =
  ENIAMwalFrames.noun_compar_adjunct_schema_field compars

let generate_adj_compar_adjuncts compars =
  ENIAMwalFrames.noun_compar_adjunct_schema_field compars

let compars = StringSet.of_list ["jak";"jako";"niż";"niczym";"niby";"co"]

let generate_prep_adjunct_tokens tokens group =
  let map = Xlist.fold group StringMap.empty (fun map id ->
    let t = ExtArray.get tokens id in
    match t.token with
      Lemma(lemma,"prep",interp) ->
        let map = if lemma = "po" then StringMap.add map "po:postp" ("po","postp") else map in
        if StringSet.mem compars lemma then map else
        Xlist.fold interp map (fun map -> function
          [cases] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case))
        | [cases;_] -> Xlist.fold cases map (fun map case -> StringMap.add map (lemma ^ ":" ^ case) (lemma,case))
        | _ -> map)
    | _ -> map) in
  StringMap.fold map [] (fun l _ v -> v :: l)

let generate_comprep_adjunct_tokens tokens group =
  let lemmas = Xlist.fold group StringSet.empty (fun lemmas id ->
    let t = ExtArray.get tokens id in
    match t.token with
      Lemma(lemma,_,_) -> StringSet.add lemmas lemma
    | _ -> lemmas) in
  StringMap.fold ENIAMwalFrames.comprep_reqs [] (fun compreps comprep reqs ->
    let b = Xlist.fold reqs true (fun b s -> b && StringSet.mem lemmas s) in
    if b then comprep :: compreps else compreps)

let generate_compar_adjunct_tokens tokens group =
  let set = Xlist.fold group StringSet.empty (fun set id ->
    let t = ExtArray.get tokens id in
    match t.token with
      Lemma(lemma,"prep",interp) ->
        if not (StringSet.mem compars lemma) then set else
        StringSet.add set lemma
    | _ -> set) in
  StringSet.to_list set

let is_measure = function
    NounAtrs(_,_,Common "measure") -> true
  | _ -> false

let remove_meaning = function
    DefaultAtrs(m,r,o,neg,p,a) -> DefaultAtrs([],r,o,neg,p,a)
  | EmptyAtrs m -> EmptyAtrs []
  | NounAtrs(m,nsyn,s(*,typ*)) -> NounAtrs([],nsyn,s(*,typ*))
  | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> AdjAtrs([],c,adjsyn(*,adjsem,typ*))
  | PersAtrs(m,le,neg,mo,t,au,a) -> PersAtrs([],le,neg,mo,t,au,a)
  | GerAtrs(m,le,neg,a) -> GerAtrs([],le,neg,a)
  | NonPersAtrs(m,le,role,role_attr,neg,a) -> NonPersAtrs([],le,role,role_attr,neg,a)
  | _ -> failwith "remove_meaning"

let assign_simplified_valence tokens lex_sems group =
  let preps = generate_prep_adjunct_tokens tokens group in
  let compreps = generate_comprep_adjunct_tokens tokens group in
  let compars = generate_compar_adjunct_tokens tokens group in
  let verb_prep_adjuncts = generate_verb_prep_adjuncts preps in
  let verb_comprep_adjuncts = generate_verb_comprep_adjuncts compreps in
  let verb_compar_adjuncts = generate_verb_compar_adjuncts compars in
  let noun_prep_adjuncts = generate_noun_prep_adjuncts preps compreps in
  let noun_compar_adjuncts = generate_noun_compar_adjuncts compars in
  let adj_compar_adjuncts = generate_adj_compar_adjuncts compars in
  let verb_adjuncts = ENIAMwalFrames.verb_adjuncts_simp @ verb_prep_adjuncts @ verb_comprep_adjuncts @ verb_compar_adjuncts in
  let noun_adjuncts = ENIAMwalFrames.noun_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
  let noun_measure_adjuncts = ENIAMwalFrames.noun_measure_adjuncts_simp @ [noun_prep_adjuncts] @ [noun_compar_adjuncts] in
  let adj_adjuncts = ENIAMwalFrames.adj_adjuncts_simp @ [adj_compar_adjuncts] in
  let adv_adjuncts = ENIAMwalFrames.adv_adjuncts_simp @ [adj_compar_adjuncts] in
  Xlist.iter group (fun id ->
    let t = ExtArray.get lex_sems id in
    let pos = match (ExtArray.get tokens id).token with
        Lemma(_,pos,_) -> ENIAMwalFrames.simplify_pos pos
      | _ -> "" in
    let lex_frames,frames = Xlist.fold t.valence ([],StringMap.empty) (fun (lex_frames,frames) -> function
        _,(Frame(attrs,schema) as frame) ->
          let attrs = remove_meaning attrs in
          lex_frames, StringMap.add_inc frames (ENIAMwalStringOf.frame_atrs attrs) (attrs,[schema,frame]) (fun (_,l) -> attrs, (schema,frame) :: l)
      | _,frame -> frame :: lex_frames, frames) in
    let simp_frames,full_frames,n = Xlist.fold lex_frames ([],[],1) (fun (simp_frames,full_frames,n) frame ->
      (n,frame) :: simp_frames, (n,frame) :: full_frames, n+1) in
    let simp_frames,full_frames,_ = StringMap.fold frames (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) _ (attrs,schemata) ->
      Xlist.fold (simplify_schemata pos schemata) (simp_frames,full_frames,n) (fun (simp_frames,full_frames,n) (schema,frames) ->
        let schema = match pos with
            "verb" -> schema @ verb_adjuncts
          | "noun" -> schema @ (if is_measure attrs then noun_measure_adjuncts else noun_adjuncts)
          | "adj" -> schema @ adj_adjuncts
          | "adv" -> schema @ adv_adjuncts
          | _ -> schema in
        (n,Frame(attrs,schema)) :: simp_frames,
        Xlist.fold frames full_frames (fun full_frames frame -> (n,frame) :: full_frames),
        n+1)) in
    ExtArray.set lex_sems id {t with simple_valence=simp_frames; valence=full_frames})


let assign tokens text =
  let lex_sems = ExtArray.make (ExtArray.size tokens) empty_lex_sem in
  let _ = ExtArray.add lex_sems empty_lex_sem in
  Int.iter 1 (ExtArray.size tokens - 1) (fun i ->
    let token = ExtArray.get tokens i in
    let token = find_proper_names tokens i token in
    ExtArray.set tokens i token;
    let senses = find_senses token in
    let cats = ENIAMcategories.assign token in
    let lex_sem = {empty_lex_sem with senses=senses; cats=cats} in
    let _ = ExtArray.add lex_sems lex_sem in
    ());
  let groups = split_tokens_into_groups (ExtArray.size tokens) text in
  (* Xlist.iter groups (fun group -> print_endline (String.concat " " (Xlist.map group string_of_int))); *)
  Xlist.iter groups (fun group -> assign_valence tokens lex_sems group);
  Xlist.iter groups (fun group -> disambiguate_senses lex_sems group);
  Xlist.iter groups (fun group -> assign_simplified_valence tokens lex_sems group);
  Xlist.iter groups (fun group -> ENIAMlexSemanticsData.assign_semantics tokens lex_sems group);
  lex_sems