LCGvalence.ml 16.9 KB
(*
 *  ENIAM: Categorial Syntactic-Semantic Parser for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMwalTypes
open LCGtypes
open Printf
open Xstd

let rec list_assoc2 x = function
    (s,a,b) :: l -> if x = s then a,b else list_assoc2 x l
  | [] -> raise Not_found

let meaning_weight = -1.

let prepare_senses lemma meanings senses =
  match meanings,senses with
    [],[] -> [lemma, ["ALL"],0.] (* FIXME *)
  | [],_ ->
     Xlist.map senses (fun (sense,hipero,weight) ->
       if hipero = ["0"] then sense,["0"],weight else
       sense,(if hipero = [] then ["ALL"] else hipero),weight)
  | _,[] -> Xlist.map meanings (fun meaning -> meaning, ["ALL"],meaning_weight)
  | _,_ ->
     Xlist.map meanings (fun meaning ->
       let hipero,weight = try list_assoc2 meaning senses with Not_found -> [],meaning_weight in
       if hipero = ["0"] then meaning,["0"],weight else meaning,(if hipero = [] then ["ALL"] else hipero),weight)

let extract_meaning lemma = function
    DefaultAtrs(m,r,o,neg,p,a) -> m,lemma,DefaultAtrs([],r,o,neg,p,a)
  | EmptyAtrs m -> m,lemma,EmptyAtrs []
  | NounAtrs(m,nsyn,s(*,typ*)) -> m,lemma,NounAtrs([],nsyn,s(*,typ*))
  | AdjAtrs(m,c,adjsyn(*,adjsem,typ*)) -> m,lemma,AdjAtrs([],c,adjsyn(*,adjsem,typ*))
  | PersAtrs(m,le,neg,mo,t,au,a) -> m,le,PersAtrs([],le,neg,mo,t,au,a)
  | GerAtrs(m,le,neg,a) -> m,le,GerAtrs([],le,neg,a)
  | NonPersAtrs(m,le,role,role_attr,neg,a) -> m,le,NonPersAtrs([],le,role,role_attr,neg,a)
  | _ -> failwith "extract_meaning"

let extract_roles = function
    NonPersAtrs(m,le,role,role_attr,neg,a) -> role,role_attr
  | _ -> failwith "extract_roles"

let get_lemma = function
    ENIAMtokenizerTypes.Lemma(lemma,cat,_) -> lemma,cat
  | ENIAMtokenizerTypes.Interp lemma -> lemma,"interp"
  | _ -> "",""

let prepare_valence tokens lex_sems =
  let valence = Array.make (ExtArray.size tokens) [] in
  Int.iter 1 (ExtArray.size tokens - 1) (fun id ->
    let c = ExtArray.get tokens id in
    let d = ExtArray.get lex_sems id in
    let lemma,cat = get_lemma c.ENIAMtokenizerTypes.token in
    let lemma = if lemma = "<or-sentence>" (*|| lemma = ":s" || lemma = "„s"*) then "pro-komunikować" else lemma in
    if lemma = "" then () else
    let prep_valence =
      if cat = "prep" then
(*         (0,lemma,StringSet.empty,0.,"NOSEM","",Frame(EmptyAtrs[],[])) :: *)
        match d.ENIAMlexSemanticsTypes.semantics with
          ENIAMlexSemanticsTypes.Normal -> []
        | ENIAMlexSemanticsTypes.PrepSemantics l -> (* FIXME: uzgadnianie cases *)
            Xlist.rev_map l (fun (case,lrole,lrole_attr,hipero,sel_prefs) ->
              0,lemma,hipero,0.,lrole,lrole_attr,Frame(EmptyAtrs[],[]))
        | _ -> failwith "prepare_valence"
      else [] in
    let valence2 = if d.ENIAMlexSemanticsTypes.valence = [] then [0,Frame(EmptyAtrs[],[])] else d.ENIAMlexSemanticsTypes.valence in
    let lrole,lrole_attr = d.ENIAMlexSemanticsTypes.lroles in
    valence.(id) <- prep_valence @ List.flatten (Xlist.map valence2 (function
        fnum,Frame(attrs,schema) ->
          let meanings,lemma,attrs = extract_meaning lemma attrs in
          let lrole,lrole_attr =
            if cat = "pact" || cat = "ppas" then extract_roles attrs else
            if cat = "pcon" then "Con","" else
            if cat = "pant" then "Ant","" else
            d.ENIAMlexSemanticsTypes.lroles in
          Xlist.map (prepare_senses lemma meanings d.ENIAMlexSemanticsTypes.senses) (fun (meaning,hipero,weight) ->
            let hipero = if cat = "conj" then ["0"] else hipero in
            fnum,meaning,StringSet.of_list hipero,weight,lrole,lrole_attr,
            Frame(attrs,Xlist.map schema (fun s ->
(*               let s = if s.sel_prefs=[] then (print_endline ("prepare_valence empty sel_prefs: " ^ lemma ^ " " ^ cat); {s with sel_prefs=["ALL"]}) else s in *)
              if s.role="" && s.gf <> ADJUNCT && s.gf <> NOSEM then (
                printf "%d: %s\n%!" fnum (ENIAMwalStringOf.frame lemma (Frame(attrs,schema)));
                failwith ("prepare_valence empty role: " ^ lemma ^ " " ^ cat)) else
              {s with morfs=List.sort compare s.morfs})))
      | fnum,(LexFrame _ as frame) -> [fnum,"lex",StringSet.empty,0.,lrole,lrole_attr,frame]
      | fnum,(ComprepFrame _ as frame) -> [fnum,"comprep",StringSet.empty,0.,lrole,lrole_attr,frame])));
  valence

(*let create_pro_frames t =
  [0,t.pred,StringSet.singleton "0",0.,"","",Frame(EmptyAtrs[],[])]*)

let get_fnum t =
  let x = try Xlist.assoc t.attrs "FNUM" with Not_found -> Val "0" in
  (match x with
        Val s -> (try int_of_string s with _ -> failwith "get_fnum 1")
      | _ -> failwith "get_fnum 2")

let select_frames l t =
(*   printf "a1 pred=%s\n" t.pred; *)
  let fnum = get_fnum t in
  let l = Xlist.fold l [] (fun l (n,meaning,hipero,weight,lrole,lrole_attr,frame) ->
    if n = fnum then (meaning,hipero,weight,lrole,lrole_attr,frame) :: l else l) in
(*   printf "a2 pred=%s\n" t.pred; *)
  match l with
    [] -> "",[]
  | [_,_,_,_,_,LexFrame _] -> "",[]
  | [_,_,_,_,_,ComprepFrame _] -> "",[]
  | _ -> LCGreductions.get_variant_label (),
         fst (Xlist.fold l ([],1) (fun (l,i) t -> (string_of_int i, t) :: l, i+1))

let rec get_arg_refs found = function
   Variant(_,l) -> Xlist.fold l found (fun found (_,t) -> get_arg_refs found t)
 | Ref i -> i :: found
 | t -> failwith ("get_arg_refs: " ^ LCGstringOf.linear_term 0 t)

let rec match_position = function
    a :: la, b :: lb ->
      if a = b then match_position (la,b :: lb) else
      if a > b then match_position (a :: la,lb)
      else false
  | [],_ -> true
  | _,[] -> false

let mark_sem_morfs morfs =
  Xlist.map morfs (function
    | Phrase(PrepNP(_,prep,c)) -> Phrase(PrepNP(Sem,prep,c))
    | Phrase(PrepAdjP(_,prep,c)) -> Phrase(PrepAdjP(Sem,prep,c))
    | Phrase(PrepNumP(_,prep,c)) -> Phrase(PrepNumP(Sem,prep,c))
    | Phrase(ComprepNP(_,prep)) -> Phrase(ComprepNP(Sem,prep))
    | Phrase(ComparNP(_,prep,c)) -> Phrase(ComparNP(Sem,prep,c))
    | Phrase(ComparPP(_,prep)) -> Phrase(ComparPP(Sem,prep))
    | Phrase(PrepNCP(_,prep,c,ct,co)) -> Phrase(PrepNCP(Sem,prep,c,ct,co))
    | t -> t)

let mark_nosem_morf = function
      Phrase(PrepNP(_,prep,c)) -> Phrase(PrepNP(NoSem,prep,c))
    | Phrase(PrepAdjP(_,prep,c)) -> Phrase(PrepAdjP(NoSem,prep,c))
    | Phrase(PrepNumP(_,prep,c)) -> Phrase(PrepNumP(NoSem,prep,c))
    | Phrase(ComprepNP(_,prep)) -> Phrase(ComprepNP(NoSem,prep))
    | Phrase(ComparNP(_,prep,c)) -> Phrase(ComparNP(NoSem,prep,c))
    | Phrase(ComparPP(_,prep)) -> Phrase(ComparPP(NoSem,prep))
    | Phrase(PrepNCP(_,prep,c,ct,co)) -> Phrase(PrepNCP(NoSem,prep,c,ct,co))
    | t -> failwith "mark_nosem_morf"

let rec is_nosem_morf = function
    Phrase(PrepNP(NoSem,prep,c)) -> true
  | Phrase(PrepAdjP(NoSem,prep,c)) -> true
  | Phrase(PrepNumP(NoSem,prep,c)) -> true
  | Phrase(ComprepNP(NoSem,prep)) -> true
  | Phrase(ComparNP(NoSem,prep,c)) -> true
  | Phrase(ComparPP(NoSem,prep)) -> true
  | Phrase(PrepNCP(NoSem,prep,c,ct,co)) -> true
  | _ -> false

let rec is_sem_morf = function
    Phrase(PrepNP(Sem,prep,c)) -> true
  | Phrase(PrepAdjP(Sem,prep,c)) -> true
  | Phrase(PrepNumP(Sem,prep,c)) -> true
  | Phrase(ComprepNP(Sem,prep)) -> true
  | Phrase(ComparNP(Sem,prep,c)) -> true
  | Phrase(ComparPP(Sem,prep)) -> true
  | Phrase(PrepNCP(Sem,prep,c,ct,co)) -> true
  | _ -> false

let rec exclude_sem_morfs = function
    [] -> []
  | morf :: morfs -> if is_sem_morf morf then exclude_sem_morfs morfs else morf :: exclude_sem_morfs morfs

(* UWAGA: dopasowywane ramy są preselekcjonowane, więc wszystkie argumenty muszą się maczować *)
let match_args_pos modifications nodes e i schema t =
(*    printf "match_args_pos\n";  *)
(*   if schema = [] then schema else *)
  let refs = get_arg_refs [] t in
  let gfl,morfs = Xlist.fold refs ([],[]) (fun (gfl,morfs) i -> nodes.(i).agf ::  gfl, nodes.(i).amorf :: morfs) in
  if gfl = [] then failwith "match_args_pos: empty gfl" else
  let gf = Xlist.fold (List.tl gfl) (List.hd gfl) (fun gf gf2 -> if gf = gf2 then gf else failwith "match_args_pos 2") in
(*  if gf = NOSEM || gf = NOGF then schema else
  if gf = CORE then schema else (* FIXME: semantyka dla core *)*)
  let morfs = exclude_sem_morfs morfs in
  let schema,selected =
    if morfs = [] then schema,[] else
    let morfs = List.sort compare morfs in
(*     printf "gf=%s morfs=%s\n%!" (ENIAMwalStringOf.gf gf) (String.concat ";" (Xlist.map morfs ENIAMwalStringOf.morf));   *)
    Xlist.fold schema ([],[]) (fun (schema,selected) pos ->
(*      printf "pos.gf=%s pos.morfs=%s\n%!" (ENIAMwalStringOf.gf pos.gf) (String.concat ";" (Xlist.map pos.morfs ENIAMwalStringOf.morf));  *)
      if gf = pos.gf || (gf = ADJUNCT && pos.gf=ARG) then
        if match_position (morfs,(*mark_sem_morfs*) pos.morfs) then schema, pos :: selected else pos :: schema, selected
      else pos :: schema, selected) in
  (match selected with
    [] -> (*if gf = ARG then failwith "match_args_pos 3" else*)
      Xlist.iter refs (fun r ->
        modifications.(r) <- StringMap.add modifications.(r) (e ^ i)
          LCGrenderer.empty_schema_field(*{gf=ADJUNCT; role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[]; dir=Both; morfs=[]}*) )
(* FIXME: przy kilku pasujących pozycjach wybieram pierwszą a nich, do poprawienia przy okazji porządków z walencją
np walencja leksemu "godzina":
1: : : common: time: Poss,T|{null;np(gen);nump(gen)}+Arg,T|{null;np(gen);nump(gen)};
1: : : common: time: Temp,T|{null;np(gen);nump(gen)}]; *)
  | pos :: _ -> Xlist.iter refs (fun r -> (* FIXME: gdzieś tu trzeba wstawić uzupełnianie brakujących ról dla argumentów i adjunctów *) (* FIXME: tu pewnie trzeba będzie wstawić rolę dla adjunctów, które pojawią się w wyniku niespełniania SEL-PREFS *)
      modifications.(r) <- StringMap.add modifications.(r) (e ^ i) pos)
  (*| _ -> failwith "match_args_pos 4"*));
  schema

let rec match_args_tuple modifications nodes e i schema = function
    Tuple l ->
      Xlist.fold l schema (fun schema t ->
        match_args_tuple modifications nodes e i schema t)
  | t -> match_args_pos modifications nodes e i schema t

let match_args modifications nodes e i t = function
    Frame(_,schema) -> ignore (match_args_tuple modifications nodes e i schema t.args)
  | LexFrame _ -> failwith "match_args"
  | ComprepFrame _ -> failwith "match_args"

let rec assign_frames_and_senses_rec modifications valence nodes t =
(*        printf "pred=%s id=%d\n" t.pred t.id;   *)
      (* if t.id = 0 then failwith ("assign_frames_and_senses_rec: t.id=0 pred=" ^ t.pred) else *) (* FIXME: zbadać po co było to zabezpieczenie *)
      let e,node_valence = select_frames ((*if t.id >= Array.length valence then create_pro_frames t else*) valence.(t.id)) t in
      let node_valence = if node_valence <> [] then node_valence else ["1",(t.pred,StringSet.empty,0.,"","",Frame(EmptyAtrs[],[]))] in
      let l = Xlist.map node_valence (fun (i,(meaning,hipero,weight,lrole,lrole_attr,frame)) ->
        let t = if lrole = "" then t else {t with arole=lrole;arole_attr=lrole_attr} in (* FIXME: pomijam to, że role dla rzeczowników dotyczą tylko inst *)
(*         let t = if lrole = "NOSEM" then {t with amorf=mark_nosem_morf t.amorf} else t in *)
        if t.args <> Dot then match_args modifications nodes e i t frame;
(*          printf "meaning=%s\n" meaning;   *)
        i,Node{t with meaning=meaning;
                    hipero=hipero;
                    meaning_weight=weight}) in
      Variant(e,l)

let rec is_nosem_morfs morfs =
  let sem = Xlist.fold morfs false (fun b m -> b || is_sem_morf m) in
  let nosem = Xlist.fold morfs false (fun b m -> b || is_nosem_morf m) in
  nosem && not sem (* FIXME: pewne uproszczenie, ale liczę, że nie ma wielu ramek z xp koordynowanym z prep *)

let has_tuple = function
    Tuple _ -> true
  | _ -> false

let rec apply_modifications_rec (*tokens*) pos = function
    Variant(e,l) -> Variant(e,List.rev (Xlist.fold l [] (fun l (i,t) ->
      let t = apply_modifications_rec (*tokens*) pos t in
      (*if t = Dot then l else*) (i, (*apply_modifications_rec (*tokens*) pos*) t) :: l)))
  | Node t ->
      if is_sem_morf t.amorf then Node t else  (* FIXME: czy to jest poprawne? *)
      let t = if is_nosem_morf t.amorf then {t with agf=ARG} else t in
      Node{t with position=pos}
  | _ -> failwith "apply_modifications_rec"

(* let empty_pos = {gf=ADJUNCT; role=""; role_attr=""; sel_prefs=[]; cr=[]; ce=[]; dir=Both; morfs=[]} (* FIXME: jaka GF? *) *)

let apply_modifications (*tokens*) modifications nodes references =
  Int.iter 1 (Array.length references - 1) (fun r ->
(*     if StringMap.is_empty modifications.(r) then failwith ("apply_modifications: " ^ nodes.(r).pred) else *)
    references.(r) <- Choice(StringMap.map modifications.(r) (fun pos ->
      apply_modifications_rec (*tokens*) pos references.(r))))

let rec extract_nosem rev = function
    [] -> List.rev rev, false
  | ("NOSEM",Val "+") :: l -> (List.rev rev) @ l, true
  | x :: l -> extract_nosem (x :: rev) l

let get_nodes = function
    Node t ->
       let attrs,b = extract_nosem [] t.attrs in
       (* let t = if t.pred = "<query1>" || t.pred = "<query2>" || t.pred = "<query3>" || t.pred = "<query4>" || t.pred = "<query5>" || t.pred = "<query6>" then {t with agf=CORE} else t in *)
       let t = if t.pred = "<conll_root>" || t.pred = "<sentence>" || t.pred = "pro-komunikować" then {t with agf=CORE} else t in (* FIXME: przetestować na mowie niezależnej *)
       if t.agf = NOGF then failwith ("get_nodes agf=NOGF: " ^ t.pred) else
       if b then {t with amorf=mark_nosem_morf t.amorf; attrs=attrs} else t
  | _ -> failwith "get_nodes"

let rec propagate_nosem_selprefs modifications ei = function
    Choice choice -> Choice(StringMap.map choice (propagate_nosem_selprefs modifications ""))
  | Variant(e,l) -> Variant(e,Xlist.map l (fun (i,t) -> i, propagate_nosem_selprefs modifications (e ^ i) t))
  | Node t ->
        if (t.cat = "prep" && t.arole = "NOSEM") || t.cat = "num" then
          let refs = IntSet.of_list (get_arg_refs [] t.args) in
          IntSet.iter refs (fun r ->
            modifications.(r) <- StringMap.add_inc modifications.(r) ei t.position.ENIAMwalTypes.sel_prefs (fun l ->
              if l = t.position.ENIAMwalTypes.sel_prefs then l else failwith ("propagate_nosem_selprefs 1: [" ^ String.concat ";" l ^ "] [" ^ String.concat ";" t.position.ENIAMwalTypes.sel_prefs ^ "]")));
          Node{t with position= {t.position with ENIAMwalTypes.sel_prefs = []}}
        else Node t
  | _ -> failwith "propagate_nosem_selprefs 2"

let rec apply_modifications2_rec mods = function
    Variant(e,l) -> Variant(e,Xlist.map l (fun (i,t) -> i, apply_modifications2_rec mods t))
  | Node t ->
      if t.position.ENIAMwalTypes.sel_prefs <> [] then failwith "apply_modifications2_rec" else
      Node{t with position={t.position with ENIAMwalTypes.sel_prefs=mods}}
  | _ -> failwith "apply_modifications2_rec"

let apply_modifications2 modifications references =
  Int.iter 1 (Array.length references - 1) (fun r ->
    if not (StringMap.is_empty modifications.(r)) then
    match references.(r) with
      Choice choice ->
        references.(r) <- Choice(StringMap.mapi choice (fun ei t ->
          try apply_modifications2_rec (StringMap.find modifications.(r) ei) t with Not_found -> t))
    | _ -> failwith "apply_modifications2")

let assign_frames_and_senses tokens lex_sems references =
  let modifications = Array.make (Array.length references) StringMap.empty in
  let valence = prepare_valence tokens lex_sems in
  let nodes = Array.map get_nodes references in
  let references = Array.map (assign_frames_and_senses_rec modifications valence nodes) nodes in
  apply_modifications (*tokens*) modifications nodes references;
(*  let modifications = Array.make (Array.length references) StringMap.empty in
  Int.iter 0 (Array.length references - 1) (fun r -> references.(r) <- propagate_nosem_selprefs modifications "" references.(r)); (* FIXME: propagowanie preferencji selekcyjnych więcej niż jeden poziom w głąb nie działa *)
  apply_modifications2 modifications references;
  Int.iter 0 (Array.length references - 1) (fun r -> references.(r) <- propagate_nosem_selprefs modifications "" references.(r));
  apply_modifications2 modifications references;
  Int.iter 0 (Array.length references - 1) (fun r -> references.(r) <- propagate_nosem_selprefs modifications "" references.(r));
  apply_modifications2 modifications references;   *)
  references