ENIAMwalLex.ml 14.1 KB
(*
 *  ENIAMwalenty, an interface for Polish Valence Dictionary "Walenty".
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMwalTypes
open Xstd

let prep_arg_schema_field morfs =
  (* {gf=CORE; role="Ref"; role_attr=""; sel_prefs=["ALL"]; cr=[]; ce=[]; dir=Forward; morfs=morfs} (* FIXME: uporządkować sensy *)

let prep_arg_schema_field2 morfs = *)
  {psn_id=(-1); gf=ARG(*CORE*); role=""(*"Ref"*); role_attr=""; sel_prefs=[(*"ALL"*)]; cr=[]; ce=[]; (*dir=Forward;*) morfs=morfs} (* FIXME: uporządkować sensy *)

let rec split_elexeme = function
    Lexeme s -> [],[Lexeme s]
  | XOR l ->
    let genders,l = Xlist.fold l ([],[]) (fun (genders,lexs) lex ->
        let gender,lex = split_elexeme lex in
        gender @ genders, lex @ lexs) in
    genders,[XOR(List.rev l)]
  | ORconcat l ->
    let genders,l = Xlist.fold l ([],[]) (fun (genders,lexs) lex ->
        let gender,lex = split_elexeme lex in
        gender @ genders, lex @ lexs) in
    genders,[ORconcat(List.rev l)]
  | ORcoord l ->
    let genders,l = Xlist.fold l ([],[]) (fun (genders,lexs) lex ->
        let gender,lex = split_elexeme lex in
        gender @ genders, lex @ lexs) in
    genders,[ORcoord(List.rev l)]
  | Elexeme gender -> [gender],[]

let rec expand_lexicalizations_schema schema =
  Xlist.map schema (fun s ->
      {s with morfs=expand_lexicalizations_morfs s.morfs})

(* FIXME: LexPhraseMode *)
and expand_lexicalizations_morfs morfs = (* uproszczenie polegające na zezwoleniu na koordynację przy zwiększaniu ilości LexPhrase *)
  List.flatten (Xlist.map morfs (fun morf ->
      let morf = match morf with
          LexPhrase(pos_lex,(restr,schema)) -> LexPhrase(pos_lex,(restr,expand_lexicalizations_schema schema))
        | morf -> morf in
      match morf with
      (*         LexPhrase([ADV _,_],(_,_::_)) -> print_endline (ENIAMwalStringOf.morf morf); [morf] *)
      (*       | LexPhrase([PREP _,_;SUBST _,_],(_,schema)) -> if remove_trivial_args schema <> [] then print_endline (ENIAMwalStringOf.morf morf); [morf] *)
      (*       | LexPhrase([PREP _,_;GER _,_],(_,schema)) -> if remove_trivial_args schema <> [] then print_endline (ENIAMwalStringOf.morf morf); [morf] *)
      (*       | LexPhrase([NUM _,_;_],(_,schema)) -> if remove_trivial_args schema <> [] then print_endline (ENIAMwalStringOf.morf morf); [morf]  *)
      (*       | LexPhrase([PREP _,_;NUM _,_;_],(_,schema)) -> if remove_trivial_args schema <> [] then print_endline (ENIAMwalStringOf.morf morf); [morf]  *)
      (*      | LexPhrase([PREP _,_;ADJ _,_],(_,_::_)) -> print_endline (ENIAMwalStringOf.morf morf); [morf]
              | LexPhrase([PREP _,_;PPAS _,_],(_,_::_)) -> print_endline (ENIAMwalStringOf.morf morf); [morf]
              | LexPhrase([PREP _,_;PACT _,_],(_,_::_)) -> print_endline (ENIAMwalStringOf.morf morf); [morf] *)
      | Phrase(PrepNumP(prep,case)) -> [LexPhrase([PREP case,Lexeme prep],(Ratrs,[prep_arg_schema_field(*2*) [Phrase(NumP(case))]]))]
      | LexPhrase([PREP pcase,plex;SUBST(n,c),slex],(Atr1,[{morfs=[LexPhrase([QUB,_],_)]} as s])) ->
        (*            print_endline (ENIAMwalStringOf.morf morf);  *)
        [LexPhrase([PREP pcase,plex],(Ratrs,[prep_arg_schema_field [LexPhrase([SUBST(n,c),slex],(Natr,[]))]]));
         LexPhrase([PREP pcase,plex],(Ratrs,[prep_arg_schema_field [LexPhrase([SUBST(n,c),slex],(Natr,[]))];s(*{s with dir=Backward}*)]))]
      | LexPhrase([PREP(pcase),plex;SUBST(n,c),slex],(Atr1,[{morfs=[LexPhrase([ADV _,_],_)]} as s])) ->
        (*            print_endline (ENIAMwalStringOf.morf morf);  *)
        [LexPhrase([PREP pcase,plex],(Ratrs,[prep_arg_schema_field [LexPhrase([SUBST(n,c),slex],(Natr,[]))]]));
         LexPhrase([PREP pcase,plex],(Ratrs,[prep_arg_schema_field [LexPhrase([SUBST(n,c),slex],(Natr,[]))];s(*{s with dir=Backward}*)]))]
      | LexPhrase([PREP pcase,plex;SUBST(n,c),slex],(Ratr1,[{morfs=[LexPhrase([ADV _,_],_)]} as s])) ->
        (*            print_endline (ENIAMwalStringOf.morf morf);  *)
        [LexPhrase([PREP pcase,plex],(Ratrs,[prep_arg_schema_field [LexPhrase([SUBST(n,c),slex],(Natr,[]))];s(*{s with dir=Backward}*)]))]
      | LexPhrase([PREP pcase,plex;pos,lex],restr) ->
        [LexPhrase([PREP pcase,plex],(Ratrs,[prep_arg_schema_field [LexPhrase([pos,lex],restr)]]))]
      | LexPhrase([PREP pcase,plex;NUM(c,g,a),nlex;pos,lex],restr) ->
        let genders,lexs = split_elexeme lex in
        Xlist.map genders (fun gender ->
            LexPhrase([PREP pcase,plex],(Ratrs,[prep_arg_schema_field [LexPhrase([NUM(c,gender,a),nlex],(Ratrs,[(*num*)prep_arg_schema_field [Phrase Pro]]))]]))) @
        Xlist.map lexs (fun lex ->
            LexPhrase([PREP pcase,plex],(Ratrs,[prep_arg_schema_field [LexPhrase([NUM(c,g,a),nlex],(Ratrs,[(*num*)prep_arg_schema_field [LexPhrase([pos,lex],restr)]]))]])))
      | LexPhrase([NUM(c,g,a),nlex;pos,lex],restr) ->
        let genders,lexs = split_elexeme lex in
        Xlist.map genders (fun gender ->
            LexPhrase([NUM(c,gender,a),nlex],(Ratrs,[(*num*)prep_arg_schema_field [Phrase Pro]]))) @
        Xlist.map lexs (fun lex ->
            LexPhrase([NUM(c,g,a),nlex],(Ratrs,[(*num*)prep_arg_schema_field [LexPhrase([pos,lex],restr)]])))
      | LexPhrase([COMP ctype,clex;pos,lex],restr) ->
        [LexPhrase([COMP ctype,clex],(Ratrs,[(*std*)prep_arg_schema_field (*Forward*) [LexPhrase([pos,lex],restr)]]))]
      | LexPhrase([SUBST(n,c),slex;COMP ctype,clex;pos,lex],restr) ->
        [LexPhrase([SUBST(n,c),slex],(Ratrs,[(*std*)prep_arg_schema_field (*Forward*) [LexPhrase([COMP ctype,clex],(Ratrs,[(*std*)prep_arg_schema_field (*Forward*) [LexPhrase([pos,lex],restr)]]))]]))] (* FIXME: poprawić po zrobieniu NCP *)
      | LexPhrase(_::_::_,_) -> failwith ("expand_lexicalizations_morfs: " ^ ENIAMwalStringOf.morf morf)
      (*      | LexPhrase([PREP pcase,plex;SUBST(n,c),slex],(Atr1,[gf,cr,ce,[LexPhrase([QUB,lex],arestr)]])) ->
              (*            print_endline (ENIAMwalStringOf.morf morf);  *)
                 [LexPhrase([PREP pcase,plex],(Ratrs,[("OBJ","Ref",["T"]),[],[],[LexPhrase([SUBST(n,c),slex],(Natr,[]))]]));
                  LexPhrase([PREP pcase,plex],(Ratrs,[("OBJ","Ref",["T"]),[],[],[LexPhrase([SUBST(n,c),slex],(Natr,[]))];gf,cr,ce,[LexPhrase([QUB,lex],arestr)]]))]
              | LexPhrase([PREP(pcase),plex;SUBST(n,c),slex],(Atr1,[gf,cr,ce,[LexPhrase([ADV gr,lex],arestr)]])) ->
              (*            print_endline (ENIAMwalStringOf.morf morf);  *)
                 [LexPhrase([PREP pcase,plex],(Ratrs,[("OBJ","Ref",["T"]),[],[],[LexPhrase([SUBST(n,c),slex],(Natr,[]))]]));
                  LexPhrase([PREP pcase,plex],(Ratrs,[("OBJ","Ref",["T"]),[],[],[LexPhrase([SUBST(n,c),slex],(Natr,[]))];gf,cr,ce,[LexPhrase([ADV gr,lex],arestr)]]))]
              | LexPhrase([PREP pcase,plex;SUBST(n,c),slex],(Ratr1,[gf,cr,ce,[LexPhrase([ADV gr,lex],arestr)]])) ->
              (*            print_endline (ENIAMwalStringOf.morf morf);  *)
                 [LexPhrase([PREP pcase,plex],(Ratrs,[("OBJ","Ref",["T"]),[],[],[LexPhrase([SUBST(n,c),slex],(Natr,[]))];gf,cr,ce,[LexPhrase([ADV gr,lex],arestr)]]))]
              | LexPhrase([PREP pcase,plex;pos,lex],restr) ->
                 [LexPhrase([PREP pcase,plex],(Ratrs,[("OBJ","Ref",["T"]),[],[],[LexPhrase([pos,lex],restr)]]))]
              | LexPhrase([PREP pcase,plex;NUM(c,g,a),nlex;pos,lex],restr) ->
                 let genders,lexs = split_elexeme lex in
                 Xlist.map genders (fun gender ->
                   LexPhrase([PREP pcase,plex],(Ratrs,[("OBJ","Ref",["T"]),[],[],[LexPhrase([NUM(c,gender,a),nlex],(Ratrs,[("OBJ","QUANT-ARG",["T"]),[],[],[Phrase Pro]]))]]))) @
                 Xlist.map lexs (fun lex ->
                   LexPhrase([PREP pcase,plex],(Ratrs,[("OBJ","Ref",["T"]),[],[],[LexPhrase([NUM(c,g,a),nlex],(Ratrs,[("OBJ","QUANT-ARG",["T"]),[],[],[LexPhrase([pos,lex],restr)]]))]])))
              | LexPhrase([NUM(c,g,a),nlex;pos,lex],restr) ->
                 let genders,lexs = split_elexeme lex in
                 Xlist.map genders (fun gender ->
                   LexPhrase([NUM(c,gender,a),nlex],(Ratrs,[("OBJ","QUANT-ARG",["T"]),[],[],[Phrase Pro]]))) @
                 Xlist.map lexs (fun lex ->
                   LexPhrase([NUM(c,g,a),nlex],(Ratrs,[("OBJ","QUANT-ARG",["T"]),[],[],[LexPhrase([pos,lex],restr)]])))
              | LexPhrase([COMP ctype,clex;pos,lex],restr) ->
                 [LexPhrase([COMP ctype,clex],(Ratrs,[("C","",["T"]),[],[],[LexPhrase([pos,lex],restr)]]))]
              | LexPhrase([SUBST(n,c),slex;COMP ctype,clex;pos,lex],restr) ->
                 [LexPhrase([SUBST(n,c),slex],(Ratrs,[("OBJ","",["T"]),[],[],[LexPhrase([COMP ctype,clex],(Ratrs,[("C","",["T"]),[],[],[LexPhrase([pos,lex],restr)]]))]]))]
              | LexPhrase(_::_::_,_) -> failwith ("expand_lexicalizations_morfs: " ^ ENIAMwalStringOf.morf morf)*)
      | morf -> [morf]))

let lex_id_counter = ref 0

let get_lex_id () =
  incr lex_id_counter;
  string_of_int (!lex_id_counter)

let get_pos lex = function
    SUBST _ ->
    (match lex with
       "ja" -> ["ppron12"]
     | "my" -> ["ppron12"]
     | "ty" -> ["ppron12"]
     | "wy" -> ["ppron12"]
     | "on" -> ["ppron3"]
     | "siebie" -> ["siebie"]
     | "się" -> ["qub"]
     | _ -> ["subst"])
  | PREP _ -> ["prep"]
  | NUM _ -> ["num"]
  | ADV _ -> ["adv"]
  | ADJ _ -> ["adj"]
  | GER _ -> ["ger"]
  | PPAS _ -> ["ppas"]
  | PACT _ -> ["pact"]
  | PERS _ -> ["fin";"praet";"winien"(*;"impt";"imps"*);"pred"]
  | INF _ -> ["inf"]
  | QUB -> ["qub"]
  | COMPAR -> ["compar"]
  | COMP _ -> ["comp"]
  | FIXED -> ["fixed"]

let rec extract_lex_frames lexeme p frames = function
    Frame(atrs,schema) ->
    let schema,frames = Xlist.fold schema ([],frames) (fun (schema,frames) s ->
        let morfs,frames = Xlist.fold s.morfs ([],frames) extract_lex_morf in
        {s with morfs=List.rev morfs} :: schema, frames) in
    (lexeme,p,Frame(atrs,List.rev schema)) :: frames
  | LexFrame(id,pos,restr,schema) ->
    let schema,frames = Xlist.fold schema ([],frames) (fun (schema,frames) s ->
        let morfs,frames = Xlist.fold s.morfs ([],frames) extract_lex_morf in
        {s with morfs=List.rev morfs} :: schema, frames) in
    (lexeme,p,LexFrame(id,pos,restr,List.rev schema)) :: frames
  | ComprepFrame(s,pos,restr,schema) ->
    let schema,frames = Xlist.fold schema ([],frames) (fun (schema,frames) s ->
        let morfs,frames = Xlist.fold s.morfs ([],frames) extract_lex_morf in
        {s with morfs=List.rev morfs} :: schema, frames) in
    (lexeme,p,ComprepFrame(s,pos,restr,List.rev schema)) :: frames
(*   | _ -> failwith "extract_lex_frames" *)

and extract_lex_morf (morfs,frames) = function
    LexPhrase([pos,lex],(restr,schema)) ->
    let id = get_lex_id () in
    let lexemes = ENIAMwalRealizations.get_lexemes lex in
    let frames = Xlist.fold lexemes frames (fun frames lexeme ->
        let poss = get_pos lexeme pos in
        Xlist.fold poss frames (fun frames p ->
            extract_lex_frames lexeme p frames (LexFrame(id,pos,restr,schema)))) in
    LexPhraseId(id,pos,lex) :: morfs, frames (* FIXME: Czy potrzebne jest tworzenie LexFrame, gdy schema=[] ? *)
  | LexPhrase _ -> failwith "extract_lex_morf"
  | morf -> morf :: morfs, frames

let split_xor schema =
  Xlist.multiply_list (Xlist.map schema (fun s ->
      Xlist.map (Xlist.multiply_list (Xlist.map s.morfs (function
            LexPhraseId(id,pos,XOR l) -> Xlist.map l (fun lex -> LexPhraseId(id,pos,lex))
          | LexPhraseId(id,pos,lex) -> [LexPhraseId(id,pos,lex)]
          | morf -> [morf]))) (fun morfs -> {s with morfs=morfs})))

let split_or_coord schema =
  Xlist.map schema (fun s ->
      {s with morfs=List.flatten (Xlist.map s.morfs (function
             LexPhraseId(id,pos,ORcoord l) -> Xlist.map l (fun lex -> LexPhraseId(id,pos,lex))
           | LexPhraseId(id,pos,ORconcat l) -> Xlist.map l (fun lex -> LexPhraseId(id,pos,lex)) (* FIXME: koordynacja zamiast konkatenacji *)
           | LexPhraseId(id,pos,lex) -> [LexPhraseId(id,pos,lex)]
           | morf -> [morf]))})

let simplify_lex schemas =
  Xlist.map schemas (fun schema ->
      Xlist.map schema (fun s ->
          {s with morfs=Xlist.map s.morfs (function
                 LexPhraseId(id,pos,Lexeme lex) -> LexArg(id,pos,lex)
               | LexPhraseId _ as morf -> failwith ("simplify_lex: " ^ ENIAMwalStringOf.morf morf)
               | morf -> morf)}))

let convert morfs =
  let morfs = expand_lexicalizations_morfs morfs in
  let morfs,frames = Xlist.fold morfs ([],[]) extract_lex_morf in
  Xlist.fold frames(*extract_lex_frames lexeme pos [] frame*) valence (fun valence -> function
        lexeme,pos,Frame(atrs,schema) ->
        let schemas = simplify_lex (split_xor (split_or_coord schema)) in
        Xlist.fold schemas valence (fun valence schema ->
            let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in
            let poss = StringMap.add_inc poss pos [Frame(atrs,schema)] (fun l -> Frame(atrs,schema) :: l) in
            StringMap.add valence lexeme poss)
      | lexeme,pos,LexFrame(id,pos2,restr,schema) ->
        let schemas = simplify_lex (split_xor (split_or_coord schema)) in
        Xlist.fold schemas valence (fun valence schema ->
            let poss = try StringMap.find valence lexeme with Not_found -> StringMap.empty in
            let poss = StringMap.add_inc poss pos [LexFrame(id,pos2,restr,schema)] (fun l -> LexFrame(id,pos2,restr,schema) :: l) in
            StringMap.add valence lexeme poss)
      | _ -> failwith "convert_frame") *)()


let phrases = IntMap.map ENIAMwalRealizations.phrases convert