ENIAMwalRealizations.ml 13.2 KB
(*
 *  ENIAMwalenty, an interface for Polish Valence Dictionary "Walenty".
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMwalTypes
open Xstd

let rec expand_schema_morf expands = function
    PhraseAbbr(Advp "misc",[]) -> PhraseAbbr(Advp "misc",[])
  | PhraseAbbr(Advp "mod",[]) -> PhraseAbbr(Advp "mod",[])
  (* | PhraseAbbr(ComparP s,[]) -> PhraseAbbr(ComparP s,[Phrase(ComparNP(s,Str));Phrase(ComparPP(s))]) (* FIXME: albo do walTEI albo usunąć *) *)
  | PhraseAbbr(abbr,[]) -> (try PhraseAbbr(abbr,AbbrMap.find expands abbr) with Not_found -> failwith "expand_schema_morf")
  | PhraseAbbr(abbr,morfs) -> PhraseAbbr(abbr,Xlist.map morfs (expand_schema_morf expands))
  | LexPhrase(pos_lex,(restr,schema)) -> LexPhrase(pos_lex,(restr,expand_schema expands schema))
  (* | LexPhraseMode(mode,pos_lex,(restr,schema)) -> LexPhraseMode(mode,pos_lex,(restr,expand_schema expands schema)) *)
  | morf -> morf

and expand_schema expands schema =
  Xlist.map schema (fun s ->
      {s with morfs=Xlist.map s.morfs (expand_schema_morf expands)})

(* let get_mode = function
    Xp(m) -> m
  | Advp(m) -> m
  (* | ComparP prep -> "compar" *)
  | Nonch -> "nonch"
  | Distrp -> "distrp"
  | Possp -> "possp" *)

let rec expand_subtypes_morf subtypes = function
    PhraseComp(comp_morf,(ctype,comps)) ->
      let comps = if comps = [] then (try CompMap.find subtypes ctype with Not_found -> failwith "expand_subtypes_schema") else comps in
      Xlist.map comps (fun comp -> Phrase(match comp_morf with
          Cp -> CP(ctype,comp)
        | Ncp case -> NCP(case,ctype,comp)
        | Prepncp(prep,case) -> PrepNCP(prep,case,ctype,comp)))
  | LexPhrase(pos_lex,(restr,schema)) -> [LexPhrase(pos_lex,(restr,expand_subtypes subtypes schema))]
  (* | LexPhraseMode(mode,pos_lex,(restr,schema)) -> [LexPhraseMode(mode,pos_lex,(restr,expand_subtypes subtypes schema))] *)
  | PhraseAbbr(abbr,morfs) ->
    (* let mode = get_mode abbr in *)
    List.flatten (Xlist.map morfs (expand_subtypes_morf subtypes))
    (* Xlist.map morfs (function
          Phrase p -> PhraseMode(mode,p)
        | PhraseMode(_,p) -> PhraseMode(mode,p)
        | LexPhrase(pos_lex,r) -> LexPhraseMode(mode,pos_lex,r)
        | LexRPhrase(pos_lex,r) -> LexRPhraseMode(mode,pos_lex,r)
        | LexPhraseMode(m,pos_lex,r) -> LexPhraseMode(mode,pos_lex,r)
        | LexRPhraseMode(m,pos_lex,r) -> LexRPhraseMode(mode,pos_lex,r)
        | _ -> failwith "expand_subtypes_morf") *)
  | E Null -> [E(NP(Str));E(NCP(Str,CompTypeUndef,CompUndef));E(CP(CompTypeUndef,CompUndef)); E(Or)]
  | morf -> [morf]

and expand_subtypes subtypes schema =
  Xlist.map schema (fun s ->
    {s with morfs=List.flatten (Xlist.map s.morfs (expand_subtypes_morf subtypes))})

let expand_equivs_phrase equivs = function
  | PrepNP(prep,case) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> PrepNP(prep,case))
  | PrepAdjP(prep,case) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> PrepAdjP(prep,case))
  | PrepNumP(prep,case) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> PrepNumP(prep,case))
  | ComprepNP(prep)  -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> ComprepNP(prep))
  (* | ComparNP(prep,case) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> ComparNP(prep,case)) *)
  | ComparP(prep)  -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> ComparP(prep))
  | CP(ctype,Comp comp) -> Xlist.map (try StringMap.find equivs comp with Not_found -> [comp]) (fun comp -> CP(ctype,Comp comp))
  | NCP(case,ctype,Comp comp) -> Xlist.map (try StringMap.find equivs comp with Not_found -> [comp]) (fun comp -> NCP(case,ctype,Comp comp))
  | PrepNCP(prep,case,ctype,Comp comp) -> List.flatten (
      Xlist.map (try StringMap.find equivs comp with Not_found -> [comp]) (fun comp ->
        Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep ->
          PrepNCP(prep,case,ctype,Comp comp))))
  | phrase -> [phrase]

let rec expand_equivs_lex equivs = function
    Lexeme s -> (try XOR(Xlist.map (StringMap.find equivs s) (fun s -> Lexeme s)) with Not_found -> Lexeme s)
  | ORconcat l -> ORconcat(Xlist.map l (expand_equivs_lex equivs))
  | ORcoord l -> ORcoord(Xlist.map l (expand_equivs_lex equivs))
  | XOR l -> XOR(Xlist.map l (expand_equivs_lex equivs))
  | Elexeme gender -> Elexeme gender

let rec expand_equivs_morf equivs = function
    Phrase phrase -> Xlist.map (expand_equivs_phrase equivs phrase) (fun phrase -> Phrase phrase)
  (* | PhraseMode(mode,phrase) -> Xlist.map (expand_equivs_phrase equivs phrase) (fun phrase -> PhraseMode(mode,phrase)) *)
  | E phrase -> Xlist.map (expand_equivs_phrase equivs phrase) (fun phrase -> E phrase)
  | LexPhrase(pos_lex,(restr,schema)) -> [LexPhrase(Xlist.map pos_lex (fun (pos,lex) -> pos, expand_equivs_lex equivs lex),(restr,expand_equivs_schema equivs schema))]
  (* | LexRPhrase(pos_lex,(restr,schema)) -> [LexRPhrase(Xlist.map pos_lex (fun (pos,lex) -> pos, expand_equivs_lex equivs lex),(restr,expand_equivs_schema equivs schema))]
  | LexPhraseMode(mode,pos_lex,(restr,schema)) -> [LexPhraseMode(mode,Xlist.map pos_lex (fun (pos,lex) -> pos, expand_equivs_lex equivs lex),(restr,expand_equivs_schema equivs schema))]
  | LexRPhraseMode(mode,pos_lex,(restr,schema)) -> [LexRPhraseMode(mode,Xlist.map pos_lex (fun (pos,lex) -> pos, expand_equivs_lex equivs lex),(restr,expand_equivs_schema equivs schema))] *)
(* | PhraseAbbr(abbr,morfs) -> [PhraseAbbr(abbr,List.flatten (Xlist.map morfs (expand_equivs_morf equivs)))] *)
  | morf -> failwith ("expand_equivs_morf: " ^ ENIAMwalStringOf.morf morf)

and expand_equivs_schema equivs schema =
  Xlist.map schema (fun s ->
    {s with morfs=List.flatten (Xlist.map s.morfs (expand_equivs_morf equivs))})


let rec load_realizations_rec (expands,subtypes,equivs) found rev = function
    [] -> if rev <> [] || found <> [] then failwith "load_realizations_rec" else expands,subtypes,equivs
  | [Str.Text s; Str.Delim "-->"] :: l -> load_realizations_rec (expands,subtypes,equivs) ((s,rev) :: found) [] l
  | [Str.Delim "    "; Str.Text s; Str.Delim "\t"; Str.Text t] :: l ->
       load_realizations_rec (expands,subtypes,equivs) found ((s,t) :: rev) l
  | [Str.Delim "    "; Str.Text s] :: l ->
       load_realizations_rec (expands,subtypes,equivs) found ((s,"") :: rev) l
  | [Str.Delim "% "; Str.Text "Phrase types expand:"] :: l -> load_realizations_rec (found,subtypes,equivs) [] rev l
  | [Str.Delim "% "; Str.Text "Attributes subtypes:"] :: l -> load_realizations_rec (expands,found,equivs) [] rev l
  | [Str.Delim "% "; Str.Text "Attributes equivalents:"] :: l -> load_realizations_rec (expands,subtypes,found) [] rev l
(*   | [Str.Delim "% "; Str.Text s] :: l -> print_endline s; load_realizations_rec found rev l *)
  | [] :: l -> load_realizations_rec (expands,subtypes,equivs) found rev l
  | _ -> failwith "load_realizations_rec"

let rec get_lexemes = function
    Lexeme s -> [s]
  | ORconcat l -> List.flatten (Xlist.map l get_lexemes)
  | ORcoord l -> List.flatten (Xlist.map l get_lexemes)
  | XOR l -> List.flatten (Xlist.map l get_lexemes)
  | Elexeme gender -> failwith "get_lexemes"

let find_comprep_reqs compreps =
  Xlist.fold compreps StringMap.empty (fun comprep_reqs (s,l) ->
    let l = Xlist.map l (function
        LexPhrase(pos_lex,_) -> Xlist.fold pos_lex StringSet.empty (fun set -> function
            _,Lexeme s -> StringSet.add set s
          | _ -> set)
      (* | LexPhraseMode(_,pos_lex,_) -> Xlist.fold pos_lex StringSet.empty (fun set -> function
            _,Lexeme s -> StringSet.add set s
          | _ -> set) *)
      | morf -> failwith ("find_compreps_reqs: " ^ ENIAMwalStringOf.morf morf)) in
    if l = [] then failwith "find_compreps_reqs";
    StringMap.add comprep_reqs s (StringSet.to_list (Xlist.fold (List.tl l) (List.hd l) StringSet.union)))

let create_comprep_dict compreps =
  Xlist.fold compreps StringMap.empty (fun compreps (s,l) ->
    Xlist.fold l compreps (fun compreps -> function
        LexPhrase([PREP _,_;SUBST _,lex],_) as morf ->
          let lexemes = get_lexemes lex in
          Xlist.fold lexemes compreps (fun compreps lexeme ->
            StringMap.add_inc compreps lexeme ["subst",(s,morf)] (fun l -> ("subst",(s,morf)) :: l))
      (* | LexPhraseMode("misc",[ADV grad,lex],restr) ->
          let morf = LexPhrase([ADV grad,lex],restr) in
          let lexemes = get_lexemes lex in
          Xlist.fold lexemes compreps (fun compreps lexeme ->
            StringMap.add_inc compreps lexeme ["adv",(s,morf)] (fun l -> ("adv",(s,morf)) :: l)) *)
      | LexPhrase([PREP _,_;NUM _,_;SUBST _,lex],_) as morf ->
          let lexemes = get_lexemes lex in
          Xlist.fold lexemes compreps (fun compreps lexeme ->
            StringMap.add_inc compreps lexeme ["subst",(s,morf)] (fun l -> ("subst",(s,morf)) :: l))
      | morf -> failwith ("create_comprep_dict: " ^ ENIAMwalStringOf.morf morf)))

let load_realizations () =
  (* let lines = Str.split (Str.regexp "\n") (File.load_file realizations_filename) in
  let lines = Xlist.rev_map lines (fun line -> Str.full_split (Str.regexp "% \\|-->\\|    \\|\t") line) in
  let expands,subtypes,equivs = load_realizations_rec ([],[],[]) [] [] lines in *)
  let subtypes = Xlist.fold ENIAMwalTEI.subtypes CompMap.empty (fun subtypes -> function
      "int",l -> CompMap.add subtypes Int (List.flatten (Xlist.map l (fun v -> snd(ENIAMwalTEI.parse_comp v))))
    | "rel",l -> CompMap.add subtypes Rel (List.flatten (Xlist.map l (fun v -> snd(ENIAMwalTEI.parse_comp v))))
    | _ -> failwith "load_realizations 1") in
  let equivs = Xlist.fold ENIAMwalTEI.equivs StringMap.empty (fun equivs (k,l) -> StringMap.add equivs k (k :: l)) in
  let expands,compreps = Xlist.fold ENIAMwalTEI.expands (AbbrMap.empty,[]) (fun (expands, compreps) (id,k,l) ->
    match k with
        PhraseAbbr(Advp m,[]) -> AbbrMap.add expands (Advp m) l, compreps
      | PhraseAbbr(Nonch,[]) -> AbbrMap.add expands Nonch l, compreps
      | PhraseAbbr(Xp m,[]) -> AbbrMap.add expands (Xp m) (List.flatten (Xlist.map l (function
             PhraseAbbr(Advp m,[]) -> (try AbbrMap.find expands (Advp m) with Not_found -> [PhraseAbbr(Advp m,[])]) (* FIXME: zakładam, że advp się nie rozmnoży *)
          | morf -> [morf]))), compreps
      | Phrase(ComprepNP s) -> expands, (s, l) :: compreps
      | PhraseAbbr(Distrp,[]) -> AbbrMap.add expands Distrp l, compreps
      | PhraseAbbr(Possp,[]) -> AbbrMap.add expands Possp l, compreps
      | _ -> failwith "load_realizations 2") in
  let compreps = Xlist.map compreps (fun (s,morfs) ->
    s, List.flatten (List.flatten (Xlist.map morfs (fun morf -> Xlist.map (expand_subtypes_morf subtypes (expand_schema_morf expands morf)) (expand_equivs_morf equivs))))) in
  let comprep_reqs = find_comprep_reqs compreps in
  (* let compreps = create_comprep_dict compreps in *)
  expands(*,compreps*),comprep_reqs,subtypes,equivs

let expands(*,compreps*),comprep_reqs,subtypes,equivs = load_realizations ()


(* Wypisanie realizacji *)
(* let _ =
  Xlist.iter ENIAMwalTEI.expands (fun (id,morf,l) ->
      Printf.printf "%d %s:\n" id (ENIAMwalStringOf.morf morf);
      Xlist.iter l (fun morf -> Printf.printf "    %s\n" (ENIAMwalStringOf.morf morf))) *)

(* Wypisanie realizacji po przetworzeniu *)
(* let _ =
  AbbrMap.iter expands (fun morf l ->
      Printf.printf "%s:\n" (ENIAMwalStringOf.phrase_abbr morf);
      Xlist.iter l (fun morf -> Printf.printf "    %s\n" (ENIAMwalStringOf.morf morf))) *)

let has_realization = function
    PhraseAbbr _ -> true
  | PhraseComp _ -> true
  | _ -> false

(* Wypisanie fraz, które podlegają rozwijaniu *)
(*let _ =
  IntMap.iter ENIAMwalTEI.phrases (fun i morf ->
      if has_realization morf then
      Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf morf)) *)

let phrases =
  IntMap.map ENIAMwalTEI.phrases (fun morf ->
      let morf = expand_schema_morf expands morf in
      let morfs = expand_subtypes_morf subtypes morf in
      let morf = List.flatten (Xlist.map morfs (expand_equivs_morf equivs)) in
      morf)

(* Wypisanie fraz, które podlegają rozwijaniu *)
(* let _ =
  IntMap.iter phrases (fun i morf ->
      if has_realization morf then
      Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf morf)) *)

(* let test_phrases = [17088; 17133; 1642]
let _ =
  Xlist.iter test_phrases (fun i ->
      let m1 = IntMap.find ENIAMwalTEI.phrases i in
      let m2 = IntMap.find phrases i in
      Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf m1);
      Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf m2)) *)