ENIAMrealizations.ml 11 KB
(*
 *  ENIAMwalenty, an interface for Polish Valence Dictionary "Walenty".
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMwalTypes

let rec expand_schema_morf expands = function
    PhraseAbbr(Advp "misc",[]) -> PhraseAbbr(Advp "misc",[])
  | PhraseAbbr(Advp "mod",[]) -> PhraseAbbr(Advp "mod",[])
  | PhraseAbbr(ComparP s,[]) -> PhraseAbbr(ComparP s,[Phrase(ComparNP(s,Str));Phrase(ComparPP(s))])
  | PhraseAbbr(abbr,[]) -> (try PhraseAbbr(abbr,AbbrMap.find expands abbr) with Not_found -> failwith "expand_schema_morf")
  | PhraseAbbr(abbr,morfs) -> PhraseAbbr(abbr,Xlist.map morfs (expand_schema_morf expands))
  | LexPhrase(pos_lex,(restr,schema)) -> LexPhrase(pos_lex,(restr,expand_schema expands schema))
  | LexPhraseMode(mode,pos_lex,(restr,schema)) -> LexPhraseMode(mode,pos_lex,(restr,expand_schema expands schema))
  | morf -> morf

and expand_schema expands schema =
  Xlist.map schema (fun s ->
      {s with morfs=Xlist.map s.morfs (fun (id,morf) -> id,expand_schema_morf expands morf)})

let rec expand_subtypes_morf subtypes = function
    PhraseComp(comp_morf,(ctype,comps)) ->
      let comps = if comps = [] then (try CompMap.find subtypes ctype with Not_found -> failwith "expand_subtypes_schema") else comps in
      Xlist.map comps (fun comp -> Phrase(match comp_morf with
          Cp -> CP(ctype,comp)
        | Ncp case -> NCP(case,ctype,comp)
        | Prepncp(prep,case) -> PrepNCP(prep,case,ctype,comp)))
  | LexPhrase(pos_lex,(restr,schema)) -> [LexPhrase(pos_lex,(restr,expand_subtypes subtypes schema))]
  | LexPhraseMode(mode,pos_lex,(restr,schema)) -> [LexPhraseMode(mode,pos_lex,(restr,expand_subtypes subtypes schema))]
  | PhraseAbbr(abbr,morfs) -> [PhraseAbbr(abbr,List.flatten (Xlist.map morfs (expand_subtypes_morf subtypes)))]
  | E Null -> [E(NP(Str));E(NCP(Str,CompTypeUndef,CompUndef));E(CP(CompTypeUndef,CompUndef)); E(Or)]
  | morf -> [morf]

and expand_subtypes subtypes schema =
  Xlist.map schema (fun s ->
    {s with morfs=List.flatten (Xlist.map s.morfs (fun (id,morf) -> id,expand_subtypes_morf subtypes morf))})

let expand_equivs_phrase equivs = function
  | PrepNP(prep,case) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> PrepNP(prep,case))
  | PrepAdjP(prep,case) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> PrepAdjP(prep,case))
  | PrepNumP(prep,case) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> PrepNumP(prep,case))
  | ComprepNP(prep)  -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> ComprepNP(prep))
  | ComparNP(prep,case) -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> ComparNP(prep,case))
  | ComparPP(prep)  -> Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep -> ComparPP(prep))
  | CP(ctype,Comp comp) -> Xlist.map (try StringMap.find equivs comp with Not_found -> [comp]) (fun comp -> CP(ctype,Comp comp))
  | NCP(case,ctype,Comp comp) -> Xlist.map (try StringMap.find equivs comp with Not_found -> [comp]) (fun comp -> NCP(case,ctype,Comp comp))
  | PrepNCP(prep,case,ctype,Comp comp) -> List.flatten (
      Xlist.map (try StringMap.find equivs comp with Not_found -> [comp]) (fun comp ->
        Xlist.map (try StringMap.find equivs prep with Not_found -> [prep]) (fun prep ->
          PrepNCP(prep,case,ctype,Comp comp))))
  | phrase -> [phrase]

let rec expand_equivs_lex equivs = function
    Lexeme s -> (try XOR(Xlist.map (StringMap.find equivs s) (fun s -> Lexeme s)) with Not_found -> Lexeme s)
  | ORconcat l -> ORconcat(Xlist.map l (expand_equivs_lex equivs))
  | ORcoord l -> ORcoord(Xlist.map l (expand_equivs_lex equivs))
  | XOR l -> XOR(Xlist.map l (expand_equivs_lex equivs))
  | Elexeme gender -> Elexeme gender

let rec expand_equivs_morf equivs = function
    Phrase phrase -> Xlist.map (expand_equivs_phrase equivs phrase) (fun phrase -> Phrase phrase)
  | E phrase -> Xlist.map (expand_equivs_phrase equivs phrase) (fun phrase -> E phrase)
  | LexPhrase(pos_lex,(restr,schema)) -> [LexPhrase(Xlist.map pos_lex (fun (pos,lex) -> pos, expand_equivs_lex equivs lex),(restr,expand_equivs_schema equivs schema))]
  | LexPhraseMode(mode,pos_lex,(restr,schema)) -> [LexPhraseMode(mode,Xlist.map pos_lex (fun (pos,lex) -> pos, expand_equivs_lex equivs lex),(restr,expand_equivs_schema equivs schema))]
  | PhraseAbbr(abbr,morfs) -> [PhraseAbbr(abbr,List.flatten (Xlist.map morfs (expand_equivs_morf equivs)))]
  | morf -> failwith ("expand_equivs_morf: " ^ ENIAMwalStringOf.morf morf)

and expand_equivs_schema equivs schema =
  Xlist.map schema (fun s ->
    {s with morfs=List.flatten (Xlist.map s.morfs (expand_equivs_morf equivs))})


let rec load_realizations_rec (expands,subtypes,equivs) found rev = function
    [] -> if rev <> [] || found <> [] then failwith "load_realizations_rec" else expands,subtypes,equivs
  | [Str.Text s; Str.Delim "-->"] :: l -> load_realizations_rec (expands,subtypes,equivs) ((s,rev) :: found) [] l
  | [Str.Delim "    "; Str.Text s; Str.Delim "\t"; Str.Text t] :: l ->
       load_realizations_rec (expands,subtypes,equivs) found ((s,t) :: rev) l
  | [Str.Delim "    "; Str.Text s] :: l ->
       load_realizations_rec (expands,subtypes,equivs) found ((s,"") :: rev) l
  | [Str.Delim "% "; Str.Text "Phrase types expand:"] :: l -> load_realizations_rec (found,subtypes,equivs) [] rev l
  | [Str.Delim "% "; Str.Text "Attributes subtypes:"] :: l -> load_realizations_rec (expands,found,equivs) [] rev l
  | [Str.Delim "% "; Str.Text "Attributes equivalents:"] :: l -> load_realizations_rec (expands,subtypes,found) [] rev l
(*   | [Str.Delim "% "; Str.Text s] :: l -> print_endline s; load_realizations_rec found rev l *)
  | [] :: l -> load_realizations_rec (expands,subtypes,equivs) found rev l
  | _ -> failwith "load_realizations_rec"

let rec get_lexemes = function
    Lexeme s -> [s]
  | ORconcat l -> List.flatten (Xlist.map l get_lexemes)
  | ORcoord l -> List.flatten (Xlist.map l get_lexemes)
  | XOR l -> List.flatten (Xlist.map l get_lexemes)
  | Elexeme gender -> failwith "get_lexemes"

let find_comprep_reqs compreps =
  Xlist.fold compreps StringMap.empty (fun comprep_reqs (s,l) ->
    let l = Xlist.map l (function
        LexPhrase(pos_lex,_) -> Xlist.fold pos_lex StringSet.empty (fun set -> function
            _,Lexeme s -> StringSet.add set s
          | _ -> set)
      | LexPhraseMode(_,pos_lex,_) -> Xlist.fold pos_lex StringSet.empty (fun set -> function
            _,Lexeme s -> StringSet.add set s
          | _ -> set)
      | morf -> failwith ("find_compreps_reqs: " ^ ENIAMwalStringOf.morf morf)) in
    if l = [] then failwith "find_compreps_reqs";
    StringMap.add comprep_reqs s (StringSet.to_list (Xlist.fold (List.tl l) (List.hd l) StringSet.union)))

let create_comprep_dict compreps =
  Xlist.fold compreps StringMap.empty (fun compreps (s,l) ->
    Xlist.fold l compreps (fun compreps -> function
        LexPhrase([PREP _,_;SUBST _,lex],_) as morf ->
          let lexemes = get_lexemes lex in
          Xlist.fold lexemes compreps (fun compreps lexeme ->
            StringMap.add_inc compreps lexeme ["subst",(s,morf)] (fun l -> ("subst",(s,morf)) :: l))
      | LexPhraseMode("misc",[ADV grad,lex],restr) ->
          let morf = LexPhrase([ADV grad,lex],restr) in
          let lexemes = get_lexemes lex in
          Xlist.fold lexemes compreps (fun compreps lexeme ->
            StringMap.add_inc compreps lexeme ["adv",(s,morf)] (fun l -> ("adv",(s,morf)) :: l))
      | LexPhrase([PREP _,_;NUM _,_;SUBST _,lex],_) as morf ->
          let lexemes = get_lexemes lex in
          Xlist.fold lexemes compreps (fun compreps lexeme ->
            StringMap.add_inc compreps lexeme ["subst",(s,morf)] (fun l -> ("subst",(s,morf)) :: l))
      | morf -> failwith ("create_comprep_dict: " ^ ENIAMwalStringOf.morf morf)))

let load_realizations () =
  (* let lines = Str.split (Str.regexp "\n") (File.load_file realizations_filename) in
  let lines = Xlist.rev_map lines (fun line -> Str.full_split (Str.regexp "% \\|-->\\|    \\|\t") line) in
  let expands,subtypes,equivs = load_realizations_rec ([],[],[]) [] [] lines in *)
  let subtypes = Xlist.fold ENIAMwalTEI.subtypes CompMap.empty (fun subtypes -> function
      "int",l -> CompMap.add subtypes Int (List.flatten (Xlist.map l (fun v -> snd(ENIAMwalTEI.parse_comp v))))
    | "rel",l -> CompMap.add subtypes Rel (List.flatten (Xlist.map l (fun v -> snd(ENIAMwalTEI.parse_comp v))))
    | _ -> failwith "load_realizations 1") in
  let equivs = Xlist.fold ENIAMwalTEI.equivs StringMap.empty (fun equivs (k,l) -> StringMap.add equivs k (k :: l)) in
  let expands,compreps = Xlist.fold ENIAMwalTEI.expands (AbbrMap.empty,[]) (fun (expands, compreps) (k,l) ->
    match parse_morf_single (split_schema k) with
(*        PhraseAbbr(Advp m,[]) -> AbbrMap.add expands (Advp m) (Xlist.map l (fun (v,_) ->
             LexPhraseMode(m,[ADV GradUndef,Lexeme v],(Natr,[])))), compreps
      | PhraseAbbr(Nonch,[]) -> AbbrMap.add expands Nonch (Xlist.map l (fun (v,_) ->
             LexPhrase([SUBST(NumberUndef,Str),Lexeme v],(Natr,[])))), compreps
      | PhraseAbbr(Xp m,[]) -> AbbrMap.add expands (Xp m) (List.flatten (Xlist.map l (fun (v,_) ->
           match parse_morfs (split_schema v) with
             [PhraseAbbr(Advp m,[])] -> (try AbbrMap.find expands (Advp m) with Not_found -> [PhraseAbbr(Advp m,[])]) (* FIXME: zakładam, że advp się nie rozmnoży *)
           | morfs -> morfs))), compreps
      | Phrase(ComprepNP(_,s)) -> expands, (s, Xlist.map l (fun (v,_) -> parse_morf_single (split_schema v))) :: compreps
      | PhraseAbbr(Distrp,[]) -> AbbrMap.add expands Distrp (Xlist.map l (fun (v,_) -> parse_morf_single (split_schema v))), compreps
          | PhraseAbbr(Possp,[]) -> AbbrMap.add expands Possp (Xlist.map l (fun (v,_) -> parse_morf_single (split_schema v))), compreps*)
      | _ -> failwith "load_realizations 2") in
  let compreps = Xlist.map compreps (fun (s,morfs) ->
    s, List.flatten (List.flatten (Xlist.map morfs (fun morf -> Xlist.map (expand_subtypes_morf subtypes (expand_schema_morf expands morf)) (expand_equivs_morf equivs))))) in
  let comprep_reqs = find_comprep_reqs compreps in
  let compreps = create_comprep_dict compreps in
  expands,compreps,comprep_reqs,subtypes,equivs

let expands,compreps,comprep_reqs,subtypes,equivs = load_realizations ()