ENIAMwalGenerate.ml 16.3 KB
(*
 *  ENIAMwalenty, a converter for Polish Valence Dictionary "Walenty".
 *  Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMwalTypes
open Xstd

let correct_walenty entry =
  if entry.form_orth = "podobać" then
    {entry with schemata=Xlist.map entry.schemata (fun s ->
         {s with positions=Xlist.map s.positions (fun p ->
              if p.gf=SUBJ then {p with morfs=List.flatten (Xlist.map p.morfs (function
                    MorfId 126 -> []
                  | m -> [m]))}
              else p)})}
  else entry

let load_walenty walenty_filename expands_filename =
  let walenty,phrases = ENIAMwalTEI.load_walenty walenty_filename in
  let walenty = Xlist.rev_map walenty correct_walenty in
  let expands = ENIAMwalTEI.load_expands expands_filename in
  let meanings =
    Xlist.fold walenty IntMap.empty (fun meanings entry ->
      Xlist.fold entry.meanings meanings (fun meanings meaning ->
        IntMap.add meanings meaning.mng_id meaning)) in
  let connected_walenty =
    Xlist.fold walenty Entries.empty (fun connected_walenty e ->
        let entries = ENIAMwalConnect.connect e in
        Entries.add_inc_list connected_walenty e.form_pos e.form_orth entries) in
  let schemata_walenty =
    Xlist.fold walenty Entries.empty (fun schemata_walenty e ->
        let entries = ENIAMwalConnect.schemata e in
        Entries.add_inc_list schemata_walenty e.form_pos e.form_orth entries) in
  let expands,compreps,subtypes,equivs,adv_types =
    ENIAMwalRealizations.load_realizations (expands,ENIAMwalTEI.subtypes,ENIAMwalTEI.equivs) in
  let phrases =
    IntMap.map phrases (fun morf ->
        let morf = ENIAMwalRealizations.expand_schema_morf expands morf in
        let morfs = ENIAMwalRealizations.expand_subtypes_morf subtypes morf in
        let morf = List.flatten (Xlist.map morfs (ENIAMwalRealizations.expand_equivs_morf equivs)) in
        morf) in
  let compreps = Xlist.map compreps (fun (lemma,morfs) ->
      lemma, ENIAMwalLex.expand_lexicalizations_morfs morfs) in
  let entries = ENIAMwalLex.extract_lex_entries_comprepnp [] compreps in
  let phrases,entries =
    IntMap.fold phrases (IntMap.empty,entries) (fun (phrases,entries) id morfs ->
        let morfs = ENIAMwalLex.expand_lexicalizations_morfs morfs in
        let morfs,entries = Xlist.fold morfs ([],entries) ENIAMwalLex.extract_lex_entries in
        IntMap.add phrases id morfs, entries) in
  let entries = Xlist.fold entries Entries.empty (fun entries (pos,lemma,entry) ->
      Entries.add_inc entries pos lemma entry) in
  let entries = Entries.map2 entries (fun pos lemma entries -> EntrySet.to_list (EntrySet.of_list entries)) in
  let entries = Entries.flatten_map entries (fun pos lemma entry ->
      ENIAMwalLex.expand_restr [] lemma pos entry) in
    (* let entries =
      StringMap.mapi entries (fun pos entries2 ->
        StringMap.mapi entries2 (fun lemma entries3 ->
            EntrySet.fold entries3 [] (fun entries3 entry ->
                (ENIAMwalLex.expand_restr [] lemma pos entry) @ entries3))) in *)
  connected_walenty, schemata_walenty, phrases, entries, meanings, adv_types

let print_entries filename entries =
  File.file_out filename (fun file ->
      Entries.iter entries (fun pos lemma entry ->
          Printf.fprintf file "%s\t%s\t%s\n" pos lemma (ENIAMwalStringOf.lex_entry entry)))

let print_phrases filename phrases =
  File.file_out filename (fun file ->
      IntMap.iter phrases (fun id morfs ->
          let morfs = Xlist.map morfs ENIAMwalStringOf.morf in
          Printf.fprintf file "%d\t%s\n" id (String.concat "\t" morfs)))

let print_schemata filename schemata =
  File.file_out filename (fun file ->
      Entries.iter schemata (fun pos lemma (opinion,(n,p,a),schema) ->
          Printf.fprintf file "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" pos lemma
            (ENIAMwalStringOf.opinion opinion)
            (ENIAMwalStringOf.negation n)
            (ENIAMwalStringOf.pred p)
            (ENIAMwalStringOf.aspect a)
            (ENIAMwalStringOf.simple_schema schema)))

let print_connected filename connected =
  File.file_out filename (fun file ->
      Entries.iter connected (fun pos lemma (sopinion,fopinion,meanings,(n,p,a),schema) ->
          Printf.fprintf file "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" pos lemma
            (ENIAMwalStringOf.opinion sopinion)
            (ENIAMwalStringOf.opinion fopinion)
            (String.concat "," (Xlist.map meanings string_of_int))
            (ENIAMwalStringOf.negation n)
            (ENIAMwalStringOf.pred p)
            (ENIAMwalStringOf.aspect a)
            (ENIAMwalStringOf.connected_schema schema)))

let split_tokens s =
  let l = List.flatten (Xlist.map (Str.full_split (Str.regexp " \\|,\\|-") s) (function
        Str.Delim " " -> []
      | Str.Delim s -> [s]
      | Str.Text s -> [s])) in
  String.concat " " l

let print_fixed filename fixed =
  File.file_out filename (fun file ->
      StringSet.iter fixed (fun s ->
          let t = split_tokens s in
          Printf.fprintf file "%s\t%s\tfixed\n" t s))

let print_adv_types filename adv_types =
  File.file_out filename (fun file ->
      Xlist.iter adv_types (fun (m,l) ->
          Xlist.iter l (fun s ->
              Printf.fprintf file "%s\t%s\n" s m)))

let add_fixed fixed = function
    Phrase (FixedP s) -> StringSet.add fixed s
  | SimpleLexArg(s,FIXED) -> StringSet.add fixed s
  | LexArg(_,s,FIXED) -> StringSet.add fixed s
  | _ -> fixed

let find_fixed_schema fixed schema =
  Xlist.fold schema fixed (fun schema p ->
      Xlist.fold p.morfs fixed add_fixed)

let find_fixed phrases entries =
  let fixed = IntMap.fold phrases StringSet.empty (fun fixed _ morfs ->
      Xlist.fold morfs fixed add_fixed) in
  Entries.fold entries fixed (fun fixed pos lemma -> function
        SimpleLexEntry(s,"fixed") -> StringSet.add fixed s
      | SimpleLexEntry(s,_) -> fixed
      | LexEntry(_,s,"fixed",_,schema) -> find_fixed_schema (StringSet.add fixed s) schema
      | LexEntry(_,_,_,_,schema) -> find_fixed_schema fixed schema
      | ComprepNPEntry(_,_,schema) -> find_fixed_schema fixed schema)

let print_meanings filename meanings =
  File.file_out filename (fun file ->
      IntMap.iter meanings (fun _ m ->
          Printf.fprintf file "%d\t%s\t%s\t%d\t%s\n" m.mng_id m.name m.variant m.plwnluid m.gloss))

(* let connected_walenty, schemata_walenty, phrases, entries, meanings = load_walenty
    "/home/yacheu/Dokumenty/NLP resources/Walenty/walenty_20170311.xml"
    "/home/yacheu/Dokumenty/NLP resources/Walenty/phrase_types_expand_20170311.xml" *)

(* Generowanie zasobów *)
let _ =
  if Array.length Sys.argv < 3 then print_endline "missing argument" else (
    let connected_walenty, schemata_walenty, phrases, entries, meanings, adv_types = load_walenty Sys.argv.(1) Sys.argv.(2) in
    print_entries "resources/entries.tab" entries;
    print_phrases "resources/phrases.tab" phrases;
    print_schemata "resources/schemata.tab" schemata_walenty;
    print_connected "resources/connected.tab" connected_walenty;
    print_fixed "resources/fixed.tab" (find_fixed phrases entries);
    print_meanings "resources/meanings.tab" meanings;
    print_adv_types "resources/adv_modes.tab" adv_types;
    ())

(* Test wczytywania Walentego TEI *)
(* let _ =
   let walenty,phrases = ENIAMwalTEI.load_walenty "/home/yacheu/Dokumenty/NLP resources/Walenty/walenty_20170311.xml" in
   let n = Xlist.fold ENIAMwalTEI.walenty 0 (fun n e -> let l = connect e in n + Xlist.size l) in
   let m = Xlist.fold ENIAMwalTEI.walenty 0 (fun n e -> let l = schemata e in n + Xlist.size l) in
   Printf.printf "%d connected\n%d schemata\n|phrases|=%d\n" n m (IntMap.size phrases);
   () *)

(* Test unikalności indeksów sensów *)
(* let _ =
  let walenty,phrases = ENIAMwalTEI.load_walenty "/home/yacheu/Dokumenty/NLP resources/Walenty/walenty_20170311.xml" in
  Xlist.fold walenty IntMap.empty (fun map e ->
    Xlist.fold e.meanings map (fun map m ->
      IntMap.add_inc map m.mng_id m (fun m1 -> if m1 = m then m else failwith "meaning"))) *)

(*
(* let insert_phrases phrases = function
    Frame(atrs,s) -> Frame(atrs,Xlist.map s (fun p ->
      {p with morfs=Xlist.map p.morfs (function
             MorfId id -> (try IntMap.find phrases id with Not_found -> failwith "insert_phrases")
           | _ -> failwith "insert_phrases")}))
  | _ -> failwith "insert_phrases: ni"

let print_entry pos_map pos orth =
  let orth_map = try StringMap.find pos_map pos with Not_found -> StringMap.empty in
  let frames = try StringMap.find orth_map orth with Not_found -> [] in
  Xlist.iter frames (fun frame ->
      let frame = insert_phrases ENIAMwalTEI.phrases frame in
      print_endline (ENIAMwalStringOf.frame orth frame)) *)

(* Wypisanie hasła *)
(* let _ =
   print_entry connected_walenty "verb" "brudzić";
   () *)

(* let has_nontrivial_lex = function
    Frame(atrs,s) -> Xlist.fold s false (fun b p ->
      if p.role = "Lemma" && p.role_attr = "" then b else
        Xlist.fold p.morfs b (fun b -> function
              MorfId id -> failwith "has_nontrivial_lex"
            | LexPhrase _ -> true
            (* | LexRPhrase _ -> true
               | LexPhraseMode _ -> true *)
            | _ -> b))
  | _ -> failwith "has_nontrivial_lex: ni" *)

(* Leksykalizacje nie wchodzące do lematu *)
(* let _ =
   StringMap.iter connected_walenty (fun _ orth_map ->
      StringMap.iter orth_map (fun orth frames ->
          Xlist.iter frames (fun frame ->
              let frame = insert_phrases ENIAMwalTEI.phrases frame in
              if has_nontrivial_lex frame then
                print_endline (ENIAMwalStringOf.frame orth frame)))) *)

let simplify_frame_verb = function
    Phrase(NP(Case "dat")) -> []
  | Phrase(NP(Case "inst")) -> []
  | Phrase(PrepNP _) -> []
  | Phrase(ComprepNP _) -> []
  | Phrase(AdvP) -> []
  | t -> [t]

let simplify_frame_noun = function
    Phrase(NP(Case "gen")) -> []
  | Phrase(NP(Case "nom")) -> []
  | Phrase(NP(CaseAgr)) -> []
  | Phrase(PrepNP _) -> []
  | Phrase(ComprepNP _) -> []
  | Phrase(AdjP CaseAgr) -> []
  | PhraseComp(Ncp(Case "gen"),_)
  | PhraseComp(Prepncp(_,_),_) -> []
  | PhraseAbbr(Possp,[]) -> []
  | t -> [t]

let simplify_frame_adj = function
  | t -> [t]

let simplify_frame_adv = function
  | t -> [t]


(* let simplify_frame pos = function
    Frame(atrs,s) ->
    let schema = Xlist.fold s [] (fun schema p ->
        let morfs = Xlist.fold p.morfs [] (fun morfs morf ->
            match pos with
              "verb" -> simplify_frame_verb morf @ morfs
            | "noun" -> simplify_frame_noun morf @ morfs
            | "adj" -> simplify_frame_adj morf @ morfs
            | "adv" -> simplify_frame_adv morf @ morfs
            | _ -> failwith "simplify_frame") in
        if morfs = [] then schema else
          {p with ce=[]; cr=[]; morfs=morfs} :: schema) in
    if schema = [] then [] else [Frame(atrs,schema)]
  | _ -> failwith "simplify_frame: ni" *)


(* Uproszczone schematy *)
(* let _ =
   StringMap.iter schemata_walenty (fun pos orth_map ->
      if pos = "noun" then
      StringMap.iter orth_map (fun orth frames ->
          Xlist.iter frames (fun frame ->
              let frame = insert_phrases ENIAMwalTEI.phrases frame in
              let frames = simplify_frame pos frame in
              Xlist.iter frames (fun frame -> print_endline (ENIAMwalStringOf.frame orth frame))))) *)

(* let has_mode_coordination = function
    Frame(atrs,s) -> Xlist.fold s false (fun b p ->
      let n = Xlist.fold p.morfs 0 (fun n -> function
            MorfId id -> failwith "has_nontrivial_lex"
          | PhraseAbbr(Advp _,_) -> n+1
          | PhraseAbbr(Xp _,_) -> n+1
          (* | LexPhraseMode _ -> n+1 FIXME*)
          | _ -> n) in
      if n>1 then true else b)
  | _ -> failwith "has_nontrivial_lex: ni" *)

(* Koordynacja z mode *)
(* let _ =
   StringMap.iter schemata_walenty(*connected_walenty*) (fun _ orth_map ->
      StringMap.iter orth_map (fun orth frames ->
          Xlist.iter frames (fun frame ->
              let frame = insert_phrases ENIAMwalTEI.phrases frame in
              if has_mode_coordination frame then
                print_endline (ENIAMwalStringOf.frame orth frame)))) *)


(* let get_entry orth pos *)
     (*
let load_walenty2 () =
  let walenty = load_walenty walenty_filename in
  Xlist.fold walenty StringMap.empty (fun walenty entry ->
    if entry.frames = [] then Xlist.fold (connect2 entry) walenty (fun walenty (lemma,pos,frame) ->
      let map = try StringMap.find walenty pos with Not_found -> StringMap.empty in
      let map = StringMap.add_inc map lemma [frame] (fun l -> frame :: l) in
      StringMap.add walenty pos map)
    else Xlist.fold (connect entry) walenty (fun walenty (lemma,pos,frame) ->
      let map = try StringMap.find walenty pos with Not_found -> StringMap.empty in
      let map = StringMap.add_inc map lemma [frame] (fun l -> frame :: l) in
      StringMap.add walenty pos map))


let print_stringqmap filename qmap =
  let l = StringQMap.fold qmap [] (fun l k v -> (v,k) :: l) in
  File.file_out filename (fun file ->
    Xlist.iter (Xlist.sort l compare) (fun (v,k) ->
      Printf.fprintf file "%5d %s\n" v k))

let sel_prefs_quantities walenty =
  Xlist.fold walenty StringQMap.empty (fun quant e ->
    Xlist.fold e.frames quant (fun quant f ->
      Xlist.fold f.arguments quant (fun quant a ->
        Xlist.fold a.sel_prefs quant (fun quant l ->
          Xlist.fold l quant (fun quant -> function
              Numeric s ->
                let name = try ENIAMplWordnet.synset_name s with Not_found -> "unknown" in
                StringQMap.add quant ("N " ^ s ^ " " ^ name)
            | Symbol s -> StringQMap.add quant ("S " ^ s)
            | Relation(s,t) -> StringQMap.add quant ("R " ^ s ^ " | " ^ t))))))
*)
(*let _ =
  let walenty = load_walenty walenty_filename in
  let quant = sel_prefs_quantities walenty in
  print_stringqmap "results/quant_sel_prefs.txt" quant*)

(*let _ =
  let walenty = load_walenty2 () in
  let frames_sem = try StringMap.find (StringMap.find walenty "verb") "bębnić" with Not_found -> failwith "walTEI" in
  Xlist.iter frames_sem (fun frame ->
    print_endline (WalStringOf.frame "bębnić" frame))*)


(* Wypisanie realizacji *)
(* let _ =
   Xlist.iter ENIAMwalTEI.expands (fun (id,morf,l) ->
      Printf.printf "%d %s:\n" id (ENIAMwalStringOf.morf morf);
      Xlist.iter l (fun morf -> Printf.printf "    %s\n" (ENIAMwalStringOf.morf morf))) *)

(* Wypisanie realizacji po przetworzeniu *)
(* let _ =
   AbbrMap.iter expands (fun morf l ->
      Printf.printf "%s:\n" (ENIAMwalStringOf.phrase_abbr morf);
      Xlist.iter l (fun morf -> Printf.printf "    %s\n" (ENIAMwalStringOf.morf morf))) *)

let has_realization = function
    PhraseAbbr _ -> true
  | PhraseComp _ -> true
  | _ -> false

(* Wypisanie fraz, które podlegają rozwijaniu *)
(*let _ =
  IntMap.iter ENIAMwalTEI.phrases (fun i morf ->
      if has_realization morf then
      Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf morf)) *)

(* Wypisanie fraz, które podlegają rozwijaniu *)
(* let _ =
   IntMap.iter phrases (fun i morf ->
      if has_realization morf then
      Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf morf)) *)

(* let test_phrases = [17088; 17133; 1642]
   let _ =
   Xlist.iter test_phrases (fun i ->
      let m1 = IntMap.find ENIAMwalTEI.phrases i in
      let m2 = IntMap.find phrases i in
      Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf m1);
      Printf.printf "%4d %s\n" i (ENIAMwalStringOf.morf m2)) *)

(* let print_entries entries =
  StringMap.iter entries (fun pos entries2 ->
      StringMap.iter entries2 (fun lemma entries3 ->
          EntrySet.iter entries3 (fun entry ->
              Printf.printf "%s: %s: %s\n" pos lemma (ENIAMwalStringOf.entry entry)))) *)

(* let _ = print_entries entries *)
*)