ENIAMwalConnect.ml 13.3 KB
(*
 *  ENIAMwalenty, an interface for Polish Valence Dictionary "Walenty".
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd

let rec parse_comp = function
    "int",[] -> ENIAMwalTypes.Int,[]
  | "rel",[] -> ENIAMwalTypes.Rel,[]
  | "int",l -> ENIAMwalTypes.Int, Xlist.map l (fun s -> ENIAMwalTypes.Comp s)
  | "rel",l -> ENIAMwalTypes.Rel, Xlist.map l (fun s -> ENIAMwalTypes.Comp s)
  | s,[] -> ENIAMwalTypes.CompTypeUndef,[ENIAMwalTypes.Comp s]
  | _ -> failwith "parse_comp"


let rec morf_of_phrase = function
    NP c -> ENIAMwalTypes.Phrase (ENIAMwalTypes.NP(ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]))
  | PrepNP(prep,c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.PrepNP(ENIAMwalTypes.Sem,ENIAMwalParser.parse_prep [ENIAMwalTypes.Text prep],ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]))
  | AdjP c -> ENIAMwalTypes.Phrase (ENIAMwalTypes.AdjP(ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]))
  | PrepAdjP(prep,c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.PrepAdjP(ENIAMwalTypes.Sem,ENIAMwalParser.parse_prep [ENIAMwalTypes.Text prep],ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]))
  | ComprepNP prep -> ENIAMwalTypes.Phrase (ENIAMwalTypes.ComprepNP(ENIAMwalTypes.Sem,ENIAMwalParser.parse_prep [ENIAMwalTypes.Text prep]))
  | CP(co) ->  ENIAMwalTypes.PhraseComp(ENIAMwalTypes.Cp,parse_comp co)
  | NCP(c,co) -> ENIAMwalTypes.PhraseComp(ENIAMwalTypes.Ncp(ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]),parse_comp co)
  | PrepNCP(prep,c,co) -> ENIAMwalTypes.PhraseComp(ENIAMwalTypes.Prepncp(ENIAMwalParser.parse_prep [ENIAMwalTypes.Text prep],ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]),parse_comp co)
  | InfP(a) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.InfP(ENIAMwalParser.parse_aspect [ENIAMwalTypes.Text a]))
  | XP(mode,phrases) -> ENIAMwalTypes.PhraseAbbr(ENIAMwalTypes.Xp(fst (ENIAMwalParser.parse_mode [ENIAMwalTypes.Text mode])), Xlist.map phrases morf_of_phrase)
  | AdvP mode -> ENIAMwalTypes.PhraseAbbr(ENIAMwalTypes.Advp(fst (ENIAMwalParser.parse_mode [ENIAMwalTypes.Text mode])), [])
  | NumP(c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.NumP(ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]))
  | PrepNumP(prep,c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.PrepNumP(ENIAMwalTypes.Sem,ENIAMwalParser.parse_prep [ENIAMwalTypes.Text prep],ENIAMwalParser.parse_case [ENIAMwalTypes.Text c]))
  | ComparP prep -> ENIAMwalTypes.PhraseAbbr (ENIAMwalTypes.ComparP(ENIAMwalParser.parse_prep [ENIAMwalTypes.Text prep]),[])
  | Nonch -> ENIAMwalTypes.PhraseAbbr (ENIAMwalTypes.Nonch,[])
  | Or -> ENIAMwalTypes.Phrase ENIAMwalTypes.Or
  | Refl  -> ENIAMwalTypes.Phrase (ENIAMwalTypes.Lex "się")
  | Recip  -> ENIAMwalTypes.Phrase (ENIAMwalTypes.Lex "się")
  | E -> ENIAMwalTypes.E ENIAMwalTypes.Null
  | DistrP -> ENIAMwalTypes.PhraseAbbr (ENIAMwalTypes.Distrp,[])
  | PossP -> ENIAMwalTypes.PhraseAbbr (ENIAMwalTypes.Possp,[])
  | FixedP(_,s) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.FixedP s)
  | Lex lex -> (*print_endline "lex";*) ENIAMwalTypes.Phrase (ENIAMwalTypes.Null) (* FIXME: ni *)
  | Null -> ENIAMwalTypes.Phrase (ENIAMwalTypes.Null)
  | _ -> failwith "morf_of_phrase"

(*  | GerP(c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.
  | PrepGerP(prep,c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.
  | PpasP(c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.
  | PrepPpasP(prep,c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.
  | PPact(c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.
  | PrepPactP(prep,c) -> ENIAMwalTypes.Phrase (ENIAMwalTypes.
  | Qub -> ENIAMwalTypes.Phrase (ENIAMwalTypes.*)


open ENIAMwalTypes

let process_phrases phrases =
  Xlist.fold phrases StringMap.empty (fun phrases (id,phrase) ->
    let id =
      match id with
        {hash=false; numbers=[(*_;_;_;*)id]; suffix="phr"} -> id
      | _ -> failwith "process_phrases" in
    StringMap.add phrases id phrase)

let process_positions positions =
  Xlist.fold positions StringMap.empty (fun positions position ->
    let id =
      match position.psn_id with
        {hash=false; numbers=[(*_;_;*)id]; suffix="psn"} -> id
      | _ -> failwith "process_positions" in
    let r,cr,ce = ENIAMwalParser.parse_roles (position.gf :: position.control) in
    let phrases = process_phrases position.phrases in
    StringMap.add positions id (r,cr,ce,phrases))

let process_schemata schemata =
  Xlist.fold schemata StringMap.empty (fun schemata schema ->
    let id =
      match schema.sch_id with
        {hash=false; numbers=[(*_;*)id]; suffix="sch"} -> id
      | _ -> failwith "process_schemata" in
    let schema_atrs = DefaultAtrs([],
      ENIAMwalParser.parse_refl [Text schema.reflexiveMark],
      ENIAMwalParser.parse_opinion schema.opinion,
      ENIAMwalParser.parse_negation [Text schema.negativity],
      ENIAMwalParser.parse_pred [Text schema.predicativity],
      ENIAMwalParser.parse_aspect [Text schema.aspect]) in
    let positions = process_positions schema.positions in
    StringMap.add schemata id (schema_atrs,positions))

let add_meanings meanings = function
    DefaultAtrs(_,r,o,n,p,a) -> DefaultAtrs(meanings,r,o,n,p,a)
  | _ -> failwith "add_meanings"

let process_arguments arguments =
  Xlist.fold arguments StringMap.empty (fun arguments argument ->
    let id =
      match argument.arg_id with
        {hash=false; numbers=[(*_;_;*)id]; suffix="arg"} -> id
      | _ -> failwith "process_arguments" in
    StringMap.add arguments id (argument.role,argument.role_attribute,argument.sel_prefs))

let get_meaning_id meaning =
  match parse_full_id meaning with
    {hash=true; numbers=[_;id]; suffix="mng"} -> id
  | _ -> failwith "get_meaning_id"

let get_schema_id alt =
  try
    match parse_full_id (List.hd ((List.hd alt.connections).phrases)) with
      {hash=true; numbers=[_;id;_;_]; suffix="phr"} -> id
    | _ -> failwith "get_schema_id 1"
  with _ -> failwith "get_schema_id 2"

let get_frame_id alt =
  try
    match parse_full_id ((List.hd alt.connections).argument) with
      {hash=true; numbers=[_;id;_]; suffix="arg"} -> id
    | _ -> failwith "get_frame_id"
  with _ -> failwith "get_frame_id"

let get_argument_id arg =
  match parse_full_id arg with
    {hash=true; numbers=[_;_;id]; suffix="arg"} -> id
  | _ -> failwith "get_argument_id"

let get_position_id phrases =
  try
    match parse_full_id (List.hd phrases) with
      {hash=true; numbers=[_;_;id;_]; suffix="phr"} -> id
    | _ -> failwith "get_position_id"
  with _ -> failwith "get_position_id"

let get_phrase_id arg =
  match parse_full_id arg with
    {hash=true; numbers=[_;_;_;id]; suffix="phr"} -> id
  | _ -> failwith "get_phrase_id"

let process_frames frames =
  Xlist.fold frames StringMap.empty (fun frames frame ->
    let id =
      match frame.frm_id with
        {hash=false; numbers=[(*_;*)id]; suffix="frm"} -> id
      | _ -> failwith "process_frames" in
      let arguments = process_arguments frame.arguments in
      let meaning_ids = Xlist.map frame.meanings get_meaning_id in
      StringMap.add frames id (meaning_ids,arguments))

let process_meanings meanings =
  Xlist.fold meanings StringMap.empty (fun meanings meaning ->
    let id =
      match meaning.mng_id with
        {hash=false; numbers=[(*_;*)id]; suffix="mng"} -> id
      | _ -> failwith "process_meanings" in
      StringMap.add meanings id (meaning.name ^ " " ^ meaning.variant))

let process_sel_pref = function
    Numeric s -> (try ENIAMplWordnet.synset_name s with Not_found -> "unknown")
  | Symbol s -> s
  | Relation(s,t) -> "REL" (* FIXME *)

let connect entry =
  let schemata = process_schemata entry.schemata in
  let frames = process_frames entry.frames in
  let meanings = process_meanings entry.meanings in
  Xlist.fold entry.alternations [] (fun found alt ->
    if alt.connections = [] then found else
    let schema_id = get_schema_id alt in
    let frame_id = get_frame_id alt in
    let schema_atrs,positions = StringMap.find schemata schema_id in
    let meaning_ids,arguments = StringMap.find frames frame_id in
    let positions = Xlist.fold alt.connections [] (fun positions2 conn ->
      let argument_id = get_argument_id conn.argument in
      let position_id = get_position_id conn.phrases in
      let r,cr,ce,phrases = StringMap.find positions position_id in
      let phrases = Xlist.fold conn.phrases [] (fun phrases2 id ->
        let phrase_id = get_phrase_id id in
        try StringMap.find phrases phrase_id :: phrases2
        with Not_found -> (*Printf.printf "%s\n%!" entry.form_orth;*)phrases2) in
      let role,role_attribute,sel_prefs = StringMap.find arguments argument_id in
      let sel_prefs = Xlist.map (List.flatten sel_prefs) process_sel_pref in
      {gf=r; role=role; role_attr=role_attribute; sel_prefs=sel_prefs;
       cr=cr; ce=ce; dir=Both; morfs=Xlist.map phrases morf_of_phrase} :: positions2) in
    let meanings = List.rev (Xlist.fold meaning_ids [] (fun l id ->
      (StringMap.find meanings id) :: l)) in
    let schema_atrs = add_meanings meanings schema_atrs in
    (entry.form_orth,entry.form_pos,Frame(schema_atrs,positions)) :: found)

let connect2 entry =
  let schemata = process_schemata entry.schemata in
  StringMap.fold schemata [] (fun found _ (schema_atrs,positions) ->
    let positions = StringMap.fold positions [] (fun positions2 _ (r,cr,ce,phrases) ->
      let phrases = StringMap.fold phrases [] (fun phrases2 _ phrase -> phrase :: phrases2) in
      {gf=r; role=""; role_attr=""; sel_prefs=[];
       cr=cr; ce=ce; dir=Both; morfs=Xlist.map phrases morf_of_phrase} :: positions2) in
    (entry.form_orth,entry.form_pos,Frame(schema_atrs,positions)) :: found)

let load_walenty2 () =
  let walenty = load_walenty walenty_filename in
  Xlist.fold walenty StringMap.empty (fun walenty entry ->
    if entry.frames = [] then Xlist.fold (connect2 entry) walenty (fun walenty (lemma,pos,frame) ->
      let map = try StringMap.find walenty pos with Not_found -> StringMap.empty in
      let map = StringMap.add_inc map lemma [frame] (fun l -> frame :: l) in
      StringMap.add walenty pos map)
    else Xlist.fold (connect entry) walenty (fun walenty (lemma,pos,frame) ->
      let map = try StringMap.find walenty pos with Not_found -> StringMap.empty in
      let map = StringMap.add_inc map lemma [frame] (fun l -> frame :: l) in
      StringMap.add walenty pos map))


let print_stringqmap filename qmap =
  let l = StringQMap.fold qmap [] (fun l k v -> (v,k) :: l) in
  File.file_out filename (fun file ->
    Xlist.iter (Xlist.sort l compare) (fun (v,k) ->
      Printf.fprintf file "%5d %s\n" v k))

let sel_prefs_quantities walenty =
  Xlist.fold walenty StringQMap.empty (fun quant e ->
    Xlist.fold e.frames quant (fun quant f ->
      Xlist.fold f.arguments quant (fun quant a ->
        Xlist.fold a.sel_prefs quant (fun quant l ->
          Xlist.fold l quant (fun quant -> function
              Numeric s ->
                let name = try ENIAMplWordnet.synset_name s with Not_found -> "unknown" in
                StringQMap.add quant ("N " ^ s ^ " " ^ name)
            | Symbol s -> StringQMap.add quant ("S " ^ s)
            | Relation(s,t) -> StringQMap.add quant ("R " ^ s ^ " | " ^ t))))))

(*let _ =
  let walenty = load_walenty walenty_filename in
  let quant = sel_prefs_quantities walenty in
  print_stringqmap "results/quant_sel_prefs.txt" quant*)

let print_entry filename lex =
  match Xml.parse_file filename with
    Xml.Element("TEI", _,
      [Xml.Element("teiHeader",_,_) ;
       Xml.Element("text",[],[Xml.Element("body",[],entries)])]) ->
         Xlist.iter entries (function
           Xml.Element("entry",_,Xml.Element("form", [], [Xml.Element("orth",[],[Xml.PCData orth]);_]) :: xml :: _) ->
             if orth = lex then print_endline (Xml.to_string_fmt xml)
         | _ -> failwith "print_entry")
  | _ -> failwith "print_entry"


(*let _ =
  print_entry walenty_filename "bębnić"*)

let print_full_entry filename lex =
  match Xml.parse_file filename with
    Xml.Element("TEI", _,
      [Xml.Element("teiHeader",_,_) ;
       Xml.Element("text",[],[Xml.Element("body",[],entries)])]) ->
         Xlist.iter entries (function
           Xml.Element("entry",e,Xml.Element("form",[],[Xml.Element("orth",[],[Xml.PCData orth]);p]) :: syn :: _ :: l) ->
             let xml = Xml.Element("entry",e,Xml.Element("form",[],[Xml.Element("orth",[],[Xml.PCData orth]);p]) :: syn :: l) in
             if orth = lex then print_endline (Xml.to_string_fmt xml)
         | _ -> failwith "print_full_entry")
  | _ -> failwith "print_full_entry"

(*let _ =
  print_full_entry walenty_filename "bębnić"*)

(*let _ =
  let walenty = load_walenty2 () in
  let frames_sem = try StringMap.find (StringMap.find walenty "verb") "bębnić" with Not_found -> failwith "walTEI" in
  Xlist.iter frames_sem (fun frame ->
    print_endline (WalStringOf.frame "bębnić" frame))*)