ENIAMplWordnet.ml 8.46 KB
(*
 *  ENIAMplWordnet, a converter for Polish Wordnet "Słowosieć".
 *  Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd
open ENIAMplWordnetTypes

let process_unit = function
    Xml.Element("unit-id",[],[Xml.PCData s]) -> int_of_string s, empty_lu
  | node -> failwith ("process_unit " ^ (Xml.to_string node))

let process_tests = function
    Xml.Element("test",["text",text;"pos",pos],[]) -> text,pos
  | node -> failwith ("process_tests " ^ (Xml.to_string node))


(* funkcja zwraca:
lexical-unit map - wiąże leksemy z identyfikatorami
synset map
lexicalrelations
synsetrelations
relationtypes map
*)
let process_entry (lumap,synmap,lr,sr,rtmap) = function
    Xml.Element("lexical-unit",["id",id;"name",name;"pos",pos;"tagcount",tagcount;"domain",domain;"workstate",workstate;
                                "source",source;"variant",variant],[]) ->
        let lumap = IntMap.add_inc lumap (int_of_string id) {lu_id=int_of_string id; lu_name=name; lu_pos=pos; lu_tagcount=tagcount; lu_domain=domain; lu_desc="";
          lu_workstate=workstate; lu_source=source; lu_variant=variant; lu_syn=(-1)} (fun _ -> failwith "process_entry 2") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("lexical-unit",["id",id;"name",name;"pos",pos;"tagcount",tagcount;"domain",domain;"desc",desc;"workstate",workstate;
                                "source",source;"variant",variant],[]) ->
        let lumap = IntMap.add_inc lumap (int_of_string id) {lu_id=int_of_string id; lu_name=name; lu_pos=pos; lu_tagcount=tagcount; lu_domain=domain; lu_desc=desc;
          lu_workstate=workstate; lu_source=source; lu_variant=variant; lu_syn=(-1)} (fun _ -> failwith "process_entry 3") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("synset",["id",id;"workstate",workstate;"split",split;"owner",owner;"definition",definition;"desc",desc;
                                "abstract",abstract],units) ->
        let units = Xlist.map units process_unit in
        let synmap = IntMap.add_inc synmap (int_of_string id) {syn_workstate=workstate; syn_split=split; syn_owner=owner; syn_definition=definition;
          syn_desc=desc; syn_abstract=abstract; syn_units=units; syn_pos=""; syn_no_hipo=0; syn_domain=""} (fun _ -> failwith "process_entry 4") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("synset",["id",id;"workstate",workstate;"split",split;"owner",owner;"desc",desc;
                                "abstract",abstract],units) ->
        let units = Xlist.map units process_unit in
        let synmap = IntMap.add_inc synmap (int_of_string id) {syn_workstate=workstate; syn_split=split; syn_owner=owner; syn_definition="";
          syn_desc=desc; syn_abstract=abstract; syn_units=units; syn_pos=""; syn_no_hipo=0; syn_domain=""} (fun _ -> failwith "process_entry 4") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("lexicalrelations",["parent",parent;"child",child;"relation",relation;"valid",valid;"owner",owner],[]) ->
        let lr = {r_parent=int_of_string parent; r_child=int_of_string child; r_relation=int_of_string relation; r_valid=valid; r_owner=owner} :: lr in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("synsetrelations",["parent",parent;"child",child;"relation",relation;"valid",valid;"owner",owner],[]) ->
        let sr = {r_parent=int_of_string parent; r_child=int_of_string child; r_relation=int_of_string relation; r_valid=valid; r_owner=owner} :: sr in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("relationtypes",["id",id;"type",typ;"reverse",reverse;"name",name;"description",description;
                                 "posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
                                 "pwn",pwn],tests) ->
        let tests = Xlist.map tests process_tests in
        let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=int_of_string reverse; rt_name=name; rt_description=description;
          rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
          (fun _ -> failwith "process_entry 5") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("relationtypes",["id",id;"type",typ;"name",name;"description",description;
                                 "posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
                                 "pwn",pwn],tests) ->
        let tests = Xlist.map tests process_tests in
        let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=(-1); rt_name=name; rt_description=description;
          rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
          (fun _ -> failwith "process_entry 5") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("relationtypes",["id",id;"type",typ;"parent",parent;"reverse",reverse;"name",name;"description",description;
                                 "posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
                                 "pwn",pwn],tests) ->
        let tests = Xlist.map tests process_tests in
        let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=int_of_string reverse; rt_name=name; rt_description=description;
          rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
          (fun _ -> failwith "process_entry 5") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("relationtypes",["id",id;"type",typ;"parent",parent;"name",name;"description",description;
                                 "posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
                                 "pwn",pwn],tests) ->
        let tests = Xlist.map tests process_tests in
        let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=(-1); rt_name=name; rt_description=description;
          rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
          (fun _ -> failwith "process_entry 5") in
        lumap,synmap,lr,sr,rtmap
  | node -> print_endline (Xml.to_string node); failwith "process_entry 1"

let load_data filename =
  match try Xml.parse_file filename with Xml.Error e -> failwith ("load_data Xml.Error " ^ Xml.error e) with
    Xml.Element("array-list",_,entries) ->
      Xlist.fold entries (IntMap.empty,IntMap.empty,[],[],IntMap.empty) process_entry
  | node -> failwith ("load_data " ^ (Xml.to_string node))

let check_lu_syn_consistency lumap synmap =
  let set = IntMap.fold lumap IntSet.empty (fun set id _ ->
    if IntSet.mem set id then failwith "check_lu_syn_consistency 1" else
    IntSet.add set id) in
  let set = IntMap.fold synmap set (fun set _ syn ->
    Xlist.fold syn.syn_units set (fun set (id,_) ->
      if not (IntSet.mem set id) then failwith "check_lu_syn_consistency 2" else
      IntSet.remove set id)) in
  if not (IntSet.is_empty set) then failwith "check_lu_syn_consistency 3" else
  ()

let merge_lu_syn lumap synmap =
  IntMap.map synmap (fun syn ->
    let units = Xlist.map syn.syn_units (fun (id,_) -> id, IntMap.find lumap id) in
    let pos = match StringSet.to_list (Xlist.fold units StringSet.empty (fun set (_,lu) ->
                 StringSet.add set lu.lu_pos)) with
        [] -> failwith "merge_lu_syn: empty synset"
      | [pos] -> pos
      | _ -> failwith "merge_lu_syn: inconsistent pos" in
    {syn with syn_units=units; syn_pos=pos})

let set_lu_syn lumap synmap =
  IntMap.fold synmap lumap (fun lumap syn_id syn ->
    Xlist.fold syn.syn_units lumap (fun lumap (id,_) ->
      let lu = try IntMap.find lumap id with Not_found -> failwith "set_lu_syn" in
      if lu.lu_syn <> -1 then failwith "set_lu_syn" else
      IntMap.add lumap id {lu with lu_syn=syn_id}))