import_plwn.py 6.37 KB
import sys, os, codecs

from django.core.management.base import BaseCommand
from lxml import etree
from xml.sax import saxutils, handler, make_parser

from wordnet.models import LexicalUnit, LexicalUnitLink, SynsetLink, Relation, Synset
from settings import PROJECT_PATH


BULK = 250


class PlWNHandler(handler.ContentHandler):
    def __init__(self, out=sys.stdout):
        handler.ContentHandler.__init__(self)
        self._out = out
        self._lexical_units = {}
        self._reflexive_lexical_units = {}
        self._mutual_lexical_units = {}
        self._synsets = {}
        self._root_synsets_ids = set()
        self._root_synsets = []
        self._defined_synset = -1
        self._unit = False
        self._content = ''
        self._other_synset_relations = set()
        self._lexical_relations = set()
        self._synsets_to_base = []
        self._lexical_units_to_base = []
        self._lexical_units_objs = {}
        self._synset_links_to_base = []
        self._lus_links_to_base = []

    def startElement(self, name, attrs):
        if name == 'lexical-unit':
            # if 'pwn' not in attrs['pos']:
            luid = int(attrs['id'])
            lubase = attrs['name']
            lusense = int(attrs['variant'])
            pos = attrs['pos']
            desc = attrs['desc']
            self._lexical_units[luid] = (lubase, lusense, pos, desc)
        elif name == 'synset':
            sid = int(attrs['id'])
            self._defined_synset = sid
            s = Synset(id=sid)
            self._synsets_to_base.append(s)
            self._synsets[sid] = s
        elif name == 'unit-id':
            self._unit = True
        elif name == 'synsetrelations':
            if attrs['valid'] == 'true':
                parent = int(attrs['parent'])
                child = int(attrs['child'])
                relation = Relation.objects.get(id=int(attrs['relation']))
                if child in self._synsets and parent in self._synsets:
                    p = self._synsets[parent]
                    c = self._synsets[child]
                    l = SynsetLink(parent=p, child=c, relation=relation)
                    self._synset_links_to_base.append(l)
        elif name == 'lexicalrelations':
            if attrs['valid'] == 'true':
                parent = int(attrs['parent'])
                child = int(attrs['child'])
                relation = Relation.objects.get(id=int(attrs['relation']))
                if child in self._lexical_units_objs and parent in self._lexical_units_objs:
                    p = self._lexical_units_objs[parent]
                    c = self._lexical_units_objs[child]
                    l = LexicalUnitLink(parent=p, child=c, relation=relation)
                    self._lus_links_to_base.append(l)
                else:
                    print (self._lexical_units[child], '-->', self._lexical_units[parent])

    def endElement(self, name):
        if name == 'synset':
            self._defined_synset = -1
        elif name == 'unit-id':
            luid = int(self._content)
            if luid in self._lexical_units:
                s = self._synsets[self._defined_synset]
                lubase, lusense, pos, desc = self._lexical_units[luid]
                lu = LexicalUnit(luid=luid, base=lubase, sense=lusense, pos=pos, synset=s, definition=desc)
                self._lexical_units_to_base.append(lu)
                self._lexical_units_objs[luid] = lu
            self._unit = False
            self._content = ''

    def characters(self, content):
        if self._unit and self._defined_synset >= 0 and content.strip():
            self._content += content

    def endDocument(self):
        pass


def import_relations(plwn_path):
    print ('Importing relation types')
    for _, element in etree.iterparse(plwn_path):
        if element.tag == 'relationtypes':
            Relation.objects.create(id=int(element.attrib['id']),
                                    name=element.attrib['name'])
        elif element.getparent() is not None:
            element.getparent().remove(element)

    print ('Adding parent relations')
    for _, element in etree.iterparse(plwn_path):
        if element.tag == 'relationtypes' and 'parent' in element.attrib:
            child = Relation.objects.get(id=int(element.attrib['id']))
            parent = Relation.objects.get(id=int(element.attrib['parent']))
            child.parent = parent
            child.save()
        elif element.getparent() is not None:
            element.getparent().remove(element)


# ==========================================================#
class Command(BaseCommand):
    args = 'none'
    help = ''

    def handle(self, **options):
        import_plWordnet()


def import_plWordnet():
    f = os.path.join(PROJECT_PATH, 'data', 'plwordnet-3.0.xml')

    import_relations(f)

    parser = make_parser()
    parser.setContentHandler(PlWNHandler())
    print ("Parsing Wordnet...")
    parser.parse(f)
    print ("...DONE")

    print ("")

    print ("Storing synsets...")
    synsets = parser.getContentHandler()._synsets_to_base
    max_len = len(synsets)
    i = 0
    while i * BULK < max_len:
        store = synsets[i * BULK:min((i + 1) * BULK, max_len)]
        Synset.objects.bulk_create(store)
        i += 1
        print (str(i * BULK) + "...")
    print ("...DONE")

    print ("")

    print ("Storing lexical units...")
    lexical_units = parser.getContentHandler()._lexical_units_to_base
    max_len = len(lexical_units)
    i = 0
    while i * BULK < max_len:
        store = lexical_units[i * BULK:min((i + 1) * BULK, max_len)]
        LexicalUnit.objects.bulk_create(store)
        i += 1
        print (str(i * BULK) + "...")
    print ("...DONE")

    print ("")

    print ("Storing synset links...")
    synset_links = parser.getContentHandler()._synset_links_to_base
    max_len = len(synset_links)
    i = 0
    while i * BULK < max_len:
        store = synset_links[i * BULK:min((i + 1) * BULK, max_len)]
        SynsetLink.objects.bulk_create(store)
        i += 1
        print (str(i * BULK) + "...")
    print ("...DONE")

    print ("")

    print ("Storing lexical unit links...")
    lus_links = parser.getContentHandler()._lus_links_to_base
    max_len = len(lus_links)
    i = 0
    while i * BULK < max_len:
        store = lus_links[i * BULK:min((i + 1) * BULK, max_len)]
        LexicalUnitLink.objects.bulk_create(store)
        i += 1
        print (str(i * BULK) + "...")
    print ("...DONE")