import_plWordnet.py 4.88 KB
#! /usr/bin/python
# -*- coding: utf-8 -*-

from django.core.management.base import BaseCommand

import sys, os, codecs
from xml.sax import saxutils, handler, make_parser
from collections import defaultdict

from meanings.models import Synset, LexicalUnit
from shellvalier.settings import BASE_DIR
    
BULK = 250
POS_MAP = {
    'czasownik'   : 'verb',
    'rzeczownik'  : 'noun',
    'przymiotnik' : 'adj',
    'przysłówek'  : 'adv',
}
    
class PlWNHandler(handler.ContentHandler):

    def __init__(self, out = sys.stdout):
        handler.ContentHandler.__init__(self)
        self._out = out
        self._lexical_units = {}
        self._reflexive_lexical_units = {}
        self._mutual_lexical_units = {}
        self._synsets = {}
        self._root_synsets_ids = set()
        self._root_synsets = []
        self._defined_synset = -1
        self._unit = False
        self._content = ''
        self._other_synset_relations = set()
        self._lexical_relations = set()
        self._synsets_to_base = []
        self._lexical_units_to_base = []
        self._hypernymy_to_base = defaultdict(list)
        self._synonymy_to_base = []

    def startElement(self, name, attrs):
        if name == 'lexical-unit':
            if 'pwn' not in attrs['pos']:
                luid = int(attrs['id'])
                lubase = attrs['name']
                lusense = int(attrs['variant'])
                pos = attrs['pos']
                desc = attrs['desc']
                self._lexical_units[luid] = (lubase, lusense, pos, desc)
        elif name == 'synset':
            sid = int(attrs['id'])
            self._defined_synset = sid
            defintion = attrs['definition'] if attrs['definition'] != 'brak danych' else ''
            s = Synset(id=sid, definition=defintion)
            self._synsets_to_base.append(s)
            self._synsets[sid] = s
        elif name == 'unit-id':
            self._unit = True
        elif name == 'synsetrelations' and attrs['relation'] == '11':
            self._hypernymy_to_base[int(attrs['child'])].append(int(attrs['parent']))
                        
    def endElement(self, name):
        if name == 'synset':
            self._defined_synset = -1
        elif name == 'unit-id':
            luid = int(self._content)
            if luid in self._lexical_units:
                s = self._synsets[self._defined_synset]
                lubase, lusense, pos, desc = self._lexical_units[luid]
                if desc == 'brak danych':
                    desc = ''
                pos = POS_MAP[pos]
                lu = LexicalUnit(luid=luid, base=lubase, sense=lusense, pos=pos, synset=s, gloss='', definition=desc, text_rep='{}-{}'.format(lubase, lusense))
                # print luid, lubase, lusense
                self._lexical_units_to_base.append(lu)
            self._unit = False
            self._content = ''

    def characters(self, content):
        if self._unit and self._defined_synset >= 0 and content.strip():
            self._content += content
                
    def endDocument(self):
        pass
        
#==========================================================#
class Command(BaseCommand):
    args = 'none'
    help = ''

    def handle(self, **options):
        import_plWordnet()
        
def import_plWordnet():
    xml_file = os.path.join(BASE_DIR, 'data', 'plwordnet', 'plwordnet_2_1.xml')
    xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), xml_file)

    parser = make_parser()
    parser.setContentHandler(PlWNHandler())
    print("Parsing Wordnet...")
    parser.parse(xml_path)
    print("...DONE")

    print()

    print("Storing synsets...")
    synsets = parser.getContentHandler()._synsets_to_base
    max_len = len(synsets)
    i = 0
    while i*BULK < max_len:
        store = synsets[i*BULK:min((i+1)*BULK, max_len)]
        Synset.objects.bulk_create(store)
        i += 1
        if (i % 50 == 0):
            print(str(i*BULK) + "...")
    print("...DONE")

    print()

    print("Storing lexical units...")
    lexical_units = parser.getContentHandler()._lexical_units_to_base
    max_len = len(lexical_units)
    i = 0
    while i*BULK < max_len:
        store = lexical_units[i*BULK:min((i+1)*BULK, max_len)]
        LexicalUnit.objects.bulk_create(store)
        i += 1
        if (i % 20 == 0):
            print(str(i*BULK) + "...")
    print("...DONE")

    print()

    print("Storing hypernyms...")
    hypernyms = parser.getContentHandler()._hypernymy_to_base
    print(len(hypernyms))
    i = 0
    for child_id, parent_ids in hypernyms.items():
        i += 1
        try:
            child = Synset.objects.get(id=child_id)
            parents = [Synset.objects.get(id=parent_id) for parent_id in parent_ids]
        except:
            print('************', child_id, parent_ids)
            continue
        if i % 2000 == 0:
            print(i, child, parents)
        child.hypernyms.add(*parents)
        child.save()
    print("...DONE")