#! /usr/bin/python # -*- coding: utf-8 -*- from django.core.management.base import BaseCommand import sys, os, codecs from xml.sax import saxutils, handler, make_parser from wordnet.models import Synset, LexicalUnit, Hypernymy, Synonymy from settings import PROJECT_PATH BULK = 250 class PlWNHandler(handler.ContentHandler): def __init__(self, out = sys.stdout): handler.ContentHandler.__init__(self) self._out = out self._lexical_units = {} self._reflexive_lexical_units = {} self._mutual_lexical_units = {} self._synsets = {} self._root_synsets_ids = set() self._root_synsets = [] self._defined_synset = -1 self._unit = False self._content = '' self._other_synset_relations = set() self._lexical_relations = set() self._synsets_to_base = [] self._lexical_units_to_base = [] self._hypernymy_to_base = [] self._synonymy_to_base = [] def startElement(self, name, attrs): if name == 'lexical-unit': if 'pwn' not in attrs['pos']: luid = int(attrs['id']) lubase = attrs['name'] lusense = int(attrs['variant']) pos = attrs['pos'] desc = attrs['desc'] self._lexical_units[luid] = (lubase, lusense, pos, desc) elif name == 'synset': sid = int(attrs['id']) self._defined_synset = sid s = Synset(id=sid) self._synsets_to_base.append(s) self._synsets[sid] = s elif name == 'unit-id': self._unit = True elif name == 'synsetrelations': if attrs['valid'] == 'true': parent = int(attrs['parent']) child = int(attrs['child']) relation = int(attrs['relation']) if child in self._synsets and parent in self._synsets: if relation == 11: # hiperonimia p = self._synsets[parent] c = self._synsets[child] h = Hypernymy(parent=p,child=c) self._hypernymy_to_base.append(h) elif relation == 60: # bliskoznacznosc p = self._synsets[parent] c = self._synsets[child] s = Synonymy(parent=p,child=c) self._synonymy_to_base.append(s) def endElement(self, name): if name == 'synset': self._defined_synset = -1 elif name == 'unit-id': luid = int(self._content) if luid in self._lexical_units: s = self._synsets[self._defined_synset] lubase, lusense, pos, desc = self._lexical_units[luid] lu = LexicalUnit(luid=luid, base=lubase, sense=lusense, pos=pos, synset=s, glossa='', definition=desc) # print luid, lubase, lusense self._lexical_units_to_base.append(lu) self._unit = False self._content = '' def characters(self, content): if self._unit and self._defined_synset >= 0 and content.strip(): self._content += content def endDocument(self): pass #==========================================================# class Command(BaseCommand): args = 'none' help = '' def handle(self, **options): import_plWordnet() def import_plWordnet(): f = os.path.join(PROJECT_PATH, 'data', 'semantics', 'plwordnet_2_1.xml') parser = make_parser() parser.setContentHandler(PlWNHandler()) print "Parsing Wordnet..." parser.parse(f) print "...DONE" print "" print "Storing synsets..." synsets = parser.getContentHandler()._synsets_to_base max_len = len(synsets) i = 0 while i*BULK < max_len: store = synsets[i*BULK:min((i+1)*BULK, max_len)] Synset.objects.bulk_create(store) i += 1 print str(i*BULK) + "..." print "...DONE" print "" print "Storing lexical units..." lexical_units = parser.getContentHandler()._lexical_units_to_base max_len = len(lexical_units) i = 0 while i*BULK < max_len: store = lexical_units[i*BULK:min((i+1)*BULK, max_len)] LexicalUnit.objects.bulk_create(store) i += 1 print str(i*BULK) + "..." print "...DONE" print "" print "Storing hypernymy..." hypernymy = parser.getContentHandler()._hypernymy_to_base max_len = len(hypernymy) i = 0 while i*BULK < max_len: store = hypernymy[i*BULK:min((i+1)*BULK, max_len)] Hypernymy.objects.bulk_create(store) i += 1 print str(i*BULK) + "..." print "...DONE" print "" print "Storing synonymy..." synonymy = parser.getContentHandler()._hypernymy_to_base max_len = len(synonymy) i = 0 while i*BULK < max_len: store = synonymy[i*BULK:min((i+1)*BULK, max_len)] Synonymy.objects.bulk_create(store) i += 1 print str(i*BULK) + "..." print "...DONE"