Commit 91fb35c632c018a228d91c3518eac0a818d6a78b

Authored by Tomasz Bartosiak
1 parent 02015f19

Skrypt do importu Słowosieci

meanings/management/__init__.py 0 → 100644
meanings/management/commands/__init__.py 0 → 100644
meanings/management/commands/import_plWordnet.py 0 → 100644
  1 +#! /usr/bin/python
  2 +# -*- coding: utf-8 -*-
  3 +
  4 +from django.core.management.base import BaseCommand
  5 +
  6 +import sys, os, codecs
  7 +from xml.sax import saxutils, handler, make_parser
  8 +from meanings.models import Synset, LexicalUnit
  9 +from shellvalier.settings import BASE_DIR
  10 +
  11 +BULK = 250
  12 +
  13 +class PlWNHandler(handler.ContentHandler):
  14 +
  15 + def __init__(self, out = sys.stdout):
  16 + handler.ContentHandler.__init__(self)
  17 + self._out = out
  18 + self._lexical_units = {}
  19 + self._reflexive_lexical_units = {}
  20 + self._mutual_lexical_units = {}
  21 + self._synsets = {}
  22 + self._root_synsets_ids = set()
  23 + self._root_synsets = []
  24 + self._defined_synset = -1
  25 + self._unit = False
  26 + self._content = ''
  27 + self._other_synset_relations = set()
  28 + self._lexical_relations = set()
  29 + self._synsets_to_base = []
  30 + self._lexical_units_to_base = []
  31 + self._hypernymy_to_base = []
  32 + self._synonymy_to_base = []
  33 +
  34 + def startElement(self, name, attrs):
  35 + if name == 'lexical-unit':
  36 + if 'pwn' not in attrs['pos']:
  37 + luid = int(attrs['id'])
  38 + lubase = attrs['name']
  39 + lusense = int(attrs['variant'])
  40 + pos = attrs['pos']
  41 + desc = attrs['desc']
  42 + self._lexical_units[luid] = (lubase, lusense, pos, desc)
  43 + elif name == 'synset':
  44 + sid = int(attrs['id'])
  45 + self._defined_synset = sid
  46 + s = Synset(id=sid)
  47 + self._synsets_to_base.append(s)
  48 + self._synsets[sid] = s
  49 + elif name == 'unit-id':
  50 + self._unit = True
  51 +
  52 + def endElement(self, name):
  53 + if name == 'synset':
  54 + self._defined_synset = -1
  55 + elif name == 'unit-id':
  56 + luid = int(self._content)
  57 + if luid in self._lexical_units:
  58 + s = self._synsets[self._defined_synset]
  59 + lubase, lusense, pos, desc = self._lexical_units[luid]
  60 + lu = LexicalUnit(luid=luid, base=lubase, sense=lusense, pos=pos, synset=s, glossa='', definition=desc)
  61 + # print luid, lubase, lusense
  62 + self._lexical_units_to_base.append(lu)
  63 + self._unit = False
  64 + self._content = ''
  65 +
  66 + def characters(self, content):
  67 + if self._unit and self._defined_synset >= 0 and content.strip():
  68 + self._content += content
  69 +
  70 + def endDocument(self):
  71 + pass
  72 +
  73 +#==========================================================#
  74 +class Command(BaseCommand):
  75 + args = 'none'
  76 + help = ''
  77 +
  78 + def handle(self, **options):
  79 + import_plWordnet()
  80 +
  81 +def import_plWordnet():
  82 + xml_file = os.path.join(BASE_DIR, 'data', 'plwordnet', 'plwordnet_2_1.xml')
  83 + xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), xml_file)
  84 +
  85 + parser = make_parser()
  86 + parser.setContentHandler(PlWNHandler())
  87 + print("Parsing Wordnet...")
  88 + parser.parse(xml_path)
  89 + print("...DONE")
  90 +
  91 + print("")
  92 +
  93 + print("Storing synsets...")
  94 + synsets = parser.getContentHandler()._synsets_to_base
  95 + max_len = len(synsets)
  96 + i = 0
  97 + while i*BULK < max_len:
  98 + store = synsets[i*BULK:min((i+1)*BULK, max_len)]
  99 + Synset.objects.bulk_create(store)
  100 + i += 1
  101 + print(str(i*BULK) + "...")
  102 + print("...DONE")
  103 +
  104 + print("")
  105 +
  106 + print("Storing lexical units...")
  107 + lexical_units = parser.getContentHandler()._lexical_units_to_base
  108 + max_len = len(lexical_units)
  109 + i = 0
  110 + while i*BULK < max_len:
  111 + store = lexical_units[i*BULK:min((i+1)*BULK, max_len)]
  112 + LexicalUnit.objects.bulk_create(store)
  113 + i += 1
  114 + print(str(i*BULK) + "...")
  115 + print("...DONE")
... ...