WalentyPreprocessXML.py 3.56 KB
#! /usr/bin/python
# -*- coding: utf-8 -*-

from xml.sax import handler
from importer.PreprocessEntry import Entry
from meanings.models import LexicalUnit


class XMLNode:
    
    def __init__(self, name, attrs, parent):
        self._name = name
        self._attrs = attrs
        self._children = []
        self._parent = parent
        self._content = ""

    def addChild(self, child):
        self._children.append(child)

    def setContent(self, content):
        self._content = content

    def __str__(self):
        att = zip(self._attrs.keys(), self._attrs.values())
        return self._name + '[' + str(att) + '](' + ';'.join([str(temp) for temp in self._children]) + ')'


class WalentyPreprocessTeiHandler(handler.ContentHandler):

    def __init__(self):
        handler.ContentHandler.__init__(self)
        self._subtree = None
        self._current = None
        self._constructing = False
        self._content = ""
        self.entry_meanings = {}
        self.meanings = {}
        self.frames = {}
        
    def startElement(self, name, attrs):
        if name == 'date':
            pass
        if name == 'entry':
            self._constructing = True
            self._content = ""
        if (self._constructing):
            node = XMLNode(name, attrs, self._current)
            if self._current is not None:
                self._current.addChild(node)
            else:
                self._subtree = node
            self._current = node

    def endElement(self, name):
        if self._current is not None:
            self._current.setContent(self._content.strip())
            self._current = self._current._parent
            if name == 'entry':
                if self._current is not None:
                    raise TEIStructureError()
                entry = Entry(self._subtree)
                if entry._semantics is not None:
                    self.extend(entry._base, entry._pos, entry._meanings, entry._semantics._frames)
            self._content = ''
        else:
            if name == 'title':
                self._content = ''
            elif name == 'publisher':
                self._content = ''
            elif name == 'licence':
                self.content = ''
            elif name == 'p':
                self._content += '\n% '

    def characters(self, content):
        self._content += content
        
    def endDocument(self):
        print("Storing new lexical units")
        for entry_data, meaning in self.meanings.values():
            lu = LexicalUnit.objects.filter()
            name, pos = entry_data
            lus = LexicalUnit.objects.filter(base=meaning._name, sense=meaning._variant, pos=pos)
            if lus:
                lu = lus[0]
                changed = False
                assert(lu.luid == meaning._luid)
                assert(lu.synset.id == meaning._sid)
                if lu.gloss != meaning._gloss:
                    print('    updating gloss for :', lu, ' --- ', repr(lu.gloss), '->', repr(meaning._gloss))
                    lu.gloss = meaning._gloss
                if changed:
                    lu.save()
            else:
                print('    new lu: {}-{}'.format(meaning._name, meaning._variant))
                meaning.save(pos)
        print("Stored")

    def extend(self, base, pos, meanings, frames):
        self.entry_meanings[(base, pos)] = [id for id in meanings._meanings]
        for id in meanings._meanings:
            self.meanings[id] = ((base, pos), meanings._meanings[id])
        for frame in frames:
            if frame._base is not None:
                self.frames[frame._id] = frame