WalentyPreprocessXML.py 2.66 KB
#! /usr/bin/python
# -*- coding: utf-8 -*-

from xml.sax import handler
from PreprocessEntry import Entry


class XMLNode:
    
    def __init__(self, name, attrs, parent):
        self._name = name
        self._attrs = attrs
        self._children = []
        self._parent = parent
        self._content = ""

    def addChild(self, child):
        self._children.append(child)

    def setContent(self, content):
        self._content = content

    def __str__(self):
        att = zip(self._attrs.keys(), self._attrs.values())
        return self._name + '[' + str(att) + '](' + ';'.join([str(temp) for temp in self._children]) + ')'


class WalentyPreprocessTeiHandler(handler.ContentHandler):

    def __init__(self):
        handler.ContentHandler.__init__(self)
        self._subtree = None
        self._current = None
        self._constructing = False
        self._content = ""
        self.entry_meanings = {}
        self.meanings = {}
        self.frames = {}
        
    def startElement(self, name, attrs):
        if name == 'date':
            pass
        if name == 'entry':
            self._constructing = True
            self._content = ""
        if (self._constructing):
            node = XMLNode(name, attrs, self._current)
            if self._current is not None:
                self._current.addChild(node)
            else:
                self._subtree = node
            self._current = node

    def endElement(self, name):
        if self._current is not None:
            self._current.setContent(self._content)
            self._current = self._current._parent
            if name == 'entry':
                if self._current is not None:
                    raise TEIStructureError()
                entry = Entry(self._subtree)
                if entry._semantics is not None:
                    self.extend(entry._base, entry._pos, entry._meanings, entry._semantics._frames)
            self._content = ''
        else:
            if name == 'title':
                self._content = ''
            elif name == 'publisher':
                self._content = ''
            elif name == 'licence':
                self.content = ''
            elif name == 'p':
                self._content += '\n% '

    def characters(self, content):
        self._content += content.strip()
        
    def endDocument(self):
        pass

    def extend(self, base, pos, meanings, frames):
        self.entry_meanings[(base, pos)] = [id for id in meanings._meanings]
        for id in meanings._meanings:
            self.meanings[id] = ((base, pos), meanings._meanings[id])
        for frame in frames:
            if frame._base is not None:
                self.frames[frame._id] = frame