WalentyXML.py 3.67 KB
#! /usr/bin/python
# -*- coding: utf-8 -*-

import traceback

from xml.sax import handler
from importer.Entry import Entry

examples_out_file = 'examples_ambig.txt'
misconnected_examples_out_file = 'examples_to_reattach.txt'


class XMLNode:
    
    def __init__(self, name, attrs, parent):
        self._name = name
        self._attrs = attrs
        self._children = []
        self._parent = parent
        self._content = ""

    def addChild(self, child):
        self._children.append(child)

    def setContent(self, content):
        self._content = content

    def __str__(self):
        att = list(zip(self._attrs.keys(), self._attrs.values()))
        return self._name + '[' + str(att) + '](' + ';'.join([str(temp) for temp in self._children]) + ')'
        
    
class WalentyTeiHandler(handler.ContentHandler):

    def __init__(self, entry_meanings, meanings, frames):
        handler.ContentHandler.__init__(self)
        self._subtree = None
        self._current = None
        self._constructing = False
        self._content = ""
        self._entry_meanings = entry_meanings
        self._meanings = meanings
        self._frames = frames
        self._stored_positions = {}
        self._examples_in = None # @TODO: read disambiguated file
        self._examples_out = open(examples_out_file, "w")
        self._misconnected_out = open(misconnected_examples_out_file, "w")
        self._errors = []
        self._counter = 0
        
    def startElement(self, name, attrs):
        if name == 'date':
            #self.printMeta(attrs['when'])
            pass
        if name == 'entry':
            self._constructing = True
            self._content = ""
        if (self._constructing):
            node = XMLNode(name, attrs, self._current)
            if self._current is not None:
                self._current.addChild(node)
            else:
                self._subtree = node
            self._current = node


    def endElement(self, name):
        if self._current is not None:
            self._current.setContent(self._content.strip())
            self._current = self._current._parent
            if name == 'entry':
                self._counter += 1
                if self._current is not None:
                    raise TEIStructureError()
                base = self._subtree._children[0]._children[0]._content
                try:
                    entry = Entry(self._subtree, self._entry_meanings, self._meanings, self._frames, self._examples_in, self._examples_out, self._misconnected_out)
                    entry.store(self._meanings, self._stored_positions)
                except Exception as e:
                    #raise
                    traceback.print_exc()
                    self._errors.append('{}: {} ({})'.format(base, type(e).__name__, str(e)))
                    # errors reach or exceed 10% of entries, but wait until some entries are read – 1 out of 2 might not yet be a reason to panic ;)
                    if self._counter >= 100 and len(self._errors) * 10 >= self._counter:
                        self.endDocument()
                        raise RuntimeError('too many errors encountered, abandoning ship!')
            self._content = ''
        else:
            if name == 'title':
                pass
            elif name == 'publisher':
                pass
            elif name == 'licence':
                pass
            elif name == 'p':
                self._content += '\n% '

    def characters(self, content):
        self._content += content
        
    def endDocument(self):
        self._examples_out.close()
        self._misconnected_out.close()
        print('encountered errors:')
        for error in self._errors:
            print(error)