tei.py 4.47 KB
'TEI parser'

import os
from collections import namedtuple
from lxml import etree
from speakers import Speakers

Utterance = namedtuple('Utterance', 'who u')


class Tei:
    'Single TEI file'
    nsmap = {'nkjp': 'http://www.nkjp.pl/ns/1.0',
             'tei': 'http://www.tei-c.org/ns/1.0',
             'xmlns': 'http://www.tei-c.org/ns/1.0',
             'xml': 'http://www.w3.org/XML/1998/namespace',
             'xi': 'http://www.w3.org/2001/XInclude'}

    def __init__(self, data, metadata):
        self.speakers = Speakers()
        self.data = data
        self.metadata = metadata

    def save(self, path):
        'Save file to TEI'
        path = f'{path}/{self.metadata["file_id"]}'
        if not os.path.isdir(path):
            os.mkdir(path)
        self._save_text(f'{path}/text_structure.xml')
        self._save_header(f'{path}/header.xml')

    def _save_header(self, filename):
        'Save header file'
        root = etree.Element("teiHeader", attrib=None, nsmap={
            'xi': self.nsmap['xi'], None: self.nsmap['tei']})
        root.attrib[etree.QName(self.nsmap['xml'], 'id')
                    ] = 'PPC-' + self.metadata['file_id']
        file_desc = self._add(root, 'fileDesc')
        title_stmt = self._add(file_desc, 'titleStmt')
        self._add(title_stmt, 'title', {}, self.metadata['title'])
        public_stmt = self._add(file_desc, 'publicationStmt')
        self._add(public_stmt, 'p', {},
                  'Prosimy o zapoznanie się z nagłówkiem korpusu (PPC_header.xml).')
        source_desc = self._add(file_desc, 'sourceDesc')
        bibl = self._add(source_desc, 'bibl')
        self._add(bibl, 'title', {}, self.metadata['title'])
        self._add(bibl, 'publisher', {},
                  f'Kancelaria {self.metadata["house"]}u Rzeczypospolitej Polskiej')
        for tag in ['system', 'house', 'termNo', 'type', 'sessionNo', 'dayNo', 'original_file']:
            self._add(bibl, 'note', {'type': tag}, self.metadata[tag])
        self._add(bibl, 'date', {}, self.metadata['date'])
        profile_desc = self._add(root, 'profileDesc')
        partic_desc = self._add(profile_desc, 'particDesc')
        for (speaker_id, speaker) in self.speakers:
            person = self._add(partic_desc, 'person', {
                               'role': self.speakers.role(speaker), 'xml:id': speaker_id})
            self._add(person, 'persName', {}, speaker)
        self._save(filename, root)

    def _save_text(self, filename):
        'Save text file'
        root = etree.Element("teiCorpus", attrib=None, nsmap={
                             'xi': self.nsmap['xi'], None: self.nsmap['tei']})
        self._add(root, 'xi:include', {'href': 'PPC_header.xml'})
        tei = self._add(root, 'TEI')
        self._add(tei, 'xi:include', {'href': 'header.xml'})
        text = self._add(tei, 'text')
        body = self._add(text, 'body')
        self._body(body, self.data)
        self._save(filename, root)

    def _body(self, parent, utterances):
        section = 0
        index = 0
        previous = None
        div = []
        for utter in utterances:
            who = self.speakers.find_id(utter.who)
            if not previous or previous != who:
                div = self._add(parent, 'div', {'xml:id': f'div-{section}'})
                previous = who
                section += 1
                index = 1
            item = self._add(
                div, 'u', {'who': f'#{who}', 'xml:id': f'u-{section}.{index}'})
            item.text = utter.u
            div.append(item)
            index += 1

    def _save(self, filename, xml):
        with open(filename, 'wb') as out:
            out.write(etree.tostring(
                xml, pretty_print=True, encoding='utf-8', xml_declaration=True))

    def _add(self, parent, tag, attributes=None, text=''):
        'Add a tag to parent'
        attributes = attributes or {}
        if ':' in tag:
            (namespace, tag) = tag.split(':')
            element = etree.SubElement(parent, etree.QName(self.nsmap[namespace], tag),
                                       attrib=None, nsmap=None)
        else:
            element = etree.SubElement(parent, tag, attrib=None, nsmap=None)
        for name, value in attributes.items():
            if ':' in name:
                (namespace, tag) = name.split(':')
                element.attrib[etree.QName(
                    self.nsmap[namespace], tag)] = value
            else:
                element.attrib[name] = value
        if text:
            element.text = text
        return element