tei.py 4.69 KB
'TEI parser'

import os
from lxml import etree


class Tei(object):
    'Single TEI file'
    nsmap = {'nkjp': 'http://www.nkjp.pl/ns/1.0',
             'tei': 'http://www.tei-c.org/ns/1.0',
             'xmlns': 'http://www.tei-c.org/ns/1.0',
             'xml': 'http://www.w3.org/XML/1998/namespace',
             'xi': 'http://www.w3.org/2001/XInclude'}

    def __init__(self, speakers, metadata, data):
        self.speakers = speakers
        self.metadata = metadata
        self.data = data

    def save(self, path):
        'Save file to TEI'
        file_id = self.metadata['file_id']
        path = f'{path}/{file_id}'
        if not os.path.isdir(path):
            os.mkdir(path)
        self._save_text(f'{path}/text_structure.xml')
        self._save_header(f'{path}/header.xml')

    def _save_header(self, filename):
        'Save header file'
        root = etree.Element("teiHeader", attrib=None, nsmap={
            'xi': self.nsmap['xi'], None: self.nsmap['tei']})
        root.attrib[etree.QName(self.nsmap['xml'], 'id')
                    ] = 'PPC-' + self.metadata['file_id']
        file_desc = self._add(root, 'fileDesc')
        title_stmt = self._add(file_desc, 'titleStmt')
        self._add(title_stmt, 'title', {}, self.metadata['title'])
        public_stmt = self._add(file_desc, 'publicationStmt')
        self._add(public_stmt, 'p', {},
                  'Prosimy o zapoznanie się z nagłówkiem korpusu (PPC_header.xml).')
        source_desc = self._add(file_desc, 'sourceDesc')
        bibl = self._add(source_desc, 'bibl')
        self._add(bibl, 'title', {}, self.metadata['title'])
        self._add(bibl, 'publisher', {},
                  'Kancelaria Sejmu Rzeczypospolitej Polskiej')
        self._add(bibl, 'note', {'type': 'system'}, 'III RP')
        self._add(bibl, 'note', {'type': 'house'}, 'Sejm')
        self._add(bibl, 'note', {'type': 'termNo'}, self.metadata['termNo'])
        self._add(bibl, 'note', {'type': 'type'}, self.metadata['type'])
        self._add(bibl, 'note', {'type': 'sessionNo'},
                  self.metadata['session'])
        self._add(bibl, 'note', {'type': 'dayNo'}, self.metadata['day'])
        self._add(bibl, 'date', {}, self.metadata['date'])
        profile_desc = self._add(root, 'profileDesc')
        partic_desc = self._add(profile_desc, 'particDesc')
        for (speaker_id, speaker) in self.speakers:
            person = self._add(partic_desc, 'person', {
                               'role': self.speakers.role(speaker), 'xml:id': speaker_id})
            self._add(person, 'persName', {}, speaker)
        self._save(filename, root)

    def _save_text(self, filename):
        'Save text file'
        root = etree.Element("teiCorpus", attrib=None, nsmap={
                             'xi': self.nsmap['xi'], None: self.nsmap['tei']})
        self._add(root, 'xi:include', {'href': 'PPC_header.xml'})
        tei = self._add(root, 'TEI')
        self._add(tei, 'xi:include', {'href': 'header.xml'})
        text = self._add(tei, 'text')
        body = self._add(text, 'body')
        self._body(body, self.data)
        self._save(filename, root)

    def _body(self, parent, sections):
        section = 0
        for utterances in sections:
            section += 1
            index = 0
            div = self._add(parent, 'div', {'xml:id': f'div-{section}'})
            # if utterances[0] and 'niewygłoszony' in utterances[0].who:
            #     div.attrib['type'] = 'undelivered'
            for utter in utterances:
                who = self.speakers.find_id(utter.who)
                item = self._add(
                    div, 'u', {'who': f'#{who}', 'xml:id': f'u-{section}.{index}'})
                item.text = utter.u
                div.append(item)
                index += 1

    def _save(self, filename, xml):
        with open(filename, 'wb') as out:
            out.write(etree.tostring(
                xml, pretty_print=True, encoding='utf-8', xml_declaration=True))

    def _add(self, parent, tag, attributes=None, text=''):
        'Add a tag to parent'
        attributes = attributes or {}
        if ':' in tag:
            (namespace, tag) = tag.split(':')
            element = etree.SubElement(parent, etree.QName(self.nsmap[namespace], tag),
                                       attrib=None, nsmap=None)
        else:
            element = etree.SubElement(parent, tag, attrib=None, nsmap=None)
        for name, value in attributes.items():
            if ':' in name:
                (namespace, tag) = name.split(':')
                element.attrib[etree.QName(
                    self.nsmap[namespace], tag)] = value
            else:
                element.attrib[name] = value
        if text:
            element.text = text
        return element