tei_writer.py 1.27 KB

Edit Raw Blame History Permalink

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2016 Bartłomiej Nitoń
# This is tei writer for thrift client
#

import gzip
import os

from lxml import etree

NOT_ARCHIVED_ANNS = ['text']

def write_as_tei_dir(result, output):
    corpus = etree.fromstring(result)
    annotations = corpus.xpath('//ns:TEI', namespaces={'ns': 'http://www.tei-c.org/ns/1.0'})
    clean(corpus)

    for ann in annotations:
        header = ann.find('ns:teiHeader', namespaces={'ns': 'http://www.tei-c.org/ns/1.0'})
        ann_type = header.attrib['type']
        corpus.append(ann)
        ann_path = ''
        if ann_type in NOT_ARCHIVED_ANNS:
            ann_path = os.path.join(output, u'%s.xml' % ann_type)
            with open(ann_path, 'w') as output_file:
                output_file.write(etree.tostring(corpus, pretty_print=True, xml_declaration=True,
                                  encoding='UTF-8'))
        else:
            ann_path = os.path.join(output, u'ann_%s.xml.gz' % ann_type)
            with gzip.open(ann_path, 'wb') as output_file:
                output_file.write(etree.tostring(corpus, pretty_print=True, xml_declaration=True,
                                  encoding='UTF-8'))
        corpus.remove(ann)


def clean(elem):
    for child in elem:
        elem.remove(child)