tei_writer.py
1.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2016 Bartłomiej Nitoń
# This is tei writer for thrift client
#
import gzip
import os
from lxml import etree
NOT_ARCHIVED_ANNS = ['text']
def write_as_tei_dir(result, output):
corpus = etree.fromstring(result)
annotations = corpus.xpath('//ns:TEI', namespaces={'ns': 'http://www.tei-c.org/ns/1.0'})
clean(corpus)
for ann in annotations:
header = ann.find('ns:teiHeader', namespaces={'ns': 'http://www.tei-c.org/ns/1.0'})
ann_type = header.attrib['type']
corpus.append(ann)
ann_path = ''
if ann_type in NOT_ARCHIVED_ANNS:
ann_path = os.path.join(output, u'%s.xml' % ann_type)
with open(ann_path, 'w') as output_file:
output_file.write(etree.tostring(corpus, pretty_print=True, xml_declaration=True,
encoding='UTF-8'))
else:
ann_path = os.path.join(output, u'ann_%s.xml.gz' % ann_type)
with gzip.open(ann_path, 'wb') as output_file:
output_file.write(etree.tostring(corpus, pretty_print=True, xml_declaration=True,
encoding='UTF-8'))
corpus.remove(ann)
def clean(elem):
for child in elem:
elem.remove(child)