tei.py
4.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
'TEI parser'
import os
from collections import namedtuple
from lxml import etree
from speakers import Speakers
Utterance = namedtuple('Utterance', 'who u')
class Tei:
'Single TEI file'
nsmap = {'nkjp': 'http://www.nkjp.pl/ns/1.0',
'tei': 'http://www.tei-c.org/ns/1.0',
'xmlns': 'http://www.tei-c.org/ns/1.0',
'xml': 'http://www.w3.org/XML/1998/namespace',
'xi': 'http://www.w3.org/2001/XInclude'}
def __init__(self, data, metadata):
self.speakers = Speakers()
self.data = data
self.metadata = metadata
def save(self, path):
'Save file to TEI'
path = f'{path}/{self.metadata["file_id"]}'
if not os.path.isdir(path):
os.mkdir(path)
self._save_text(f'{path}/text_structure.xml')
self._save_header(f'{path}/header.xml')
def _save_header(self, filename):
'Save header file'
root = etree.Element("teiHeader", attrib=None, nsmap={
'xi': self.nsmap['xi'], None: self.nsmap['tei']})
root.attrib[etree.QName(self.nsmap['xml'], 'id')
] = 'PPC-' + self.metadata['file_id']
file_desc = self._add(root, 'fileDesc')
title_stmt = self._add(file_desc, 'titleStmt')
self._add(title_stmt, 'title', {}, self.metadata['title'])
public_stmt = self._add(file_desc, 'publicationStmt')
self._add(public_stmt, 'p', {},
'Prosimy o zapoznanie się z nagłówkiem korpusu (PPC_header.xml).')
source_desc = self._add(file_desc, 'sourceDesc')
bibl = self._add(source_desc, 'bibl')
self._add(bibl, 'title', {}, self.metadata['title'])
self._add(bibl, 'publisher', {},
f'Kancelaria {self.metadata["house"]}u Rzeczypospolitej Polskiej')
for tag in ['system', 'house', 'termNo', 'type', 'sessionNo', 'dayNo', 'original_file']:
self._add(bibl, 'note', {'type': tag}, self.metadata[tag])
self._add(bibl, 'date', {}, self.metadata['date'])
profile_desc = self._add(root, 'profileDesc')
partic_desc = self._add(profile_desc, 'particDesc')
for (speaker_id, speaker) in self.speakers:
person = self._add(partic_desc, 'person', {
'role': self.speakers.role(speaker), 'xml:id': speaker_id})
self._add(person, 'persName', {}, speaker)
self._save(filename, root)
def _save_text(self, filename):
'Save text file'
root = etree.Element("teiCorpus", attrib=None, nsmap={
'xi': self.nsmap['xi'], None: self.nsmap['tei']})
self._add(root, 'xi:include', {'href': 'PPC_header.xml'})
tei = self._add(root, 'TEI')
self._add(tei, 'xi:include', {'href': 'header.xml'})
text = self._add(tei, 'text')
body = self._add(text, 'body')
self._body(body, self.data)
self._save(filename, root)
def _body(self, parent, utterances):
section = 0
index = 0
previous = None
div = []
for utter in utterances:
who = self.speakers.find_id(utter.who)
if not previous or previous != who:
div = self._add(parent, 'div', {'xml:id': f'div-{section}'})
previous = who
section += 1
index = 1
item = self._add(
div, 'u', {'who': f'#{who}', 'xml:id': f'u-{section}.{index}'})
item.text = utter.u
div.append(item)
index += 1
def _save(self, filename, xml):
with open(filename, 'wb') as out:
out.write(etree.tostring(
xml, pretty_print=True, encoding='utf-8', xml_declaration=True))
def _add(self, parent, tag, attributes=None, text=''):
'Add a tag to parent'
attributes = attributes or {}
if ':' in tag:
(namespace, tag) = tag.split(':')
element = etree.SubElement(parent, etree.QName(self.nsmap[namespace], tag),
attrib=None, nsmap=None)
else:
element = etree.SubElement(parent, tag, attrib=None, nsmap=None)
for name, value in attributes.items():
if ':' in name:
(namespace, tag) = name.split(':')
element.attrib[etree.QName(
self.nsmap[namespace], tag)] = value
else:
element.attrib[name] = value
if text:
element.text = text
return element