tei.py
4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
'TEI parser'
import os
from collections import namedtuple
from lxml import etree
from speakers import Speakers
from metadata import metadata
Utterance = namedtuple('Utterance', 'who u')
class Tei:
'Single TEI file'
nsmap = {'nkjp': 'http://www.nkjp.pl/ns/1.0',
'tei': 'http://www.tei-c.org/ns/1.0',
'xmlns': 'http://www.tei-c.org/ns/1.0',
'xml': 'http://www.w3.org/XML/1998/namespace',
'xi': 'http://www.w3.org/2001/XInclude'}
def __init__(self, data, filename):
self.speakers = Speakers()
self.data = data
self.metadata = metadata(data)
self.filename = str(os.path.basename(filename)).split('.')[0]
def save(self, path):
'Save file to TEI'
path = f'{path}/{self.filename}'
if not os.path.isdir(path):
os.mkdir(path)
self._save_text(f'{path}/text_structure.xml')
self._save_header(f'{path}/header.xml')
def _save_header(self, filename):
'Save header file'
root = etree.Element("teiHeader", attrib=None, nsmap={
'xi': self.nsmap['xi'], None: self.nsmap['tei']})
root.attrib[etree.QName(self.nsmap['xml'], 'id')
] = 'PPC-' + self.metadata['file_id']
file_desc = self._add(root, 'fileDesc')
title_stmt = self._add(file_desc, 'titleStmt')
self._add(title_stmt, 'title', {}, self.metadata['title'])
public_stmt = self._add(file_desc, 'publicationStmt')
self._add(public_stmt, 'p', {},
'Prosimy o zapoznanie się z nagłówkiem korpusu (PPC_header.xml).')
source_desc = self._add(file_desc, 'sourceDesc')
bibl = self._add(source_desc, 'bibl')
self._add(bibl, 'title', {}, self.metadata['title'])
self._add(bibl, 'publisher', {},
f'Kancelaria {self.metadata["house"]}u Rzeczypospolitej Polskiej')
self._add(bibl, 'note', {'type': 'system'}, 'III RP')
self._add(bibl, 'note', {'type': 'house'}, self.metadata['house'])
self._add(bibl, 'note', {'type': 'termNo'}, self.metadata['termNo'])
self._add(bibl, 'note', {'type': 'type'}, self.metadata['type'])
self._add(bibl, 'note', {'type': 'sessionNo'},
self.metadata['session'])
self._add(bibl, 'note', {'type': 'dayNo'}, self.metadata['day'])
self._add(bibl, 'date', {}, self.metadata['date'])
profile_desc = self._add(root, 'profileDesc')
partic_desc = self._add(profile_desc, 'particDesc')
for (speaker_id, speaker) in self.speakers:
person = self._add(partic_desc, 'person', {
'role': self.speakers.role(speaker), 'xml:id': speaker_id})
self._add(person, 'persName', {}, speaker)
self._save(filename, root)
def _save_text(self, filename):
'Save text file'
root = etree.Element("teiCorpus", attrib=None, nsmap={
'xi': self.nsmap['xi'], None: self.nsmap['tei']})
self._add(root, 'xi:include', {'href': 'PPC_header.xml'})
tei = self._add(root, 'TEI')
self._add(tei, 'xi:include', {'href': 'header.xml'})
text = self._add(tei, 'text')
body = self._add(text, 'body')
self._body(body, self.data)
self._save(filename, root)
def _body(self, parent, utterances):
section = 0
index = 0
previous = None
div = []
for utter in utterances:
who = self.speakers.find_id(utter.who)
if not previous or previous != who:
div = self._add(parent, 'div', {'xml:id': f'div-{section}'})
previous = who
section += 1
index = 1
item = self._add(
div, 'u', {'who': f'#{who}', 'xml:id': f'u-{section}.{index}'})
item.text = utter.u
div.append(item)
index += 1
def _save(self, filename, xml):
with open(filename, 'wb') as out:
out.write(etree.tostring(
xml, pretty_print=True, encoding='utf-8', xml_declaration=True))
def _add(self, parent, tag, attributes=None, text=''):
'Add a tag to parent'
attributes = attributes or {}
if ':' in tag:
(namespace, tag) = tag.split(':')
element = etree.SubElement(parent, etree.QName(self.nsmap[namespace], tag),
attrib=None, nsmap=None)
else:
element = etree.SubElement(parent, tag, attrib=None, nsmap=None)
for name, value in attributes.items():
if ':' in name:
(namespace, tag) = name.split(':')
element.attrib[etree.QName(
self.nsmap[namespace], tag)] = value
else:
element.attrib[name] = value
if text:
element.text = text
return element