tei.py
4.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
'TEI parser'
import os
from lxml import etree
class Tei(object):
'Single TEI file'
nsmap = {'nkjp': 'http://www.nkjp.pl/ns/1.0',
'tei': 'http://www.tei-c.org/ns/1.0',
'xmlns': 'http://www.tei-c.org/ns/1.0',
'xml': 'http://www.w3.org/XML/1998/namespace',
'xi': 'http://www.w3.org/2001/XInclude'}
def __init__(self, speakers, metadata, data):
self.speakers = speakers
self.metadata = metadata
self.data = data
def save(self, path):
'Save file to TEI'
file_id = self.metadata['file_id']
path = f'{path}/{file_id}'
if not os.path.isdir(path):
os.mkdir(path)
self._save_text(f'{path}/text_structure.xml')
self._save_header(f'{path}/header.xml')
def _save_header(self, filename):
'Save header file'
root = etree.Element("teiHeader", attrib=None, nsmap={
'xi': self.nsmap['xi'], None: self.nsmap['tei']})
root.attrib[etree.QName(self.nsmap['xml'], 'id')
] = 'PPC-' + self.metadata['file_id']
file_desc = self._add(root, 'fileDesc')
title_stmt = self._add(file_desc, 'titleStmt')
self._add(title_stmt, 'title', {}, self.metadata['title'])
public_stmt = self._add(file_desc, 'publicationStmt')
self._add(public_stmt, 'p', {},
'Prosimy o zapoznanie się z nagłówkiem korpusu (PPC_header.xml).')
source_desc = self._add(file_desc, 'sourceDesc')
bibl = self._add(source_desc, 'bibl')
self._add(bibl, 'title', {}, self.metadata['title'])
self._add(bibl, 'publisher', {},
'Kancelaria Sejmu Rzeczypospolitej Polskiej')
self._add(bibl, 'note', {'type': 'system'}, 'III RP')
self._add(bibl, 'note', {'type': 'house'}, 'Sejm')
self._add(bibl, 'note', {'type': 'termNo'}, self.metadata['termNo'])
self._add(bibl, 'note', {'type': 'type'}, self.metadata['type'])
self._add(bibl, 'note', {'type': 'sessionNo'},
self.metadata['session'])
self._add(bibl, 'note', {'type': 'dayNo'}, self.metadata['day'])
self._add(bibl, 'date', {}, self.metadata['date'])
profile_desc = self._add(root, 'profileDesc')
partic_desc = self._add(profile_desc, 'particDesc')
for (speaker_id, speaker) in self.speakers:
person = self._add(partic_desc, 'person', {
'role': self.speakers.role(speaker), 'xml:id': speaker_id})
self._add(person, 'persName', {}, speaker)
self._save(filename, root)
def _save_text(self, filename):
'Save text file'
root = etree.Element("teiCorpus", attrib=None, nsmap={
'xi': self.nsmap['xi'], None: self.nsmap['tei']})
self._add(root, 'xi:include', {'href': 'PPC_header.xml'})
tei = self._add(root, 'TEI')
self._add(tei, 'xi:include', {'href': 'header.xml'})
text = self._add(tei, 'text')
body = self._add(text, 'body')
self._body(body, self.data)
self._save(filename, root)
def _body(self, parent, sections):
section = 0
for utterances in sections:
section += 1
index = 0
div = self._add(parent, 'div', {'xml:id': f'div-{section}'})
# if utterances[0] and 'niewygłoszony' in utterances[0].who:
# div.attrib['type'] = 'undelivered'
for utter in utterances:
who = self.speakers.find_id(utter.who)
item = self._add(
div, 'u', {'who': f'#{who}', 'xml:id': f'u-{section}.{index}'})
item.text = utter.u
div.append(item)
index += 1
def _save(self, filename, xml):
with open(filename, 'wb') as out:
out.write(etree.tostring(
xml, pretty_print=True, encoding='utf-8', xml_declaration=True))
def _add(self, parent, tag, attributes=None, text=''):
'Add a tag to parent'
attributes = attributes or {}
if ':' in tag:
(namespace, tag) = tag.split(':')
element = etree.SubElement(parent, etree.QName(self.nsmap[namespace], tag),
attrib=None, nsmap=None)
else:
element = etree.SubElement(parent, tag, attrib=None, nsmap=None)
for name, value in attributes.items():
if ':' in name:
(namespace, tag) = name.split(':')
element.attrib[etree.QName(
self.nsmap[namespace], tag)] = value
else:
element.attrib[name] = value
if text:
element.text = text
return element