plain_text.py
2.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-
import os, io, argparse
from lxml import etree
import normalize
from transcription import split_punct
transcr_rules='rules_XIXw.csv'
transcr_excepts='excepts_XIXw.csv'
namespaces = {
'tei': 'http://www.tei-c.org/ns/1.0',
'xi': 'http://www.w3.org/2001/XInclude',
'nkjp': 'http://www.nkjp.pl/ns/1.0',
'xml': 'http://www.w3.org/XML/1998/namespace'
}
# Przetwarza akapit podany jako argument w wersję transkrybowaną
def transcript(text_div):
res = []
for w in text_div.split():
l, r, tok = split_punct(w)
# print u"tok: {}".format(tok).encode('utf-8')
tok_transcr, charmap = normalize.processToken(tok)
# print u"tok_transcr: {}".format(tok_transcr).encode('utf-8')
res.append(l+tok_transcr+r)
return ' '.join(res)
def process_dir(texts_dir, transcr):
if transcr:
normalize.readRules(transcr_rules)
normalize.readExceptions(transcr_excepts)
for dirname, subdirs, files in os.walk(texts_dir):
if ('text.xml' in files):
print u'Processing text: {}'.format(os.path.join(dirname, 'text.xml')).encode("utf-8")
process_text(dirname, transcr)
def process_text(dirname, transcr):
text_out = io.open(os.path.join(dirname, 'text.txt'), 'w', encoding='utf-8')
if transcr:
text_transcr_out = io.open(os.path.join(dirname, 'text_transcr.txt'), 'w', encoding='utf-8')
text_tree = etree.parse(os.path.join(dirname, 'text.xml'))
ab_list = text_tree.xpath('//tei:ab', namespaces = namespaces)
for ab in ab_list:
s = etree.tostring(ab, method='text', encoding='unicode')
text_out.write(s)
text_out.write(u'\n')
if transcr:
s_transcr = transcript(s)
text_transcr_out.write(s_transcr)
text_transcr_out.write(u'\n\n')
text_out.close()
if transcr:
text_transcr_out.close()
parser = argparse.ArgumentParser(description=u'Przetwarza pliki text.xml z folderu podanego jako argument na pliki tekstowe.'.encode("utf-8"))
parser.add_argument('texts_dir', help=u'Ścieżka do folderu z plikami'.encode("utf-8"))
parser.add_argument('--transcr', action='store_true', dest='transcription', default=False, help=u'Przepuszcza dodatkowo tekst przez reguły transkrypcji.'.encode("utf-8"))
args = parser.parse_args()
process_dir(args.texts_dir, args.transcription)