transcription.py
3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
import normalize
import re
#removes left or right punctuation marks, return them with a token
def split_punct(token):
res = re.split(r'(^[;/,.\-:!?()]+|[;/,.\-:!?()]+$)', token)
left = u''
right = u''
tok = res[0]
if len(res) > 1:
if res[0] == u'':
left = res[1]
tok = res[2]
if res[-1] == u'':
right = res[-2]
if (res[0] == u'' and res[-1] ==u''):
left, right, tok = u'',u'',token
return left, right, tok
#transkrypcja bloku tekstu
def transcription(text):
lines = text.split(u'\n')
reses = []
charmap = {}
ooffset = 0
noffset = 0
for line in lines:
line_res = []
tkns = line.split(u' ')
for token in tkns:
left, right, token = split_punct(token)
for i in range(len(left)):
charmap[noffset+i] = [ooffset+i]
#print('token: {}'.format(token.encode(encoding='utf-8')))
res, chmap = normalize.processToken(token)
for k,v in chmap.items():
charmap[noffset+len(left)+k] = [ooffset+len(left)+x for x in v]
ooffset += len(left) + len(token)
noffset += len(left) + len(res)
for i in range(len(right)+1):
charmap[noffset+i] = [ooffset+i]
ooffset += len(right) + 1
noffset += len(right) + 1
line_res.append(unicode.join(u'',[left,res,right]))
reses.append(u' '.join(line_res))
return u'\n'.join(reses), charmap
#transkrypcja listy akapitow (akapit = lista zdan, zdanie = lista fragmentow)
def transcr_paragraphs(parasents, text_ver):
if text_ver == 'modern':
normalize.readRules('rules_XIXw.csv')
normalize.readExceptions('excepts_XIXw.csv')
else: # original
normalize.readRules('new_rules.csv')
normalize.readExceptions('excepts.csv')
for p in parasents:
for s in p:
for frag in s:
if "orig_text" in frag and frag["orig_text"] is not None:
#print(u'orig_text: {}'.format(frag["orig_text"]))
trans, charmap = transcription(frag["orig_text"])
frag["orig_trans"] = trans
frag["orig_charmap"] = charmap
if "text" not in frag or frag["text"] is None:
frag["text"] = u""
if "text" in frag and frag["text"] is not None:
#print(u'text: {}'.format(frag["text"]))
trans, charmap = transcription(frag["text"])
frag["trans"] = trans
frag["charmap"] = charmap
def fake_transcr_paragraphs(parasents):
for p in parasents:
for s in p:
for frag in s:
if "orig_text" in frag:
frag["orig_trans"] = frag["orig_text"]
frag["orig_charmap"] = {i : [i] for i in range(len(frag["orig_text"]))}
if "text" not in frag or frag["text"] is None:
frag["text"] = u""
if "text" in frag and frag["text"] is not None:
frag["trans"] = frag["text"]
frag["charmap"] = {i : [i] for i in range(len(frag["text"]))}