transcription.py 3.19 KB
# -*- coding: utf-8 -*-
import normalize
import re

#removes left or right punctuation marks, return them with a token
def split_punct(token):
  res = re.split(r'(^[;/,.\-:!?()]+|[;/,.\-:!?()]+$)', token)
  left = u''
  right = u''
  tok = res[0]
  if len(res) > 1:
    if res[0] == u'':
      left = res[1]
      tok = res[2]
    if res[-1] == u'':
      right = res[-2]
  if (res[0] == u'' and res[-1] ==u''):
    left, right, tok = u'',u'',token
  return left, right, tok

#transkrypcja bloku tekstu
def transcription(text):
    lines = text.split(u'\n')
    reses = []
    charmap = {}
    ooffset = 0
    noffset = 0
    for line in lines:
        line_res = []
        tkns = line.split(u' ')
        for token in tkns:
            left, right, token = split_punct(token)
            for i in range(len(left)):
                charmap[noffset+i] = [ooffset+i]
            #print('token: {}'.format(token.encode(encoding='utf-8')))
            res, chmap = normalize.processToken(token)
            for k,v in chmap.items():
                charmap[noffset+len(left)+k] = [ooffset+len(left)+x for x in v]

            ooffset += len(left) + len(token) 
            noffset += len(left) + len(res)

            for i in range(len(right)+1):
                charmap[noffset+i] = [ooffset+i]

            ooffset += len(right) + 1
            noffset += len(right) + 1

            line_res.append(unicode.join(u'',[left,res,right]))
        reses.append(u' '.join(line_res))
    return u'\n'.join(reses), charmap

#transkrypcja listy akapitow (akapit = lista zdan, zdanie = lista fragmentow)
def transcr_paragraphs(parasents, text_ver):
    if text_ver == 'modern':
        normalize.readRules('rules_XIXw.csv')
        normalize.readExceptions('excepts_XIXw.csv')
    else: # original
        normalize.readRules('new_rules.csv')
        normalize.readExceptions('excepts.csv')
    for p in parasents:
        for s in p:
            for frag in s:
                if "orig_text" in frag and frag["orig_text"] is not None:
                    #print(u'orig_text: {}'.format(frag["orig_text"]))
                    trans, charmap = transcription(frag["orig_text"])
                    frag["orig_trans"] = trans
                    frag["orig_charmap"] = charmap
                    if "text" not in frag or frag["text"] is None:
                        frag["text"] = u""
                if "text" in frag and frag["text"] is not None:
                    #print(u'text: {}'.format(frag["text"]))
                    trans, charmap = transcription(frag["text"])
                    frag["trans"] = trans
                    frag["charmap"] = charmap


def fake_transcr_paragraphs(parasents):
    for p in parasents:
        for s in p:
            for frag in s:
                if "orig_text" in frag:
                    frag["orig_trans"] = frag["orig_text"] 
                    frag["orig_charmap"] = {i : [i] for i in range(len(frag["orig_text"]))}
                    if "text" not in frag or frag["text"] is None:
                        frag["text"] = u""
                if "text" in frag and frag["text"] is not None:
                    frag["trans"] = frag["text"]
                    frag["charmap"] = {i : [i] for i in range(len(frag["text"]))}