transcription.py 6.41 KB
# -*- coding: utf-8 -*-
import normalize
import re

#removes left or right punctuation marks, return them with a token
def split_punct(token):
  res = re.split(ur'(^[;/,.\-:!?()„"»«]+|[;/,.\-:!?()”"“»«]+$)', token)
  left = u''
  right = u''
  tok = res[0]
  if len(res) > 1:
    if res[0] == u'':
      left = res[1]
      tok = res[2]
    if res[-1] == u'':
      right = res[-2]
  if (res[0] == u'' and res[-1] == u'' and len(res) == 3):
    left, right, tok = u'',u'',token
  return left, right, tok

#transkrypcja bloku tekstu
def transcription(text):
    lines = text.split(u'\n')
    reses = []
    charmap = {}
    ooffset = 0
    noffset = 0
    for line in lines:
        line_res = []
        tkns = line.split(u' ')
        for token in tkns:
            left, right, token = split_punct(token)
            for i in range(len(left)):
                charmap[noffset+i] = [ooffset+i]
            #print('token: {}'.format(token.encode(encoding='utf-8')))
            res, chmap = normalize.processToken(token)
            for k,v in chmap.items():
                charmap[noffset+len(left)+k] = [ooffset+len(left)+x for x in v]

            ooffset += len(left) + len(token) 
            noffset += len(left) + len(res)

            for i in range(len(right)+1):
                charmap[noffset+i] = [ooffset+i]

            ooffset += len(right) + 1
            noffset += len(right) + 1

            line_res.append(unicode.join(u'',[left,res,right]))
        reses.append(u' '.join(line_res))
    return u'\n'.join(reses), charmap


# wygenerowanie charmap dla poprawianych fragmentów
def orig_charmap(orig, trans):
    charmap = {}
    j = 0
    for i in range(len(trans) + 1):
        if i != (len(trans) - 1):
            charmap[i] = [j]
            j += 1
        else: # jeśli pierwotny ciąg jest dłuższy, to skumuluj na ostatnim znaku
            if len(orig) > len(trans):
                charmap[i] = range(i, len(orig))
                j += len(orig) - len(trans) + 1
            else:
                charmap[i] = [j]
                j += 1
    return charmap

#transkrypcja listy akapitow (akapit = lista zdan, zdanie = lista fragmentow)
def transcr_paragraphs(parasents, text_ver):
    if text_ver == 'modern':
        normalize.readRules('rules_XIXw.csv')
        normalize.readExceptions('excepts_XIXw.csv')
    else: # original
        normalize.readRules('new_rules.csv')
        normalize.readExceptions('excepts.csv')
    for p in parasents:
        for s in p:
            for frag in s:
                # Jeśli fragment jest typu "foreign", to go nie transkrybuj
                # (transkrypcja = transliteracja)
                if "type" in frag and frag['type'] == 'foreign':
                    fake_transcr_frag(frag)
                else:
                    '''                    
                    if "orig_text" in frag and frag["orig_text"] is not None:
                        #print(u'orig_text: {}'.format(frag["orig_text"]))
                        trans, charmap = transcription(frag["orig_text"])
                        frag["orig_trans"] = trans
                        frag["orig_charmap"] = charmap
                        if "text" not in frag or frag["text"] is None:
                            frag["text"] = u""
                    if "text" in frag and frag["text"] is not None:
                        #print(u'text: {}'.format(frag["text"]))
                        trans, charmap = transcription(frag["text"])
                        frag["trans"] = trans
                        frag["charmap"] = charmap
                    '''
                    '''
                    Nowa wersja obsługi "choice" - robimy z tego jeden zwykły segment, w którym text = orig_text 
                    a trans = transcription(text). Zmiena się też id na orig_id
                    '''
                    if "orig_text" in frag and frag["orig_text"] is not None:
                        #print(u'orig_text: {}'.format(frag["orig_text"]))
                        if "text" not in frag or frag["text"] is None:
                            frag["text"] = u""
                        trans, charmap = transcription(frag["text"])
                        #print(frag, trans, charmap)
                        frag["trans"] = trans
                        '''
                        oryginalny charmap nie będzie pasował, bo jest w odniesieiu do text
                        a nie orig_text - trzeba go zmienić
                        '''
                        #frag["charmap"] = charmap
                        frag["charmap"] = orig_charmap(frag["orig_text"], trans)
                        
                        frag['text'] = frag.pop('orig_text')
                        frag['id']=frag.pop('orig_id')
                        
                    elif "text" in frag and frag["text"] is not None:
                        #print(u'text: {}'.format(frag["text"]))
                        trans, charmap = transcription(frag["text"])
                        #print(u'trans: {}'.format(trans))
                        #print(u'charmap: {}'.format(charmap))
                        frag["trans"] = trans
                        frag["charmap"] = charmap


def fake_transcr_paragraphs(parasents):
    for p in parasents:
        for s in p:
            for frag in s:
                fake_transcr_frag(frag)


def fake_transcr_frag(frag):
    if "orig_text" in frag and frag["orig_text"] is not None:
        if "text" not in frag or frag["text"] is None:
            frag["text"] = u""
        frag["trans"] = frag['text']
        '''
        oryginalny charmap nie będzie pasował, bo jest w odniesieiu do text
        a nie orig_text - trzeba go zmienić
        '''
        #frag["charmap"] = {i : [i] for i in range(len(frag["text"]))}
        frag["charmap"] = orig_charmap(frag["orig_text"], frag["text"])

        frag['text'] = frag.pop('orig_text')
        frag['id']=frag.pop('orig_id')
        
    elif "text" in frag and frag["text"] is not None:
        frag["trans"] = frag['text']
        frag["charmap"] = {i : [i] for i in range(len(frag["text"]))}

    '''
    if "orig_text" in frag:
        frag["orig_trans"] = frag["orig_text"] 
        frag["orig_charmap"] = {i : [i] for i in range(len(frag["orig_text"]))}
        if "text" not in frag or frag["text"] is None:
            frag["text"] = u""
    if "text" in frag and frag["text"] is not None:
        frag["trans"] = frag["text"]
        frag["charmap"] = {i : [i] for i in range(len(frag["text"]))}
    '''