plain_text.py 2.23 KB
# -*- coding: utf-8 -*-
import os, io, argparse
from lxml import etree
import normalize
from transcription import split_punct

transcr_rules='rules_XIXw.csv'
transcr_excepts='excepts_XIXw.csv'

namespaces = {
  'tei': 'http://www.tei-c.org/ns/1.0',
  'xi': 'http://www.w3.org/2001/XInclude',
  'nkjp': 'http://www.nkjp.pl/ns/1.0',
  'xml': 'http://www.w3.org/XML/1998/namespace'
}


# Przetwarza akapit podany jako argument w wersję transkrybowaną
def transcript(text_div):
  res = []
  for w in text_div.split():
    l, r, tok = split_punct(w)
#    print u"tok: {}".format(tok).encode('utf-8')
    tok_transcr, charmap = normalize.processToken(tok)
#    print u"tok_transcr: {}".format(tok_transcr).encode('utf-8')
    res.append(l+tok_transcr+r)
  return ' '.join(res)

def process_dir(texts_dir, transcr):
  if transcr:
    normalize.readRules(transcr_rules)
    normalize.readExceptions(transcr_excepts)
  for dirname, subdirs, files in os.walk(texts_dir):
    if ('text.xml' in files):
      print u'Processing text: {}'.format(os.path.join(dirname, 'text.xml')).encode("utf-8")
      process_text(dirname, transcr)
    
def process_text(dirname, transcr):
  text_out = io.open(os.path.join(dirname, 'text.txt'), 'w', encoding='utf-8')
  if transcr:
    text_transcr_out = io.open(os.path.join(dirname, 'text_transcr.txt'), 'w', encoding='utf-8')
  text_tree = etree.parse(os.path.join(dirname, 'text.xml'))
  ab_list = text_tree.xpath('//tei:ab', namespaces = namespaces)
  for ab in ab_list:
    s = etree.tostring(ab, method='text', encoding='unicode')
    text_out.write(s)
    text_out.write(u'\n')
    if transcr:
      s_transcr = transcript(s)
      text_transcr_out.write(s_transcr)
      text_transcr_out.write(u'\n\n')
  text_out.close()
  if transcr:
    text_transcr_out.close()

parser = argparse.ArgumentParser(description=u'Przetwarza pliki text.xml z folderu podanego jako argument na pliki tekstowe.'.encode("utf-8"))
parser.add_argument('texts_dir', help=u'Ścieżka do folderu z plikami'.encode("utf-8"))
parser.add_argument('--transcr', action='store_true', dest='transcription', default=False, help=u'Przepuszcza dodatkowo tekst przez reguły transkrypcji.'.encode("utf-8"))
args = parser.parse_args()

process_dir(args.texts_dir, args.transcription)