ameba.py 1.6 KB

Edit Raw Blame History

#!/bin/python
# -*- coding: utf-8 -*-
import largescale_segmentation
import transcription
import morfosegment
import tei_writer
import os, sys
import time
from lxml import etree
import xml_utils

def check_modernized(path):
    structure_path = os.path.join(unicode(path, "utf-8"), u'header.xml')
    text_tree = etree.parse(structure_path)
    notes = list(text_tree.xpath("tei:fileDesc/tei:sourceDesc/tei:bibl/tei:note", namespaces=xml_utils.namespaces))
    if len(notes) < 1:
        return False
    texts = [(unicode(n.text) if n.text else u"").strip() for n in notes]
    if len(texts) == 1:
        return texts[0] == u"Modernized publication"
    print(repr(texts))
    raise Exception


#transcription.normalize.verbose = True

#s = time.clock()
paragraphs = largescale_segmentation.parse_file(sys.argv[1])
#print("t0", time.clock()-s)


#s = time.clock()
if check_modernized(sys.argv[1]):
    #print("modern")
    transcription.fake_transcr_paragraphs(paragraphs)
    # albo bez zwracania uwagi na modern
    #transcription.transcr_paragraphs(paragraphs)
else:
    #print("nmodern")
    transcription.transcr_paragraphs(paragraphs)

#print("t1", time.clock()-s)

#s = time.clock()
morfosegment.morfosegment_paragraphs(paragraphs)
#print("t2", time.clock()-s)

#s = time.clock()
tei_writer.write_files(sys.argv[1], paragraphs, "text_structure.xml")
#print("t3", time.clock()-s)

#for p in paragraphs:
    #for s in p:
        #print
        #for frag in s:
            #if "segs" in frag:
                #del frag["segs"]
            #if "charmap" in frag:
                #del frag["charmap"]
            #print(repr(frag))