ameba.py 2.41 KB

Edit Raw Blame History

#!/bin/python
# -*- coding: utf-8 -*-
import largescale_segmentation
import transcription
import morfosegment
import tei_writer
import os, sys
import time
from lxml import etree
import xml_utils
import settings

def check_modernized(path):
    #structure_path = os.path.join(unicode(path, "utf-8"), u'header.xml')
    structure_path = os.path.join(unicode(path, "utf-8"), settings.SUBFOL, u'header.xml')
    text_tree = etree.parse(structure_path)
    notes = list(text_tree.xpath("tei:fileDesc/tei:sourceDesc/tei:bibl/tei:note", namespaces=xml_utils.namespaces))
    if len(notes) < 1:
        return False
    texts = [(unicode(n.text) if n.text else u"").strip() for n in notes]
    if u"Modernized publication" in texts:
        return True
    else:
        return False


if len(sys.argv) == 4:
    dict_path = sys.argv[1]
    dict_name = sys.argv[2]
    filename = sys.argv[3]
elif len(sys.argv) == 3:
    dict_path = ""
    dict_name = sys.argv[1]
    filename = sys.argv[2]
elif len(sys.argv) == 2:
    dict_path = ""
    dict_name = "sgjp"
    filename = sys.argv[1]
else:
    print("Uzycie:")
    print(sys.argv[0]+" [[sciezka_do_slownikow_morfeusza] nazwa_slownika_morfeusza] <sciezka_do_katalogu_z_dokumentem>")
    sys.exit(1)

#Dopisane dla folderów bez wybranych próbek
if not os.path.exists(os.path.join(unicode(filename, "utf-8"), settings.SUBFOL, settings.FNAME)):
    print "Brak próbki w {}".format(filename)
    sys.exit(2)

#transcription.normalize.verbose = True

#s = time.clock()
paragraphs = largescale_segmentation.parse_file(filename)
#print("t0", time.clock()-s)


#s = time.clock()
if check_modernized(filename):
    #print("modern")
    transcription.fake_transcr_paragraphs(paragraphs)
    # albo bez zwracania uwagi na modern
    #transcription.transcr_paragraphs(paragraphs)
else:
    #print("nmodern")
    transcription.transcr_paragraphs(paragraphs)

#print("t1", time.clock()-s)

#s = time.clock()
morfosegment.morfosegment_paragraphs(paragraphs, dict_path, dict_name)
#print("t2", time.clock()-s)

#s = time.clock()
#tei_writer.write_files(filename, paragraphs, "text_structure.xml")
tei_writer.write_files(filename, paragraphs, settings.FNAME)
#print("t3", time.clock()-s)

#for p in paragraphs:
    #for s in p:
        #print
        #for frag in s:
            #if "segs" in frag:
                #del frag["segs"]
            #if "charmap" in frag:
                #del frag["charmap"]
            #print(repr(frag))