ameba.py 2.81 KB

Edit Raw Blame History

#!/bin/python
# -*- coding: utf-8 -*-
import largescale_segmentation
import transcription
import morfosegment
import tei_writer
import os, sys
import time
from lxml import etree
import xml_utils
import settings

def check_modernized(path):
    #structure_path = os.path.join(unicode(path, "utf-8"), u'header.xml')
    structure_path = os.path.join(unicode(path, "utf-8"), settings.SUBFOL, u'header.xml')
    text_tree = etree.parse(structure_path)
    modernized = text_tree.xpath("tei:fileDesc/tei:sourceDesc/tei:bibl[@type='original']/tei:bibl[@type='modernized']", namespaces=xml_utils.namespaces)
    if len(modernized) > 0:
        #edit_date = modernized[0].xpath("./tei:date", namespaces=xml_utils.namespaces)
        #if len(edit_date) > 0:
        #    edit_date_val = int(edit_date[0].get('when', ''))
        #    if edit_date_val < 1940:
        #        return 'XIXw'
        #    else:
        #        return 'modern'
        #else: #modernized bez daty - nie powinno się zdarzyć
        #    return 'modern'
        return 'modern'
    else:
        return 'original'


if len(sys.argv) == 4:
    dict_path = sys.argv[1]
    dict_name = sys.argv[2]
    filename = sys.argv[3]
elif len(sys.argv) == 3:
    dict_path = ""
    dict_name = sys.argv[1]
    filename = sys.argv[2]
elif len(sys.argv) == 2:
    dict_path = ""
    dict_name = "sgjp"
    filename = sys.argv[1]
else:
    print("Uzycie:")
    print(sys.argv[0]+" [[sciezka_do_slownikow_morfeusza] nazwa_slownika_morfeusza] <sciezka_do_katalogu_z_dokumentem>")
    sys.exit(1)

#Dopisane dla folderów bez wybranych próbek
if not os.path.exists(os.path.join(unicode(filename, "utf-8"), settings.SUBFOL, settings.FNAME)):
    print "Brak próbki w {}".format(filename)
    sys.exit(2)

#transcription.normalize.verbose = True

#s = time.clock()
paragraphs = largescale_segmentation.parse_file(filename)
#print("t0", time.clock()-s)


#s = time.clock()
text_ver = check_modernized(filename)
#if text_ver == 'modern':
    #print("modern")
    #transcription.fake_transcr_paragraphs(paragraphs)
    # albo bez zwracania uwagi na modern
    #transcription.transcr_paragraphs(paragraphs)
#else:
    #print(text_ver)
    #transcription.transcr_paragraphs(paragraphs, text_ver)
transcription.transcr_paragraphs(paragraphs, text_ver)

#print("t1", time.clock()-s)

#s = time.clock()
morfosegment.morfosegment_paragraphs(paragraphs, dict_path, dict_name)
#print("t2", time.clock()-s)

#s = time.clock()
#tei_writer.write_files(filename, paragraphs, "text_structure.xml")
tei_writer.write_files(filename, paragraphs, settings.FNAME)
#print("t3", time.clock()-s)

#for p in paragraphs:
    #for s in p:
        #print
        #for frag in s:
            #if "segs" in frag:
                #del frag["segs"]
            #if "charmap" in frag:
                #del frag["charmap"]
            #print(repr(frag))