ameba.py 4.01 KB
#!/bin/python
# -*- coding: utf-8 -*-
import largescale_segmentation
import transcription
import morfosegment
import tei_writer
import os, sys, argparse
import time
from lxml import etree
import srx_segmenter
import xml_utils
import settings

def get_srx_rules():
    srx_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'segment.srx')
    return srx_segmenter.parse(srx_filepath)
    
SRX_RULES = get_srx_rules()

def check_modernized(path):
    #structure_path = os.path.join(unicode(path, "utf-8"), u'header.xml')
    structure_path = os.path.join(unicode(path, "utf-8"), settings.SUBFOL, u'header.xml')
    text_tree = etree.parse(structure_path)
    modernized = text_tree.xpath("tei:fileDesc/tei:sourceDesc/tei:bibl[@type='original']/tei:bibl[@type='modernized']", namespaces=xml_utils.namespaces)
    if len(modernized) > 0:
        #edit_date = modernized[0].xpath("./tei:date", namespaces=xml_utils.namespaces)
        #if len(edit_date) > 0:
        #    edit_date_val = int(edit_date[0].get('when', ''))
        #    if edit_date_val < 1940:
        #        return 'XIXw'
        #    else:
        #        return 'modern'
        #else: #modernized bez daty - nie powinno się zdarzyć
        #    return 'modern'
        return 'modern'
    else:
        return 'original'


#if len(sys.argv) == 4:
    #dict_path = sys.argv[1]
    #dict_name = sys.argv[2]
    #filename = sys.argv[3]
#elif len(sys.argv) == 3:
    #dict_path = ""
    #dict_name = sys.argv[1]
    #filename = sys.argv[2]
#elif len(sys.argv) == 2:
    #dict_path = ""
    #dict_name = "sgjp"
    #filename = sys.argv[1]
#else:
    #print("Uzycie:")
    #print(sys.argv[0]+" [[sciezka_do_slownikow_morfeusza] nazwa_slownika_morfeusza] <sciezka_do_katalogu_z_dokumentem>")
    #sys.exit(1)

parser = argparse.ArgumentParser(description=u'Przetwarza pliki TEI korpusu i produkuje dla nich analizę morfosyntaktyczną.'.encode("utf-8"))
parser.add_argument('--morph_dict', dest='dict_name', default='sgjp', help=u'Nazwa słownika Morfeusza.'.encode("utf-8"))
parser.add_argument('--morph_path', dest='dict_path', default='', help=u'Ścieżka do słownika Morfeusza.'.encode("utf-8"))
parser.add_argument('--tager', dest='tager', action='store_true', default=False, help=u'Ujednoznaczenie na podstawie plików tagera.'.encode("utf-8"))
parser.add_argument('filename', type=str, action='store', help=u'Ścieżka do folderu, w którym jest plik do przetworzenia.'.encode("utf-8"))
args = parser.parse_args()

#Dopisane dla folderów bez wybranych próbek
if not os.path.exists(os.path.join(unicode(args.filename, "utf-8"), settings.SUBFOL, settings.FNAME)):
    print "Brak próbki w {}".format(args.filename)
    sys.exit(2)

#transcription.normalize.verbose = True

#s = time.clock()

# paragpahs: lista akapitów, z których każdy jest listą zdań,
# z których każde jest listą ciągów tekstowych różnych typów.
paragraphs = largescale_segmentation.parse_file(args.filename, SRX_RULES)
#print(paragraphs)
#print("t0", time.clock()-s)


#s = time.clock()
text_ver = check_modernized(args.filename)
#if text_ver == 'modern':
    #print("modern")
    #transcription.fake_transcr_paragraphs(paragraphs)
    # albo bez zwracania uwagi na modern
    #transcription.transcr_paragraphs(paragraphs)
#else:
    #print(text_ver)
    #transcription.transcr_paragraphs(paragraphs, text_ver)
transcription.transcr_paragraphs(paragraphs, text_ver)
#print(paragraphs)
#print("t1", time.clock()-s)


#s = time.clock()
morfosegment.morfosegment_paragraphs(paragraphs, args.dict_path, args.dict_name, SRX_RULES, args.tager)
#print(paragraphs)
#print("t2", time.clock()-s)

#s = time.clock()
#tei_writer.write_files(filename, paragraphs, "text_structure.xml")

tei_writer.write_files(args.filename, paragraphs, settings.FNAME, args.tager)
#print("t3", time.clock()-s)

#for p in paragraphs:
    #for s in p:
        #print
        #for frag in s:
            #if "segs" in frag:
                #del frag["segs"]
            #if "charmap" in frag:
                #del frag["charmap"]
            #print(repr(frag))