ameba.py
4.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/bin/python
# -*- coding: utf-8 -*-
import largescale_segmentation
import transcription
import morfosegment
import tei_writer
import os, sys, argparse
import time
from lxml import etree
import srx_segmenter
import xml_utils
import settings
def get_srx_rules():
srx_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'segment.srx')
return srx_segmenter.parse(srx_filepath)
SRX_RULES = get_srx_rules()
def check_modernized(path):
#structure_path = os.path.join(unicode(path, "utf-8"), u'header.xml')
structure_path = os.path.join(unicode(path, "utf-8"), settings.SUBFOL, u'header.xml')
text_tree = etree.parse(structure_path)
modernized = text_tree.xpath("tei:fileDesc/tei:sourceDesc/tei:bibl[@type='original']/tei:bibl[@type='modernized']", namespaces=xml_utils.namespaces)
if len(modernized) > 0:
#edit_date = modernized[0].xpath("./tei:date", namespaces=xml_utils.namespaces)
#if len(edit_date) > 0:
# edit_date_val = int(edit_date[0].get('when', ''))
# if edit_date_val < 1940:
# return 'XIXw'
# else:
# return 'modern'
#else: #modernized bez daty - nie powinno się zdarzyć
# return 'modern'
return 'modern'
else:
return 'original'
#if len(sys.argv) == 4:
#dict_path = sys.argv[1]
#dict_name = sys.argv[2]
#filename = sys.argv[3]
#elif len(sys.argv) == 3:
#dict_path = ""
#dict_name = sys.argv[1]
#filename = sys.argv[2]
#elif len(sys.argv) == 2:
#dict_path = ""
#dict_name = "sgjp"
#filename = sys.argv[1]
#else:
#print("Uzycie:")
#print(sys.argv[0]+" [[sciezka_do_slownikow_morfeusza] nazwa_slownika_morfeusza] <sciezka_do_katalogu_z_dokumentem>")
#sys.exit(1)
parser = argparse.ArgumentParser(description=u'Przetwarza pliki TEI korpusu i produkuje dla nich analizę morfosyntaktyczną.'.encode("utf-8"))
parser.add_argument('--morph_dict', dest='dict_name', default='sgjp', help=u'Nazwa słownika Morfeusza.'.encode("utf-8"))
parser.add_argument('--morph_path', dest='dict_path', default='', help=u'Ścieżka do słownika Morfeusza.'.encode("utf-8"))
parser.add_argument('--tager', dest='tager', action='store_true', default=False, help=u'Ujednoznaczenie na podstawie plików tagera.'.encode("utf-8"))
parser.add_argument('filename', type=str, action='store', help=u'Ścieżka do folderu, w którym jest plik do przetworzenia.'.encode("utf-8"))
args = parser.parse_args()
#Dopisane dla folderów bez wybranych próbek
if not os.path.exists(os.path.join(unicode(args.filename, "utf-8"), settings.SUBFOL, settings.FNAME)):
print "Brak próbki w {}".format(args.filename)
sys.exit(2)
#transcription.normalize.verbose = True
#s = time.clock()
# paragpahs: lista akapitów, z których każdy jest listą zdań,
# z których każde jest listą ciągów tekstowych różnych typów.
paragraphs = largescale_segmentation.parse_file(args.filename, SRX_RULES)
#print(paragraphs)
#print("t0", time.clock()-s)
#s = time.clock()
text_ver = check_modernized(args.filename)
#if text_ver == 'modern':
#print("modern")
#transcription.fake_transcr_paragraphs(paragraphs)
# albo bez zwracania uwagi na modern
#transcription.transcr_paragraphs(paragraphs)
#else:
#print(text_ver)
#transcription.transcr_paragraphs(paragraphs, text_ver)
transcription.transcr_paragraphs(paragraphs, text_ver)
#print(paragraphs)
#print("t1", time.clock()-s)
#s = time.clock()
morfosegment.morfosegment_paragraphs(paragraphs, args.dict_path, args.dict_name, SRX_RULES, args.tager)
#print(paragraphs)
#print("t2", time.clock()-s)
#s = time.clock()
#tei_writer.write_files(filename, paragraphs, "text_structure.xml")
tei_writer.write_files(args.filename, paragraphs, settings.FNAME, args.tager)
#print("t3", time.clock()-s)
#for p in paragraphs:
#for s in p:
#print
#for frag in s:
#if "segs" in frag:
#del frag["segs"]
#if "charmap" in frag:
#del frag["charmap"]
#print(repr(frag))