ameba.py
2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/bin/python
# -*- coding: utf-8 -*-
import largescale_segmentation
import transcription
import morfosegment
import tei_writer
import os, sys
import time
from lxml import etree
import xml_utils
import settings
def check_modernized(path):
#structure_path = os.path.join(unicode(path, "utf-8"), u'header.xml')
structure_path = os.path.join(unicode(path, "utf-8"), settings.SUBFOL, u'header.xml')
text_tree = etree.parse(structure_path)
modernized = text_tree.xpath("tei:fileDesc/tei:sourceDesc/tei:bibl[@type='original']/tei:bibl[@type='modernized']", namespaces=xml_utils.namespaces)
if len(modernized) > 0:
#edit_date = modernized[0].xpath("./tei:date", namespaces=xml_utils.namespaces)
#if len(edit_date) > 0:
# edit_date_val = int(edit_date[0].get('when', ''))
# if edit_date_val < 1940:
# return 'XIXw'
# else:
# return 'modern'
#else: #modernized bez daty - nie powinno się zdarzyć
# return 'modern'
return 'modern'
else:
return 'original'
if len(sys.argv) == 4:
dict_path = sys.argv[1]
dict_name = sys.argv[2]
filename = sys.argv[3]
elif len(sys.argv) == 3:
dict_path = ""
dict_name = sys.argv[1]
filename = sys.argv[2]
elif len(sys.argv) == 2:
dict_path = ""
dict_name = "sgjp"
filename = sys.argv[1]
else:
print("Uzycie:")
print(sys.argv[0]+" [[sciezka_do_slownikow_morfeusza] nazwa_slownika_morfeusza] <sciezka_do_katalogu_z_dokumentem>")
sys.exit(1)
#Dopisane dla folderów bez wybranych próbek
if not os.path.exists(os.path.join(unicode(filename, "utf-8"), settings.SUBFOL, settings.FNAME)):
print "Brak próbki w {}".format(filename)
sys.exit(2)
#transcription.normalize.verbose = True
#s = time.clock()
paragraphs = largescale_segmentation.parse_file(filename)
#print("t0", time.clock()-s)
#s = time.clock()
text_ver = check_modernized(filename)
#if text_ver == 'modern':
#print("modern")
#transcription.fake_transcr_paragraphs(paragraphs)
# albo bez zwracania uwagi na modern
#transcription.transcr_paragraphs(paragraphs)
#else:
#print(text_ver)
#transcription.transcr_paragraphs(paragraphs, text_ver)
transcription.transcr_paragraphs(paragraphs, text_ver)
#print("t1", time.clock()-s)
#s = time.clock()
morfosegment.morfosegment_paragraphs(paragraphs, dict_path, dict_name)
#print("t2", time.clock()-s)
#s = time.clock()
#tei_writer.write_files(filename, paragraphs, "text_structure.xml")
tei_writer.write_files(filename, paragraphs, settings.FNAME)
#print("t3", time.clock()-s)
#for p in paragraphs:
#for s in p:
#print
#for frag in s:
#if "segs" in frag:
#del frag["segs"]
#if "charmap" in frag:
#del frag["charmap"]
#print(repr(frag))