ameba.py
2.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/bin/python
# -*- coding: utf-8 -*-
import largescale_segmentation
import transcription
import morfosegment
import tei_writer
import os, sys
import time
from lxml import etree
import xml_utils
import settings
def check_modernized(path):
#structure_path = os.path.join(unicode(path, "utf-8"), u'header.xml')
structure_path = os.path.join(unicode(path, "utf-8"), settings.SUBFOL, u'header.xml')
text_tree = etree.parse(structure_path)
notes = list(text_tree.xpath("tei:fileDesc/tei:sourceDesc/tei:bibl/tei:note", namespaces=xml_utils.namespaces))
if len(notes) < 1:
return False
texts = [(unicode(n.text) if n.text else u"").strip() for n in notes]
if u"Modernized publication" in texts:
return True
else:
return False
if len(sys.argv) == 4:
dict_path = sys.argv[1]
dict_name = sys.argv[2]
filename = sys.argv[3]
elif len(sys.argv) == 3:
dict_path = ""
dict_name = sys.argv[1]
filename = sys.argv[2]
elif len(sys.argv) == 2:
dict_path = ""
dict_name = "sgjp"
filename = sys.argv[1]
else:
print("Uzycie:")
print(sys.argv[0]+" [[sciezka_do_slownikow_morfeusza] nazwa_slownika_morfeusza] <sciezka_do_katalogu_z_dokumentem>")
sys.exit(1)
#Dopisane dla folderów bez wybranych próbek
if not os.path.exists(os.path.join(unicode(filename, "utf-8"), settings.SUBFOL, settings.FNAME)):
print "Brak próbki w {}".format(filename)
sys.exit(2)
#transcription.normalize.verbose = True
#s = time.clock()
paragraphs = largescale_segmentation.parse_file(filename)
#print("t0", time.clock()-s)
#s = time.clock()
if check_modernized(filename):
#print("modern")
transcription.fake_transcr_paragraphs(paragraphs)
# albo bez zwracania uwagi na modern
#transcription.transcr_paragraphs(paragraphs)
else:
#print("nmodern")
transcription.transcr_paragraphs(paragraphs)
#print("t1", time.clock()-s)
#s = time.clock()
morfosegment.morfosegment_paragraphs(paragraphs, dict_path, dict_name)
#print("t2", time.clock()-s)
#s = time.clock()
#tei_writer.write_files(filename, paragraphs, "text_structure.xml")
tei_writer.write_files(filename, paragraphs, settings.FNAME)
#print("t3", time.clock()-s)
#for p in paragraphs:
#for s in p:
#print
#for frag in s:
#if "segs" in frag:
#del frag["segs"]
#if "charmap" in frag:
#del frag["charmap"]
#print(repr(frag))