ameba.py
1.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/bin/python
# -*- coding: utf-8 -*-
import largescale_segmentation
import transcription
import morfosegment
import tei_writer
import os, sys
import time
from lxml import etree
import xml_utils
def check_modernized(path):
structure_path = os.path.join(unicode(path, "utf-8"), u'header.xml')
text_tree = etree.parse(structure_path)
notes = list(text_tree.xpath("tei:fileDesc/tei:sourceDesc/tei:bibl/tei:note", namespaces=xml_utils.namespaces))
if len(notes) < 1:
return False
texts = [(unicode(n.text) if n.text else u"").strip() for n in notes]
if len(texts) == 1:
return texts[0] == u"Modernized publication"
print(repr(texts))
raise Exception
#transcription.normalize.verbose = True
#s = time.clock()
paragraphs = largescale_segmentation.parse_file(sys.argv[1])
#print("t0", time.clock()-s)
#s = time.clock()
if check_modernized(sys.argv[1]):
#print("modern")
transcription.fake_transcr_paragraphs(paragraphs)
# albo bez zwracania uwagi na modern
#transcription.transcr_paragraphs(paragraphs)
else:
#print("nmodern")
transcription.transcr_paragraphs(paragraphs)
#print("t1", time.clock()-s)
#s = time.clock()
morfosegment.morfosegment_paragraphs(paragraphs)
#print("t2", time.clock()-s)
#s = time.clock()
tei_writer.write_files(sys.argv[1], paragraphs, "text_structure.xml")
#print("t3", time.clock()-s)
#for p in paragraphs:
#for s in p:
#print
#for frag in s:
#if "segs" in frag:
#del frag["segs"]
#if "charmap" in frag:
#del frag["charmap"]
#print(repr(frag))