odt2tei.py
2.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"Convert scanned ODT to TEI"
import argparse
import csv
import logging
from typing import Iterable
from odt2tei.converter import Converter
FIELD_NAMES = {
"committee": "Komisja",
"number": "Numer",
"date": "Data",
"original_file": "Plik",
"original_index": "Numer w pliku",
"first": "Pierwsza komisja",
"session": "Identyfikator",
"error": "Błąd",
}
def parse_options():
parser = argparse.ArgumentParser(description="Convert ODT documents to PCC TEI")
parser.add_argument(
"-o",
"--output",
default="output",
metavar="<folder>",
help="folder to save data",
)
parser.add_argument(
"-d",
"--debug",
default="",
metavar="<folder>",
help="category to debug",
choices=[
"",
"lines",
"odt",
"spaceout",
"whitespace",
"punctuation",
"merge",
"illegible",
"speakers",
"comments",
"split",
],
)
parser.add_argument("-f", "--force", action="store_true", help="save invalid files")
parser.add_argument(
"-v", "--verbose", action="store_true", help="print correct files"
)
parser.add_argument(
"-s", "--stop-on-error", action="store_true", help="stop on error"
)
parser.add_argument("-e", "--export-headers", help="export headers to CSV")
parser.add_argument("filename", nargs="+", help="folder or filename")
args = parser.parse_args()
if args.debug:
loglevel = logging.DEBUG
elif args.verbose:
loglevel = logging.INFO
else:
loglevel = logging.WARNING
logging.basicConfig(format="%(message)s", level=loglevel)
return args
def session_rows(session: dict) -> Iterable:
row = {
"date": session["date"],
"original_file": session["original_file"].split(":")[0],
"original_index": session["original_file"].split(":")[1],
"session": session.get("file_id"),
"error": "" if session["valid"] else "x",
}
first = True
for committee in session["sessions"]:
yield {
"first": "x" if first else "",
"committee": committee["name"],
"number": committee["sessionNo"],
} | row
first = False
def main():
"Main loop"
args = parse_options()
converter = Converter(args)
for filename in args.filename:
converter.convert(filename)
if args.export_headers:
with open(args.export_headers, "w") as outfile:
csvfile = csv.DictWriter(outfile, FIELD_NAMES.keys())
csvfile.writerow(FIELD_NAMES)
for session in converter.sessions:
for row in session_rows(session):
csvfile.writerow(row)
if __name__ == "__main__":
main()