odt2tei.py 2.82 KB
"Convert scanned ODT to TEI"
import argparse
import csv
import logging
from typing import Iterable
from odt2tei.converter import Converter

FIELD_NAMES = {
    "committee": "Komisja",
    "number": "Numer",
    "date": "Data",
    "original_file": "Plik",
    "original_index": "Numer w pliku",
    "first": "Pierwsza komisja",
    "session": "Identyfikator",
    "error": "Błąd",
}


def parse_options():
    parser = argparse.ArgumentParser(description="Convert ODT documents to PCC TEI")
    parser.add_argument(
        "-o",
        "--output",
        default="output",
        metavar="<folder>",
        help="folder to save data",
    )
    parser.add_argument(
        "-d",
        "--debug",
        default="",
        metavar="<folder>",
        help="category to debug",
        choices=[
            "",
            "lines",
            "odt",
            "spaceout",
            "whitespace",
            "punctuation",
            "merge",
            "illegible",
            "speakers",
            "comments",
            "split",
        ],
    )
    parser.add_argument("-f", "--force", action="store_true", help="save invalid files")
    parser.add_argument(
        "-v", "--verbose", action="store_true", help="print correct files"
    )
    parser.add_argument(
        "-s", "--stop-on-error", action="store_true", help="stop on error"
    )
    parser.add_argument("-e", "--export-headers", help="export headers to CSV")
    parser.add_argument("filename", nargs="+", help="folder or filename")
    args = parser.parse_args()
    if args.debug:
        loglevel = logging.DEBUG
    elif args.verbose:
        loglevel = logging.INFO
    else:
        loglevel = logging.WARNING
    logging.basicConfig(format="%(message)s", level=loglevel)
    return args


def session_rows(session: dict) -> Iterable:
    row = {
        "date": session["date"],
        "original_file": session["original_file"].split(":")[0],
        "original_index": session["original_file"].split(":")[1],
        "session": session.get("file_id"),
        "error": "" if session["valid"] else "x",
    }
    first = True
    for committee in session["sessions"]:
        yield {
            "first": "x" if first else "",
            "committee": committee["name"],
            "number": committee["sessionNo"],
        } | row
        first = False


def main():
    "Main loop"
    args = parse_options()

    converter = Converter(args)
    for filename in args.filename:
        converter.convert(filename)
    if args.export_headers:
        with open(args.export_headers, "w") as outfile:
            csvfile = csv.DictWriter(outfile, FIELD_NAMES.keys())
            csvfile.writerow(FIELD_NAMES)
            for session in converter.sessions:
                for row in session_rows(session):
                    csvfile.writerow(row)


if __name__ == "__main__":
    main()