to_raw.py 3.37 KB
import argparse
import os


def main():
    args = parse_arguments()

    if not args.input:
        print('Error: Corpora root directory must be selected!')

    if not args.output:
        print('Error: Directory for raw data must be selected!')

    if not os.path.isdir(args.output):
        print('Error: Selected output must be a directory!')

    create_raw_text_corpora(args.input, args.output)


def parse_arguments():
    parser = argparse.ArgumentParser(description='Create raw text corpora from conllup files.')
    required_arguments = parser.add_argument_group('required arguments')
    required_arguments.add_argument('-i', '--input', help='corpora root directory', required=True)
    required_arguments.add_argument('-o', '--output', help='output directory (must exist)', required=True)
    return parser.parse_args()


def create_raw_text_corpora(conllu_root_directory, raw_text_directory):
    for root, dirs, files in os.walk(conllu_root_directory):
        for filename in files:
            if filename.endswith('.conllup') or filename.endswith('.conllu'):
                print('Extracting text from %s.' % filename)
                in_doc_path = os.path.join(root, filename)
                doc_rel_path = os.path.relpath(in_doc_path, conllu_root_directory)
                out_doc_path = os.path.join(raw_text_directory, doc_rel_path)
                out_doc_dir = os.path.dirname(out_doc_path)
                os.makedirs(out_doc_dir, exist_ok=True)
                extract_text_and_save(os.path.join(root, filename), out_doc_dir)


def extract_text_and_save(filepath, raw_text_directory):
    paragraphs_specified = False
    document_id = None
    document_text = None
    with open(filepath, 'r') as conllup_file:
        for line in conllup_file:
            line = line.strip()
            if is_metadata(line):
                name, value = get_metadata(line)
                if name == 'newdoc id':
                    if document_text and document_id:
                        document_text = document_text.strip()
                        raw_file_path = os.path.join(raw_text_directory, '%s.txt' % document_id)
                        save_raw_text(document_text, raw_file_path)
                    document_id = value
                    document_text = ''
                elif name == 'newpar id':
                    paragraphs_specified = True
                    document_text += '\n\n'
                elif name == 'text':
                    if paragraphs_specified:
                        if document_text.endswith('\n\n'):
                            document_text += value
                        else:
                            document_text += ' %s' % value
                    else:
                        document_text += '\n\n%s' % value

        if document_text and document_id:
            document_text = document_text.strip()
            raw_file_path = os.path.join(raw_text_directory, '%s.txt' % document_id)
            save_raw_text(document_text, raw_file_path)


def is_metadata(line):
    if line.startswith('#'):
        return True
    return False


def get_metadata(line):
    name_value_pair = line.split('=', 1)
    name = name_value_pair[0].lstrip('#').strip()
    value = name_value_pair[1].strip()
    return name, value


def save_raw_text(text, raw_file_path):
    with open(raw_file_path, 'w') as raw_file:
        raw_file.write(text)


if __name__ == '__main__':
    main()