harmonize_metadata.py 8.96 KB
import argparse
import json
import os


METADATA_ORDER = ['Identifier', 'Language', 'Licence', 'PublicationDate', 'DocumentTitle', 'ArticleTitle',
                  'DocumentType', 'Source', 'Domain', 'No_of_sentences', 'No_of_words', 'No_of_punctuation',
                  'No_of_tokens', 'Author', 'SourceType', 'Keywords', 'Url', 'Style', 'Subdomain', 'Issn_isbn_eisbn',
                  'ScientificField', 'EnTitle', 'EnKeywords', 'EnAbstract', 'PublishingCompany', 'IssueYear',
                  'IssueVolume', 'IssueNumber', 'PageRange', 'Reviewer', 'Translator', 'OriginalType']


def main():
    args = parse_arguments()
    if not os.path.isdir(args.input):
        print('Error: Input must be a root corpora directory!')
    metadata_summary = harmonize_metadata(args.input, args.summary, args.output)
    sentences = 0
    tokens = 0
    for meta in metadata_summary:
        sentences += meta['No_of_sentences']
        tokens += meta['No_of_tokens']


def parse_arguments():
    parser = argparse.ArgumentParser(description='Harmonize metadata and write metadata summary to json file.')
    required_arguments = parser.add_argument_group('required arguments')
    required_arguments.add_argument('-i', '--input', help='corpora root directory', required=True)
    required_arguments.add_argument('-o', '--output', help='output directory', required=True)
    required_arguments.add_argument('-s', '--summary', help='path to json metadata summary file', required=True)
    return parser.parse_args()


def harmonize_metadata(root_directory, summary_json_path, harmonized_outpath):
    metadata_summary = []
    for root, dirs, files in os.walk(root_directory):
        for filename in files:
            if filename.endswith('.conllup') or filename.endswith('.conllu'):
                src = os.path.join(root, filename)
                metadata = get_all_metadata(src)
                year = get_year(src)
                year_path = os.path.join(harmonized_outpath, year)
                os.makedirs(year_path, exist_ok=True)
                dst = os.path.join(year_path, filename)
                copy_and_harmonize(src, dst, metadata)
                metadata_summary.append(metadata)
    with open(summary_json_path, 'w') as f:
        json.dump(metadata_summary, f, indent=4)
    return metadata_summary


def get_all_metadata(filepath):
    metadata = {'No_of_sentences': 0,
                'No_of_words': 0,
                'No_of_punctuation': 0,
                'No_of_tokens': 0,
                'SourceType': 'Publishing House',
                'Style': 'scientific',
                'Licence': 'CC0'}
    with open(filepath, 'r') as conllup_file:
        for line in conllup_file:
            line = line.strip()
            if is_segment(line):
                metadata['No_of_tokens'] += 1
                pos = line.split('\t')[3]
                if pos == 'PUNCT':
                    metadata['No_of_punctuation'] += 1
                else:
                    metadata['No_of_words'] += 1
            elif is_metadata(line):
                name, value = get_metadata(line)
                if name == 'sent_id':
                    metadata['No_of_sentences'] += 1
                elif name == 'newdoc id' and value:
                    metadata['Identifier'] = value
                elif name == 'language' and value:
                    metadata['Language'] = value
                elif name == 'license' and value:
                    metadata['Licence'] = value
                elif name == 'date' and value:
                    if len(value) == 4:
                        metadata['PublicationDate'] = f'{value}-01-01'
                    elif len(value) == 7:
                        metadata['PublicationDate'] = f'{value}-01'
                    elif len(value) == 10:
                        metadata['PublicationDate'] = value
                    else:
                        print(f'Something wrong with the publication date: {value}.')
                elif name == 'title' and value:
                    metadata['DocumentTitle'] = value
                elif name == 'content_type' and value:
                    if value == 'abstract':
                        metadata['DocumentType'] = 'abstract'
                    elif value == 'full_text':
                        metadata['DocumentType'] = 'paper'
                    else:
                        print(f'Unknown content_type value: {value}.')
                elif name == 'journal' and value:
                    metadata['Source'] = value
                elif name == 'domain' and value:
                    metadata['Domain'] = value
                elif name == 'authors' and value:
                    metadata['Author'] = value
                elif name == 'url' and value:
                    metadata['Url'] = value
                elif name == 'keywords' and value:
                    metadata['Keywords'] = value
                elif name == 'disciplines' and value:
                    metadata['Subdomain'] = value
                elif name == 'fields' and value:
                    metadata['ScientificField'] = value
                elif name == 'entitle' and value:
                    metadata['EnTitle'] = value
                elif name == 'enkeywords' and value:
                    metadata['EnKeywords'] = value
                elif name == 'enabstract' and value:
                    metadata['EnAbstract'] = value
                elif name == 'publishing_company' and value:
                    metadata['PublishingCompany'] = value
                elif name == 'issue_year' and value:
                    metadata['IssueYear'] = value
                elif name == 'issue_volume' and value:
                    metadata['IssueVolume'] = value
                elif name == 'issue_number' and value:
                    metadata['IssueNumber'] = value
                elif name == 'page_range' and value:
                    metadata['PageRange'] = value
                elif name == 'reviewers' and value:
                    metadata['Reviewer'] = value
                elif name == 'translators' and value:
                    metadata['Translator'] = value
                elif name == 'type' and value:
                    metadata['OriginalType'] = value
                elif name == 'source':
                    pass
                elif name == 'text':
                    pass
                elif name == 'newpar id':
                    pass
                elif name == 'global.columns':
                    pass
                else:
                    print(f'Unknown metadata name: {name}.')

    return metadata


def is_segment(line):
    if line and line[0].isdigit():
        return True
    return False


def is_metadata(line):
    if line.startswith('#'):
        return True
    return False


def get_metadata(line):
    name_value_pair = line.split('=', 1)
    name = name_value_pair[0].lstrip('#').strip()
    value = name_value_pair[1].strip()
    return name, value


def is_full_text(filepath):
    with open(filepath, 'r') as conllup_file:
        for line in conllup_file:
            line = line.strip()
            if is_segment(line):
                continue
            elif is_metadata(line):
                name, value = get_metadata(line)
                if name == 'content_type':
                    if value.strip() == 'full_text':
                        return True
                    else:
                        return False
    return False


def get_year(filepath):
    with open(filepath, 'r') as conllup_file:
        for line in conllup_file:
            line = line.strip()
            if is_segment(line):
                continue
            elif is_metadata(line):
                name, value = get_metadata(line)
                if name == 'date':
                    return value.split('-')[0]
    return 0


def copy_and_harmonize(src, dst, metadata):
    harmonized_lines = []
    with open(src, 'r') as conllup_file:
        for line in conllup_file:
            line = line.strip()
            if is_metadata(line):
                name, value = get_metadata(line)
                if name == 'newdoc id':
                    harmonized_lines.append(line)
                    for meta_name in METADATA_ORDER:
                        if meta_name in metadata:
                            harmonized_lines.append(f'# {meta_name} = {metadata[meta_name]}')
                        elif meta_name not in ['Issn_isbn_eisbn', 'Translator', 'EnTitle', 'EnAbstract', 'IssueVolume',
                                               'ArticleTitle', 'Reviewer', 'EnKeywords', 'PageRange', 'Keywords',
                                               'IssueNumber']:
                            print(f'Missing {meta_name} in {src}.')
                elif name in ['sent_id', 'global.columns', 'newpar id', 'text']:
                    harmonized_lines.append(line)
                else:
                    pass
            else:
                harmonized_lines.append(line)

    with open(dst, 'w') as dst_file:
        dst_file.write('\n'.join(harmonized_lines))


if __name__ == '__main__':
    main()