harmonize_licences.py 3.88 KB
import argparse
import json
import os


LICENCES_MAP = {
    'CC BY - Creative Commons Uznanie Autorstwa 4.0': 'CC BY 4.0',
    'CC BY - Creative Commons Uznanie Autorstwa 3.0 PL': 'CC BY 3.0 PL',
    'CC BY-SA Creative Commons Uznanie Autorstwa - Na tych samych warunkach 4.0': 'CC BY-SA 4.0',
    'CC BY-NC Creative Commons Uznanie Autorstwa - Użycie niekomercyjne 4.0': 'CC BY-NC 4.0',
    'CC BY-SA Creative Commons Uznanie Autorstwa - Na tych samych warunkach 3.0 PL': 'CC BY-SA 3.0 PL',
    'CC BY-NC Creative Commons Uznanie Autorstwa - Użycie niekomercyjne 3.0 PL': 'CC BY-NC 3.0 PL',
    'CC BY-NC-SA Creative Commons Uznanie autorstwa - Użycie niekomercyjne - Na tych samych warunkach 4.0':
        'CC BY-NC-SA 4.0',
    'CC BY-NC-SA Creative Commons Uznanie autorstwa - Użycie niekomercyjne - Na tych samych warunkach 3.0 PL':
        'CC BY-NC-SA 3.0 PL'
}


def main():
    args = parse_arguments()
    if not os.path.isdir(args.input):
        print('Error: Input must be a root corpora directory!')
    harmonize_licenses(args.input, args.output)


def parse_arguments():
    parser = argparse.ArgumentParser(description='Harmonize licences.')
    required_arguments = parser.add_argument_group('required arguments')
    required_arguments.add_argument('-i', '--input', help='corpora root directory', required=True)
    required_arguments.add_argument('-o', '--output', help='output directory', required=True)
    return parser.parse_args()


def harmonize_licenses(root_directory, harmonized_outpath):
    for root, dirs, files in os.walk(root_directory):
        for filename in files:
            if filename.endswith('.conllup') or filename.endswith('.conllu'):
                src = os.path.join(root, filename)
                licence = get_licence(src)
                if not is_full_text(src):
                    licence = 'CC0'
                else:
                    licence = LICENCES_MAP[licence]
                os.makedirs(harmonized_outpath, exist_ok=True)
                dst = os.path.join(harmonized_outpath, filename)
                copy_and_harmonize(src, dst, licence)


def get_licence(filepath):
    with open(filepath, 'r') as conllup_file:
        for line in conllup_file:
            line = line.strip()
            if is_segment(line):
                continue
            elif is_metadata(line):
                name, value = get_metadata(line)
                if name == 'Licence':
                    return value
    return '0'


def is_segment(line):
    if line and line[0].isdigit():
        return True
    return False


def is_metadata(line):
    if line.startswith('#'):
        return True
    return False


def get_metadata(line):
    name_value_pair = line.split('=', 1)
    name = name_value_pair[0].lstrip('#').strip()
    value = name_value_pair[1].strip()
    return name, value


def is_full_text(filepath):
    with open(filepath, 'r') as conllup_file:
        for line in conllup_file:
            line = line.strip()
            if is_segment(line):
                continue
            elif is_metadata(line):
                name, value = get_metadata(line)
                if name == 'DocumentType':
                    if value.strip() == 'paper':
                        return True
                    else:
                        return False
    return False


def copy_and_harmonize(src, dst, licence):
    harmonized_lines = []
    with open(src, 'r') as conllup_file:
        for line in conllup_file:
            line = line.strip()
            if is_metadata(line):
                name, value = get_metadata(line)
                if name == 'Licence':
                    harmonized_lines.append(f'# Licence = {licence}')
                else:
                    harmonized_lines.append(line)
            else:
                harmonized_lines.append(line)

    with open(dst, 'w') as dst_file:
        dst_file.write('\n'.join(harmonized_lines))


if __name__ == '__main__':
    main()