get_exact_domains_mapping.py 2.94 KB
import argparse
import os


def main():
    args = parse_arguments()

    if not args.input:
        print('Error: Corpora root directory must be selected!')

    mappings = get_mappings(args.input)
    write_mappings(mappings, args.output)


def parse_arguments():
    parser = argparse.ArgumentParser(description='Generate domains mapping list for conllup corpora.')
    parser.add_argument('-o', '--output', help='output tsv file with domains mappings')

    required_arguments = parser.add_argument_group('required arguments')
    required_arguments.add_argument('-i', '--input', help='corpora root directory', required=True)

    return parser.parse_args()


def get_mappings(root_directory):
    mappings = {}
    for root, dirs, files in os.walk(root_directory):
        for filename in files:
            if filename.endswith('.conllup') or filename.endswith('.conllu'):
                filepath = os.path.join(root, filename)
                domain = get_domain(filepath)
                scientific_disciplines = get_disciplines(filepath)
                if domain in mappings:
                    if scientific_disciplines in mappings[domain]:
                        pass
                    else:
                        mappings[domain].append(scientific_disciplines)
                else:
                    mappings[domain] = [scientific_disciplines]
    return mappings


def get_domain(filepath):
    with open(filepath, 'r') as conllup_file:
        for line in conllup_file:
            line = line.strip()
            if is_segment(line):
                continue
            elif is_metadata(line):
                name, value = get_metadata(line)
                if name == 'domain':
                    return value.strip()
    return ''


def get_disciplines(filepath):
    with open(filepath, 'r') as conllup_file:
        for line in conllup_file:
            line = line.strip()
            if is_segment(line):
                continue
            elif is_metadata(line):
                name, value = get_metadata(line)
                if name == 'disciplines':
                    return value.strip()
    return ''


def is_segment(line):
    if line and line[0].isdigit():
        return True
    return False


def is_metadata(line):
    if line.startswith('#'):
        return True
    return False


def get_metadata(line):
    name_value_pair = line.split('=', 1)
    name = name_value_pair[0].lstrip('#').strip()
    value = name_value_pair[1].strip()
    return name, value


def write_mappings(mappings, outpath):
    if outpath:
        with open(outpath, 'w') as outfile:
            outfile.write(stats_to_string(mappings))
    else:
        print(stats_to_string(mappings))


def stats_to_string(mappings):
    lines = ['CURLICAT Domain\tScientificDiscipline(s)']
    for domain in sorted(mappings.keys()):
        for sds in sorted(mappings[domain]):
            lines.append(f'{domain}\t{sds}')
    return '\n'.join(lines)


if __name__ == '__main__':
    main()