metadata.py 6.66 KB

Edit Raw Blame History Permalink

import argparse
import os


def main():
    args = parse_arguments()

    if not args.input:
        print('Error: Corpora root directory must be selected!')

    statistics = {'documents': 0,
                  'paragraphs': 0,
                  'sentences': 0,
                  'segments': 0,
                  'by_year': {},
                  'by_type': {},
                  'by_content_type': {},
                  'by_licence': {}}
    count_corpora_stats(args.input, statistics)
    write_corpora_stats(statistics, args.output)


def parse_arguments():
    parser = argparse.ArgumentParser(description='Generate statistics for conllup corpora.')
    parser.add_argument('-o', '--output', help='output tsv file with collected statistics')

    required_arguments = parser.add_argument_group('required arguments')
    required_arguments.add_argument('-i', '--input', help='corpora root directory', required=True)

    return parser.parse_args()


def count_corpora_stats(root_directory, stats):
    for root, dirs, files in os.walk(root_directory):
        for filename in files:
            if filename.endswith('.conllup') or filename.endswith('.conllu'):
                print('Counting statistics for %s.' % filename)
                count_document_statistics(os.path.join(root, filename), stats)


def count_document_statistics(filepath, stats):
    year = None
    entype = None
    content_type = None
    licence = None
    with open(filepath, 'r') as conllup_file:
        for line in conllup_file:
            line = line.strip()
            if is_segment(line):
                stats['segments'] += 1
                stats['by_year'][year]['segments'] += 1
                stats['by_content_type'][content_type]['segments'] += 1
                stats['by_licence'][licence]['segments'] += 1
                if entype:
                    stats['by_type'][entype]['segments'] += 1
            elif is_metadata(line):
                name, value = get_metadata(line)
                if name == 'PublicationDate':
                    entype = None
                    year = get_year(value)
                    if year not in stats['by_year']:
                        stats['by_year'][year] = {'documents': 0, 'paragraphs': 0, 'sentences': 0, 'segments': 0}
                    stats['documents'] += 1
                    stats['by_year'][year]['documents'] += 1
                elif name == 'Domain':
                    entype = value
                    if entype not in stats['by_type']:
                        stats['by_type'][entype] = {'documents': 0, 'paragraphs': 0, 'sentences': 0, 'segments': 0}
                    stats['by_type'][entype]['documents'] += 1
                elif name == 'DocumentType':
                    content_type = value
                    if content_type not in stats['by_content_type']:
                        stats['by_content_type'][content_type] = {'documents': 0, 'paragraphs': 0,
                                                                  'sentences': 0, 'segments': 0}
                    stats['by_content_type'][content_type]['documents'] += 1
                elif name == 'Licence':
                    licence = value
                    if licence not in stats['by_licence']:
                        stats['by_licence'][licence] = {'documents': 0, 'paragraphs': 0,
                                                        'sentences': 0, 'segments': 0}
                    stats['by_licence'][licence]['documents'] += 1
                elif name == 'newpar id':
                    stats['paragraphs'] += 1
                    stats['by_year'][year]['paragraphs'] += 1
                    stats['by_content_type'][content_type]['paragraphs'] += 1
                    stats['by_licence'][licence]['paragraphs'] += 1
                    if entype:
                        stats['by_type'][entype]['paragraphs'] += 1
                elif name == 'sent_id':
                    stats['sentences'] += 1
                    stats['by_year'][year]['sentences'] += 1
                    stats['by_content_type'][content_type]['sentences'] += 1
                    stats['by_licence'][licence]['sentences'] += 1
                    if entype:
                        stats['by_type'][entype]['sentences'] += 1


def is_segment(line):
    if line and line[0].isdigit():
        return True
    return False


def is_metadata(line):
    if line.startswith('#'):
        return True
    return False


def get_metadata(line):
    name_value_pair = line.split('=', 1)
    name = name_value_pair[0].lstrip('#').strip()
    value = name_value_pair[1].strip()
    return name, value


def get_year(date):
    return date.split('-')[0]


def write_corpora_stats(stats, outpath):
    if outpath:
        with open(outpath, 'w') as outfile:
            outfile.write(stats_to_string(stats))
    else:
        print(stats_to_string(stats))


def stats_to_string(stats):
    lines = ['\tDocuments\tParagraphs\tSentences\tSegments',
             'Total\t%d\t%d\t%d\t%d' % (stats['documents'], stats['paragraphs'], stats['sentences'], stats['segments']),
             '\nBy year:']

    for year, stats_by_year in sorted(stats['by_year'].items(), key=lambda kv: kv[0], reverse=True):
        lines.append('%s\t%d\t%d\t%d\t%d' % (year, stats_by_year['documents'], stats_by_year['paragraphs'],
                                             stats_by_year['sentences'], stats_by_year['segments']))

    lines.append('\nBy type:')
    for entype, stats_by_type in sorted(stats['by_type'].items(), key=lambda kv: kv[1]['documents'], reverse=True):
        lines.append('%s\t%d\t%d\t%d\t%d' % (entype, stats_by_type['documents'], stats_by_type['paragraphs'],
                                             stats_by_type['sentences'], stats_by_type['segments']))

    lines.append('\nBy content type:')
    for content_type, stats_by_content_type in sorted(stats['by_content_type'].items(),
                                                      key=lambda kv: kv[1]['documents'], reverse=True):
        lines.append('%s\t%d\t%d\t%d\t%d' % (content_type, stats_by_content_type['documents'],
                                             stats_by_content_type['paragraphs'], stats_by_content_type['sentences'],
                                             stats_by_content_type['segments']))

    lines.append('\nBy licence:')
    for licence, stats_by_licence in sorted(stats['by_licence'].items(), key=lambda kv: kv[1]['documents'],
                                            reverse=True):
        lines.append('%s\t%d\t%d\t%d\t%d' % (licence, stats_by_licence['documents'], stats_by_licence['paragraphs'],
                                             stats_by_licence['sentences'], stats_by_licence['segments']))

    return '\n'.join(lines)


if __name__ == '__main__':
    main()