check_ispell.py 1.07 KB

Edit Raw Blame History

# -*- coding: utf-8 -*-

from django.core.management.base import BaseCommand
from common.util import uniprint, uniopen
from dictionary.models import Lexeme


class Command(BaseCommand):
    args = '<input file>'
    help = ''

    def handle(self, input_file, **options):
        check_ispell(input_file)


def inc_count(d, key):
    d[key] = 1 + d.get(key, 0)


def dict_repr(d):
    return '|'.join('%s:%s' % (key, value) for key, value in d.iteritems())


def check_ispell(input_file):
    results = {}
    for line in uniopen(input_file):
        entry, flags = line.strip().split('/', 1)
        lexemes = Lexeme.objects.filter(entry=entry).exclude(status='cand')
        if flags not in results:
            results[flags] = {'pos': {}, 'ics': {}}
        for l in lexemes:
            ics = l.lip_data()['inflection_characteristics']
            inc_count(results[flags]['pos'], l.part_of_speech.symbol)
            inc_count(results[flags]['ics'], ics)
    for flags, res in results.iteritems():
        uniprint(
            '%s=%s,%s' % (flags, dict_repr(res['pos']), dict_repr(res['ic'])))