check_morfologik.py 4.95 KB
#-*- coding:utf-8 -*-

from django.core.management.base import BaseCommand
from dictionary.models import Lexeme


class Command(BaseCommand):
    args = '<symbol części mowy> <nazwa pliku wejściowego>'
    help = 'Check Morfologik import'

    def handle(self, lc_sym, input_file, **options):
        check_morfologik(lc_sym, input_file)

# i tak nie ma żadnych q* aktualnie...
v_forms = {
    ('1', 'allq'): u'',
    ('1', 'all'): u'cie|my|sz',
    ('2', 'all'): u'',
    ('3', 'all'): u'',
    ('3', 'ndk'): u'c',
    ('3', 'pact'): u'ca|cą|ce|cego|cej|cemu|cy|cych|cym|cymi',
    ('4', 'all'): u'|że|my|myż|cie|cież',
    ('5', 'allq'): u'',
    ('6', 'all'): u'|by|byś|bym',
    ("6'", 'dk'): u'szy',
    ('7', 'all'): u'em|eś',
    ('8', 'allq'): u'o|oby',
    ('8', 'all'): u'a|aby|abyś|abym|am|aś|obym|obyś|om|oś|'
                  u'y|yby|ybyście|ybyśmy|yście|yśmy',
    ('9', 'all'): u'i|iby|ibyście|ibyśmy|iście|iśmy',
    ('10', 'all'): u'o',
    ('10', 'ppas'): u'a|ą|e|ego|ej|emu|y|ych|ym|ymi',
    ('11', 'ger'): u'ie|ia|iach|iami|iem|iom|iu',
    ('11pg', 'ger'): u'',
    ('12', 'ppas'): u'',
}


def get_forms(l, lc_sym):
    if lc_sym != 'v':
        l_forms = set(l.lexemeform_set.values_list('form', flat=True))
        if lc_sym == 'adj':
            neg = l.refs_to.filter(type__symbol='adjnie')
            if neg:
                l_neg = neg[0].to_lexeme
                neg_forms = l_neg.lexemeform_set.values_list('form', flat=True)
                added_forms = l_neg.all_forms(label_filter='^0|3\+$')
                l_forms |= set(
                    form for form in neg_forms if form not in added_forms)
    else:
        tags = ['allq']
        if l.refs_to.filter(type__symbol='verpact'):
            tags.append('pact')
        if l.refs_to.filter(type__symbol='verppas'):
            tags.append('ppas')
        if l.refs_to.filter(type__symbol='verger'):
            tags.append('ger')
        lips = l.lexemeinflectionpattern_set.all()
        if not lips:
            return set()
        ic = lips[0].inflection_characteristic.symbol
        q = ic.startswith('q')
        if not q:
            tags.append('all')
            if 'ndk' in ic:
                tags.append('ndk')
            if 'dk' in ic.replace('ndk', ''):
                tags.append('dk')
        base_forms = {}
        for lip in l.lexemeinflectionpattern_set.all():
            for ending in lip.pattern.endings.all():
                bfl = ending.base_form_label.symbol
                if bfl not in base_forms:
                    base_forms[bfl] = set()
                base_forms[bfl].add(lip.root + ending.string)
        l_forms = set()
        for (label, tag), suffixes in v_forms.iteritems():
            if tag in tags and label in base_forms:
                new_forms = set()
                for base_form in base_forms[label]:
                    new_forms |= set(
                        base_form + suffix for suffix in suffixes.split('|'))
                l_forms |= new_forms
                if tag in ('pact', 'ppas', 'ger'):
                    l_forms |= set('nie' + form for form in new_forms)
    return l_forms


def check_forms(lc_sym, forms):
    entry = forms[0]
    forms = set(forms)
    morf_lexemes = Lexeme.objects.filter(
        lexemeassociation__vocabulary__id='Morfologik', entry=entry,
        part_of_speech__lexical_class__symbol=lc_sym)
    for l in morf_lexemes:
        if l.part_of_speech.lexical_class.symbol != lc_sym:
            continue
        l_forms = get_forms(l, lc_sym)
        if l_forms == set():
            break # brak dopasowania nas tu nie interesuje
        if forms == l_forms:
            break
        if lc_sym == 'subst':
            m1_lips = l.lexemeinflectionpattern_set.filter(
                inflection_characteristic__symbol='m1')
            if m1_lips and u'formę depr' in l.comment:
                if forms | l.all_forms(label_filter='^pl:nom$') == l_forms:
                    break
            if (u'rozszerzone singulare' in l.comment
            or u'rozszerzyć sgtant' in l.comment
            or l.owner_vocabulary.id != 'Morfologik'):
                if forms == l.all_forms(label_filter='^sg:'):
                    break
        elif lc_sym == 'adj':
        #if u' -o' in l.comment:
            if forms | l.all_forms(label_filter='^0$') == l_forms:
                break
    else: # żaden nie pasował
        print entry.encode('utf-8')
        for l in morf_lexemes:
            l_forms = get_forms(l, lc_sym)
            missing = ', '.join(forms - l_forms)
            extra = ', '.join(l_forms - forms)
            print ('%s|%s' % (missing, extra)).encode('utf-8')


def check_morfologik(lc_sym, input_file):
    with open(input_file) as file:
        forms = []
        for line in file:
            line = line.decode('utf-8').rstrip('\n')
            if line == '':
                check_forms(lc_sym, forms)
                forms = []
            else:
                form, tag = line.split('\t')
                forms.append(form)