auto_derivatives.py 7.8 KB
# -*- coding: utf-8 -*-
from django.db.models import Max
from dictionary.models import Lexeme, LexemeInflectionPattern, \
    Gender, LexemeAttributeValue, LexemeAttribute, REVERSE_CR_TYPE
from patterns.models import Pattern, Ending

VOWELS = u'aeiouyąęó'

ADJ_POS = ('ppas', 'appas', 'pact', 'adjcom', 'nieadj')

Ppact = Pattern.objects.get(name='P2')  # hardcoded pattern
Pppas_ty = Pattern.objects.get(name='P4t')
Pppas_ny_ni = Pattern.objects.get(name='P4')
Pppas_ony_eni = Pattern.objects.get(name='P4no')
Pppas_iony_ieni = Pattern.objects.get(name='P4noi')
Pger_nie = Pattern.objects.get(name='C1n')
Pger_cie = Pattern.objects.get(name='C1c')
Posc = Pattern.objects.get(name='D1ć0+i')
Pcom = Pattern.objects.get(name='P4zs')
Pndm = Pattern.objects.get(name='ndm')
n2 = Gender.objects.get(symbol='n2')
f = Gender.objects.get(symbol='f')
ATTR_POPRZ = LexemeAttribute.objects.get(name=u'forma poprz.')
ATTR_ZLOZ = LexemeAttribute.objects.get(name=u'forma złoż.')
NO_POPRZ = LexemeAttributeValue.objects.get(
    value=u'nieobecna', attribute__name=u'forma poprz.')
NO_ZLOZ = LexemeAttributeValue.objects.get(
    value=u'nieobecna', attribute__name=u'forma złoż.')


def ppas_data(lips, pos='ppas'):
    for lip in lips:
        pattern = lip.pattern
        endings10 = Ending.objects.filter(
            pattern=pattern, base_form_label__symbol='10')
        endings12 = Ending.objects.filter(
            pattern=pattern, base_form_label__symbol='12')
        for ending in endings10:
            for ending12 in endings12:
                yield {
                    'pos': pos,
                    'cr_type': 'verppas',
                    'entry': lip.root + ending.string + 'y',
                    'pl': lip.root + ending12.string,
                    'index': lip.index,
                }


def pact_data(lips):
    for lip in lips:
        pattern = lip.pattern
        endings3 = Ending.objects.filter(
            pattern=pattern, base_form_label__symbol='3')
        for ending in endings3:
            yield {
                'pos': 'pact',
                'cr_type': 'verpact',
                'entry': lip.root + ending.string + 'cy',
                'index': lip.index,
            }


def ger_data(lips):
    for lip in lips:
        pattern = lip.pattern
        endings11 = Ending.objects.filter(
            pattern=pattern, base_form_label__symbol='11')
        for ending in endings11:
            yield {
                'pos': 'ger',
                'cr_type': 'verger',
                'entry': lip.root + ending.string + 'ie',
                'index': lip.index,
            }


def guess_osc(s):
    if s[-1] == 'i':
        if s[-2] in VOWELS:
            base = s[:-1] + u'j'
        elif s[-2] in u'gkl':
            base = s[:-1]
        else:
            base = s
    elif s[-1] == u'y':
        base = s[:-1]
    elif s[-2:] in [u'ek', u'en']:
        base = s[:-2] + s[-1]
    elif s[-2:] in [u'ój', u'ów']:
        base = s[:-2] + u'o' + s[-1]
    else:
        base = s
    return base + u'ość'


def make_negation(s):
    if s[0].islower():
        return u'nie' + s
    else:
        return u'nie-' + s


def lexeme_derivatives(lexeme):
    lips = list(lexeme.lexemeinflectionpattern_set.all())
    if not lips:
        return
    if lexeme.part_of_speech.symbol == 'v':
        proper = lexeme.lexemeattributevalue_set.filter(
            attribute__name=u'właściwy', value__in=('', '(Q)'))
        if proper:
            trans = lexeme.lexemeattributevalue_set.filter(
                attribute__name=u'przechodniość', value='T')
            q_trans = lexeme.lexemeattributevalue_set.filter(
                attribute__name=u'przechodniość', value='qT')
            imperf = lexeme.lexemeattributevalue_set.filter(
                attribute__name=u'aspekt').exclude(value='dk')
            if trans or q_trans:
                pos = 'ppas' if trans else 'appas'
                for data in ppas_data(lips, pos):
                    yield data
            if imperf:
                for data in pact_data(lips):
                    yield data
            for data in ger_data(lips):
                yield data
    elif lexeme.part_of_speech.symbol == 'adj':
        # adjcom, adv, advcom, osc, nieadj
        pos_types = (
            ('adjcom', 'adjcom'),
            ('adv', 'adjadv'),
            ('advcom', 'adjadvc')
        )
        for pos, cr_type in pos_types:
            yield {
                'pos': pos,
                'cr_type': cr_type,
                'entry': None,
                'index': 1,
            }
        yield {
            'pos': 'osc',
            'cr_type': 'adjosc',
            'entry': guess_osc(lexeme.entry),
            'index': 1,
        }
        yield {
            'pos': 'adj',
            'cr_type': 'adjnie',
            'entry': make_negation(lexeme.entry),
            'index': 1,
        }


def create_derivative(lexeme, part_of_speech, cr_type, entry, index, pl=None):
    negation = cr_type.endswith('nie')
    next_id = Lexeme.all_objects.aggregate(Max('id'))['id__max'] + 1
    der = Lexeme.objects.create(
        id=next_id, entry=entry, part_of_speech_id=part_of_speech,
        status=lexeme.status, owner_vocabulary_id=lexeme.owner_vocabulary_id,
        specialist=lexeme.specialist,
        borrowing_source_id=lexeme.borrowing_source_id)
    der.fix_homonym_number()
    lexeme.owner_vocabulary.add_lexeme(der)
    if not negation:
        lip = LexemeInflectionPattern(lexeme=der, index=1)
        if part_of_speech in ('ppas', 'appas'):
            # -ty/-ci
            if entry.endswith('ty'):
                lip.pattern = Pppas_ty
            # -iony/-eni
            elif entry.endswith('iony') and not pl.endswith('ieni'):
                lip.pattern = Pppas_iony_ieni
            # -ony/-eni
            elif entry.endswith('eni'):
                lip.pattern = Pppas_ony_eni
            # -ny/-ni
            else:
                lip.pattern = Pppas_ny_ni
        elif part_of_speech == 'pact':
            lip.pattern = Ppact
        elif part_of_speech == 'ger':
            lip.gender = n2
            if entry.endswith('cie'):
                lip.pattern = Pger_cie
            else:  # -nie
                lip.pattern = Pger_nie
        elif part_of_speech == 'osc':
            lip.pattern = Posc
            lip.gender = f
        elif part_of_speech == 'adjcom':
            lip.pattern = Pcom
        elif part_of_speech in ('adv', 'advcom'):
            lip.pattern = Pndm
        lip.root = lip.get_root()
        lip.save()
        orig_lip = LexemeInflectionPattern.objects.get(
            lexeme=lexeme, index=index)
        # może kopiować kwalifikatory odmieniasia do leksemu?
        for q in orig_lip.qualifiers.all():
            lip.qualifiers.add(q)  # der zamiast lip?
    else:
        for orig_lip in lexeme.lexemeinflectionpattern_set.all():
            lip = LexemeInflectionPattern(
                lexeme=der, index=orig_lip.index, pattern=orig_lip.pattern,
                gender=orig_lip.gender)
            lip.root = lip.get_root()
            lip.save()
            for q in orig_lip.qualifiers.all():
                lip.qualifiers.add(q)
    for attr, attr_val in lexeme.attributes_values():
        if attr not in (ATTR_POPRZ, ATTR_ZLOZ) and attr_val \
                and attr.parts_of_speech.filter(symbol=part_of_speech):
            attr_val.add_lexeme(der)
    if part_of_speech in ('ppas', 'appas', 'pact', 'adjcom', 'nieadj'):
        NO_POPRZ.add_lexeme(der)
        NO_ZLOZ.add_lexeme(der)
    for q in lexeme.qualifiers.all():
        der.qualifiers.add(q)
    lexeme.add_cross_reference(der, cr_type)
    der.add_cross_reference(lexeme, REVERSE_CR_TYPE[cr_type])
    der.refresh_data()
    new_lexemes = [der]
    if cr_type in ('adjosc', 'adjadv'):
        new_lexemes.extend(create_derivative(
            der, part_of_speech, 'adjnie', make_negation(entry),
            index))
    return new_lexemes