extra_crs.py 8.18 KB
#-*- coding:utf-8 -*-

import sys
from django.core.management.base import BaseCommand
from common.util import no_history, debug, uniopen
from dictionary.models import Lexeme, Vocabulary, CrossReference

# UWAGA: zepsute! (po rewolucji w odsyłaczach)
# Nie wiem, czy warto to poprawiać...

class Command(BaseCommand):
    args = '<type> <input file>'
    help = 'Adds extra cross-references'

    def handle(self, type, input_file, **options):
        raise Exception('stale code')
        #add_crs(type, input_file)


def connect_real(from_l, to_l, cr_type):
    created1 = created2 = False
    if from_l.source == 'Morfologik':
        cr1, created1 = CrossReference.objects.get_or_create(
            from_lexeme=from_l, to_lexeme=to_l, type=cr_type[0])
    if to_l.source == 'Morfologik':
        cr2, created2 = CrossReference.objects.get_or_create(
            from_lexeme=to_l, to_lexeme=from_l, type=cr_type[1])
    if created1 or created2:
        debug(from_l.entry, u'Powiązano z %s' % to_l.entry)


def connect_dummy(from_l, to_l, cr_type):
    created1 = not CrossReference.objects.filter(
        from_lexeme=from_l, to_lexeme=to_l, type=cr_type[0])
    created2 = not CrossReference.objects.filter(
        from_lexeme=to_l, to_lexeme=from_l, type=cr_type[1])
    if created1 or created2:
        debug(from_l.entry, u'Powiązanoby z %s' % to_l.entry)
        if 'Morfologik' not in (from_l.source, to_l.source):
            debug(from_l.entry, u'Powiązanoby leksemy spoza Morfologika!')


connect = connect_dummy


def make_detail(qs, desc, morf, details):
    if qs.count() > 1:
        morf_count = qs.filter(pk__in=morf).count()
        details.append(
            u'%s: %s, w tym z Morfologika: %s' % (desc, qs.count(), morf_count))


def which_lacks(qs1, desc1, qs2, desc2):
    if qs2.count() > 0:
        return desc2
    elif qs1.count() > 0:
        return desc1
    else:
        return u'obu'


# dużo copypasty, ale chyba nie warto refaktoryzować
def add_crs(type, path):
    no_history()
    morfologik = Vocabulary.objects.get(id='Morfologik')
    morf = morfologik.owned_lexemes_pk()
    lexemes = Lexeme.objects.filter(
        lexemeassociation__vocabulary__id='Morfologik')
    adv = lexemes.filter(part_of_speech__symbol__in=('adv', 'advndm'))
    adj = lexemes.filter(part_of_speech__symbol='adj')
    advcom = lexemes.filter(part_of_speech__symbol='advcom')
    file = uniopen(path)
    if type == 'advnie':
        cr_type = ('nieadj', 'adjnie')
        for line in file:
            advneg_e, base_e = line.split()
            advnegs = adv.filter(entry=advneg_e)
            if adv.filter(entry=base_e):
                advs = adv.filter(entry=base_e)
                if advnegs.count() > 1 or advs.count() > 1:
                    details = []
                    make_detail(advnegs, u'zanegowane', morf, details)
                    make_detail(advs, u'niezanegowane', morf, details)
                    debug(advneg_e,
                        u'Niejednoznaczność: %s' % '; '.join(details))
                elif advnegs.count() == 0 or advs.count() == 0:
                    lack = which_lacks(
                        advnegs, u'zanegowanego', advs, u'niezanegowanego')
                    debug(advneg_e, u'Brak %s' % lack)
                else:
                    connect(advnegs[0], advs[0], cr_type)
            elif adj.filter(entry=base_e):
                # najpierw trzeba odpalić advadj!
                adjs = adj.filter(
                    entry=base_e, refs_to__type='adjadv',
                    refs_to__to_lexeme__deleted=False)
                adjs = adjs | adj.filter(
                    entry=base_e, refs_from__type='advadj',
                    refs_from__from_lexeme__deleted=False)
                adjs = adjs.distinct()
                if advnegs.count() > 1 or adjs.count() > 1:
                    details = []
                    make_detail(advnegs, u'zanegowane', morf, details)
                    make_detail(adjs, u'przymiotniki', morf, details)
                    debug(advneg_e,
                        u'Niejednoznaczność: %s' % '; '.join(details))
                elif advnegs.count() == 0 or adjs.count() == 0:
                    lack = which_lacks(
                        advnegs, u'zanegowanego', adjs, u'przymiotnika')
                    debug(advneg_e, u'Brak %s' % lack)
                else:
                    advs = [cr.to_lexeme.pk for cr
                        in adjs[0].refs_to.filter(type='adjadv')]
                    advs += [cr.from_lexeme.pk for cr
                        in adjs[0].refs_from.filter(type='advadj')]
                    advs = adv.filter(pk__in=advs).distinct()
                    if len(advs) > 1:
                        details = []
                        make_detail(advs, u'niezanegowane', morf, details)
                        debug(advneg_e,
                            u'Niejednoznaczność: %s' % '; '.join(details))
                    elif len(advs) == 0:
                        debug(advneg_e, u'Brak niezanegowanego')
                    else:
                        connect(advnegs[0], advs[0], cr_type)
            else:
                debug(advneg_e,
                    u'Brak drugiego leksemu [przymiotnik lub przysłówek]')
    elif type == 'advadj':
        cr_type = ('advadj', 'adjadv')
        for line in file:
            adv_e, adj_e = line.split()
            advs = adv.filter(entry=adv_e, part_of_speech__symbol='adv')
            adjs = adj.filter(entry=adj_e,
                patterns__name__contains='').distinct()
            if advs.count() > 1 or adjs.count() > 1:
                details = []
                make_detail(advs, u'przysłówki', morf, details)
                make_detail(adjs, u'przymiotniki', morf, details)
                debug(adv_e, u'Niejednoznaczność: %s' % '; '.join(details))
            elif advs.count() == 0 or adjs.count() == 0:
                lack = which_lacks(advs, u'przysłówka', adjs,
                    u'przymiotnika')
                debug(adv_e, u'Brak %s' % lack)
            else:
                connect(advs[0], adjs[0], cr_type)
    elif type == 'advcom':
        cr_type = ('advcom', 'comadv')
        for line in file:
            com_e, adv_e = line.split()
            advs = adv.filter(entry=adv_e)
            coms = advcom.filter(entry=com_e)
            if advs.count() > 1 or coms.count() > 1:
                details = []
                make_detail(advs, u'równe', morf, details)
                make_detail(coms, u'wyższe', morf, details)
                debug(adv_e, u'Niejednoznaczność: %s' % '; '.join(details))
            elif advs.count() == 0 or coms.count() == 0:
                lack = which_lacks(advs, u'równego', coms, u'wyższego')
                debug(adv_e, u'Brak %s' % lack)
            else:
                connect(advs[0], coms[0], cr_type)
    elif type == 'adjadvc': # do puszczenia na koniec
        cr_type = ('adjadvc', 'advcadj')
        # uch!
        advs = Lexeme.objects.filter(
            refs_to__type='advadj', refs_to__to_lexeme__deleted=False)
        advs = advs | Lexeme.objects.filter(
            refs_from__type='adjadv', refs_from__from_lexeme__deleted=False)
        advs_with_com = advs.filter(
            refs_to__type='advcom', refs_to__to_lexeme__deleted=False)
        advs_with_com = advs_with_com | advs.filter(
            refs_from__type='comadv', refs_from__from_lexeme__deleted=False)
        advs = advs_with_com.distinct()
        for adv in advs:
            adjs = Lexeme.objects.filter(
                refs_to__type='adjadv', refs_to__to_lexeme=adv,
                refs_to__to_lexeme__deleted=False)
            adjs = adjs | Lexeme.objects.filter(
                refs_from__type='advadj', refs_from__from_lexeme=adv)
            adjs = adjs.distinct()
            advcoms = Lexeme.objects.filter(
                refs_to__type='comadv', refs_to__to_lexeme=adv,
                refs_to__to_lexeme__deleted=False)
            advcoms = advcoms | Lexeme.objects.filter(
                refs_from__type='advcom', refs_from__from_lexeme=adv)
            for adj in adjs:
                for advcom in advcoms:
                    if not adj.refs_to.filter(to_lexeme=advcom,
                            type='adjadvc'):
                        connect(adj, advcom, cr_type)