extra_crs.py 8.77 KB
#-*- coding:utf-8 -*-

import sys
from django.core.management.base import BaseCommand
from common.util import no_history
from dictionary.models import Lexeme, Vocabulary, CrossReference

# UWAGA: zepsute! (po rewolucji w odsyłaczach)
# Nie wiem, czy warto to poprawiać...

class Command(BaseCommand):
    args = '<type> <input file>'
    help = 'Adds extra cross-references'

    def handle(self, type, input_file, **options):
        return
        add_crs(type, input_file)


def debug(entry, text):
    print>> sys.stderr, (u'%s: %s' % (entry, text)).encode('utf-8')


def connect_real(from_l, to_l, cr_type):
    created1 = created2 = False
    if from_l.source == 'Morfologik':
        cr1, created1 = CrossReference.objects.get_or_create(
            from_lexeme=from_l, to_lexeme=to_l, type=cr_type[0])
    if to_l.source == 'Morfologik':
        cr2, created2 = CrossReference.objects.get_or_create(
            from_lexeme=to_l, to_lexeme=from_l, type=cr_type[1])
    if created1 or created2:
        debug(from_l.entry, u'Powiązano z %s' % to_l.entry)


def connect_dummy(from_l, to_l, cr_type):
    created1 = not CrossReference.objects.filter(
        from_lexeme=from_l, to_lexeme=to_l, type=cr_type[0])
    created2 = not CrossReference.objects.filter(
        from_lexeme=to_l, to_lexeme=from_l, type=cr_type[1])
    if created1 or created2:
        debug(from_l.entry, u'Powiązanoby z %s' % to_l.entry)
        if 'Morfologik' not in (from_l.source, to_l.source):
            debug(from_l.entry, u'Powiązanoby leksemy spoza Morfologika!')


connect = connect_dummy


def make_detail(qs, desc, morf, details):
    if qs.count() > 1:
        morf_count = qs.filter(pk__in=morf).count()
        details.append(
            u'%s: %s, w tym z Morfologika: %s' % (desc, qs.count(), morf_count))


def which_lacks(qs1, desc1, qs2, desc2):
    if qs2.count() > 0:
        return desc2
    elif qs1.count() > 0:
        return desc1
    else:
        return u'obu'


# dużo copypasty, ale chyba nie warto refaktoryzować
def add_crs(type, path):
    no_history()
    morfologik = Vocabulary.objects.get(id='Morfologik')
    morf = morfologik.owned_lexemes_pk()
    lexemes = Lexeme.objects.filter(
        lexemeassociation__vocabulary__id='Morfologik')
    adv = lexemes.filter(part_of_speech__symbol__in=('adv', 'advndm'))
    adj = lexemes.filter(part_of_speech__symbol='adj')
    advcom = lexemes.filter(part_of_speech__symbol='advcom')
    with open(path) as file:
        if type == 'advnie':
            cr_type = ('nieadj', 'adjnie')
            for line in file:
                advneg_e, base_e = line.strip().decode('utf-8').split()
                advnegs = adv.filter(entry=advneg_e)
                if adv.filter(entry=base_e):
                    advs = adv.filter(entry=base_e)
                    if advnegs.count() > 1 or advs.count() > 1:
                        details = []
                        make_detail(advnegs, u'zanegowane', morf, details)
                        make_detail(advs, u'niezanegowane', morf, details)
                        debug(advneg_e,
                            u'Niejednoznaczność: %s' % '; '.join(details))
                    elif advnegs.count() == 0 or advs.count() == 0:
                        lack = which_lacks(
                            advnegs, u'zanegowanego', advs, u'niezanegowanego')
                        debug(advneg_e, u'Brak %s' % lack)
                    else:
                        connect(advnegs[0], advs[0], cr_type)
                elif adj.filter(entry=base_e):
                    # najpierw trzeba odpalić advadj!
                    adjs = adj.filter(
                        entry=base_e, refs_to__type='adjadv',
                        refs_to__to_lexeme__deleted=False)
                    adjs = adjs | adj.filter(
                        entry=base_e, refs_from__type='advadj',
                        refs_from__from_lexeme__deleted=False)
                    adjs = adjs.distinct()
                    if advnegs.count() > 1 or adjs.count() > 1:
                        details = []
                        make_detail(advnegs, u'zanegowane', morf, details)
                        make_detail(adjs, u'przymiotniki', morf, details)
                        debug(advneg_e,
                            u'Niejednoznaczność: %s' % '; '.join(details))
                    elif advnegs.count() == 0 or adjs.count() == 0:
                        lack = which_lacks(
                            advnegs, u'zanegowanego', adjs, u'przymiotnika')
                        debug(advneg_e, u'Brak %s' % lack)
                    else:
                        advs = [cr.to_lexeme.pk for cr
                            in adjs[0].refs_to.filter(type='adjadv')]
                        advs += [cr.from_lexeme.pk for cr
                            in adjs[0].refs_from.filter(type='advadj')]
                        advs = adv.filter(pk__in=advs).distinct()
                        if len(advs) > 1:
                            details = []
                            make_detail(advs, u'niezanegowane', morf, details)
                            debug(advneg_e,
                                u'Niejednoznaczność: %s' % '; '.join(details))
                        elif len(advs) == 0:
                            debug(advneg_e, u'Brak niezanegowanego')
                        else:
                            connect(advnegs[0], advs[0], cr_type)
                else:
                    debug(advneg_e,
                        u'Brak drugiego leksemu [przymiotnik lub przysłówek]')
        elif type == 'advadj':
            cr_type = ('advadj', 'adjadv')
            for line in file:
                adv_e, adj_e = line.strip().decode('utf-8').split()
                advs = adv.filter(entry=adv_e, part_of_speech__symbol='adv')
                adjs = adj.filter(entry=adj_e,
                    patterns__name__contains='').distinct()
                if advs.count() > 1 or adjs.count() > 1:
                    details = []
                    make_detail(advs, u'przysłówki', morf, details)
                    make_detail(adjs, u'przymiotniki', morf, details)
                    debug(adv_e, u'Niejednoznaczność: %s' % '; '.join(details))
                elif advs.count() == 0 or adjs.count() == 0:
                    lack = which_lacks(advs, u'przysłówka', adjs,
                        u'przymiotnika')
                    debug(adv_e, u'Brak %s' % lack)
                else:
                    connect(advs[0], adjs[0], cr_type)
        elif type == 'advcom':
            cr_type = ('advcom', 'comadv')
            for line in file:
                com_e, adv_e = line.strip().decode('utf-8').split()
                advs = adv.filter(entry=adv_e)
                coms = advcom.filter(entry=com_e)
                if advs.count() > 1 or coms.count() > 1:
                    details = []
                    make_detail(advs, u'równe', morf, details)
                    make_detail(coms, u'wyższe', morf, details)
                    debug(adv_e, u'Niejednoznaczność: %s' % '; '.join(details))
                elif advs.count() == 0 or coms.count() == 0:
                    lack = which_lacks(advs, u'równego', coms, u'wyższego')
                    debug(adv_e, u'Brak %s' % lack)
                else:
                    connect(advs[0], coms[0], cr_type)
        elif type == 'adjadvc': # do puszczenia na koniec
            cr_type = ('adjadvc', 'advcadj')
            # uch!
            advs = Lexeme.objects.filter(
                refs_to__type='advadj', refs_to__to_lexeme__deleted=False)
            advs = advs | Lexeme.objects.filter(
                refs_from__type='adjadv', refs_from__from_lexeme__deleted=False)
            advs_with_com = advs.filter(
                refs_to__type='advcom', refs_to__to_lexeme__deleted=False)
            advs_with_com = advs_with_com | advs.filter(
                refs_from__type='comadv', refs_from__from_lexeme__deleted=False)
            advs = advs_with_com.distinct()
            for adv in advs:
                adjs = Lexeme.objects.filter(
                    refs_to__type='adjadv', refs_to__to_lexeme=adv,
                    refs_to__to_lexeme__deleted=False)
                adjs = adjs | Lexeme.objects.filter(
                    refs_from__type='advadj', refs_from__from_lexeme=adv)
                adjs = adjs.distinct()
                advcoms = Lexeme.objects.filter(
                    refs_to__type='comadv', refs_to__to_lexeme=adv,
                    refs_to__to_lexeme__deleted=False)
                advcoms = advcoms | Lexeme.objects.filter(
                    refs_from__type='advcom', refs_from__from_lexeme=adv)
                for adj in adjs:
                    for advcom in advcoms:
                        if not adj.refs_to.filter(to_lexeme=advcom,
                                type='adjadvc'):
                            connect(adj, advcom, cr_type)