extra_crs.py 7.43 KB
#-*- coding:utf-8 -*-

import sys
from django.core.management.base import BaseCommand, CommandError
from common.util import no_history
from dictionary.models import Lexeme, Vocabulary, CrossReference

# UWAGA: zepsute! (po rewolucji w odsyłaczach)
# Nie wiem, czy warto to poprawiać...

class Command(BaseCommand):
  args = '<type> <input file>'
  help = 'Adds extra cross-references'

  def handle(self, type, input_file, **options):
    return
    add_crs(type, input_file)

def debug(entry, text):
  print>>sys.stderr, (u'%s: %s' % (entry, text)).encode('utf-8')

def connect_real(from_l, to_l, cr_type):
  created1 = created2 = False
  if from_l.source == 'Morfologik':
    cr1, created1 = CrossReference.objects.get_or_create(
      from_lexeme=from_l, to_lexeme=to_l, type=cr_type[0])
  if to_l.source == 'Morfologik':
    cr2, created2 = CrossReference.objects.get_or_create(
      from_lexeme=to_l, to_lexeme=from_l, type=cr_type[1])
  if created1 or created2:
    debug(from_l.entry, u'Powiązano z %s' % to_l.entry)

def connect_dummy(from_l, to_l, cr_type):
  created1 = not CrossReference.objects.filter(
    from_lexeme=from_l, to_lexeme=to_l, type=cr_type[0])
  created2 = not CrossReference.objects.filter(
    from_lexeme=to_l, to_lexeme=from_l, type=cr_type[1])
  if created1 or created2:
    debug(from_l.entry, u'Powiązanoby z %s' % to_l.entry)
    if 'Morfologik' not in (from_l.source, to_l.source):
      debug(from_l.entry, u'Powiązanoby leksemy spoza Morfologika!')

connect = connect_dummy

def make_detail(qs, desc, morf, details):
  if qs.count() > 1:
    morf_count = qs.filter(pk__in=morf).count()
    details.append(
      u'%s: %s, w tym z Morfologika: %s' % (desc, qs.count(), morf_count))

def which_lacks(qs1, desc1, qs2, desc2):
  if qs2.count() > 0:
    return desc2
  elif qs1.count() > 0:
    return desc1
  else:
    return u'obu'


# dużo copypasty, ale chyba nie warto refaktoryzować
def add_crs(type, path):
  no_history()
  morfologik = Vocabulary.objects.get(id='Morfologik')
  morf = morfologik.owned_lexemes_pk()
  lexemes = Lexeme.objects.filter(
    lexemeassociation__vocabulary__id='Morfologik')
  adv = lexemes.filter(part_of_speech__symbol__in=('adv', 'advndm'))
  adj = lexemes.filter(part_of_speech__symbol='adj')
  advcom = lexemes.filter(part_of_speech__symbol='advcom')
  with open(path) as file:
    if type == 'advnie':
      cr_type = ('nieadj', 'adjnie')
      for line in file:
        advneg_e, base_e = line.strip().decode('utf-8').split()
        advnegs = adv.filter(entry=advneg_e)
        if adv.filter(entry=base_e):
          advs = adv.filter(entry=base_e)
          if advnegs.count() > 1 or advs.count() > 1:
            details = []
            make_detail(advnegs, u'zanegowane', morf, details)
            make_detail(advs, u'niezanegowane', morf, details)
            debug(advneg_e, u'Niejednoznaczność: %s' % '; '.join(details))
          elif advnegs.count() == 0 or advs.count() == 0:
            lack = which_lacks(
              advnegs, u'zanegowanego', advs, u'niezanegowanego')
            debug(advneg_e, u'Brak %s' % lack)
          else:
            connect(advnegs[0], advs[0], cr_type)
        elif adj.filter(entry=base_e):
          # najpierw trzeba odpalić advadj!
          adjs = adj.filter(
            entry=base_e, refs_to__type='adjadv',
            refs_to__to_lexeme__deleted=False)
          adjs = adjs | adj.filter(
            entry=base_e, refs_from__type='advadj',
            refs_from__from_lexeme__deleted=False)
          adjs = adjs.distinct()
          if advnegs.count() > 1 or adjs.count() > 1:
            details = []
            make_detail(advnegs, u'zanegowane', morf, details)
            make_detail(adjs, u'przymiotniki', morf, details)
            debug(advneg_e, u'Niejednoznaczność: %s' % '; '.join(details))
          elif advnegs.count() == 0 or adjs.count() == 0:
            lack = which_lacks(
              advnegs, u'zanegowanego', adjs, u'przymiotnika')
            debug(advneg_e, u'Brak %s' % lack)
          else:
            advs = [cr.to_lexeme.pk for cr
                    in adjs[0].refs_to.filter(type='adjadv')]
            advs += [cr.from_lexeme.pk for cr
                     in adjs[0].refs_from.filter(type='advadj')]
            advs = adv.filter(pk__in=advs).distinct()
            if len(advs) > 1:
              details = []
              make_detail(advs, u'niezanegowane', morf, details)
              debug(advneg_e, u'Niejednoznaczność: %s' % '; '.join(details))
            elif len(advs) == 0:
              debug(advneg_e, u'Brak niezanegowanego')
            else:
              connect(advnegs[0], advs[0], cr_type)
        else:
          debug(advneg_e, u'Brak drugiego leksemu [przymiotnik lub przysłówek]')
    elif type == 'advadj':
      cr_type = ('advadj', 'adjadv')
      for line in file:
        adv_e, adj_e = line.strip().decode('utf-8').split()
        advs = adv.filter(entry=adv_e, part_of_speech__symbol='adv')
        adjs = adj.filter(entry=adj_e, patterns__name__contains='').distinct()
        if advs.count() > 1 or adjs.count() > 1:
          details = []
          make_detail(advs, u'przysłówki', morf, details)
          make_detail(adjs, u'przymiotniki', morf, details)
          debug(adv_e, u'Niejednoznaczność: %s' % '; '.join(details))
        elif advs.count() == 0 or adjs.count() == 0:
          lack = which_lacks(advs, u'przysłówka', adjs, u'przymiotnika')
          debug(adv_e, u'Brak %s' % lack)
        else:
          connect(advs[0], adjs[0], cr_type)
    elif type == 'advcom':
      cr_type = ('advcom', 'comadv')
      for line in file:
        com_e, adv_e = line.strip().decode('utf-8').split()
        advs = adv.filter(entry=adv_e)
        coms = advcom.filter(entry=com_e)
        if advs.count() > 1 or coms.count() > 1:
          details = []
          make_detail(advs, u'równe', morf, details)
          make_detail(coms, u'wyższe', morf, details)
          debug(adv_e, u'Niejednoznaczność: %s' % '; '.join(details))
        elif advs.count() == 0 or coms.count() == 0:
          lack = which_lacks(advs, u'równego', coms, u'wyższego')
          debug(adv_e, u'Brak %s' % lack)
        else:
          connect(advs[0], coms[0], cr_type)
    elif type == 'adjadvc': # do puszczenia na koniec
      cr_type = ('adjadvc', 'advcadj')
      # uch!
      advs = Lexeme.objects.filter(
        refs_to__type='advadj', refs_to__to_lexeme__deleted=False)
      advs = advs | Lexeme.objects.filter(
        refs_from__type='adjadv', refs_from__from_lexeme__deleted=False)
      advs_with_com = advs.filter(
        refs_to__type='advcom', refs_to__to_lexeme__deleted=False)
      advs_with_com = advs_with_com | advs.filter(
        refs_from__type='comadv', refs_from__from_lexeme__deleted=False)
      advs = advs_with_com.distinct()
      for adv in advs:
        adjs = Lexeme.objects.filter(
          refs_to__type='adjadv', refs_to__to_lexeme=adv,
          refs_to__to_lexeme__deleted=False)
        adjs = adjs | Lexeme.objects.filter(
          refs_from__type='advadj', refs_from__from_lexeme=adv)
        adjs = adjs.distinct()
        advcoms = Lexeme.objects.filter(
          refs_to__type='comadv', refs_to__to_lexeme=adv,
          refs_to__to_lexeme__deleted=False)
        advcoms = advcoms | Lexeme.objects.filter(
          refs_from__type='advcom', refs_from__from_lexeme=adv)
        for adj in adjs:
          for advcom in advcoms:
            if not adj.refs_to.filter(to_lexeme=advcom, type='adjadvc'):
              connect(adj, advcom, cr_type)