check_morfologik.py 4.42 KB

Edit Raw Blame History

#-*- coding:utf-8 -*-

import sys
from django.core.management.base import BaseCommand, CommandError
from common.util import debug
from dictionary.models import Lexeme

class Command(BaseCommand):
  args = '<symbol części mowy> <nazwa pliku wejściowego>'
  help = 'Check Morfologik import'

  def handle(self, lc_sym, input_file, **options):
    check_morfologik(lc_sym, input_file)

# i tak nie ma żadnych q* aktualnie...
v_forms = {
  ('1', 'allq'): u'',
  ('1', 'all'): u'cie|my|sz',
  ('2', 'all'): u'',
  ('3', 'all'): u'',
  ('3', 'ndk'): u'c',
  ('3', 'pact'): u'ca|cą|ce|cego|cej|cemu|cy|cych|cym|cymi',
  ('4', 'all'): u'|że|my|myż|cie|cież',
  ('5', 'allq'): u'',
  ('6', 'all'): u'|by|byś|bym',
  ("6'", 'dk'): u'szy',
  ('7', 'all'): u'em|eś',
  ('8', 'allq'): u'o|oby',
  ('8', 'all'): u'a|aby|abyś|abym|am|aś|obym|obyś|om|oś|'
                u'y|yby|ybyście|ybyśmy|yście|yśmy',
  ('9', 'all'): u'i|iby|ibyście|ibyśmy|iście|iśmy',
  ('10', 'all'): u'o',
  ('10', 'ppas'): u'a|ą|e|ego|ej|emu|y|ych|ym|ymi',
  ('11', 'ger'): u'ie|ia|iach|iami|iem|iom|iu',
  ('11pg', 'ger'): u'',
  ('12', 'ppas'): u'',
}

def get_forms(l, lc_sym):
  if lc_sym != 'v':
    l_forms = set(l.lexemeform_set.values_list('form', flat=True))
    if lc_sym == 'adj':
      neg = l.refs_to.filter(type__symbol='adjnie')
      if neg:
        l_neg = neg[0].to_lexeme
        neg_forms = l_neg.lexemeform_set.values_list('form', flat=True)
        added_forms = l_neg.all_forms(label_filter='^0|3\+$')
        l_forms |= set(form for form in neg_forms if form not in added_forms)
  else:
    tags = ['allq']
    if l.refs_to.filter(type__symbol='verpact'):
      tags.append('pact')
    if l.refs_to.filter(type__symbol='verppas'):
      tags.append('ppas')
    if l.refs_to.filter(type__symbol='verger'):
      tags.append('ger')
    lips = l.lexemeinflectionpattern_set.all()
    if not lips:
      return set()
    ic = lips[0].inflection_characteristic.symbol
    q = ic.startswith('q')
    if not q:
      tags.append('all')
      if 'ndk' in ic:
        tags.append('ndk')
      if 'dk' in ic.replace('ndk', ''):
        tags.append('dk')
    base_forms = {}
    for lip in l.lexemeinflectionpattern_set.all():
      for ending in lip.pattern.endings.all():
        bfl = ending.base_form_label.symbol
        if bfl not in base_forms:
          base_forms[bfl] = set()
        base_forms[bfl].add(lip.root + ending.string)
    l_forms = set()
    for (label, tag), suffixes in v_forms.iteritems():
      if tag in tags and label in base_forms:
        new_forms = set()
        for base_form in base_forms[label]:
          new_forms |= set(base_form + suffix for suffix in suffixes.split('|'))
        l_forms |= new_forms
        if tag in ('pact', 'ppas', 'ger'):
          l_forms |= set('nie' + form for form in new_forms)
  return l_forms

def check_forms(lc_sym, forms):
  entry = forms[0]
  forms = set(forms)
  morf_lexemes = Lexeme.objects.filter(
    lexemeassociation__vocabulary__id='Morfologik', entry=entry,
    part_of_speech__lexical_class__symbol=lc_sym)
  for l in morf_lexemes:
    if l.part_of_speech.lexical_class.symbol != lc_sym:
      continue
    l_forms = get_forms(l, lc_sym)
    if l_forms == set():
      break # brak dopasowania nas tu nie interesuje
    if forms == l_forms:
      break
    if lc_sym == 'subst':
      m1_lips = l.lexemeinflectionpattern_set.filter(
        inflection_characteristic__symbol='m1')
      if m1_lips and u'formę depr' in l.comment:
        if forms | l.all_forms(label_filter='^pl:nom$') == l_forms:
          break
      if (u'rozszerzone singulare' in l.comment
          or u'rozszerzyć sgtant' in l.comment
          or l.owner_vocabulary.id != 'Morfologik'):
        if forms == l.all_forms(label_filter='^sg:'):
          break
    elif lc_sym == 'adj':
      #if u' -o' in l.comment:
        if forms | l.all_forms(label_filter='^0$') == l_forms:
          break
  else: # żaden nie pasował
    print entry.encode('utf-8')
    for l in morf_lexemes:
      l_forms = get_forms(l, lc_sym)
      missing = ', '.join(forms - l_forms)
      extra = ', '.join(l_forms - forms)
      print ('%s|%s' % (missing, extra)).encode('utf-8')

def check_morfologik(lc_sym, input_file):
  with open(input_file) as file:
    forms = []
    for line in file:
      line = line.decode('utf-8').rstrip('\n')
      if line == '':
        check_forms(lc_sym, forms)
        forms = []
      else:
        form, tag = line.split('\t')
        forms.append(form)