import_resztki.py 27.5 KB

Edit Raw Blame History

#-*- coding:utf-8 -*-

from django.core.management.base import BaseCommand
import json
from common.util import suffixes, cut_end, debug
from dictionary.models import Pattern, Lexeme, InflectionCharacteristic,\
  Ending, LexicalClass, BaseFormLabel, Vocabulary
from dictionary.pattern_blacklist import blacklist
from dictionary.management.commands.import_morfologik import join, join_many, \
  relevant_subst, relevant_adj, find_minimal_sets

class Command(BaseCommand):
  args = '<nazwa pliku wejściowego>'

  def handle(self, input_file, **options):
    import_resztki(open(input_file))

DEBUG = False

GENDERS = ('m1', 'm2', 'm3', 'm', 'f', 'n1', 'n2', 'p1', 'p2', 'p3')

#morf = Vocabulary.objects.get(id='Morfologik').owned_lexemes.all()
sgjp = Lexeme.objects.exclude(source='Morfologik')

def get_basic_endings(parts_of_speech, genders=None):
  ics = InflectionCharacteristic.objects.filter(
    part_of_speech__in=parts_of_speech)
  if genders:
    ics = ics.filter(symbol__in=genders)
  basic_form_labels = ics.values_list('basic_form_label', flat=True).distinct()
  return Ending.objects.filter(base_form_label__pk__in=basic_form_labels,
    pattern__type__lexical_class=lexical_class)

def expand_gender(gender):
  if gender == 'm':
    return ['m1', 'm2', 'm3']
  else:
    return [gender]

basic_form_endings_dict = {}
for lexical_class in LexicalClass.objects.all():
  parts_of_speech = lexical_class.partofspeech_set.all()
  if lexical_class.symbol == 'subst':
    for gender in GENDERS:
      basic_form_endings_dict[(lexical_class, gender)] = get_basic_endings(
        parts_of_speech, expand_gender(gender))
  else:
    basic_form_endings_dict[lexical_class] = get_basic_endings(
      parts_of_speech)

def tantum_a_posteriori(form_set, patterns):
  tantum = None
  for pattern, root in patterns:
    tantum_forms = {
      'sg': set(root + e for e in
        pattern.endings.filter(base_form_label__symbol__startswith='sg')
        .values_list('string', flat=True)),
      'pl': set(root + e for e in
        pattern.endings.filter(base_form_label__symbol__startswith='pl')
        .values_list('string', flat=True)),
      }
    for num in ('sg', 'pl'):
      if form_set.issubset(tantum_forms[num]):
        tantum = num
    if tantum:
      return tantum
  if not patterns:
    return 'sg'
  return None

def relevant(lexical_class, ending, **extra):
  if lexical_class.symbol == 'subst':
    return relevant_subst(ending, **extra)
  elif lexical_class.symbol == 'adj':
    return relevant_adj(ending)

from itertools import chain, combinations
def powerset(iterable):
  """powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"""
  s = list(iterable)
  return chain.from_iterable(
    combinations(s, r) for r in xrange(min(len(s)+1, 5)))

sure_bfls_sg = tuple(
  BaseFormLabel.objects.filter(
    symbol__in=['sg:dat', 'sg:gen', 'sg:inst']).values_list('pk', flat=True))
sure_bfls_pl = tuple(
  BaseFormLabel.objects.filter(
    symbol__in=['pl:dat', 'pl:inst', 'pl:loc']).values_list('pk', flat=True))

def basic_form_endings(lexical_class, basic_form, form_set, **extra):
  if 'gender' in extra:
    key = (lexical_class, extra['gender'])
  else:
    key = lexical_class
  if lexical_class.symbol != 'subst':
    return basic_form_endings_dict[key].filter(string__in=suffixes(basic_form))
  else:
    # karkołomne, ale trochę przyśpiesza
    endings = basic_form_endings_dict[key]
    new_endings = Ending.objects.none()
    for suf in suffixes(basic_form):
      root = cut_end(basic_form, suf)
      n = len(root)
      ending_strings = tuple(
        form[n:] for form in form_set if form.startswith(root))
      endings_part = endings.filter(string=suf)
      pattern_pks = endings_part.values_list('pattern', flat=True)
      patterns = Pattern.objects.filter(pk__in=pattern_pks).extra(
        where=["(w_id = '0000' or not exists "
               "(select id from zakonczenia where w_id = wzory.id "
               "and zak not in %s and efobaz in %s) or not exists "
               "(select id from zakonczenia where w_id = wzory.id "
               "and zak not in %s and efobaz in %s))"],
        params=[ending_strings, sure_bfls_sg, ending_strings, sure_bfls_pl])
      new_endings = new_endings | endings_part.filter(pattern__in=patterns)
    return new_endings

memoized_good_endings = {}

def good_ending_set_subst(pattern, root, tantum, gender):
  if (pattern, tantum, gender) in memoized_good_endings:
    good_endings = memoized_good_endings[(pattern, tantum, gender)]
    return set(root + e for e in good_endings)
  endings = pattern.endings
  if tantum:
    endings = endings.filter(base_form_label__symbol__startswith=tantum)
  if gender not in ('m1', 'p1'):
    endings = endings.exclude(base_form_label__symbol='pl:nom:mo')
  if gender[0] != 'p':
    for g in list(set('mfn') - set(gender[0])):
      endings = endings.exclude(
        base_form_label__symbol__startswith='pl:gen:' + g)
  if gender == 'p3':
    if pattern.type.symbol == 'f':
      endings = endings.exclude(base_form_label__symbol='pl:gen:m')
    if pattern.type.symbol == 'n':
      endings = endings.exclude(base_form_label__symbol='pl:gen:n')
  good_endings = list(endings.values_list('string', flat=True))
  memoized_good_endings[(pattern, tantum, gender)] = good_endings
  return set(root + e for e in good_endings)

def good_ending_set(lexical_class, pattern, root='', **extra):
  if lexical_class.symbol != 'subst':
    return pattern.ending_set(root)
  else:
    return good_ending_set_subst(pattern, root, **extra)

memoized_pattern_ics = {}

def bad_pattern_subst(pattern, gender, tantum):
  if (pattern, gender) in memoized_pattern_ics:
    return memoized_pattern_ics[(pattern, gender)]
  ics = expand_gender(gender)
  if gender == 'p1':
    ics.append('m1')
  if gender in ('p2', 'p3'):
    ics += ['m2', 'm3', 'f', 'n1', 'n2']
  if not pattern.lexemeinflectionpattern_set.filter(
    inflection_characteristic__symbol__in=ics).filter(lexeme__pk__in=sgjp):
    ret = True
  elif pattern.type.symbol in 'mn' and gender == 'f':
    ret = True
  elif pattern.type.symbol in 'fm' and gender[0] == 'n':
    ret = True
  else:
    ret = False
  memoized_pattern_ics[(pattern, gender)] = ret
  return ret

def find_patterns(lexical_class, basic_form, forms, **extra):
  #patterns = Pattern.objects.filter(type__lexical_class=lexical_class)
  # znaleźć wszystkie zawarte i zawierające wzory
  form_set = set(forms)
  ending_sets = {}
  included_patterns = set()
  including_patterns = set()
  matching_patterns = set()
  base_forms_changed = False
  for basic_ending in basic_form_endings(
      lexical_class, basic_form, form_set, **extra):
    pattern = basic_ending.pattern
    if lexical_class.symbol == 'subst' and bad_pattern_subst(pattern, **extra):
      #print 'odpadł:', pattern
      continue # olewamy komentarze że formy odrzucone przez charfle?
    root = basic_form[:len(basic_form) - len(basic_ending.string)]
    ending_sets[pattern] = good_ending_set(
      lexical_class, pattern, root, **extra)
    including = form_set.issubset(ending_sets[pattern])
    extra_base_forms = []
    bad_forms = set()
    for ending in pattern.endings.all():
      if relevant(lexical_class, ending, **extra):
        if root + ending.string not in form_set:
          if DEBUG:
            bfl = ending.base_form_label.symbol
            #print pattern.name, root, ending.string, bfl
          bad_forms.add(root + ending.string)
    if not bad_forms:
      if extra_base_forms:
        extra['base_forms'] += extra_base_forms
        base_forms_changed = True
      included_patterns.add((pattern, root))
      if including:
        matching_patterns.add((pattern, root))
    elif including:
      including_patterns.add(((pattern, root), tuple(bad_forms)))

  if base_forms_changed:
    #print extra['base_forms']
    return find_patterns(lexical_class, basic_form, forms, **extra)
    # nie wiem, czy to potrzebne, ale na wszelki wypadek
  included_patterns = list(included_patterns)
  including_patterns = list(including_patterns)
  matching_patterns = list(matching_patterns)
  if len(matching_patterns) > 0:
    if DEBUG:
      print u'dokładne wzory: %s' % join(matching_patterns)
    return 'match', matching_patterns, included_patterns, including_patterns
    # nic nie pasuje albo trzeba wybrać wiele wzorów
  if DEBUG and len(including_patterns) > 0:
    print u'zawierające: %s' % join(p for p, b_f in including_patterns)
  if DEBUG and len(included_patterns) > 0:
    print u'zawarte: %s' % join(included_patterns)
  return find_many_patterns(
    lexical_class, form_set, basic_form, included_patterns, ending_sets,
    **extra) + (included_patterns, including_patterns)

def find_many_patterns(lexical_class, form_set, basic_form, included_patterns,
                       ending_sets, **extra):
  necessary_patterns = set()
  missing_form = None
  for form in form_set:
    having = []
    for pattern, root in included_patterns:
      if form in ending_sets[pattern]:
        having.append((pattern, root))
    if len(having) == 1:
      necessary_patterns.add(having[0])
    if not having:
      missing_form = form
      break
  if missing_form:
    if DEBUG:
      print u"brak formy: %s" % missing_form
    return 'none', []
  covered_forms = set()
  for pattern, root in necessary_patterns:
    covered_forms |= ending_sets[pattern]
  if form_set.issubset(covered_forms):
    if DEBUG:
      print u"pokryte koniecznymi wzorami: %s" % join(necessary_patterns)
    return 'many', [list(necessary_patterns)]
  else:
    #for pattern, root in included_patterns:
    #  print pattern, ending_sets[pattern]
    minimal_sets = find_minimal_sets(
      form_set, covered_forms, necessary_patterns, included_patterns,
      ending_sets)
    return 'many', minimal_sets

def check_sgjp(lc_sym, entry, form_set, **extra):
  if lc_sym != 'adj':
    lexemes = Lexeme.objects.distinct().filter(
      entry=entry, part_of_speech__lexical_class__symbol=lc_sym)
  else:
    lexemes = Lexeme.objects.distinct().filter(
      entry=entry, part_of_speech__symbol__in=('adj', 'appas'))
  lexemes = lexemes.filter(pk__in=sgjp)
  matched_lexemes = []
  for lexeme in lexemes:
    if lc_sym == 'adj' and lexeme.refs_to.filter(type='nieadj'):
      continue
    if lc_sym == 'subst' and extra['tantum'] == 'sg':
      sgjp_forms = lexeme.all_forms(affixes=False, label_filter=r'sg:')
    elif lexeme.part_of_speech.symbol == 'appas':
      sgjp_forms = lexeme.all_forms(affixes=True)
    else:
      sgjp_forms = lexeme.all_forms(affixes=False)
    if sgjp_forms == form_set:
      matched_lexemes.append(lexeme)
      continue
    diff = sgjp_forms - form_set
    exceptions = []
    if lc_sym == 'subst':
      if lexeme.lexemeinflectionpattern_set.filter(
          inflection_characteristic__symbol__in=('m1', 'p1')).exists():
        # depr
        exceptions = lexeme.all_forms(affixes=False, label_filter=r'^pl:nom$')
    elif lc_sym == 'adj':
      # -o
      exceptions = lexeme.all_forms(affixes=False, label_filter=r'^0$')
    if form_set.issubset(sgjp_forms) and diff.issubset(exceptions):
      matched_lexemes.append(lexeme)
  if len(matched_lexemes) > 1:
    if lc_sym == 'subst' and entry.endswith(u'ość'):
      matched_lexemes_subst = [
      l for l in matched_lexemes if l.part_of_speech.symbol == 'subst']
      if matched_lexemes_subst:
        matched_lexemes = matched_lexemes_subst
    if len(matched_lexemes) > 1:
      debug(entry, u'niejednoznaczność dopasowanych leksemów')
  if len(matched_lexemes) > 0:
    return matched_lexemes[0]
  return False

def closest_lexeme_subst(entry, gender, patterns, included=None):
  lexemes = Lexeme.objects.filter(
    part_of_speech__lexical_class__symbol='subst')
  lexemes = lexemes.distinct()
  # ten sam rodzaj
  genders = expand_gender(gender)
  lexemes = lexemes.filter(
    lexemeinflectionpattern__inflection_characteristic__symbol__in=
    genders)
  if not included:
    # posiada wzór zawierający się w pasujących
    lexemes = lexemes.filter(lexemeinflectionpattern__pattern__in=patterns)
  else:
    #print patterns, included
    new_lexemes = Lexeme.objects.none()
    # posiada wszystkie wzory z któregoś zestawu
    for pattern_set in patterns:
      part = lexemes
      for pattern, root in pattern_set:
        part = part.filter(lexemeinflectionpattern__pattern=pattern)
      new_lexemes |= part
    lexemes = new_lexemes.distinct()
  # nie posiada wzorów niezawierających się w pasujących, dobra wielkość
  uppercase = entry[0].isupper()
  good_lexemes = []
  for lexeme in lexemes:
    if lexeme.entry[0].isupper() == uppercase:
      for lip in lexeme.lexemeinflectionpattern_set.all():
        if not included:
          if lip.pattern not in patterns:
            break
        else:
          if lip.pattern not in included:
            break
      else:
        good_lexemes.append(lexeme)
  # najdłuższe wspólne zakończenie
  best = (-1, None)
  for lexeme in good_lexemes:
    common_suffix = 0
    for char1, char2 in zip(entry[::-1], lexeme.entry[::-1]):
      if char1 == char2:
        common_suffix += 1
      else:
        break
    if common_suffix > best[0]:
      best = (common_suffix, lexeme)
  return best[1]

def blacklist_filter(patterns):
  return [(pattern, root) for (pattern, root) in patterns
          if pattern.name not in blacklist]

def filter_patterns(filter, action_name, type, patterns, included, including,
                    lexical_class, form_set, entry, **extra):
  old_patterns = patterns
  old_included = included
  bad_patterns = False
  if type == 'many':
    if any(pattern_set != filter(pattern_set) for pattern_set in patterns):
      included = filter(included)
      ending_sets = {}
      for pattern, root in included:
        ending_sets[pattern] = good_ending_set(
          lexical_class, pattern, root, **extra)
      type, patterns = find_many_patterns(
        lexical_class, form_set, entry, included, ending_sets, **extra)
      if type != 'many':
        debug(entry, u'mnogie dopasowanie zepsute przez %s (%s)' %
                     (action_name, join_many(old_patterns)))
        type = 'many'
        patterns, included = old_patterns, old_included
        bad_patterns = True
  elif type == 'none':
    including_dict = dict(including)
    including = [(key, including_dict[key]) for key in filter(including_dict)]
  else: # type == 'match'
    patterns = filter(patterns)
    including_dict = dict(including)
    including = [(key, including_dict[key]) for key in filter(including_dict)]
    included = filter(included)
    if old_patterns and not patterns:
      ending_sets = {}
      for pattern, root in included:
        ending_sets[pattern] = good_ending_set(
          lexical_class, pattern, root, **extra)
      type, patterns = find_many_patterns(
        lexical_class, form_set, entry, included, ending_sets, **extra)
      if type == 'none':
        debug(entry, u'znikły wzory przez %s (%s)' %
                     (action_name, join(old_patterns)))
        type = 'match'
        patterns = old_patterns
        bad_patterns = True
  return type, patterns, included, including, bad_patterns

def create_derived(pos, base_forms, forms, patterns):
  tab = {'ger': ('11', u'ie'), 'pact': ('3', u'cy'), 'ppas': ('10', u'y')}
  entries = {}
  for pattern, root in patterns:
    bfl = tab[pos][0]
    ending = pattern.endings.get(base_form_label__symbol=bfl)
    entry = root + ending.string + tab[pos][1]
    if entry not in entries:
      entries[entry] = []
    entries[entry].append(pattern.name)
  output = []
  for entry, patterns in entries.iteritems():
    if entry in forms:
      output.append((pos, entry, patterns))
  return output

def get_sgjp(lexeme):
  return {'source': 'sgjp', 'id': lexeme.pk, 'entry': lexeme.entry}

def create_lexeme(entry, part_of_speech, status, comment):
  return {
    'source': 'morfologik',
    'entry': entry,
    'part_of_speech': part_of_speech,
    'status': status,
    'comment': comment,
    }

def create_lip(pattern, root, i, ic, part_of_speech):
  output = {
    'pattern': pattern if isinstance(pattern, basestring) else pattern.name,
    'ind': i,
    'ic': (ic, part_of_speech),
    }
  if root:
    output['root'] = {'type': 'string', 'root': root}
  else:
    output['root'] = {'type': 'compute'}
  return output

alternative_gender = {
  'p1': 'p3',
  'p3': 'p1',
  'm1': 'm2/m3',
  'm': 'm1',
  }

alternative_gender2 = {
  'p1': 'p3',
  'p3': 'p1',
  'm1': 'm',
  'm': 'm1',
  }

def lexeme_creation(lc_sym, entry, ic, forms, type, patterns, fitting,
                    bad_patterns, included, other_result, tantum=None,
                    gender=None, negated=None, base_forms=None, derived=None):
  status = 'desc' if type != 'none' else 'cand'
  comments = [u'Z importu resztek']
  copy_lips = False
  if lc_sym == 'subst':
    part_of_speech = 'subst' # co z osc i skrs?
    if ic in ('m2', 'm3'):
      sure = False
      if type != 'none' and len(fitting) == 1:
        for pattern, root in patterns:
          for e in patterns[0][0].endings.filter(base_form_label__symbol='sg:gen'):
            if not e.string.endswith('u'):
              break
          else:
            continue
          break
        else: # wszystkie sg:gen kończą się na 'u'
          ic = 'm3'
          sure = True
        for pattern, root in patterns:
          if pattern.type.symbol == 'f':
            ic = 'm2'
            sure = True
            break
      if not sure:
        status = 'cand'
    if tantum is None and ic == 'm1':
      for pattern, root in patterns:
        nmo_endings = pattern.endings.filter(base_form_label__symbol='pl:nom')
        for e in nmo_endings:
          nmo_form = root + e.string
          if nmo_form not in forms:
            comments.append(u'Dodano formę depr')
            break
        else:
          continue
        break
    if ic == 'p1':
      for pattern, root in patterns:
        nmo_endings = pattern.endings.filter(base_form_label__symbol='pl:nom')
        other_endings = pattern.endings.exclude(base_form_label__symbol='pl:nom')
        other_strings = other_endings.values_list('string', flat=True)
        nmo_strings = [e.string for e in nmo_endings if e.string not in other_strings]
        nmo_forms = set(root + s for s in nmo_strings)
        if nmo_forms & set(forms):
          comments.append(
            u'Usunięto formę depr: %s' % ', '.join(list(nmo_forms)))
          break
    if tantum == 'sg' and type != 'none':
      if type == 'match':
        search_patterns = [pattern for pattern, root in fitting]
        l = closest_lexeme_subst(entry, gender, search_patterns)
      else:
        included_patterns = [pattern for pattern, root in included]
        l = closest_lexeme_subst(entry, gender, fitting, included_patterns)
      if l:
        copy_lips = l.lexemeinflectionpattern_set.all()
        #print l
        comments.append(u'Automatycznie rozszerzone singulare tantum')
      else:
        if type == 'match':
          p = join(fitting)
        else:
          p = join_many(fitting)
        debug(entry, u'nie ma pasujących leksemów dla rozszerzenia sgtant '
                     u'dla wzorów %s' % p)
        comments.append(u'Nie udało się rozszerzyć singulare tantum')
        #status = 'cand'
        # dodać kwalifikator [po imporcie jednak]
  if bad_patterns:
    comments.append(u'Wzory z czarnej listy!')
    status = 'cand'
  if len(fitting) > 1 or (type == 'none' and fitting):
    status = 'cand'
    if type == 'none':
      comments.append(u'Zawierające wzory:')
      for (pattern, root), bad_forms in fitting:
        comments.append('%s: %s' % (pattern.name, ', '.join(bad_forms)))
    elif type != 'many':
      comments.append(u'Pasujące wzory: %s' % join(fitting))
    else:
      comments.append(u'Pasujące zestawy wzorów: %s' % join_many(fitting))
  if other_result:
    status = 'cand'
    type2, patterns2, included2, including2 = other_result
    comments.append(u'Alternatywny rodzaj: %s' % alternative_gender[gender])
    if type2 == 'match':
      comments.append(u'Pasujące wzory: %s' % join(patterns2))
    elif type2 == 'many':
      comments.append(u'Pasujące zestawy wzorów: %s' % join_many(patterns2))
    # hm?
  if ic is None and type != 'none':
    comments.append(u'Dopasowane wzory: %s' % join(patterns))
    # zbieramy wzory do porównania [fuj, copypasta!]
  if len(fitting) > 1:
    if type == 'none':
      all_patterns = set(p for p, b_f in fitting)
    elif type != 'many':
      all_patterns = set(fitting)
    else:
      all_patterns = set()
      for pattern_set in fitting:
        all_patterns |= set(pattern_set)
  if other_result and len(patterns2) > 1:
    if type2 != 'many':
      other_patterns = set(patterns2)
    else:
      other_patterns = set()
      for pattern_set in patterns2:
        other_patterns |= set(pattern_set)
  comment = '\n'.join(comments)
  output = {
    'lexeme': create_lexeme(entry, part_of_speech, status, comment)
  }
  lips = []
  if ic is not None:
    if not copy_lips:
      for i, (pattern, root) in enumerate(patterns):
        lips.append(create_lip(pattern, root, i + 1, ic, part_of_speech))
    else:
      for lip in copy_lips:
        ic = lip.inflection_characteristic.symbol
        lips.append(
          create_lip(lip.pattern, None, lip.index, ic, part_of_speech))
  output['lips'] = lips
  if lc_sym == 'adj' and negated:
    output['negated'] = True
  if lc_sym == 'v':
    derived_data = []
    for pos in derived:
      # wypadałoby informować, jeśli wyszło puste... (?)
      derived_data += create_derived(pos, base_forms, forms, patterns)
    output['derived'] = derived_data
  return output

def process_forms(entry, forms, lc_sym, **extra):
  lexical_class = LexicalClass.objects.get(symbol=lc_sym)
  other_result = None
  form_set = set(forms)
  check = check_sgjp(lc_sym, entry, form_set, **extra)
  if check and not DEBUG:
    # dopisz leksem do słownika
    data = {'lexeme': get_sgjp(check)}
    # TODO negacja przymiotnika
    print_data(data)
  else:
    if lc_sym == 'subst':
      ic = extra['gender']
    extra2 = dict(extra)
    # jeśli rzeczownik męski lub pltant, to puszczamy więcej razy
    if lc_sym == 'subst' and extra['gender'] in 'pm':
      if extra['gender'] == 'm':
        extra2['gender'] = 'm1'
        type1, patterns1, included1, including1 = find_patterns(
          lexical_class, entry, forms, **extra2)
        extra2['gender'] = 'm'
        type2, patterns2, included2, including2 = find_patterns(
          lexical_class, entry, forms, **extra)
      elif extra['gender'] == 'p':
        extra2['gender'] = 'p1'
        type1, patterns1, included1, including1 = find_patterns(
          lexical_class, entry, forms, **extra2)
        extra2['gender'] = 'p3'
        type2, patterns2, included2, including2 = find_patterns(
          lexical_class, entry, forms, **extra2)
      if type1 != 'none' and type2 == 'none':
        type, patterns, included, including = type1, patterns1, included1, including1
        if extra['gender'] == 'm':
          ic = 'm1'
        else:
          ic = 'p1'
      elif type1 == 'none' and type2 != 'none':
        type, patterns, included, including = type2, patterns2, included2, including2
        if extra['gender'] == 'm':
          ic = 'm'
        else:
          ic = 'p3'
      elif type1 == type2 == 'none':
        type = 'none'
        patterns = []
        included = list(set(included1) | set(included2))
        including = list(set(including1) | set(including2))
        # chyba warto coś ustawić
        if extra['gender'] == 'm':
          ic = 'm'
        else:
          ic = 'p3'
      else: # z obu coś wyszło
        type, patterns, included, including = type1, patterns1, included1, including1
        if extra['gender'] == 'm':
          ic = 'm'
        else:
          ic = 'p1'
        other_result = (type2, patterns2, included2, including2)
        if DEBUG:
          print u"dwie możliwości na rodzaj"
    else:
      type, patterns, included, including = find_patterns(
        lexical_class, entry, forms, **extra)
    if type == 'none':
      if lc_sym == 'subst' and not extra['tantum']:
        extra['tantum'] = tantum_a_posteriori(
          form_set, [p for p, b_f in including])
        if extra['tantum']:
          if extra['tantum'] == 'pl':
            extra['gender'] = 'p'
          return process_forms(entry, forms, lc_sym, **extra)

    if lc_sym == 'subst':
      extra2['gender'] = ic
    type, patterns, included, including, bad_patterns = filter_patterns(
      blacklist_filter, u'czarną listę', type, patterns, included, including,
      lexical_class, form_set, entry, **extra2)
    if bad_patterns and other_result:
      type, patterns, included, including = other_result
      ic = extra2['gender'] = alternative_gender2[ic]
      type, patterns, included, including, bad_patterns = filter_patterns(
        blacklist_filter, u'czarną listę', type, patterns, included, including,
        lexical_class, form_set, entry, **extra2)
      other_result = None
    elif other_result:
      type2, patterns2, included2, including2 = other_result
      new_other_result = filter_patterns(
        blacklist_filter, u'czarną listę', type2, patterns2, included2,
        including2, lexical_class, form_set, entry, **extra2)
      if not new_other_result[4]:
        other_result = new_other_result[:4]
      else:
        other_result = None

    # wzory się już nie zmienią od tego miejsca
    if type == 'many':
      # albo patterns[0]...
      all_patterns = [p for pattern_set in patterns for p in pattern_set]
    else:
      all_patterns = patterns
    if type != 'none':
      # brzydko...
      if lexical_class.symbol == 'subst':
        extra['ic'] = ic
        del extra['ic']
    else:
      ic = None
    if lc_sym == 'subst':
      # poprawka dla m2/m3
      if ic in ('m2', 'm3') and patterns and not bad_patterns:
        new_ic = ''
        for pattern, root in all_patterns:
          for ic2 in ('m2', 'm3'):
            # jeśli wszystkie użycia tego wzoru są przy ic2
            if not pattern.lexemeinflectionpattern_set.exclude(
              inflection_characteristic__symbol=ic2).filter(
              lexeme__pk__in=sgjp).exists():
              if new_ic == '':
                new_ic = ic2
              elif new_ic != ic2:
                new_ic = None
        if new_ic:
          ic = new_ic

    if type == 'none':
      debug(entry, u'zawiera się w %s' % join(p for p, b_f in including))
      chosen = []
      fitting = including
    elif type == 'match':
      patterns.sort(key=lambda p: p[0].name)
      fitting = patterns
      chosen = patterns[:1]
    elif type == 'many':
      chosen = patterns[0]
      if DEBUG:
        print u'zestawy wielu wzorów: %s' % join_many(patterns)
      fitting = patterns
    if not DEBUG:
      data = lexeme_creation(
        lc_sym, entry, ic, forms, type, chosen, fitting, bad_patterns, included,
        other_result, **extra2)
      print_data(data)

def get_pos_ndm(tag):
  if tag[0] == 'adv':
    return 'adv' if tag[-1] != 'comp' else 'advcom'
  elif tag[0] == 'xxx':
    return 'burk'
  else:
    return tag[0]

def print_data(data):
  print json.dumps(data)

def import_resztki(input_file):
  for line in input_file:
    entry, rest = line.decode('utf-8').strip().replace("'", u'’').split(':')
    pos, ic, rest = rest.split(';')
    if ic in ('n2/m3', 'n2/m3/m2', 'm3/m2'):
      ic = 'm3'
    elif ic == 'p3/p2':
      ic = 'p3'
    elif ic in ('f/m3', 'f/n2'):
      ic = 'f'
    forms = rest.split(',')
    if entry not in forms:
      forms = [entry] + forms
    if ic[0] == 'p':
      tantum = 'pl'
    else:
      tantum = None
    if entry.endswith(u'stwo') and ic == 'p1':
      # zbędne formy mnogie
      root = entry[:-1]
      for suffix in ('', 'ami', 'ach', 'om'):
        forms.remove(root + suffix)
    process_forms(entry, forms, pos, gender=ic, tantum=tantum)