import_resztki.py 31.4 KB

Edit Raw Blame History

#-*- coding:utf-8 -*-

import json

from django.core.management.base import BaseCommand

from common.util import suffixes, cut_end, debug, GroupDict, uniopen
from dictionary.models import Pattern, Lexeme, InflectionCharacteristic, \
    Ending, LexicalClass, BaseFormLabel
from dictionary.pattern_blacklist import blacklist
from dictionary.management.commands.import_morfologik import join, join_many, \
    relevant_subst, relevant_adj, find_minimal_sets


class Command(BaseCommand):
    args = '<nazwa pliku wejściowego>'

    def handle(self, input_file, **options):
        import_resztki(uniopen(input_file))


DEBUG = False

GENDERS = ('m1', 'm2', 'm3', 'm', 'f', 'n1', 'n2', 'p1', 'p2', 'p3')

#morf = Vocabulary.objects.get(id='Morfologik').owned_lexemes.all()
sgjp = Lexeme.objects.exclude(source='Morfologik')


def get_basic_endings(parts_of_speech, genders=None):
    ics = InflectionCharacteristic.objects.filter(
        part_of_speech__in=parts_of_speech)
    if genders:
        ics = ics.filter(symbol__in=genders)
    basic_form_labels = ics.values_list('basic_form_label',
        flat=True).distinct()
    return Ending.objects.filter(base_form_label__pk__in=basic_form_labels,
        pattern__type__lexical_class=lexical_class)


def expand_gender(gender):
    if gender == 'm':
        return ['m1', 'm2', 'm3']
    else:
        return [gender]


basic_form_endings_dict = {}
for lexical_class in LexicalClass.objects.all():
    parts_of_speech = lexical_class.partofspeech_set.all()
    if lexical_class.symbol == 'subst':
        for gender in GENDERS:
            basic_form_endings_dict[
                (lexical_class, gender)] = get_basic_endings(
                parts_of_speech, expand_gender(gender))
    else:
        basic_form_endings_dict[lexical_class] = get_basic_endings(
            parts_of_speech)


def tantum_a_posteriori(form_set, patterns):
    tantum = None
    for pattern, root in patterns:
        tantum_forms = {
            'sg': set(root + e for e in
                pattern.endings.filter(
                    base_form_label__symbol__startswith='sg')
                .values_list('string', flat=True)),
            'pl': set(root + e for e in
                pattern.endings.filter(
                    base_form_label__symbol__startswith='pl')
                .values_list('string', flat=True)),
        }
        for num in ('sg', 'pl'):
            if form_set.issubset(tantum_forms[num]):
                tantum = num
        if tantum:
            return tantum
    if not patterns:
        return 'sg'
    return None


def relevant(lexical_class, ending, **extra):
    if lexical_class.symbol == 'subst':
        return relevant_subst(ending, **extra)
    elif lexical_class.symbol == 'adj':
        return relevant_adj(ending)


from itertools import chain, combinations


def powerset(iterable):
    """powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"""
    s = list(iterable)
    return chain.from_iterable(
        combinations(s, r) for r in xrange(min(len(s) + 1, 5)))


sure_bfls_sg = tuple(
    BaseFormLabel.objects.filter(
        symbol__in=['sg:dat', 'sg:gen', 'sg:inst']).values_list('pk',
        flat=True))
sure_bfls_pl = tuple(
    BaseFormLabel.objects.filter(
        symbol__in=['pl:dat', 'pl:inst', 'pl:loc']).values_list('pk',
        flat=True))


def basic_form_endings(lexical_class, basic_form, form_set, **extra):
    if 'gender' in extra:
        key = (lexical_class, extra['gender'])
    else:
        key = lexical_class
    if lexical_class.symbol != 'subst':
        return basic_form_endings_dict[key].filter(
            string__in=suffixes(basic_form))
    else:
        # karkołomne, ale trochę przyśpiesza
        endings = basic_form_endings_dict[key]
        new_endings = Ending.objects.none()
        for suf in suffixes(basic_form):
            root = cut_end(basic_form, suf)
            n = len(root)
            ending_strings = tuple(
                form[n:] for form in form_set if form.startswith(root))
            endings_part = endings.filter(string=suf)
            pattern_pks = endings_part.values_list('pattern', flat=True)
            patterns = Pattern.objects.filter(pk__in=pattern_pks).extra(
                where=["(w_id = '0000' or not exists "
                       "(select id from zakonczenia where w_id = wzory.id "
                       "and zak not in %s and efobaz in %s) or not exists "
                       "(select id from zakonczenia where w_id = wzory.id "
                       "and zak not in %s and efobaz in %s))"],
                params=[ending_strings, sure_bfls_sg, ending_strings,
                    sure_bfls_pl])
            new_endings = new_endings | endings_part.filter(
                pattern__in=patterns)
        return new_endings


memoized_good_endings = {}


def good_ending_set_subst(pattern, root, tantum, gender):
    if (pattern, tantum, gender) in memoized_good_endings:
        good_endings = memoized_good_endings[(pattern, tantum, gender)]
        return set(root + e for e in good_endings)
    endings = pattern.endings
    if tantum:
        endings = endings.filter(base_form_label__symbol__startswith=tantum)
    if gender not in ('m1', 'p1'):
        endings = endings.exclude(base_form_label__symbol='pl:nom:mo')
    if gender[0] != 'p':
        for g in list(set('mfn') - set(gender[0])):
            endings = endings.exclude(
                base_form_label__symbol__startswith='pl:gen:' + g)
    if gender == 'p3':
        if pattern.type.symbol == 'f':
            endings = endings.exclude(base_form_label__symbol='pl:gen:m')
        if pattern.type.symbol == 'n':
            endings = endings.exclude(base_form_label__symbol='pl:gen:n')
    good_endings = list(endings.values_list('string', flat=True))
    memoized_good_endings[(pattern, tantum, gender)] = good_endings
    return set(root + e for e in good_endings)


def good_ending_set(lexical_class, pattern, root='', **extra):
    if lexical_class.symbol != 'subst':
        return pattern.ending_set(root)
    else:
        return good_ending_set_subst(pattern, root, **extra)


memoized_pattern_ics = {}


def bad_pattern_subst(pattern, gender, tantum):
    if (pattern, gender) in memoized_pattern_ics:
        return memoized_pattern_ics[(pattern, gender)]
    ics = expand_gender(gender)
    if gender == 'p1':
        ics.append('m1')
    if gender in ('p2', 'p3'):
        ics += ['m2', 'm3', 'f', 'n1', 'n2']
    if not pattern.lexemeinflectionpattern_set.filter(
            inflection_characteristic__symbol__in=ics).filter(
            lexeme__pk__in=sgjp):
        ret = True
    elif pattern.type.symbol in 'mn' and gender == 'f':
        ret = True
    elif pattern.type.symbol in 'fm' and gender[0] == 'n':
        ret = True
    else:
        ret = False
    memoized_pattern_ics[(pattern, gender)] = ret
    return ret


def find_patterns(lexical_class, basic_form, forms, **extra):
    #patterns = Pattern.objects.filter(type__lexical_class=lexical_class)
    # znaleźć wszystkie zawarte i zawierające wzory
    form_set = set(forms)
    ending_sets = {}
    included_patterns = set()
    including_patterns = set()
    matching_patterns = set()
    base_forms_changed = False
    for basic_ending in basic_form_endings(
            lexical_class, basic_form, form_set, **extra):
        pattern = basic_ending.pattern
        if lexical_class.symbol == 'subst' and bad_pattern_subst(pattern,
                **extra):
            #print 'odpadł:', pattern
            continue # olewamy komentarze że formy odrzucone przez charfle?
        root = basic_form[:len(basic_form) - len(basic_ending.string)]
        ending_sets[pattern] = good_ending_set(
            lexical_class, pattern, root, **extra)
        including = form_set.issubset(ending_sets[pattern])
        extra_base_forms = []
        bad_forms = set()
        for ending in pattern.endings.all():
            if relevant(lexical_class, ending, **extra):
                if root + ending.string not in form_set:
                    if DEBUG:
                        bfl = ending.base_form_label.symbol
                        #print pattern.name, root, ending.string, bfl
                    bad_forms.add(root + ending.string)
        if not bad_forms:
            if extra_base_forms:
                extra['base_forms'] += extra_base_forms
                base_forms_changed = True
            included_patterns.add((pattern, root))
            if including:
                matching_patterns.add((pattern, root))
        elif including:
            including_patterns.add(((pattern, root), tuple(bad_forms)))

    if base_forms_changed:
        #print extra['base_forms']
        return find_patterns(lexical_class, basic_form, forms, **extra)
        # nie wiem, czy to potrzebne, ale na wszelki wypadek
    included_patterns = list(included_patterns)
    including_patterns = list(including_patterns)
    matching_patterns = list(matching_patterns)
    if len(matching_patterns) > 0:
        if DEBUG:
            print u'dokładne wzory: %s' % join(matching_patterns)
        return 'match', matching_patterns, included_patterns, including_patterns
        # nic nie pasuje albo trzeba wybrać wiele wzorów
    if DEBUG and len(including_patterns) > 0:
        print u'zawierające: %s' % join(p for p, b_f in including_patterns)
    if DEBUG and len(included_patterns) > 0:
        print u'zawarte: %s' % join(included_patterns)
    return find_many_patterns(
        lexical_class, form_set, basic_form, included_patterns, ending_sets,
        **extra) + (included_patterns, including_patterns)


def find_many_patterns(lexical_class, form_set, basic_form, included_patterns,
                       ending_sets, **extra):
    necessary_patterns = set()
    missing_form = None
    for form in form_set:
        having = []
        for pattern, root in included_patterns:
            if form in ending_sets[pattern]:
                having.append((pattern, root))
        if len(having) == 1:
            necessary_patterns.add(having[0])
        if not having:
            missing_form = form
            break
    if missing_form:
        if DEBUG:
            print u"brak formy: %s" % missing_form
        return 'none', []
    covered_forms = set()
    for pattern, root in necessary_patterns:
        covered_forms |= ending_sets[pattern]
    if form_set.issubset(covered_forms):
        if DEBUG:
            print u"pokryte koniecznymi wzorami: %s" % join(necessary_patterns)
        return 'many', [list(necessary_patterns)]
    else:
        #for pattern, root in included_patterns:
        #  print pattern, ending_sets[pattern]
        minimal_sets = find_minimal_sets(
            form_set, covered_forms, necessary_patterns, included_patterns,
            ending_sets)
        return 'many', minimal_sets


def check_sgjp(lc_sym, entry, form_set, **extra):
    if lc_sym != 'adj':
        lexemes = Lexeme.objects.distinct().filter(
            entry=entry, part_of_speech__lexical_class__symbol=lc_sym)
    else:
        lexemes = Lexeme.objects.distinct().filter(
            entry=entry, part_of_speech__symbol__in=('adj', 'appas'))
    lexemes = lexemes.filter(pk__in=sgjp)
    matched_lexemes = []
    for lexeme in lexemes:
        if lc_sym == 'adj' and lexeme.refs_to.filter(type='nieadj'):
            continue
        if lc_sym == 'subst' and extra['tantum'] == 'sg':
            sgjp_forms = lexeme.all_forms(label_filter=r'sg:')
        elif lexeme.part_of_speech.symbol == 'appas':
            sgjp_forms = lexeme.all_forms()
        else:
            sgjp_forms = lexeme.all_forms()
        if sgjp_forms == form_set:
            matched_lexemes.append(lexeme)
            continue
        diff = sgjp_forms - form_set
        exceptions = []
        if lc_sym == 'subst':
            if lexeme.lexemeinflectionpattern_set.filter(
                    inflection_characteristic__symbol__in=(
                        'm1', 'p1')).exists():
                # depr
                exceptions = lexeme.all_forms(label_filter=r'^pl:nom$')
        elif lc_sym == 'adj':
            # -o
            exceptions = lexeme.all_forms(label_filter=r'^0$')
        if form_set.issubset(sgjp_forms) and diff.issubset(exceptions):
            matched_lexemes.append(lexeme)
    if len(matched_lexemes) > 1:
        if lc_sym == 'subst' and entry.endswith(u'ość'):
            matched_lexemes_subst = [
                l for l in matched_lexemes if
                l.part_of_speech.symbol == 'subst']
            if matched_lexemes_subst:
                matched_lexemes = matched_lexemes_subst
        if len(matched_lexemes) > 1:
            debug(entry, u'niejednoznaczność dopasowanych leksemów')
    if len(matched_lexemes) > 0:
        return matched_lexemes[0]
    return False


def closest_lexeme_subst(entry, gender, patterns, included=None):
    lexemes = Lexeme.objects.filter(
        part_of_speech__lexical_class__symbol='subst')
    lexemes = lexemes.distinct()
    # ten sam rodzaj
    genders = expand_gender(gender)
    lexemes = lexemes.filter(
        lexemeinflectionpattern__inflection_characteristic__symbol__in=
        genders)
    if not included:
        # posiada wzór zawierający się w pasujących
        lexemes = lexemes.filter(lexemeinflectionpattern__pattern__in=patterns)
    else:
        #print patterns, included
        new_lexemes = Lexeme.objects.none()
        # posiada wszystkie wzory z któregoś zestawu
        for pattern_set in patterns:
            part = lexemes
            for pattern, root in pattern_set:
                part = part.filter(lexemeinflectionpattern__pattern=pattern)
            new_lexemes |= part
        lexemes = new_lexemes.distinct()
        # nie posiada wzorów niezawierających się w pasujących, dobra wielkość
    uppercase = entry[0].isupper()
    good_lexemes = []
    for lexeme in lexemes:
        if lexeme.entry[0].isupper() == uppercase:
            for lip in lexeme.lexemeinflectionpattern_set.all():
                if not included:
                    if lip.pattern not in patterns:
                        break
                else:
                    if lip.pattern not in included:
                        break
            else:
                good_lexemes.append(lexeme)
                # najdłuższe wspólne zakończenie
    best = (-1, None)
    for lexeme in good_lexemes:
        common_suffix = 0
        for char1, char2 in zip(entry[::-1], lexeme.entry[::-1]):
            if char1 == char2:
                common_suffix += 1
            else:
                break
        if common_suffix > best[0]:
            best = (common_suffix, lexeme)
    return best[1]


def blacklist_filter(patterns):
    return [(pattern, root) for (pattern, root) in patterns
        if pattern.name not in blacklist]


def filter_patterns(filter, action_name, type, patterns, included, including,
                    lexical_class, form_set, entry, **extra):
    old_patterns = patterns
    old_included = included
    bad_patterns = False
    if type == 'many':
        if any(pattern_set != filter(pattern_set) for pattern_set in patterns):
            included = filter(included)
            ending_sets = {}
            for pattern, root in included:
                ending_sets[pattern] = good_ending_set(
                    lexical_class, pattern, root, **extra)
            type, patterns = find_many_patterns(
                lexical_class, form_set, entry, included, ending_sets, **extra)
            if type != 'many':
                debug(entry, u'mnogie dopasowanie zepsute przez %s (%s)' %
                             (action_name, join_many(old_patterns)))
                type = 'many'
                patterns, included = old_patterns, old_included
                bad_patterns = True
    elif type == 'none':
        including_dict = dict(including)
        including = [(key, including_dict[key]) for key in
            filter(including_dict)]
    else: # type == 'match'
        patterns = filter(patterns)
        including_dict = dict(including)
        including = [(key, including_dict[key]) for key in
            filter(including_dict)]
        included = filter(included)
        if old_patterns and not patterns:
            ending_sets = {}
            for pattern, root in included:
                ending_sets[pattern] = good_ending_set(
                    lexical_class, pattern, root, **extra)
            type, patterns = find_many_patterns(
                lexical_class, form_set, entry, included, ending_sets, **extra)
            if type == 'none':
                debug(entry, u'znikły wzory przez %s (%s)' %
                             (action_name, join(old_patterns)))
                type = 'match'
                patterns = old_patterns
                bad_patterns = True
    return type, patterns, included, including, bad_patterns


def create_derived(pos, base_forms, forms, patterns):
    tab = {'ger': ('11', u'ie'), 'pact': ('3', u'cy'), 'ppas': ('10', u'y')}
    entries = GroupDict()
    for pattern, root in patterns:
        bfl = tab[pos][0]
        ending = pattern.endings.get(base_form_label__symbol=bfl)
        entry = root + ending.string + tab[pos][1]
        entries.add(entry, pattern.name)
    output = []
    for entry, patterns in entries.iteritems():
        if entry in forms:
            output.append((pos, entry, patterns))
    return output


def get_sgjp(lexeme):
    return {'source': 'sgjp', 'id': lexeme.pk, 'entry': lexeme.entry}


def create_lexeme(entry, part_of_speech, status, comment):
    return {
        'source': 'morfologik',
        'entry': entry,
        'part_of_speech': part_of_speech,
        'status': status,
        'comment': comment,
    }


def create_lip(pattern, root, i, ic, part_of_speech):
    output = {
        'pattern': pattern if isinstance(pattern, basestring) else pattern.name,
        'ind': i,
        'ic': (ic, part_of_speech),
    }
    if root:
        output['root'] = {'type': 'string', 'root': root}
    else:
        output['root'] = {'type': 'compute'}
    return output


alternative_gender = {
    'p1': 'p3',
    'p3': 'p1',
    'm1': 'm2/m3',
    'm': 'm1',
}

alternative_gender2 = {
    'p1': 'p3',
    'p3': 'p1',
    'm1': 'm',
    'm': 'm1',
}


def lexeme_creation(lc_sym, entry, ic, forms, type, patterns, fitting,
                    bad_patterns, included, other_result, tantum=None,
                    gender=None, negated=None, base_forms=None, derived=None):
    status = 'desc' if type != 'none' else 'cand'
    comments = [u'Z importu resztek']
    copy_lips = False
    if lc_sym == 'subst':
        part_of_speech = 'subst' # co z osc i skrs?
        if ic in ('m2', 'm3'):
            sure = False
            if type != 'none' and len(fitting) == 1:
                for pattern, root in patterns:
                    for e in patterns[0][0].endings.filter(
                            base_form_label__symbol='sg:gen'):
                        if not e.string.endswith('u'):
                            break
                    else:
                        continue
                    break
                else: # wszystkie sg:gen kończą się na 'u'
                    ic = 'm3'
                    sure = True
                for pattern, root in patterns:
                    if pattern.type.symbol == 'f':
                        ic = 'm2'
                        sure = True
                        break
            if not sure:
                status = 'cand'
        if tantum is None and ic == 'm1':
            for pattern, root in patterns:
                nmo_endings = pattern.endings.filter(
                    base_form_label__symbol='pl:nom')
                for e in nmo_endings:
                    nmo_form = root + e.string
                    if nmo_form not in forms:
                        comments.append(u'Dodano formę depr')
                        break
                else:
                    continue
                break
        if ic == 'p1':
            for pattern, root in patterns:
                nmo_endings = pattern.endings.filter(
                    base_form_label__symbol='pl:nom')
                other_endings = pattern.endings.exclude(
                    base_form_label__symbol='pl:nom')
                other_strings = other_endings.values_list('string', flat=True)
                nmo_strings = [e.string for e in nmo_endings if
                    e.string not in other_strings]
                nmo_forms = set(root + s for s in nmo_strings)
                if nmo_forms & set(forms):
                    comments.append(
                        u'Usunięto formę depr: %s' % ', '.join(list(nmo_forms)))
                    break
        if tantum == 'sg' and type != 'none':
            if type == 'match':
                search_patterns = [pattern for pattern, root in fitting]
                l = closest_lexeme_subst(entry, gender, search_patterns)
            else:
                included_patterns = [pattern for pattern, root in included]
                l = closest_lexeme_subst(entry, gender, fitting,
                    included_patterns)
            if l:
                copy_lips = l.lexemeinflectionpattern_set.all()
                #print l
                comments.append(u'Automatycznie rozszerzone singulare tantum')
            else:
                if type == 'match':
                    p = join(fitting)
                else:
                    p = join_many(fitting)
                debug(entry,
                    u'nie ma pasujących leksemów dla rozszerzenia sgtant '
                    u'dla wzorów %s' % p)
                comments.append(u'Nie udało się rozszerzyć singulare tantum')
                #status = 'cand'
                # dodać kwalifikator [po imporcie jednak]
    if bad_patterns:
        comments.append(u'Wzory z czarnej listy!')
        status = 'cand'
    if len(fitting) > 1 or (type == 'none' and fitting):
        status = 'cand'
        if type == 'none':
            comments.append(u'Zawierające wzory:')
            for (pattern, root), bad_forms in fitting:
                comments.append('%s: %s' % (pattern.name, ', '.join(bad_forms)))
        elif type != 'many':
            comments.append(u'Pasujące wzory: %s' % join(fitting))
        else:
            comments.append(u'Pasujące zestawy wzorów: %s' % join_many(fitting))
    if other_result:
        status = 'cand'
        type2, patterns2, included2, including2 = other_result
        comments.append(u'Alternatywny rodzaj: %s' % alternative_gender[gender])
        if type2 == 'match':
            comments.append(u'Pasujące wzory: %s' % join(patterns2))
        elif type2 == 'many':
            comments.append(
                u'Pasujące zestawy wzorów: %s' % join_many(patterns2))
            # hm?
    if ic is None and type != 'none':
        comments.append(u'Dopasowane wzory: %s' % join(patterns))
        # zbieramy wzory do porównania [fuj, copypasta!]
    if len(fitting) > 1:
        if type == 'none':
            all_patterns = set(p for p, b_f in fitting)
        elif type != 'many':
            all_patterns = set(fitting)
        else:
            all_patterns = set()
            for pattern_set in fitting:
                all_patterns |= set(pattern_set)
    if other_result and len(patterns2) > 1:
        if type2 != 'many':
            other_patterns = set(patterns2)
        else:
            other_patterns = set()
            for pattern_set in patterns2:
                other_patterns |= set(pattern_set)
    comment = '\n'.join(comments)
    output = {
        'lexeme': create_lexeme(entry, part_of_speech, status, comment)
    }
    lips = []
    if ic is not None:
        if not copy_lips:
            for i, (pattern, root) in enumerate(patterns):
                lips.append(
                    create_lip(pattern, root, i + 1, ic, part_of_speech))
        else:
            for lip in copy_lips:
                ic = lip.inflection_characteristic.symbol
                lips.append(
                    create_lip(lip.pattern, None, lip.index, ic,
                        part_of_speech))
    output['lips'] = lips
    if lc_sym == 'adj' and negated:
        output['negated'] = True
    if lc_sym == 'v':
        derived_data = []
        for pos in derived:
            # wypadałoby informować, jeśli wyszło puste... (?)
            derived_data += create_derived(pos, base_forms, forms, patterns)
        output['derived'] = derived_data
    return output


def process_forms(entry, forms, lc_sym, **extra):
    lexical_class = LexicalClass.objects.get(symbol=lc_sym)
    other_result = None
    form_set = set(forms)
    check = check_sgjp(lc_sym, entry, form_set, **extra)
    if check and not DEBUG:
        # dopisz leksem do słownika
        data = {'lexeme': get_sgjp(check)}
        # TODO negacja przymiotnika
        print_data(data)
    else:
        if lc_sym == 'subst':
            ic = extra['gender']
        extra2 = dict(extra)
        # jeśli rzeczownik męski lub pltant, to puszczamy więcej razy
        if lc_sym == 'subst' and extra['gender'] in 'pm':
            if extra['gender'] == 'm':
                extra2['gender'] = 'm1'
                type1, patterns1, included1, including1 = find_patterns(
                    lexical_class, entry, forms, **extra2)
                extra2['gender'] = 'm'
                type2, patterns2, included2, including2 = find_patterns(
                    lexical_class, entry, forms, **extra)
            elif extra['gender'] == 'p':
                extra2['gender'] = 'p1'
                type1, patterns1, included1, including1 = find_patterns(
                    lexical_class, entry, forms, **extra2)
                extra2['gender'] = 'p3'
                type2, patterns2, included2, including2 = find_patterns(
                    lexical_class, entry, forms, **extra2)
            if type1 != 'none' and type2 == 'none':
                type, patterns, included, including = type1, patterns1, included1, including1
                if extra['gender'] == 'm':
                    ic = 'm1'
                else:
                    ic = 'p1'
            elif type1 == 'none' and type2 != 'none':
                type, patterns, included, including = type2, patterns2, included2, including2
                if extra['gender'] == 'm':
                    ic = 'm'
                else:
                    ic = 'p3'
            elif type1 == type2 == 'none':
                type = 'none'
                patterns = []
                included = list(set(included1) | set(included2))
                including = list(set(including1) | set(including2))
                # chyba warto coś ustawić
                if extra['gender'] == 'm':
                    ic = 'm'
                else:
                    ic = 'p3'
            else: # z obu coś wyszło
                type, patterns, included, including = type1, patterns1, included1, including1
                if extra['gender'] == 'm':
                    ic = 'm'
                else:
                    ic = 'p1'
                other_result = (type2, patterns2, included2, including2)
                if DEBUG:
                    print u"dwie możliwości na rodzaj"
        else:
            type, patterns, included, including = find_patterns(
                lexical_class, entry, forms, **extra)
        if type == 'none':
            if lc_sym == 'subst' and not extra['tantum']:
                extra['tantum'] = tantum_a_posteriori(
                    form_set, [p for p, b_f in including])
                if extra['tantum']:
                    if extra['tantum'] == 'pl':
                        extra['gender'] = 'p'
                    return process_forms(entry, forms, lc_sym, **extra)

        if lc_sym == 'subst':
            extra2['gender'] = ic
        type, patterns, included, including, bad_patterns = filter_patterns(
            blacklist_filter, u'czarną listę', type, patterns, included,
            including,
            lexical_class, form_set, entry, **extra2)
        if bad_patterns and other_result:
            type, patterns, included, including = other_result
            ic = extra2['gender'] = alternative_gender2[ic]
            type, patterns, included, including, bad_patterns = filter_patterns(
                blacklist_filter, u'czarną listę', type, patterns, included,
                including,
                lexical_class, form_set, entry, **extra2)
            other_result = None
        elif other_result:
            type2, patterns2, included2, including2 = other_result
            new_other_result = filter_patterns(
                blacklist_filter, u'czarną listę', type2, patterns2, included2,
                including2, lexical_class, form_set, entry, **extra2)
            if not new_other_result[4]:
                other_result = new_other_result[:4]
            else:
                other_result = None

        # wzory się już nie zmienią od tego miejsca
        if type == 'many':
            # albo patterns[0]...
            all_patterns = [p for pattern_set in patterns for p in pattern_set]
        else:
            all_patterns = patterns
        if type != 'none':
            # brzydko...
            if lexical_class.symbol == 'subst':
                extra['ic'] = ic
                del extra['ic']
        else:
            ic = None
        if lc_sym == 'subst':
            # poprawka dla m2/m3
            if ic in ('m2', 'm3') and patterns and not bad_patterns:
                new_ic = ''
                for pattern, root in all_patterns:
                    for ic2 in ('m2', 'm3'):
                        # jeśli wszystkie użycia tego wzoru są przy ic2
                        if not pattern.lexemeinflectionpattern_set.exclude(
                                inflection_characteristic__symbol=ic2).filter(
                                lexeme__pk__in=sgjp).exists():
                            if new_ic == '':
                                new_ic = ic2
                            elif new_ic != ic2:
                                new_ic = None
                if new_ic:
                    ic = new_ic

        if type == 'none':
            debug(entry, u'zawiera się w %s' % join(p for p, b_f in including))
            chosen = []
            fitting = including
        elif type == 'match':
            patterns.sort(key=lambda p: p[0].name)
            fitting = patterns
            chosen = patterns[:1]
        elif type == 'many':
            chosen = patterns[0]
            if DEBUG:
                print u'zestawy wielu wzorów: %s' % join_many(patterns)
            fitting = patterns
        if not DEBUG:
            data = lexeme_creation(
                lc_sym, entry, ic, forms, type, chosen, fitting, bad_patterns,
                included,
                other_result, **extra2)
            print_data(data)


def get_pos_ndm(tag):
    if tag[0] == 'adv':
        return 'adv' if tag[-1] != 'comp' else 'advcom'
    elif tag[0] == 'xxx':
        return 'burk'
    else:
        return tag[0]


def print_data(data):
    print json.dumps(data)


def import_resztki(input_file):
    for line in input_file:
        entry, rest = line.replace("'", u'’').split(':')
        pos, ic, rest = rest.split(';')
        if ic in ('n2/m3', 'n2/m3/m2', 'm3/m2'):
            ic = 'm3'
        elif ic == 'p3/p2':
            ic = 'p3'
        elif ic in ('f/m3', 'f/n2'):
            ic = 'f'
        forms = rest.split(',')
        if entry not in forms:
            forms = [entry] + forms
        if ic[0] == 'p':
            tantum = 'pl'
        else:
            tantum = None
        if entry.endswith(u'stwo') and ic == 'p1':
            # zbędne formy mnogie
            root = entry[:-1]
            for suffix in ('', 'ami', 'ach', 'om'):
                forms.remove(root + suffix)
        process_forms(entry, forms, pos, gender=ic, tantum=tantum)