RealizationDescriptions.py 45.9 KB

Edit Raw Blame History

import datetime
import os

from collections import Counter, defaultdict
from itertools import chain

from shellvalier.settings import BASE_DIR, DEBUG

from meanings.models import LexicalUnit, Synset
from semantics.models import SemanticRole, RoleAttribute

from entries.phrase_descriptions.utils import get_form
from entries.phrase_descriptions.polish_strings import TO
from entries.phrase_descriptions.descriptions import make_phraseologisms

from importer.Phrase import Case, Preposition, Modification, Words, LexPhrase, Fixed, NP, LexNP, LexNumP, PrepNP, LexPrepNP, LexPrepGerP, AdjP, LexAdjP, LexPrepAdjP, PActP, LexPActP
from importer.RealizationDescriptionUtils import *

def get_prefs_list(argument):
    return sorted(
        map(str, argument.predefined.all())
    ) + sorted(
        map(str, argument.synsets.all())
    ) + sorted(
        map(str, argument.relations.all())
    )

LOCATION_ROLES = {'Location', 'Path'}

def select_predefined(predefs):
    if len(predefs) == 1:
        return predefs[0]
    return 'ALL'
    # TODO inne heurystyki?
    raise RealisationDescriptionError('couldn’t choose predef lemma: {}'.format('/'.join(predefs)))

def select_predefined_for_xp(predefs, role):
    if predefs == ['ISTOTY']:
        return 'ISTOTY'
    return 'ALL'
    # TODO heurystyki?
    raise RealisationDescriptionError('couldn’t choose predef lemma for XP: {}'.format('/'.join(predefs)))

def get_predefined_lemma(argument, xp=False):
    predefined = argument.predefined.all()
    if not predefined:
        return None
    predefs = sorted(p.key for p in predefined)
    role = argument.role.role.role
    if role not in LOCATION_ROLES and {'LUDZIE', 'PODMIOTY'}.intersection(predefs):
        return ['LUDZIE']
    if xp:
        return [select_predefined_for_xp(predefs, role)]
    else:
        return [select_predefined(predefs)]

def get_hyponyms(synset, seen=None, tab=' '):
    if seen is None:
        seen = set()
    hyponyms = set()
    for hypo in synset.hyponyms.all():
        if hypo not in seen:
            seen.add(hypo)
            hyponyms.add(hypo)
            hyponyms.update(get_hyponyms(hypo, seen, tab=tab + '  '))
    return hyponyms

# for benchmarking
BENCH3 = defaultdict(list)

# precalculated for the largest ones
HYPONYM_CACHE = {
    # sklep-1
    4747 : 46,
    # obiekt budowlany-1
    53426 : 590,
    # konstrukcja-1
    7218 : 614,
    # cecha człowieka-1
    36347 : 676,
    # aberracja-1 nieprawidłowość-1 zaburzenie-2 zakłócenie-3
    4127 : 700,
    # znak-1
    7416 : 732,
    # coś na ząb-1 jedzenie-2 pokarm-1 pożywienie-3 żywność-1
    10738 : 766,
    # materiał-1 tworzywo-1
    1612 : 879,
    # jednostka miary-1 jednostka-4 miano-2 miara-3
    1161 : 881,
    # związek chemiczny-1 związek-1
    19589 : 882,
    # zjawisko naturalne-1
    5351 : 901,
    # dzieło-2 praca-6
    7469 : 927,
    # część-1
    462 : 957,
    # cecha czynności-1 cecha działania-1
    5953 : 1033,
    # część-3
    104936 : 1056,
    # cecha fizyczna-1
    5464 : 1056,
    # wypowiedź-1
    3998 : 1062,
    # proces-1
    54253 : 1103,
    # ciąg wydarzeń-1 ciąg zdarzeń-1
    47401 : 1107,
    # grupa-2 zespół ludzi-1 zespół-2
    7653 : 1176,
    # człowiek charakteryzowany ze względu na kwalifikacje-1
    6779 : 1188,
    # substancja chemiczna-1
    5233 : 1206,
    # przyrząd-1
    7425 : 1260,
    # ilość-1
    1078 : 1427,
    # grupa ludzi-1 grupa-5 ludzie-1
    7702 : 1510,
    # kategoria-3 pojęcie-2
    8170 : 1522,
    # urządzenie-5
    7446 : 1524,
    # historia-3 wydarzenie-1 wypadek-3 zdarzenie-2
    6526 : 1533,
    # grupa istot-1
    103330 : 1585,
    # miejsce-1
    4750 : 1632,
    # stan-1
    3243 : 1761,
    # narzędzie-1
    7610 : 1800,
    # roślina-1
    4603 : 1928,
    # artefakt-1 twór-5 wytwór-2
    2605 : 2029,
    # człowiek ze względu na swoje zajęcie-1
    6797 : 2184,
    # nazwa człowieka uwzględniająca jego cechy-1 nosiciel cechy-1
    6778 : 2308,
    # płód-3 wytwór umysłu-1
    8137 : 2599,
    # człowiek ze względu na relacje społeczne-1
    6775 : 2642,
    # fenomen-1 zjawisko-1
    5371 : 2674,
    # środek-1
    28294 : 2793,
    # człowiek, który coś robi-1
    241977 : 2828,
    # substancja-1
    5236 : 2871,
    # zwierzę-1
    5621 : 2966,
    # materia-3
    247979 : 2970,
    # spowodowanie-1 sprawienie-1
    102579 : 4255,
    # atrybut-1 cecha-1 przymiot-1 własność-2 właściwość-1
    323 : 4579,
    # grupa-4 zbiór-1
    1282 : 4587,
    # uczynienie-1 zrobienie-1
    102576 : 4851,
    # całość-1 ogół-1
    2129 : 5668,
    # człowiek-1 istota ludzka-1 jednostka-2 osoba-1
    6047 : 6151,
    # osoba-4
    28688 : 6170,
    # wytwór-1
    2903 : 7230,
    # efekt-1 rezultat-1 skutek-1 wynik-1
    5195 : 7915,
    # przedmiot-1
    2646 : 7552,
    # istota żywa-1 stworzenie-5 twór-1
    6045 : 8448,
    # istota-1
    1027 : 8536,
    # czynność-1
    10765 : 8653,
    # rzecz-4
    103156 : 9480,
    # egzemplarz-1 indywiduum-1 jednostka-3 organizm-1 osobnik-2
    6731 : 10609,
    # obiekt-2
    234224 : 21435,
}

def select_synsets(synsets):
    by_num_hyponyms = defaultdict(set)
    for synset in synsets:
        sid = synset.id
        if sid not in HYPONYM_CACHE:
            #-------
            t1 = datetime.datetime.now()
            #-------
            hyponyms = get_hyponyms(synset)
            HYPONYM_CACHE[sid] = len(hyponyms)
            #-------
            t2 = datetime.datetime.now()
            # deciseconds :)
            d = round((t2 - t1).total_seconds() * 10)
            if DEBUG:
                BENCH3[d].append((HYPONYM_CACHE[sid], sid, synset))
            # ----
        N = HYPONYM_CACHE[sid]
        by_num_hyponyms[N].add(synset)
    M = max(by_num_hyponyms.keys())
    return list(by_num_hyponyms[M])

FREQ = Counter()
with open(os.path.join(BASE_DIR, 'data/freq/sgjp-freq-23032021.tab')) as f:
    for l in f:
        lemma, pos, freq = l.strip('\n').split('\t')
        if pos not in ('adj', 'subst'):
            continue
        freq = int(freq)
        if freq < 10:
            continue
        # this is inaccurate, but conflate multiple occurrences
        FREQ[lemma] += freq

def rank_units(units, ranker):
    buckets = defaultdict(set)
    for unit in units:
        buckets[ranker(unit)].add(unit)
    ranked = dict()
    for rank, (n, unts) in enumerate(sorted(buckets.items())):
        for unit in unts:
            ranked[unit] = rank
    return ranked

meaning_no_ranker = lambda unit: int(unit.sense)
# TODO lepiej mniej znaczeń (bardziej specyficzne -> precyzyjniejsze?) czy więcej (częstsze -> bardziej zrozumiałe?)
num_meanings_ranker = lambda unit: LexicalUnit.objects.filter(base=unit.base).count()
# w ten sposób nadajemy też najniższy priorytet wielowyrazowym, jeśli istnieje 1-wyrazowa notowana na liście frek.
freq_ranker = lambda unit: -FREQ.get(unit.base, 0)
words_ranker = lambda unit: len(unit.base.split())


# różnice przejrzane oczami na próbce dla:
# [meaning_no_ranker, freq_ranker, num_meanings_ranker]
# [freq_ranker, meaning_no_ranker, num_meanings_ranker] -> [freq_ranker, num_meanings_ranker, meaning_no_ranker] -> takie same wyniki na próbce, TODO sugestia Eli: druga opcja brzmi intuicyjniej
# [num_meanings_ranker, meaning_no_ranker, freq_ranker]
# [meaning_no_ranker, num_meanings_ranker, freq_ranker]

def select_units(units, rankers=[freq_ranker, num_meanings_ranker, meaning_no_ranker, words_ranker]):
    units = [unit for unit in units if (unit.base, unit.sense) not in UNIT_KILL_LIST]
    unit2rank = defaultdict(lambda: [0 for i in range(len(rankers))])
    for i, ranker in enumerate(rankers):
        for unit, rank in rank_units(units, ranker).items():
            unit2rank[unit][i] = rank
    by_rank = defaultdict(set)
    for unit, rank in unit2rank.items():
        by_rank[tuple(rank)].add(unit)
    #for rank, units in sorted(by_rank.items()):
    #    print('        ***', rank, units)
    return sorted(by_rank.items())[0][1]

LEMMA_CACHE = dict()

#returns [lemmata], is_predef
def get_synsets_lemma(argument, pos):
    synsets = argument.synsets.filter(lexical_units__pos=pos).distinct()
    synsets = [(Synset.objects.get(id=SYNSET_MAP[s.id]) if s.id in SYNSET_MAP else s) for s in synsets if s.id not in SYNSET_KILL_LIST]
    if not synsets:
        return None
    key = tuple(sorted(map(str, synsets)))
    if key in LEMMA_CACHE:
        return LEMMA_CACHE[key]
    synsets = synsets if len(synsets) == 1 else select_synsets(synsets)
    for synset in synsets:
        if synset.id in SYNSET2LEMMA:
            return [SYNSET2LEMMA[synset.id]], True
    units = list(chain.from_iterable(synset.lexical_units.all() for synset in synsets))
    units = [units[0]] if len(units) == 1 else select_units(units)
    ret = (sorted(unit.base for unit in units), False)
    if ret[0] == ['cecha czynności', 'cecha działania']:
        return (['cecha'], False)
    LEMMA_CACHE[key] = ret
    return ret

# for benchmarking
BENCH2 = defaultdict(list)

def get_argument_lemma(argument, xp=False):
    t1 = datetime.datetime.now()
    ret = get_argument_lemma2(argument, xp=xp)
    t2 = datetime.datetime.now()
    # deciseconds :)
    d = round((t2 - t1).total_seconds() * 10)
    if DEBUG:
        BENCH2[d].append((argument.predefined.all(), argument.synsets.all(), ret))
    return ret

def get_argument_lemma2(argument, xp=False):
    lemma = get_predefined_lemma(argument, xp=xp)
    if lemma:
        return lemma, True
    lemma = get_synsets_lemma(argument, 'noun')
    if lemma:
        # get_synsets_lemma returns [lemmata], is_predef
        return lemma
    lemma = get_synsets_lemma(argument, 'adj')
    if lemma:
        return lemma
    # TODO!!! np. akuratność
    return ['ALL'], True
    lemma = get_relations_lemma(argument)
    assert(lemma)
    return lemma, False

# nie powinny występować razem:
#   * LUDZIE + PODMIOTY
#   * MIEJSCE + OTOCZENIE + POŁOŻENIE

def process_lemma(lemma, phrase_type):
    mod = NATR
    if lemma in PREDEF2LEMMA:
        lemma, gend, num, pos, mod = PREDEF2LEMMA[lemma].get(phrase_type, PREDEF2LEMMA[lemma]['_'])
        return lemma, gend, num, pos, mod

    if ' ' in lemma:
        # eg. ‹środki pieniężne›
        words = lemma.split(' ')
        tags = []
        for i, word in enumerate(words):
            tags.append(sorted(get_simplified_tags(word)))
        if len(words) == 2 and 'subst:nom' in tags[0] and 'adj' in tags[1]:
            # np. ‹środki pieniężne›
            # ‹napój wyskokowy› -> ‹napój› również impt,
            # ‹stan psychiczny› -> ‹psychiczny› również subst,
            lemma = words[0]
            mod = make_adjp_mod(words[1])
            mod._order = 'post'
        elif len(words) == 2 and 'subst:nom' in tags[1] and 'adj' in tags[0]:
            # np. ‹zły uczynek›
            lemma = words[1]
            mod = make_adjp_mod(words[0])
        elif len(words) == 2 and 'subst:nom' in tags[0] and 'pact' in tags[1]:
            # np. ‹pojazd latający›
            lemma = words[0]
            mod = make_pactp_mod(words[1])
            mod._order = 'post'
        elif len(words) == 2 and 'subst:nom' in tags[0] and 'subst:gen' in tags[1]:
            # np. ‹dziedzina wiedzy›
            lemma = words[0]
            mod = make_npgen_mod(words[1])
        elif len(words) == 2 and 'subst:nom' in tags[0] and 'ger:gen' in tags[1]:
            # np. ‹język programowania›
            lemma = words[0]
            # nie mamy lexgerp, więc używamy fixed
            mod = make_fixed_mod(words[1])
            mod._order = 'post'
        elif len(words) == 3 and 'subst:nom' in tags[0] and 'prep:gen' in tags[1] and 'subst:gen' in tags[2]:
            # np. ‹maszyna do szycia›
            lemma = words[0]
            mod = make_prepnp_mod(words[2], words[1], 'gen')
        else:
            raise RealisationDescriptionError('couldn’t parse lemma: {} {}'.format(lemma, tags))

    if lemma == 'lata':
        return 'rok', 'm3', 'pl', 'subst', mod
    if lemma in GERUNDS:
        return lemma, 'n', 'sg', 'subst', mod

    subst_sg_interps = get_interps(lemma, lemma=lemma, tag_constraints=['subst', 'sg', 'nom'])
    if subst_sg_interps:
        return lemma, get_gender(subst_sg_interps), 'sg', 'subst', mod
    subst_pl_interps = get_interps(lemma, lemma=lemma, tag_constraints=['subst', 'pl', 'nom'])
    if subst_pl_interps:
        # lemat „mnogi” notowany w Morfeuszu jako plurale tantum, np. ‹środki›
        return lemma, get_gender(subst_pl_interps), 'pl', 'subst', mod
    pt_interps = get_interps(lemma, tag_constraints=['subst', 'pl', 'nom'])
    if pt_interps:
        # lemat „mnogi” nie notowany w Morfeuszu, jako plurale tantum, np. ‹pieniądze›
        lemmata = set(lemma for lemma, tag in pt_interps)
        if len(lemmata) == 1:
            return lemmata.pop(), get_gender(pt_interps), 'pl', 'subst', mod
    if get_interps(lemma, lemma=lemma, tag_constraints=['adj', 'sg', 'nom', 'm1']):
        # przymiotnik
        return lemma, None, 'sg', 'adj', mod
    ger_interps = get_interps(lemma, tag_constraints=['ger', 'sg', 'nom'])
    if ger_interps:
        # gerundium
        lemmata = set(lemma for lemma, tag in ger_interps)
        if len(lemmata) == 1:
            return lemmata.pop(), 'n', 'sg', 'ger', mod

    raise RealisationDescriptionError('couldn’t process lemma: {} {}'.format(lemma, get_interps(lemma)))

    '''
    # TODO rodzaj w zależności od hiperonimów?
    if lemma == 'członek':
        return lemma, 'sg', 'subst', mod
    try:
        get_form(lemma, ['subst', 'sg', 'nom'])
        return lemma, 'sg', 'subst', mod
    except:
        pass
    try:
        # lemat „mnogi” notowany w Morfeuszu jako plurale tantum, np. ‹środki›
        get_form(lemma, ['subst', 'pl', 'nom'])
        return lemma, 'pl', 'subst', mod
    except:
        pass
    try:
        # przymiotnik
        get_form(lemma, ['adj', 'sg', 'nom', 'm1'])
        return lemma, 'sg', 'adj', mod
    except:
        # lemat „mnogi” nie notowany w Morfeuszu, jako plurale tantum, np. ‹pieniądze›
        subst_pl_nom_lemmata = set(interp[2][1].split(':')[0] for interp in morfeusz.analyse(lemma) if interp[2][2].startswith('subst:pl:nom'))
        if len(subst_pl_nom_lemmata) == 1:
            return subst_pl_nom_lemmata.pop(), 'pl', 'subst', mod
        print('============', lemma)
        print('============', subst_pl_nom_lemmata)
        raise
    '''

PREP_2GRAMS = Counter()
with open(os.path.join(BASE_DIR, 'data/freq/2grams_prep_nkjp')) as f:
    for l in f:
        digram, freq = l.strip('\n').split('\t')
        freq = int(freq)
        PREP_2GRAMS[digram] = freq

XP2PREPNP = {
    'abl'   : (('z', 'gen'),),
    # do domu / na basen
    'adl'   : (('do', 'gen'), ('na', 'acc'),),
    # w mieście, na wsi, u Janka
    'locat' : (('w', 'loc'), ('na', 'loc',), ('u', 'gen'),),
    'perl'  : (('przez', 'acc'),),
    'temp'  : (('podczas', 'gen'),),
    'dur'   : (('przez', 'acc'),),
}

def xp2prepnp(advcat, lemma, num):
    if advcat in XP2PREPNP:
        preps = XP2PREPNP[advcat]
        if len(preps) == 1:
            return preps[0]
        else:
            ranked = []
            for prep, case in preps:
                form = get_form(lemma, ['subst', num, case])[0]
                digram = '{} {}'.format(prep, form)
                ranked.append((-PREP_2GRAMS[digram], (prep, case)))
            return sorted(ranked)[0][1]
    else:
        return None, None

XP2COMPREPNP = {
     'caus'  : 'z powodu',
     # TODO: ożywione: dla ..., nieożywione: w celu ...
     'dest'  : 'w celu',
     'instr' : 'za pomocą',
}

def generate_phrases(function, negativity, phrase, lemma, is_predef, head_gender, controller=None, controller_grammar=None):
    phrase_type = phrase._name
    dummy_id = None

    if is_predef and phrase_type == 'xp' and not phrase._category._limitations:
        advcat = phrase._category._value
        # np. „komuś podobało się gdzieś”
        return [PREDEFXP[advcat][lemma]], 'n', 'sg'

    distrp = False
    processed_lemma, gend, num, pos, mod = process_lemma(lemma, phrase_type)
    if phrase_type in ('adjp', 'prepadjp') and pos != 'adj':
        # np. aborcja - Manner - lek - adjp(agr)/xp(instr) -> ‹jakaś aborcja›
        processed_lemma, gend, pos, mod = 'jakiś', None, 'adj', NATR
    if phrase_type == 'nonch':
        phrase_type = 'np'
        phrase = NP(Case('nom'), dummy_id)
        # bo nonch może być realizowana wyłącznie przez ‹coś› itp.
        processed_lemma, gend, pos, mod = 'coś', 'n', 'subst', NATR
        # i przetwarzanie dalej jako np
    if phrase_type == 'distrp':
        # ‘po jabłku’ byłoby OK, ale np. ‘po pieniądzach’ brMzmi idiotycznie, więc
        # robimy np(gen) i potem dokleimy ‘po ileś’ (czegoś)
        distrp = True
        phrase_type = 'np'
        phrase = NP(Case('gen'), dummy_id)
        # i przetwarzanie dalej jako np

    print('PHRASE TYPE:', phrase_type, 'LEMMA:', processed_lemma, 'MODIFICATION:', mod, 'FUNCTION:', function)
    words = Words('concat', 'xor', [processed_lemma])

    # TODO
    if phrase_type in ('cp', 'ncp', 'prepncp'):
        cptype = phrase._type._value
        assert(cptype in ('int', 'rel') or not phrase._type._realisations)
        phr = None
        if cptype == 'int':
            if phrase._type._realisations:
                phr = '/'.join(phrase._type._realisations) + ' …'
            else:
                phr = 'kto/co/czy/… robi/się dzieje/…'
        elif cptype == 'rel':
            if phrase._type._realisations:
                phr = '/'.join(phrase._type._realisations) + ' …'
            else:
                raise RealisationDescriptionError('rel phrase without realisations: {}'.format(phrase))
        elif cptype == 'żeby2':
            comp = 'że' if negativity != 'neg' else 'żeby'
            phr = 'że coś się stało'
        elif cptype in ('żeby', 'jakoby', 'jakby',):
            phr = '{} coś się stało'.format(cptype)
        elif cptype in ('że', 'bo', 'gdy', 'jak', 'jeśli', 'kiedy',):
            phr = '{} coś się dzieje'.format(cptype)
        elif cptype in ('aż', 'zanim',):
            phr = '{} coś się stanie'.format(cptype)
        else:
            print(phrase)
            1 / 0
        if phrase_type == 'cp':
            return [phr], 'n', 'sg'
        if phrase_type == 'ncp':
            return ['{}, {}'.format(TO[phrase._case._value], phr)], 'n', 'sg'
        if phrase_type == 'prepncp':
            return ['{} {}, {}'.format(phrase._prep._value, TO[phrase._prep._case._value], phr)], 'n', 'sg'
    if phrase_type == 'or':
        # TODO? absurd „coś się dzieje”? absurd: coś się dzieje?
        return ['„coś się dzieje”'], 'n', 'sg'
    if phrase_type in ('refl', 'recip'):
        # TODO?
        return ['się'], None, None
    if phrase_type == 'advp':
        # TODO!
        if pos == 'adj':
            return [adj2adv(processed_lemma)], None, None
        # dla nie-przymiotników i tak nic nie wymyślimy
        return ['jakoś'], None, None
    if phrase_type == 'infp':
        # TODO?
        return ['coś robić' if negativity != 'neg' else 'czegoś robić'], 'n', 'sg'
    if phrase_type == 'E':
        # TODO?
        return ['∅'], 'n', 'sg'

    if pos == 'adj' and phrase_type not in ('possp', 'adjp', 'prepadjp',):
        # TODO? np. aktualizacja - Manner - automatyczny - xp(instr)
        # TODO źle się generuje dla chlastać, ale tam Instrument ma pref. przymiotnikową ‹ostry›, powinno być raczej ‹ostrze›
        phrase_type = 'adjp'
        phrase = AdjP(Case('agr'), dummy_id)
        # i przetwarzanie dalej jako adjp

    if phrase_type == 'possp' and processed_lemma == 'czyjś':
        return [get_form(processed_lemma, ['sg', 'nom', head_gender, 'pos'])[0]], None, None
    if phrase_type == 'comprepnp':
        # TODO wielowyrazowe! ‹abonament w wysokości środków pieniężnych›
        # TODO może ładniej by było „w czyjejś sprawie”, „na czyjąś rzecz”, ale
        # to trochę trudniejsze
        return make_comprepnp(phrase._prep._value, words, num, mod), None, None
        #return ['{} {}'.format(phrase._prep._value, get_form(lemma, [num, 'gen'])[0])]

    lex_phrases = []
    phrases = []

    if phrase_type == 'np':
        # gerundium; TODO? lista wyjątków jeśli więcej
        if (processed_lemma, function, phrase._case._value) == ('przyrządzanie', 'subj', 'str'):
            return ['przyrządzanie'], 'n', 'sg'
        if (processed_lemma, function, phrase._case._value) == ('szarpnięcie', None, 'inst'):
            return ['szarpnięciem'], 'n', 'sg'
        lex_phrases.append(LexNP(phrase, num, words, mod, dummy_id))
    if phrase_type == 'possp':
        np = NP(Case('gen'), dummy_id)
        lex_phrases.append(LexNP(np, num, words, mod, dummy_id))
    if phrase_type == 'prepnp':
        # gerundium; TODO? lista wyjątków jeśli więcej
        if (processed_lemma, phrase._prep._case._value, phrase._prep._value) == ('przyrządzanie', 'gen', 'do'):
            return ['do przyrządzania'], None, None
        if phrase._prep._value in ('między', 'pomiędzy', 'wśród', 'pośród') and processed_lemma not in ('ktoś', 'coś'):
            num = 'pl'
        if pos == 'subst':
            lex_phrases.append(LexPrepNP(phrase, num, words, mod, dummy_id))
        if pos == 'ger':
            lex_phrases.append(LexPrepGerP(phrase, num, 'aff', words, '', mod, dummy_id))
    if phrase_type == 'adjp':
        # TODO! gender & control
        lex_phrases.append(LexAdjP(phrase, 'sg', head_gender if head_gender else 'm1', 'pos', words, mod, dummy_id))
    if phrase_type == 'prepadjp':
        lex_phrases.append(LexPrepAdjP(phrase, 'sg', 'm1', 'pos', words, mod, dummy_id))
    if phrase_type == 'compar':
        lex_phrases.append(make_compar(phrase, words, num, mod, controller))
    if phrase_type == 'xp':
        if phrase._category._limitations:
            for realisation in phrase._category._limitations:
                phrs, g, n = generate_phrases(function, negativity, realisation, lemma, is_predef, head_gender)
                for phr in phrs:
                    if phr not in phrases:
                        phrases.append(phr)
            return phrases, 'n', 'sg'
        else:
            advcat = phrase._category._value
            if advcat == 'mod':
                phrase2 = NP(Case('inst'), dummy_id)
                lex_phrases.append(LexNP(phrase2, num, words, mod, dummy_id))
            prep, case = xp2prepnp(advcat, processed_lemma, num)
            if prep:
                phrase2 = PrepNP(Preposition(prep, Case(case)), dummy_id)
                lex_phrases.append(LexPrepNP(phrase2, num, words, mod, dummy_id))
            if advcat in XP2COMPREPNP:
                if pos == 'subst':
                    comprep = XP2COMPREPNP[advcat]
                    return make_comprepnp(comprep, words, num, mod), None, None
                if pos == 'ger':
                    assert(mod == NATR)
                    return ['{} {}'.format(comprep, get_form(processed_lemma, ['ger', num, 'gen', head_gender])[0])], 'n', 'sg'

    for lex_phrase in lex_phrases:
        for phr in make_phraseologisms(lex_phrase, function, negativity, controller=controller, controller_grammar=controller_grammar):
            if phr not in phrases:
                # TODO? porządna lista wyjątków, jeśli będzie więcej
                if phr == 'na członek rodziny':
                    phr = 'na członka rodziny'
                if distrp:
                    # po iluś facetów/po ileś dziewczyn/kotów...
                    phr = 'po {} {}'.format('iluś' if gend == 'm1' else 'ileś', phr)
                phrases.append(phr)

    assert(phrases)
    return phrases, gend if phrase_type == 'np' else None, num if phrase_type == 'np' else None

def get_lex_gender_number(phrase):
    if isinstance(phrase, LexNP):
        number = phrase._number
        # take the first lemma since first expansion is taken for whole meaning description
        lemma = phrase._words._lemmas[0]
        if lemma == 'siebie':
            gender = 'm1'
        elif lemma == 'łupień':
            gender = 'm2'
        else:
            interps = get_interps(lemma, lemma=lemma, tag_constraints=['subst', 'nom'])
            gender = get_gender(interps)
        return gender, number if number != '_' else 'sg'
        '''
        genders = list()
        for lemma in phrase._words._lemmas:
            if lemma == 'siebie':
                genders.append('m1')
            elif lemma == 'łupień':
                genders.append('m2')
            else:
                interps = get_interps(lemma, lemma=lemma, tag_constraints=['subst', 'nom'])
                genders.append(get_gender(interps))
        return genders[0], number if number != '_' else 'sg'
        '''
    if isinstance(phrase, LexNumP):
        # take the first lemma since first expansion is taken for whole meaning description
        lemma = phrase._words._lemmas[0]
        interps = get_interps(lemma, lemma=lemma, tag_constraints=['subst', 'nom'])
        gender = get_gender(interps)
        lemma = phrase._nums._lemmas[0]
        recs = set()
        if lemma == '2':
            recs.add('congr')
        else:
            for interp in get_interps(lemma, lemma=lemma, tag_constraints=['num', 'nom']):
                recs.add(interp[1].split(':')[-1])
        assert(len(recs) == 1)
        rec = recs.pop()
        if rec == 'rec':
            # wiele/pięciu/trzydzieści osiem kotów/facetów/kobiet przyszło
            return 'n', 'sg'
        else:
            # trzy kobiety/koty przyszły/trzej faceci przyszli
            return gender, 'pl'
    return None, None

PHRASE_CACHE = dict()

PHRASE_SEP = ' / '

# for benchmarking
BENCH = defaultdict(list)

def get_phrase_description(subentry, argument, position, phrase, controller_grammar=None):
    t1 = datetime.datetime.now()
    ret = get_phrase_description2(subentry, argument, position, phrase, controller_grammar=controller_grammar)
    t2 = datetime.datetime.now()
    # deciseconds :)
    d = round((t2 - t1).total_seconds() * 10)
    if DEBUG:
        BENCH[d].append((subentry.entry.name, argument.role.role.role, ret[0]))
    return ret

# subentry, argument: DB model objects
# schema, phrase: importer objects
def get_phrase_description2(subentry, argument, position, phrase, controller_grammar=None):
    print()
    print(argument)
    print(phrase)
    gender, number = None, None
    function = position._function._value if position._function else None
    control = None
    if position._control:
        #assert(len(position._control) == 1)
        #control = position._control[0]._function
        ee = [c._function for c in position._control if c._function.endswith('controllee')]
        er = [c._function for c in position._control if c._function.endswith('controller')]
        assert(len(ee) <= 1)
        assert(len(er) <= 1)
        # e.g. ‹uznać› — controllee and pred_controller on the same position, take controllee
        if ee:
            control = ee[0]
        else:
            control = er[0]
    negativity = subentry.negativity.name if subentry.negativity else '_'
    head_lemma, head_gender = subentry.entry.name, None

    controller, controller_features, controller_function = None, None, None
    if control and control.endswith('controllee'):
        controller = position._schema.getController(control)
        try:
            controller_features = controller_grammar[controller]
        except KeyError:
            raise RealisationDescriptionError('couldn’t determine grammar features for {}: {} {}'.format(' '.join(map(str, argument.frame.lexical_units.all())), control, phrase))
        controller_function = controller._function._value if controller._function else None

    if subentry.entry.pos.tag == 'noun':
        interps = get_interps(head_lemma, lemma=head_lemma, tag_constraints=['subst', 'nom'])
        head_gender = get_gender(interps)

    # TODO
    # TODO gender, number
    # TODO (‹jakieś›) oko * (‹jakieś›) oczy *błyszczy* z powodu substancji
    if isinstance(phrase, LexPhrase) or isinstance(phrase, Fixed):
        phrs = []
        # TODO to powinny być tylko brakujące [...] w lex(cp)
        try:
            for phr in make_phraseologisms(phrase, function, negativity, controller=controller, controller_grammar=controller_features):
                if phr not in phrs:
                    phrs.append(phr)
        except:
            phrs.append('!!!???')
        gender, number = get_lex_gender_number(phrase)
        return PHRASE_SEP.join(phrs), gender, number
    lemmata, is_predef = get_argument_lemma(argument, xp=(phrase._name == 'xp' and not phrase._category._limitations))
    if len(lemmata) != 1:
        raise RealisationDescriptionError('couldn’t choose single lemma: {}'.format('/'.join(lemmata)))
    phrases = []
    # TODO since there’s one lemma, drop the loop
    for lemma in lemmata:
        key = (function, negativity, str(phrase), lemma, str(head_gender), control, controller_features, controller_function)
        if key in PHRASE_CACHE:
            lemma_phrases, gender, number = PHRASE_CACHE[key]
        else:
            lemma_phrases, gender, number = generate_phrases(function, negativity, phrase, lemma, is_predef, head_gender, controller=controller, controller_grammar=controller_features)
            PHRASE_CACHE[key] = (lemma_phrases, gender, number)
        phrases += lemma_phrases
    return PHRASE_SEP.join(phrases), gender, number

def get_only_value(d):
    return list(d.values())[0]

PRIORITY, ATTR, SUBPRIORITY = 'priority', 'attr', 'subpriority'
LOW_PRIORITY = 200
CP_PRIO = {
    'żeby'   : 0, # że
    'kiedy'  : 0, # gdy, jak
    'żeby2'  : 1, # jak
    'że'     : 2, # jak
    # prefer phrases introduced by complementisers where present
    'int' : LOW_PRIORITY + 1,
}
PHRASE_PRIORITY = {
    'xp' : {
        PRIORITY : 10,
        ATTR : lambda phrase: phrase._category._value,
        SUBPRIORITY : {
            'adl'   : 0, # nawigacja xp(adl)/xp(locat)
            'locat' : 1, # powycierać xp(abl)/xp(locat)
            'caus'  : 2, # ucierpieć xp(caus)/xp(temp)
        },
    },
    'np' : {
        PRIORITY : 20,
        ATTR : lambda phrase: phrase._case._value,
        SUBPRIORITY : {
            'str' : 0,
        },
    },
    'prepnp' : {
        PRIORITY : 22,
        ATTR : lambda phrase: (phrase._prep._value, phrase._prep._case._value),
        SUBPRIORITY : {
            ('do', 'gen')      : 0, # adekwatny do/dla; kolejka do/za
            ('za', 'inst')     : 1, # agitować za/przeciw
            ('o', 'acc')       : 1, # apel o/przeciw
            ('w', 'acc')       : 1, # całować w/po
            ('w', 'loc')       : 1, # defilada w/na pojeździe
            ('między', 'inst') : 2, # debata między/z/wśród
            ('o', 'loc')       : 2, # debata o/wokół/nad
            ('wobec', 'gen')   : 2, # dług wobec/względem, konsekwentny wobec/dla
            ('dla', 'gen')     : 3, # certyfikat dla/za
            ('z', 'gen')       : 2, # dochód z/za/od
            ('o', 'acc')       : 3, # kampania o/za
            ('pod', 'inst')    : 4, # kruszyć się pod/od
            ('o', 'loc')       : 4, # książka o czymś/z czegoś
            ('po', 'loc')      : 5, # odlatywać od/po
            ('od', 'gen')      : 6, # podatek od/za
            ('przeciw', 'dat') : 7, # przestępstwo z/przeciw
            ('na', 'loc')      : 7, # skoncentrować się na/nad
            ('za', 'acc')      : 7, # zabulić na/za
            ('z', 'acc')       : LOW_PRIORITY + 1, # mandat – błąd w danych, jest tam też za:acc
        },
    },
    'comprepnp' : {
        PRIORITY : 24,
        ATTR : lambda phrase: phrase._prep._value,
        SUBPRIORITY : {
            'w sprawie'   : 0, # w kwestii
            'w zakresie'  : 0, # dyletant w zakresie/w kwestii
            'w kwestii'   : 1, # dyskrecja co do/w kwestii
            'z dziedziny' : 1, # referat w dziedzinie/z dziedziny
        },
    },
    'cp' : {
        PRIORITY : 30,
        ATTR : lambda phrase: phrase._type._value,
        SUBPRIORITY : CP_PRIO,
    },
    'ncp' : {
        PRIORITY : 32,
        ATTR : lambda phrase: phrase._type._value,
        SUBPRIORITY : CP_PRIO,
    },
    'prepncp' : {
        PRIORITY : 34,
        ATTR : lambda phrase: phrase._type._value,
        SUBPRIORITY : CP_PRIO,
    },
}

def get_phrase_priority(phrase):
    lex = False
    if isinstance(phrase, LexPhrase):
        lex = True
        phrase = phrase._lex_phrase()
    phrase_type = phrase._name
    if phrase_type == 'xp' and phrase._category._limitations:
        # TODO? heurystyka: bierzemy pierwszą
        phrase, phrase_type = phrase._category._limitations[0], phrase._category._limitations[0]._name
    if phrase_type not in PHRASE_PRIORITY:
        return (LOW_PRIORITY, LOW_PRIORITY)
    attr = PHRASE_PRIORITY[phrase_type][ATTR](phrase)
    # lower the priority by 1 for lexes, eg. dostępność prepnp(dla, gen)/lex(prepnp(‹dla kieszeni›))
    return (PHRASE_PRIORITY[phrase_type][PRIORITY] + (1 if lex else 0), PHRASE_PRIORITY[phrase_type][SUBPRIORITY].get(attr, LOW_PRIORITY))

# position: importer object
# phrase_descriptions: dict
#    key: phrase importer object
#    value: (description, gender, number)
# result: phrase description to use in the realisation description
def select_phrase_description(position, phrase_descriptions):
    #print(type(position))
    #print(phrase_descriptions)
    if len(phrase_descriptions) == 1:
        desc = get_only_value(phrase_descriptions)
        assert(desc[0] != '???')
        return desc
    by_priority = defaultdict(set)
    for p, d in phrase_descriptions.items():
        by_priority[get_phrase_priority(p)].add((p, d))
    min_priority_phrases = by_priority[min(by_priority.keys())]
    if len(min_priority_phrases) == 1:
        p, desc = min_priority_phrases.pop()
        assert (desc[0] != '???')
        return desc
    else:
        # all are lex phrases
        assert(all(isinstance(p, LexPhrase) for p, d in min_priority_phrases))
        # all have the same grammatical type
        assert(len(set(str(p._lex_phrase()) for p, d in min_priority_phrases)) == 1)
        # heuristic: return first lexicographically
        return sorted(min_priority_phrases, key=lambda x: x[1][0])[0][1]
        #raise RealisationDescriptionError('couldn’t select phrase description: {}'.format(' * '.join(desc[0] for desc in phrase_descriptions.values())))


FUNCTION_RANK = {
    'subj' : 0,
    'head' : 0,
    'obj'  : 2,
    None   : 4,
}

def is_np(phrase, case):
    if phrase._name != 'np':
        return False
    if isinstance(phrase, LexPhrase):
        return phrase._np._case._value == case
    else:
        return phrase._case._value == case

# TODO: possp na początku tylko, jeśli jest przymiotnikowe
def get_argument_realisation_priority(ar, entry_pos):
    position = ar._position
    function = position._function._value if position._function else None
    # first rank by subj or possp, obj, rest
    rank1 = FUNCTION_RANK[function]
    phrase_types = set(phrase._name for phrase in position._phrases)
    if (phrase_types == {'adjp'} and entry_pos == 'noun') or phrase_types == {'possp'}:
        # jakieś COŚ, ale UCZYNIĆ kogoś jakimś
        rank1 = 0
        # np(dat) after verb ‹ktoś daje komuś coś›
    if [p for p in ar._position._phrases if is_np(p, 'dat')]:
        rank1 = 1
    # np(str) without function (TODO? error in data, e.g. chwytać ustami *powietrze* – should be obj?)
    if function is None and [p for p in ar._position._phrases if is_np(p, 'str')]:
        rank1 = 3
    # clauses at the end
    if {'cp', 'ncp', 'prepncp'}.issuperset(phrase_types):
        rank1 = 5
    # then rank by phrase type: refl/recip, then nominal, then rest
    rank2 = 2
    if {'refl', 'recip'}.intersection(phrase_types):
        rank2 = 0
    elif 'np' in phrase_types:
        rank2 = 1
    # finally rank by semantic argument priority
    sem_role = ar._argument._semantic_role
    role_prio = SemanticRole.objects.get(role=sem_role._value).priority
    attribute_prio = RoleAttribute.objects.get(attribute=sem_role._attribute).priority if sem_role._attribute else 0
    rank3 = (role_prio, attribute_prio)
    return [rank1, rank2, rank3]

# jeśli nie ma nic na początku, a jest np(dat), to przesuwamy na początek
def rerank(ars):
    print(ars)
    before, after, np_dat = [], [], []
    for rank, fallback, ar in ars:
        if rank[0] == 0:
            before.append((rank, fallback, ar))
        elif [p for p in ar._position._phrases if is_np(p, 'dat')]:
            np_dat.append((rank, fallback, ar))
        else:
            after.append((rank, fallback, ar))
    if before:
        return ars
    else:
        #assert(len(np_dat) <= 1) #TODO? hasło: daleki
        return [([0] + rank[1:], fallback, ar) for rank, fallback, ar in np_dat] + after

# for multi-position Lemma arguments, e.g. dostać się z deszczu pod rynnę

FALLBACK = {
    'z deszczu' : 1,
    'pod rynnę' : 2,
    'od ściany' : 1,
    'do ściany' : 2,
    'żywcem'   : 1,
    'ze skóry' : 2,
    'pięknym'    : 1,
    'za nadobne' : 2,
    'od Annasza'  : 1,
    'do Kajfasza' : 2,
    'z (brudnymi) buciorami / z (swoimi) buciorami / z (brudnymi swoimi) buciorami / z (brudnymi) butami / z (swoimi) butami / z (brudnymi swoimi) butami' : 1,
    'do łóżka / do łóżek'                                                                                                                                  : 2,
    'samego'            : 1,
    'w (‹jakieś›) ręce' : 2,
    'z (‹jakiejś›) radości / z (‹jakiegoś›) szczęścia' : 1,
    'pod sufit'                                        : 2,
    'z jednej skrajności' : 1,
    'w drugą'             : 2,
    'ze skrajności' : 1,
    'w skrajność'   : 2,
    'z motyką'  : 1,
    'na słońce' : 2,
    'z nogi'  : 1,
    'na nogę' : 2,
    'z pustego' : 1,
    'w próżne'  : 2,
    'z (‹jakiejś›) klasy'  : 1,
    'do (‹jakiejś›) klasy' : 2,
    'z (‹jakiegoś›) kwiatka' : 1,
    'na (‹jakiś›) kwiatek'   : 2,
    'w dno'    : 1,
    'od spodu' : 2,
    'po rozum' : 1,
    'do głowy' : 2,
    'z pazurami / z pięściami' : 1,
    'do oczu'                  : 2,
    'na ziemię' : 1,
    'z obłoków' : 2,
    'prosto' : 1,
    'w (‹jakieś›) serce / w (‹jakieś›) serca' : 2,
    'z rąk'  : 1,
    'do rąk' : 2,
    'z ręki'  : 1,
    'do ręki' : 2,
    'o pomstę' : 1,
    'do nieba' : 2,
    'ze zbiornika' : 1,
    'do zbiornika' : 2,
}

def fallback(description):
    return FALLBACK.get(description, 0)

WINIEN = ('powinien', 'winien',)

# realisation: importer object
# subentry: DB model object
# TODO wszystkie lex-y chyba powinny wejść do tej reprezentacji,
# np. ktoś babrze ‹sobie› ‹rączki›: ‹sobie› nie jest powiązane z argumentem...
def get_realisation_description(realisation, subentry, aspect):
    entry = subentry.entry
    ars = [(get_argument_realisation_priority(ar, entry.pos.tag), fallback(ar._description), ar) for ar in realisation._argument_realizations]
    print([(p1, p2, ar._description) for p1, p2, ar in ars])
    try:
        ars = sorted(ars)
    except:
        raise RealisationDescriptionError('couldn’t order argument realisations: {}'.format(' * '.join('{}{} {}'.format(ar._argument._semantic_role._value, ar._argument._semantic_role._attribute, ar._description) for ar in realisation._argument_realizations)))
    if entry.pos.tag == 'verb':
        # dla innych nie przesuwamy np(dat): bliski *komuś*
        ars = rerank(ars)
    before = [('<b>{}</b>' if ar._argument._semantic_role._value == 'Lemma' else '{}').format(ar._description.split(PHRASE_SEP)[0]) for rank, fallback, ar in ars if rank[0] == 0]
    after = [('<b>{}</b>' if ar._argument._semantic_role._value == 'Lemma' else '{}').format(ar._description.split(PHRASE_SEP)[0]) for rank, fallback, ar in ars if rank[0] > 0]
    subj_ars = [ar for ar in realisation._argument_realizations if ar._position._function and ar._position._function._value == 'subj']
    if len(subj_ars) > 1:
        raise RealisationDescriptionError('> 1 subject argument realisations: {}'.format(' * '.join('{}{} {}'.format(ar._argument._semantic_role._value, ar._argument._semantic_role._attribute, ar._description) for ar in subj_ars)))
    subj_ar = subj_ars[0] if subj_ars else None
    head_ars = [ar for ar in realisation._argument_realizations if ar._position._function and ar._position._function._value == 'head']
    if len(head_ars) > 1:
        raise RealisationDescriptionError('> 1 head argument realisations: {}'.format(' * '.join('{}{} {}'.format(ar._argument._semantic_role._value, ar._argument._semantic_role._attribute, ar._description) for ar in head_ars)))
    head_ar = head_ars[0] if head_ars else None
    entry_form = entry.name
    if entry.name == 'naleźć':
        #TODO błąd w słowniku
        aspect = 'perf'
    if entry.pos.tag == 'adj' and head_ar:
        entry_form = get_form(entry.name, ['adj', head_ar._number, 'nom', head_ar._gender, 'pos'])[0]
    elif entry.name == 'bootować':
        # nienotowane w Morfeuszu
        entry_form = 'bootuje'
    elif entry.name == 'wtyczkować':
        # nienotowane w Morfeuszu
        entry_form = 'wtyczkuje'
    elif entry.pos.tag == 'verb':
        assert(aspect)
        entry_base = entry.name
        if entry_base == 'doprząc':
            entry_base = 'doprzęgnąć'
        if aspect == '_':
            # eg. aresztować
            aspect = 'imperf'
        try:
            subj_num = subj_ar._number if subj_ar else 'sg'
            if subj_ar and (aspect == 'perf' or entry_base in WINIEN):
                # potrzebne tylko dla dokonanych (zrobił/a/o) i winien/na
                if subj_ar._gender:
                    subj_gend = subj_ar._gender
                else:
                    raise RealisationDescriptionError('couldn’t determine subject’s gender: {} {} {}'.format(subj_ar, subj_ar._position._phrases, subj_ar._argument))
            else:
                # no subject: ‹jestem kotem — olśniło kogoś›
                subj_gend = 'n'
            if entry_base in WINIEN:
                entry_form = get_form(entry_base, ['winien', subj_num, subj_gend, 'imperf'])[0]
            elif aspect == 'imperf':
                # niedokonane: fin (cz. teraźnieszy)
                # TODO? lista wyjątków, jeśli będzie więcej
                if entry_base == 'sparować' and subj_num == 'sg':
                    # bokser sparuje — imperf nienotowane w Morfeuszu
                    entry_form = 'sparuje'
                else:
                    print(entry_base, ['fin', subj_num, 'ter', 'imperf'])
                    entry_form = get_form(entry_base, ['fin', subj_num, 'ter', 'imperf'])[0]
            else:
                # dokonane: praet (cz. przeszły)
                # TODO? lista wyjątków, jeśli będzie więcej
                if entry_base == 'nasuwać' and (subj_num, subj_gend) == ('sg', 'm1'):
                    # „Nasuwał się mebli przy odnawianiu mieszkania.” — perf nienotowane w Morfeuszu
                    entry_form = 'nasuwał'
                elif entry_base == 'wybzykać' and (subj_num, subj_gend) == ('sg', 'm1'):
                    # nienotowane w Morfeuszu
                    entry_form = 'wybzykał'
                elif entry_base == 'wytuszować' and (subj_num, subj_gend) == ('sg', 'm1'):
                    # nienotowane w Morfeuszu
                    entry_form = 'wytuszował'
                elif entry_base == 'zależeć' and (subj_num, subj_gend) == ('sg', 'm2'):
                    # nienotowane w Morfeuszu
                    entry_form = 'zależał'
                elif entry_base == 'zemdlić' and (subj_num, subj_gend) == ('sg', 'f'):
                    # formy inne niż „zemdliło” nienotowane w Morfeuszu
                    entry_form = 'zemdliła'
                else:
                    entry_form = get_form(entry_base, ['praet', subj_num, subj_gend, 'perf', ['nagl', '']])[0]
        except:
            entry_form = get_form(entry_base, ['pred'])[0]
        if entry.name == 'napaść' and {'wal_69620-mng', 'wal_80242-mng', 'wal_174604-mng', 'wal_174605-mng', 'wal_174603-mng', 'wal_174606-mng'}.issuperset(realisation._frame._meanings):
            # znaczenie ‹napaść (się) jedzeniem›
            entry_form = entry_form.replace('dł', 'sł')
        if entry.name == 'oblec' and {'wal_85605-mng', 'wal_85615-mng'}.issuperset(realisation._frame._meanings):
            # znaczenie ‹oblec twierdzę›
            entry_form = entry_form.replace('kł', 'gł')
        if entry.name == 'odpaść' and {'wal_68230-mng', 'wal_68225-mng', 'wal_79689-mng'}.issuperset(realisation._frame._meanings):
            # znaczenie ‹odpaść (się) jedzeniem›
            entry_form = entry_form.replace('dł', 'sł')
        if entry.name == 'podpaść' and {'wal_86356-mng', 'wal_86350-mng', 'wal_174582-mng', 'wal_174584-mng', 'wal_174585-mng', 'wal_174586-mng'}.issuperset(realisation._frame._meanings):
            # znaczenie ‹podpaść (się) jedzeniem›
            entry_form = entry_form.replace('dł', 'sł')
        if entry.name == 'popaść' and {'wal_174529-mng', 'wal_174530-mng'}.issuperset(realisation._frame._meanings):
            # znaczenie ‹popaść (się) jedzeniem›
            entry_form = entry_form.replace('dł', 'sł')

    if subentry.negativity and subentry.negativity.name == 'neg':
        entry_form = 'nie ' + entry_form
    if subentry.inherent_sie.name == 'true':
        entry_form += ' się'
    elements = before + ['<b>{}</b>'.format(entry_form)] + after

    if entry_form[0] > 'z':
        #-------
        for t in sorted(BENCH3.keys()):
            if t > 4:
                print('    ************', t, len(BENCH3[t]), BENCH3[t][:10])
                for n, sid, synset in BENCH3[t]:
                    print('    ************', synset)
                    print('    ************', sid, ':', n)
        #for t in sorted(BENCH2.keys()):
        #    if t > 4:
        #        print('    ********', t, len(BENCH2[t]), BENCH2[t][:10])
        #for t in sorted(BENCH.keys()):
        #    if t > 4:
        #        print('    ****', t, len(BENCH[t]), BENCH[t][:10])
        #-------

    return ' '.join(elements)