utils.py 7.91 KB
from itertools import product

import re

from importer.Phrase import *

from .morph_generation import MorphologyError, select_form

PRE, POST = 0, 1
def build_phrase(head, dep, head_type, dep_type, order_override=None):
    order = None
    if order_override is not None:
        order = PRE if order_override == 'pre' else POST
    else:
        if head_type == NP:
            if dep_type in (AdjP, LexAdjP, LexPPasP, LexPActP, PossP, LexQub, Fixed,):
                order = PRE
            # LexAdvP: nic więcej
            if dep_type in (NP, LexNP, PrepNP, ComPrepNP, LexPrepNP, LexPrepGerP, CP, LexCP, NCP, XP, LexXP, LexAdvP, OR,):
                order = POST
        if head_type == NumP:
            if dep_type in (AdjP, LexAdjP, PossP):
                order = PRE
            # XP: w pół drogi ‹dokądś›
            # NP: na dwóch biegunach ‹kogoś/czegoś›
            if dep_type in (NP, XP,):
                order = POST
        if head_type == AdjP:
            if dep_type in (AdvP, LexAdvP, AdjP, LexAdjP, LexQub,):
                order = PRE
            # NP: pełny czegoś
            # Fixed: samo przez się
            if dep_type in (NP, LexNP, PrepNP, LexPrepNP, XP, LexXP, Compar, LexCompar, Fixed):
                order = POST
        if head_type == AdvP:
            if dep_type in (XP, AdvP,):
                order = PRE
            # LexNP: dalej własnego nosa
            # LexPrepNP: prosto w oczy
            # LexCP: tak, że...
            if dep_type in (LexCompar, NP, LexNP, PrepNP, LexPrepNP, LexCP,):
                order = POST
        if head_type == InfP:
            order = POST
        if head_type == Qub:
            if dep_type in (LexQub,):
                order = PRE
    if order == PRE:
        return '{} {}'.format(dep, head)
    if order == POST:
        return '{} {}'.format(head, dep)
    else:
        raise RuntimeError('couldn’t build phrase: {} {} {} {}'.format(head, dep, head_type, dep_type))

def correct_lemma(lemma):
    # TODO see notes
    l = lemma.strip('\'')
    if l == 'bliźnięta':
        return 'bliźnię'
    return l

NUM_LEMMA = { '2' : 'dwa', '3' : 'trzy', '5' : 'pięć', }

def correct_num_lemma(lemma):
    return NUM_LEMMA.get(lemma, lemma)

def correct_pos(lemma, pos):
    if lemma == 'siebie':
        return 'siebie'
    if lemma in ('ja', 'ty', 'my', 'wy'):
        return 'ppron12'
    if lemma == 'on':
        return 'ppron3'
    if lemma == 'oba':
        return 'num'
    if lemma == 'jeden':
        return 'adj'
    return pos

def correct_num(lemma, num):
    if lemma == 'siebie':
        return ''
    if lemma in ('ja', 'ty') and num == '_':
        return 'sg'
    if lemma in ('oba', 'plecy', 'usta',):
        return 'pl'
    if lemma in ('pół', 'półtora'):
        return 'sg'
    # TODO (?)
    if num == 'agr':
        return 'sg'
    # TODO _ -> sg or _ -> sg and pl?
    return num if num != '_' else ['sg', 'pl']

def correct_gend(gend):
    if gend == 'agr':
        return 'm1'
    return gend

# TODO is the mapping for no function correct?
# TODO the mapping should be more complex, e.g. most lex(np)s should be in acc (dać kosza etc.),
# but adjps seem to need nom: chrzest bojowy
STR_CASE = {
    'subj' : { '_' : 'nom', 'aff' : 'nom', 'neg' : 'nom' },
    'obj'  : { '_' : 'acc', 'aff' : 'acc', 'neg' : 'gen' },
    None   : { '_' : 'acc', 'aff' : 'acc', 'neg' : 'gen' },
}
AGR_CASE = { 'subj' : 'nom', 'obj' : 'acc', 'head' : 'nom', None : 'nom' }
PART_CASE = { '_' : 'acc', 'aff' : 'acc', 'neg' : 'gen' }
PRED_CASE = {
    'subj' : { '_' : 'nom', 'aff' : 'nom', 'neg' : 'nom' },
    'obj'  : { '_' : 'acc', 'aff' : 'acc', 'neg' : 'gen' },
    None   : { '_' : 'inst', 'aff' : 'inst', 'neg' : 'inst' },
}
def correct_case(case, function, negativity='_'):
    if case == 'str':
        return STR_CASE[function][negativity]
    if case == 'agr':
        return AGR_CASE[function]
    if case == 'part':
        return PART_CASE[negativity]
    if case == 'pred':
        return PRED_CASE[function][negativity]
    if case == 'postp':
        return 'dat'
    return case

def correct_deg(deg):
    # positive degree = positive or no degree at all
    if deg == 'pos':
        return [deg, '']
    if deg == '_':
        return ['pos', 'com', 'sup', '']
    return deg

def correct_congr(lemma):
    if lemma in ('pół', 'półtora'):
        return 'rec'
    # heuristic: if both congr and rec forms available, prefer congr
    # no congr/rec also possible
    return ['congr', 'rec', '']

def correct_aff(aff):
    if aff == '_':
        return ['aff', 'neg']
    return aff

NEG = { '_' : '(nie) ', 'aff' : '', 'neg' : 'nie '}
def correct_neg(neg):
    return NEG[neg]

SIE = { '' : '', 'się' : 'się ', }
def correct_sie(sie):
    return SIE[sie]

def correct_feats(lemma, feats, praep=False):
    if lemma == 'on':
        return feats + ['m1', 'nakc', 'praep' if praep else 'npraep']
    if lemma in ('ja', 'ty',):
        # mi, ci, cię
        akc = 'nakc' if 'dat' in feats or ({'acc', 'gen'}.intersection(feats) and lemma == 'ty') else 'akc'
        return feats + ['m1', [akc, '']]
    if lemma in ('my', 'wy'):
        return feats + ['m1']
    if lemma == 'oba':
        return feats + ['congr', 'ncol']
    return feats
        
def get_subst_attrs(lemma, tag):
    feats = tag.split(':')
    if lemma == 'siebie':
        return { 'case' : feats[1] }
    return {'num': feats[1], 'case': feats[2], 'gend' : feats[3]}

def get_gender(lemma):
    form = get_form(lemma, ['subst', 'sg', 'nom'])
    # 1 or 2 values: ['f'], ['n', 'ncol'], ...
    gend = form[1].split(':')[3:]
    if len(gend) == 2:
        # no col/ncol variant for jeden, wiele itp.
        gend[1] = [gend[1], '']
    else:
        # choose ncol for e.g. czterech/czworo m1
        gend = [gend[0], ['ncol', '']]
    return gend

def get_form(lemma, feats):
    if lemma.startswith('E('):
        return ('', 'subst:pl:nom:{}'.format(lemma.strip('E()')))
    lemma_feats = [f(lemma) if hasattr(f, '__call__') else f for f in feats]
    return select_form(lemma, lemma_feats)

def get_forms(lemma, feats):
    lemma_feats = [f(lemma) if hasattr(f, '__call__') else f for f in feats]
    lemma_feats = [[f] if type(f) == str else f for f in lemma_feats]
    ret = []
    errors = []
    for feats in product(*lemma_feats):
        try:
            ret.append(select_form(lemma, feats))
        except MorphologyError as e:
            errors.append(str(e))
    if ret:
        #print('get_forms', lemma, feats, ret)
        return ret
    raise MorphologyError('couldn’t select form: {}'.format(' + '.join(errors)))

WOK_PREP = {
    'bez' : ('^mn',), # beze mnie
    'nad' : (
        '^mn',
        '^wszystko' # nade wszystko, ale: nad wszystkim
     ),
    'od' : ('^mn',),
    'pod' : ('^mn',),
    'przed' : (
        '^mn',
        '^wszystkim$' # przede wszystkim, ale: przed wszystkimi
     ),
    'przez' : ('^mn',),
    'w' : (
        '^dwoje', # ale: w dwojaki
        '^dwój',
        '^fr',
        '^mgl',
        '^mnie$', # ale: w mniejszych
        '^wc',
        '^wn',
        '^wp',
        '^wr',
        '^ws',
        '^wt',
        '^wz',
        '^wł',
        '^znaki$',
        '^śnie',
    ),
    'z' : (
        '^mnie$',
        '^mną$',
        '^sobą$',
        '^sc',
        '^sf',
        '^sk',
        '^sm',
        '^sn',
        '^sp',
        '^st',
        '^sw',
        '^szc',
        '^szk',
        '^szp',
        '^szt',
        '^szw',
        '^sł',
        '^wsc',
        '^wsi',
        '^wsk',
        '^wsp',
        '^wst',
        '^wszec',
        '^wszystk',
        '^wz',
        '^zb',
        '^zd',
        '^zg',
        '^zj',
        '^zm',
        '^zn',
        '^zr',
        '^zw',
        '^zł',
        '^łz',
        '^ś',
        '^ź',
    ),
}

def combine_with_prep(prep, rest):
    if prep in WOK_PREP:
        for pattern in WOK_PREP[prep]:
            if re.match(pattern, rest.lower()):
                return '{}e {}'.format(prep, rest)
    return '{} {}'.format(prep, rest)