import_sejfek.py 14 KB

Edit Raw Blame History

#-*- coding:utf-8 -*-

from django.core.management.base import BaseCommand
from common.util import debug, suffixes, cut_end, uniopen
from dictionary.models import Lexeme, Pattern, InflectionCharacteristic, \
    Ending, BaseFormLabel
from dictionary.management.commands.import_morfologik import create_lexeme, \
    create_lip, print_data, find_minimal_sets, blacklist_filter, join_many, \
    join, get_sgjp


class Command(BaseCommand):
    args = '<input file name>'
    help = 'importuje leksemy z KIPI 1.0'

    def handle(self, filename, **options):
        import_sejfek(uniopen(filename))


DEBUG = False


def inflection_characteristic(forms, pos):
    # w SEJFKU jest tylko subst i adj
    tag = forms[0][1]
    if pos == 'subst':
        if 'depr' in tag or tag.endswith('m1'):
            ic = 'm1'
        else:
            ic = tag.rsplit(':', 1)[1]
            # syf
            if ic == 'n1.n2':
                ic = 'n2'
            if '.' in ic:
                ic = 'm3'
    elif pos == 'adj':
        # formy 3+ tu nie występują
        if any(tag == 'adja' for form, tag in forms):
            ic = ''
        else:
            ic = '0-'
    return ic

# COPYPASTA HEAVEN
def get_basic_endings(lexical_class, ic):
    return Ending.objects.filter(
        base_form_label=ic.basic_form_label,
        pattern__type__lexical_class__symbol=lexical_class)


basic_form_endings_dict = {}
for pos in ('adj', 'subst'):
    for ic in InflectionCharacteristic.objects.filter(
            part_of_speech__symbol=pos):
        basic_form_endings_dict[(pos, ic.symbol)] = get_basic_endings(pos, ic)

sure_bfls_sg = tuple(
    BaseFormLabel.objects.filter(
        symbol__in=['sg:dat', 'sg:gen', 'sg:inst']).values_list('pk',
        flat=True))
sure_bfls_pl = tuple(
    BaseFormLabel.objects.filter(
        symbol__in=['pl:dat', 'pl:inst', 'pl:loc']).values_list('pk',
        flat=True))


def basic_form_endings(lexical_class, ic, basic_form, form_set):
    if lexical_class != 'subst':
        return basic_form_endings_dict[(lexical_class, ic)].filter(
            string__in=suffixes(basic_form))
    else:
        # karkołomne, ale trochę przyśpiesza
        endings = basic_form_endings_dict[(lexical_class, ic)]
        new_endings = Ending.objects.none()
        for suf in suffixes(basic_form):
            root = cut_end(basic_form, suf)
            n = len(root)
            ending_strings = tuple(
                form[n:] for form in form_set if form.startswith(root))
            endings_part = endings.filter(string=suf)
            pattern_ids = endings_part.values_list('pattern', flat=True)
            patterns = Pattern.objects.filter(pk__in=pattern_ids).extra(
                where=["(id = '0000' or not exists "
                       "(select id from zakonczenia where w_id = wzory.id "
                       "and zak not in %s and efobaz in %s) or not exists "
                       "(select id from zakonczenia where w_id = wzory.id "
                       "and zak not in %s and efobaz in %s))"],
                params=[ending_strings, sure_bfls_sg, ending_strings,
                    sure_bfls_pl])
            new_endings = new_endings | endings_part.filter(
                pattern__in=patterns)
        return new_endings


memoized_pattern_ics = {}


def bad_pattern_subst(pattern, ic):
    if (pattern, ic) in memoized_pattern_ics:
        return memoized_pattern_ics[(pattern, ic)]
    if not pattern.lexemeinflectionpattern_set.filter(
            inflection_characteristic__symbol=ic).exclude(
            lexeme__status='cand'):
        ret = True
    elif pattern.type.symbol in 'mn' and ic == 'f':
        ret = True
    elif pattern.type.symbol in 'fm' and ic[0] == 'n':
        ret = True
    else:
        ret = False
    memoized_pattern_ics[(pattern, ic)] = ret
    return ret


memoized_good_endings = {}


def good_ending_set_subst(pattern, ic, root):
    if (pattern, ic) in memoized_good_endings:
        good_endings = memoized_good_endings[(pattern, ic)]
        return set(root + e for e in good_endings)
    endings = pattern.endings
    if ic not in ('m1', 'p1'):
        endings = endings.exclude(base_form_label__symbol='pl:nom:mo')
    if ic[0] == 'p':
        endings = endings.filter(base_form_label__symbol__startswith='pl')
    else:
        for g in list(set('mfn') - set(ic[0])):
            endings = endings.exclude(
                base_form_label__symbol__startswith='pl:gen:' + g)
    if ic == 'p3':
        if pattern.type.symbol == 'f':
            endings = endings.exclude(base_form_label__symbol='pl:gen:m')
        elif pattern.type.symbol == 'n':
            endings = endings.exclude(base_form_label__symbol='pl:gen:n')
    good_endings = list(endings.values_list('string', flat=True))
    memoized_good_endings[(pattern, ic)] = good_endings
    return set(root + e for e in good_endings)


def good_ending_set(lexical_class, ic, pattern, root=''):
    if lexical_class != 'subst':
        return pattern.ending_set(root)
    else:
        return good_ending_set_subst(pattern, ic, root)


def relevant_subst(ending, ic):
    bfl = ending.base_form_label.symbol
    tag = bfl.split(':')
    type = ending.pattern.type.symbol
    return (not (ic in ('m1', 'p1') and bfl == 'pl:nom') and
            not (len(tag) >= 3 and ic[0] != 'p' and
                 tag[2][0] != ic[0]) and
            not (ic[0] == 'p' and tag[0] != 'pl') and
            not (ic == 'p3' and bfl.startswith('pl:gen:') and (
                (type == 'n' and tag[2] == 'n') or
                (type == 'f' and tag[2] == 'm')
            )) and
            not (ic not in ('m1', 'p1') and bfl == 'pl:nom:mo'))


def relevant_adj(ending):
    tag = ending.base_form_label.symbol
    return tag not in ('0', '3+')


def relevant(lexical_class, ending, ic):
    if lexical_class == 'subst':
        return relevant_subst(ending, ic)
    elif lexical_class == 'adj':
        return relevant_adj(ending)


def find_patterns(basic_form, pos, ic, forms):
    patterns = Pattern.objects.filter(type__lexical_class__symbol=pos)
    # znaleźć wszystkie zawarte i zawierające wzory
    form_set = set(form for form, tag in forms)
    ending_sets = {}
    included_patterns = set()
    including_patterns = set()
    matching_patterns = set()
    for basic_ending in basic_form_endings(pos, ic, basic_form, form_set):
        pattern = basic_ending.pattern
        if pos == 'subst' and bad_pattern_subst(pattern, ic):
            #print 'odpadł:', pattern
            continue # olewamy komentarze że formy odrzucone przez charfle?
        root = basic_form[:len(basic_form) - len(basic_ending.string)]
        ending_sets[pattern] = good_ending_set(pos, ic, pattern, root)
        including = form_set.issubset(ending_sets[pattern])
        bad_forms = set()
        for ending in pattern.endings.all():
            if relevant(pos, ending, ic):
                if root + ending.string not in form_set:
                    bfl = ending.base_form_label.symbol
                    #print pattern.name, root, ending.string, bfl
                    bad_forms.add(root + ending.string)
        if not bad_forms:
            included_patterns.add((pattern, root))
            if including:
                matching_patterns.add((pattern, root))
        elif including:
            including_patterns.add(((pattern, root), tuple(bad_forms)))

    # nie wiem, czy to potrzebne, ale na wszelki wypadek
    included_patterns = list(included_patterns)
    including_patterns = list(including_patterns)
    matching_patterns = list(matching_patterns)
    if len(matching_patterns) > 0:
        if DEBUG:
            print u'dokładne wzory: %s' % join(matching_patterns)
        return 'match', matching_patterns, included_patterns, including_patterns
        # nic nie pasuje albo trzeba wybrać wiele wzorów
    if DEBUG and len(including_patterns) > 0:
        print u'zawierające: %s' % join(p for p, b_f in including_patterns)
    if DEBUG and len(included_patterns) > 0:
        print u'zawarte: %s' % join(included_patterns)
    return find_many_patterns(
        pos, ic, form_set, basic_form, included_patterns, ending_sets) + (
               included_patterns, including_patterns)


def find_many_patterns(pos, ic, form_set, basic_form, included_patterns,
                       ending_sets):
    necessary_patterns = set()
    missing_form = None
    for form in form_set:
        having = []
        for pattern, root in included_patterns:
            if form in ending_sets[pattern]:
                having.append((pattern, root))
        if len(having) == 1:
            necessary_patterns.add(having[0])
        if having == []:
            missing_form = form
            break
    if missing_form:
        if DEBUG:
            print u"brak formy: %s" % missing_form
        return 'none', []
    covered_forms = set()
    for pattern, root in necessary_patterns:
        covered_forms |= ending_sets[pattern]
    if form_set.issubset(covered_forms):
        if DEBUG:
            print u"pokryte koniecznymi wzorami: %s" % join(necessary_patterns)
        return 'many', [list(necessary_patterns)]
    else:
        #for pattern, root in included_patterns:
        #  print pattern, ending_sets[pattern]
        minimal_sets = find_minimal_sets(
            form_set, covered_forms, necessary_patterns, included_patterns,
            ending_sets)
        return 'many', minimal_sets


def filter_patterns(filter, action_name, type, patterns, included, including,
                    lexical_class, form_set, entry, ic):
    old_patterns = patterns
    old_included = included
    bad_patterns = False
    if type == 'many':
        if any(pattern_set != filter(pattern_set) for pattern_set in patterns):
            included = filter(included)
            ending_sets = {}
            for pattern, root in included:
                ending_sets[pattern] = good_ending_set(lexical_class, ic,
                    pattern, root)
            type, patterns = find_many_patterns(
                lexical_class, ic, form_set, entry, included, ending_sets)
            if type != 'many':
                debug(entry, u'mnogie dopasowanie zepsute przez %s (%s)' %
                             (action_name, join_many(old_patterns)))
                type = 'many'
                patterns, included = old_patterns, old_included
                bad_patterns = True
    elif type == 'none':
        including_dict = dict(including)
        including = [(key, including_dict[key]) for key in
            filter(including_dict)]
    else: # type == 'match'
        patterns = filter(patterns)
        including_dict = dict(including)
        including = [(key, including_dict[key]) for key in
            filter(including_dict)]
        included = filter(included)
        if old_patterns and not patterns:
            ending_sets = {}
            for pattern, root in included:
                ending_sets[pattern] = good_ending_set(lexical_class, ic,
                    pattern, root)
            type, patterns = find_many_patterns(
                lexical_class, ic, form_set, entry, included, ending_sets)
            if type == 'none':
                debug(entry, u'znikły wzory przez %s (%s)' %
                             (action_name, join(old_patterns)))
                type = 'match'
                patterns = old_patterns
                bad_patterns = True
    return type, patterns, included, including, bad_patterns


def process_forms(forms, base, pos):
    old = Lexeme.objects.filter(entry=base)
    if old:
        if old.count() > 1:
            debug(base, u'więcej niż jedna wersja w Kuźni')
        else:
            print_data({'lexeme': get_sgjp(old.get())})
        return
    ic = inflection_characteristic(forms, pos)
    form_set = set(form for form, tag in forms)
    type, patterns, included, including = find_patterns(base, pos, ic, forms)
    type, patterns, included, including, bad_patterns = filter_patterns(
        blacklist_filter, u'czarną listę', type, patterns, included, including,
        pos, form_set, base, ic)
    # wzory się już nie zmienią od tego miejsca

    if type == 'none':
        debug(base, u'zawiera się w %s' % join(p for p, b_f in including))
        chosen = []
        fitting = including
    elif type == 'match':
        patterns.sort(key=lambda p: p[0].name)
        fitting = patterns
        chosen = patterns[:1]
    else: # type == 'many'
        chosen = patterns[0]
        if DEBUG:
            print u'zestawy wielu wzorów: %s' % join_many(patterns)
        fitting = patterns

    if not DEBUG:
        comments = []
        if type != 'match' or len(fitting) > 1:
            status = 'cand'
        else:
            status = 'desc'
        if bad_patterns:
            comments.append(u'Wzory z czarnej listy!')
            status = 'cand'
        if len(fitting) > 1 or (type == 'none' and fitting):
            if type == 'none':
                comments.append(u'Zawierające wzory:')
                for (pattern, root), bad_forms in fitting:
                    comments.append(
                        '%s: %s' % (pattern.name, ', '.join(bad_forms)))
            elif type != 'many':
                comments.append(u'Pasujące wzory: %s' % join(fitting))
            else:
                comments.append(
                    u'Pasujące zestawy wzorów: %s' % join_many(fitting))
        comment = '\n'.join(comments)
        lips = []
        for i, pattern in enumerate(chosen):
            lips.append(create_lip(pattern[0], pattern[1], i + 1, ic, pos))
        lexeme_data = create_lexeme(base, 1, pos, status, comment)
        data = {
            'lexeme': lexeme_data,
            'lips': lips,
        }
        print_data(data)


def import_sejfek(input_file):
    forms = []
    for line in input_file:
        data = line.replace("'", u'’').split(' ')
        if len(data) > 1:
            form, _base, tag = data
            pos = 'subst' if tag.split(':', 1)[0] == 'subst' else 'adj'
            if not forms:
                base = form
            forms.append((form, tag))
        else:
            if forms:
                process_forms(forms, base, pos)
            forms = []
            patterns = set()
    process_forms(forms, base, pos)