import_kipi.py 14.5 KB
#-*- coding:utf-8 -*-

from django.core.management.base import BaseCommand
from common.util import debug, suffixes, cut_end
from dictionary.models import Lexeme, Pattern, InflectionCharacteristic, \
    Ending, BaseFormLabel
from dictionary.management.commands.import_morfologik import create_lexeme, \
    create_lip, print_data, find_minimal_sets, blacklist_filter, join_many, \
    join, print_forms


class Command(BaseCommand):
    args = '<input file name>'
    help = 'importuje leksemy z KIPI 1.0'

    def handle(self, filename, **options):
        import_kipi(open(filename))


DEBUG = False

COMMONNESS = {
    'geog': u'geograficzna',
    'imie': u'imię',
    'inna': u'własna',
    'nazw': u'nazwisko',
    'orga': u'organizacja',
    'posp': u'pospolita',
}


def inflection_characteristic(forms, pos):
    # w KIPI jest tylko subst i adj
    tag = forms[0][1]
    if pos == 'subst':
        if 'depr' in tag or tag.endswith('m1'):
            ic = 'm1'
        else:
            ic = tag.rsplit(':', 1)[1]
    elif pos == 'adj':
        # formy 3+ tu nie występują
        if any(tag == 'adja' for form, tag in forms):
            ic = ''
        else:
            ic = '0-'
    return ic

# COPYPASTA HEAVEN
def get_basic_endings(lexical_class, ic):
    return Ending.objects.filter(
        base_form_label=ic.basic_form_label,
        pattern__type__lexical_class__symbol=lexical_class)


basic_form_endings_dict = {}
for pos in ('adj', 'subst'):
    for ic in InflectionCharacteristic.objects.filter(
            part_of_speech__symbol=pos):
        basic_form_endings_dict[(pos, ic.symbol)] = get_basic_endings(pos, ic)

sure_bfls_sg = tuple(
    BaseFormLabel.objects.filter(
        symbol__in=['sg:dat', 'sg:gen', 'sg:inst']).values_list('pk',
                                                                flat=True))
sure_bfls_pl = tuple(
    BaseFormLabel.objects.filter(
        symbol__in=['pl:dat', 'pl:inst', 'pl:loc']).values_list('pk',
                                                                flat=True))


def basic_form_endings(lexical_class, ic, basic_form, form_set):
    if lexical_class != 'subst':
        return basic_form_endings_dict[(lexical_class, ic)].filter(
            string__in=suffixes(basic_form))
    else:
        # karkołomne, ale trochę przyśpiesza
        endings = basic_form_endings_dict[(lexical_class, ic)]
        new_endings = Ending.objects.none()
        for suf in suffixes(basic_form):
            root = cut_end(basic_form, suf)
            n = len(root)
            ending_strings = tuple(
                form[n:] for form in form_set if form.startswith(root))
            endings_part = endings.filter(string=suf)
            pattern_ids = endings_part.values_list('pattern', flat=True)
            patterns = Pattern.objects.filter(pk__in=pattern_ids).extra(
                where=["(id = '0000' or not exists "
                       "(select id from zakonczenia where w_id = wzory.id "
                       "and zak not in %s and efobaz in %s) or not exists "
                       "(select id from zakonczenia where w_id = wzory.id "
                       "and zak not in %s and efobaz in %s))"],
                params=[ending_strings, sure_bfls_sg, ending_strings,
                        sure_bfls_pl])
            new_endings = new_endings | endings_part.filter(
                pattern__in=patterns)
        return new_endings


memoized_pattern_ics = {}


def bad_pattern_subst(pattern, ic):
    if (pattern, ic) in memoized_pattern_ics:
        return memoized_pattern_ics[(pattern, ic)]
    if not pattern.lexemeinflectionpattern_set.filter(
            inflection_characteristic__symbol=ic).exclude(
            lexeme__status='cand'):
        ret = True
    elif pattern.type.symbol in 'mn' and ic == 'f':
        ret = True
    elif pattern.type.symbol in 'fm' and ic[0] == 'n':
        ret = True
    else:
        ret = False
    memoized_pattern_ics[(pattern, ic)] = ret
    return ret


memoized_good_endings = {}


def good_ending_set_subst(pattern, ic, root):
    if (pattern, ic) in memoized_good_endings:
        good_endings = memoized_good_endings[(pattern, ic)]
        return set(root + e for e in good_endings)
    endings = pattern.endings
    if ic not in ('m1', 'p1'):
        endings = endings.exclude(base_form_label__symbol='pl:nom:mo')
    if ic[0] == 'p':
        endings = endings.filter(base_form_label__symbol__startswith='pl')
    else:
        for g in list(set('mfn') - set(ic[0])):
            endings = endings.exclude(
                base_form_label__symbol__startswith='pl:gen:' + g)
    if ic == 'p3':
        if pattern.type.symbol == 'f':
            endings = endings.exclude(base_form_label__symbol='pl:gen:m')
        elif pattern.type.symbol == 'n':
            endings = endings.exclude(base_form_label__symbol='pl:gen:n')
    good_endings = list(endings.values_list('string', flat=True))
    memoized_good_endings[(pattern, ic)] = good_endings
    return set(root + e for e in good_endings)


def good_ending_set(lexical_class, ic, pattern, root=''):
    if lexical_class != 'subst':
        return pattern.ending_set(root)
    else:
        return good_ending_set_subst(pattern, ic, root)


def relevant_subst(ending, ic):
    bfl = ending.base_form_label.symbol
    tag = bfl.split(':')
    pattern_type = ending.pattern.type.symbol
    return (not (ic in ('m1', 'p1') and bfl == 'pl:nom') and
            not (len(tag) >= 3 and ic[0] != 'p' and
                 tag[2][0] != ic[0]) and
            not (ic[0] == 'p' and tag[0] != 'pl') and
            not (ic == 'p3' and bfl.startswith('pl:gen:') and (
                (pattern_type == 'n' and tag[2] == 'n') or
                (pattern_type == 'f' and tag[2] == 'm')
            )) and
            not (ic not in ('m1', 'p1') and bfl == 'pl:nom:mo'))


def relevant_adj(ending):
    tag = ending.base_form_label.symbol
    return tag not in ('0', '3+')


def relevant(lexical_class, ending, ic):
    if lexical_class == 'subst':
        return relevant_subst(ending, ic)
    elif lexical_class == 'adj':
        return relevant_adj(ending)


def find_patterns(basic_form, pos, ic, forms):
    patterns = Pattern.objects.filter(type__lexical_class__symbol=pos)
    # znaleźć wszystkie zawarte i zawierające wzory
    form_set = set(form for form, tag in forms)
    ending_sets = {}
    included_patterns = set()
    including_patterns = set()
    matching_patterns = set()
    for basic_ending in basic_form_endings(pos, ic, basic_form, form_set):
        pattern = basic_ending.pattern
        if pos == 'subst' and bad_pattern_subst(pattern, ic):
            #print 'odpadł:', pattern
            continue # olewamy komentarze że formy odrzucone przez charfle?
        root = basic_form[:len(basic_form) - len(basic_ending.string)]
        ending_sets[pattern] = good_ending_set(pos, ic, pattern, root)
        including = form_set.issubset(ending_sets[pattern])
        bad_forms = set()
        for ending in pattern.endings.all():
            if relevant(pos, ending, ic):
                if root + ending.string not in form_set:
                    bfl = ending.base_form_label.symbol
                    #print pattern.name, root, ending.string, bfl
                    bad_forms.add(root + ending.string)
        if not bad_forms:
            included_patterns.add((pattern, root))
            if including:
                matching_patterns.add((pattern, root))
        elif including:
            including_patterns.add(((pattern, root), tuple(bad_forms)))

    # nie wiem, czy to potrzebne, ale na wszelki wypadek
    included_patterns = list(included_patterns)
    including_patterns = list(including_patterns)
    matching_patterns = list(matching_patterns)
    if len(matching_patterns) > 0:
        if DEBUG:
            print u'dokładne wzory: %s' % join(matching_patterns)
        return 'match', matching_patterns, included_patterns, including_patterns
        # nic nie pasuje albo trzeba wybrać wiele wzorów
    if DEBUG and len(including_patterns) > 0:
        print u'zawierające: %s' % join(p for p, b_f in including_patterns)
    if DEBUG and len(included_patterns) > 0:
        print u'zawarte: %s' % join(included_patterns)
    return find_many_patterns(
        pos, ic, form_set, basic_form, included_patterns, ending_sets) + (
               included_patterns, including_patterns)


def find_many_patterns(pos, ic, form_set, basic_form, included_patterns,
                       ending_sets):
    necessary_patterns = set()
    missing_form = None
    for form in form_set:
        having = []
        for pattern, root in included_patterns:
            if form in ending_sets[pattern]:
                having.append((pattern, root))
        if len(having) == 1:
            necessary_patterns.add(having[0])
        if having == []:
            missing_form = form
            break
    if missing_form:
        if DEBUG:
            print u"brak formy: %s" % missing_form
        return 'none', []
    covered_forms = set()
    for pattern, root in necessary_patterns:
        covered_forms |= ending_sets[pattern]
    if form_set.issubset(covered_forms):
        if DEBUG:
            print u"pokryte koniecznymi wzorami: %s" % join(necessary_patterns)
        return 'many', [list(necessary_patterns)]
    else:
        #for pattern, root in included_patterns:
        #  print pattern, ending_sets[pattern]
        minimal_sets = find_minimal_sets(
            form_set, covered_forms, necessary_patterns, included_patterns,
            ending_sets)
        return 'many', minimal_sets


def filter_patterns(filter, action_name, type, patterns, included, including,
                    lexical_class, form_set, entry, ic):
    old_patterns = patterns
    old_included = included
    bad_patterns = False
    if type == 'many':
        if any(pattern_set != filter(pattern_set) for pattern_set in patterns):
            included = filter(included)
            ending_sets = {}
            for pattern, root in included:
                ending_sets[pattern] = good_ending_set(lexical_class, ic,
                                                       pattern, root)
            type, patterns = find_many_patterns(
                lexical_class, ic, form_set, entry, included, ending_sets)
            if type != 'many':
                debug(entry, u'mnogie dopasowanie zepsute przez %s (%s)' %
                             (action_name, join_many(old_patterns)))
                type = 'many'
                patterns, included = old_patterns, old_included
                bad_patterns = True
    elif type == 'none':
        including_dict = dict(including)
        including = [(key, including_dict[key]) for key in
                     filter(including_dict)]
    else: # type == 'match'
        patterns = filter(patterns)
        including_dict = dict(including)
        including = [(key, including_dict[key]) for key in
                     filter(including_dict)]
        included = filter(included)
        if old_patterns and not patterns:
            ending_sets = {}
            for pattern, root in included:
                ending_sets[pattern] = good_ending_set(lexical_class, ic,
                                                       pattern, root)
            type, patterns = find_many_patterns(
                lexical_class, ic, form_set, entry, included, ending_sets)
            if type == 'none':
                debug(entry, u'znikły wzory przez %s (%s)' %
                             (action_name, join(old_patterns)))
                type = 'match'
                patterns = old_patterns
                bad_patterns = True
    return type, patterns, included, including, bad_patterns


def process_forms(forms, base, pos, commonness):
    if Lexeme.objects.filter(entry=base):
        return
    ic = inflection_characteristic(forms, pos)
    form_set = set(form for form, tag in forms)
    type, patterns, included, including = find_patterns(base, pos, ic, forms)
    type, patterns, included, including, bad_patterns = filter_patterns(
        blacklist_filter, u'czarną listę', type, patterns, included, including,
        pos, form_set, base, ic)
    # wzory się już nie zmienią od tego miejsca
    if type == 'many':
        all_patterns = [p for pattern_set in patterns for p in pattern_set]
    else:
        all_patterns = patterns

    if type == 'none':
        debug(base, u'zawiera się w %s' % join(p for p, b_f in including))
        chosen = []
        fitting = including
        if pos == 'adj' and including:
            print_forms(forms, 'rzeczownik#')
            return
    elif type == 'match':
        patterns.sort(key=lambda p: p[0].name)
        fitting = patterns
        chosen = patterns[:1]
    elif type == 'many':
        chosen = patterns[0]
        if DEBUG:
            print u'zestawy wielu wzorów: %s' % join_many(patterns)
        fitting = patterns

    if not DEBUG:
        comments = [u'z Korpusu IPI 1.0']
        if commonness == u'własna' or type != 'match' or len(fitting) > 1:
            status = 'cand'
        else:
            status = 'desc'
        if bad_patterns:
            comments.append(u'Wzory z czarnej listy!')
            status = 'cand'
        if len(fitting) > 1 or (type == 'none' and fitting):
            if type == 'none':
                comments.append(u'Zawierające wzory:')
                for (pattern, root), bad_forms in fitting:
                    comments.append(
                        u'%s: %s' % (pattern.name, ', '.join(bad_forms)))
            elif type != 'many':
                comments.append(u'Pasujące wzory: %s' % join(fitting))
            else:
                comments.append(
                    u'Pasujące zestawy wzorów: %s' % join_many(fitting))
        comment = '\n'.join(comments)
        lips = []
        for i, pattern in enumerate(chosen):
            lips.append(create_lip(pattern[0], pattern[1], i + 1, ic, pos))
        lexeme_data = create_lexeme(base, 1, pos, status, comment)
        lexeme_data['commonness'] = commonness
        data = {
            'lexeme': lexeme_data,
            'lips': lips,
        }
        print_data(data)


def import_kipi(input_file):
    last_key = None
    forms = None
    for line in input_file:
        data = line.strip().decode('utf-8').split('\t')
        form, base, comm, tag = data
        pos = 'subst' if tag.startswith('subst') else 'adj' # bez split, bo adja
        key = (base, pos, comm)
        if key != last_key:
            if last_key is not None:
                process_forms(forms, last_key[0], last_key[1],
                              COMMONNESS[last_key[2]])
            last_key = key
            forms = []
        forms.append((form, tag))
    process_forms(forms, last_key[0], last_key[1], COMMONNESS[last_key[2]])