load_import.py 5.66 KB
# -*- coding: utf-8 -*-

import json
from django.db.models import Max
from django.core.management.base import BaseCommand
from django.db.transaction import atomic

from common.util import no_history, debug, uniopen
from dictionary.models import Lexeme, Inflection, PartOfSpeech, Vocabulary, \
    InflectionCharacteristic, CrossReference, CrossReferenceType, \
    ClassificationValue
from patterns.models import Pattern

START_ID = 500000
END_ID = 1000000

next_id = Lexeme.objects.filter(
    pk__gte=START_ID, pk__lt=END_ID).aggregate(Max('id'))['id__max']
next_id = next_id + 1 if next_id else START_ID


class Command(BaseCommand):
    args = '<nazwa pliku wejściowego> <nazwa słownika> <nazwa źródła>'
    help = 'Load prepared lexeme data'

    def handle(self, input_file, vocab_name, source, **options):
        raise Exception('stale code')
        #load_morfologik(input_file, vocab_name, source)


source = None
vocab = None # brzydko, ale nie chce mi się przerabiać wszystkiego

parts_of_speech = PartOfSpeech.objects.all()
pos_table = {}
for part_of_speech in parts_of_speech:
    pos_table[part_of_speech.symbol] = part_of_speech

ic_list = InflectionCharacteristic.objects.all()
ic_table = {}
for ic in ic_list:
    ic_table[(ic.symbol, ic.part_of_speech.symbol)] = ic

pattern_list = Pattern.objects.all()
pattern_table = {}
for p in pattern_list:
    pattern_table[p.name] = p


def associate(l):
    created = vocab.add_lexeme(l)
    if not created and l.part_of_speech.symbol not in ('ppas', 'pact', 'ger'):
        debug(l.entry, u'wielokrotne przypisanie leksemu do słownika!')


def add_cr(l_from, l_to, symbol):
    cr_type = CrossReferenceType.objects.get(
        symbol=symbol, from_pos=l_from.part_of_speech,
        to_pos=l_to.part_of_speech)
    CrossReference(from_lexeme=l_from, to_lexeme=l_to, type=cr_type).save()


def create_lexeme(entry, part_of_speech, status, comment, commonness=None):
    global next_id
    l = Lexeme.objects.create(
        id=next_id, entry=entry, part_of_speech=part_of_speech,
        source=source, status=status, comment=comment,
        owner_vocabulary=vocab)
    l.fix_homonym_number()
    if commonness:
        ClassificationValue.objects.get(label=commonness).add_lexeme(l)
    associate(l)
    next_id += 1
    return l


def create_negated(l):
    lneg = create_lexeme(
        u"nie" + l.entry, l.part_of_speech,
        "cand" if l.status == "cand" else "desc", '')
    for lip in l.inflection_set.all():
        if lip.inflection_characteristic.symbol != "0-":
            ic = ic_table[("0-", "adj")]
        else:
            ic = lip.inflection_characteristic
        Inflection(lexeme=lneg, index=lip.index,
                   pattern=lip.pattern, root=u"nie" + lip.root,
                   inflection_characteristic=ic).save()
    add_cr(l, lneg, "adjnie")
    add_cr(lneg, l, "nieadj")


def check_der(verb, pos, entry, patterns):
    lips = verb.inflection_set.all()
    if not lips:
        return None
    ic = lips[0].inflection_characteristic.symbol
    matched = []
    for l in Lexeme.objects.filter(
            entry=entry, part_of_speech__symbol=pos,
            inflection__inflection_characteristic__symbol=ic):
        l_lips = l.inflection_set.all()
        if l_lips[0].inflection_characteristic.symbol == ic:
            l_patterns = set(l.patterns.values_list('name', flat=True))
            if l_patterns == set(patterns):
                matched.append(l)
    if len(matched) > 1:
        debug(entry, u'niejednoznaczny derywat')
    if len(matched) > 0:
        return matched[0]
    else:
        return None


def create_derived(l, pos, entry, patterns):
    old_der = check_der(l, pos, entry, patterns)
    if old_der:
        if vocab not in old_der.vocabularies.all():
            associate(old_der)
        lder = old_der
    else:
        lder = create_lexeme(entry, pos_table[pos], l.status, u'')
        for lip in l.inflection_set.all():
            if lip.pattern.name in patterns:
                ic = lip.inflection_characteristic.symbol.lstrip("q")
                Inflection(lexeme=lder, index=lip.index,
                           pattern=lip.pattern, root=lip.root,
                           inflection_characteristic=ic_table[
                        (ic, pos)]).save()
    add_cr(l, lder, "ver" + pos)
    add_cr(lder, l, pos + "ver")


@atomic
def load_morfologik(filename, vocab_name, source_):
    global vocab, source
    vocab = Vocabulary.objects.get(id=vocab_name)
    source = source_
    no_history()
    for line in uniopen(filename):
        data = json.loads(line)
        if data['lexeme']['source'] == 'sgjp':
            l = Lexeme.objects.get(pk=data['lexeme']['id'])
            associate(l)
        elif data['lexeme']['source'] == 'morfologik':
            l_data = data['lexeme']
            l = create_lexeme(l_data['entry'],
                pos_table[l_data['part_of_speech']],
                l_data['status'], l_data['comment'],
                l_data.get('commonness'))
            for lip_data in data['lips']:
                pattern = pattern_table[lip_data['pattern']]
                ic = ic_table[tuple(lip_data['ic'])]
                if lip_data['root']['type'] == 'string':
                    root = lip_data['root']['root']
                elif lip_data['root']['type'] == 'compute':
                    root = l.get_root(pattern, ic)
                Inflection(lexeme=l, index=lip_data['ind'],
                           pattern=pattern, root=root,
                           inflection_characteristic=ic).save()
            if 'derived' in data:
                for pos, entry, patterns in data['derived']:
                    create_derived(l, pos, entry, patterns)
        if 'negated' in data:
            create_negated(l)