import_witek.py 4.74 KB

Edit Raw Blame History

# -*- coding: utf-8 -*-
import sys
from django.core.management.base import BaseCommand
from django.db.models import Max
from django.db.transaction import atomic

from common.util import uniopen, no_history
from dictionary.models import Lexeme, Vocabulary, LexemeInflectionPattern, \
    Qualifier, ClassificationValue, LexemeCV, Gender, \
    LexemeAttributeValue, CrossReferenceType, CrossReference
from patterns.models import Pattern


class Command(BaseCommand):
    help = "My shiny new management command."

    def handle(self, filename, comment, *args, **options):
        import_lexemes(uniopen(filename), comment)


WSJP = Vocabulary.objects.get(id='WSJP')


@atomic
def import_lexemes(lines, comment):
    no_history()
    for line in lines:
        elements = line.strip().split(';')
        if elements[1] == 'subst':
            import_subst(elements, comment)
        elif elements[1] == 'adv':
            import_adv(elements, comment)
        elif elements[1] == 'adj':
            import_adj(elements, comment)


next_id = None


def new_lexeme(entry, pos, comment):
    global next_id
    if next_id:
        next_id += 1
    else:
        next_id = Lexeme.all_objects.aggregate(Max('id'))['id__max'] + 1
    l = Lexeme(
        id=next_id, entry=entry, part_of_speech_id=pos,
        status=Lexeme.STATUS_DESCRIBED, owner_vocabulary_id=WSJP,
        comment=comment)
    l.save()
    WSJP.add_lexeme(l)
    return l


def import_subst(elements, comment):
    try:
        entry, pos, gender, pattern_data, commonness = elements
    except ValueError:
        print >>sys.stderr, 'zla liczba kolumn', elements
        raise
    assert pos == 'subst'
    genders = [Gender.objects.get(symbol=g) for g in gender.split('/')]
    lip_data = [p.rsplit(' ', 1) for p in pattern_data.split('/')]
    if len(genders) > 1 and len(lip_data) > 1:
        print >>sys.stderr, 'mnogie wzory i rodzaje', elements
        return
    if len(genders) == 1:
        lip_data = [(ld, genders[0]) for ld in lip_data]
    else:
        lip_data = [(lip_data[0], g) for g in genders]
    l = new_lexeme(entry, 'subst', comment)
    comm_value = ClassificationValue.objects.get(
        classification__name=u'pospolitość', label=commonness)
    LexemeCV.objects.create(lexeme=l, classification_value=comm_value)
    for i, (ld, gender) in enumerate(lip_data, 1):
        lip = LexemeInflectionPattern(lexeme=l, index=i, gender=gender)
        if len(ld) == 1:
            pattern = ld[0]
            qualifier = None
        else:
            qualifier, pattern = ld
        lip.pattern = Pattern.objects.get(name=pattern)
        lip.root = lip.get_root()
        if lip.get_root() is None:
            raise ValueError(u"%s: can't find root" % repr(entry))
        lip.save()
        if qualifier:
            lip.qualifiers.add(Qualifier.objects.get(label=qualifier))


NDM = Pattern.objects.get(name='ndm')  # hardcoded pattern
ADVADJ = CrossReferenceType.objects.get(symbol='advadj')


def import_adv(elements, comment):
    try:
        entry, pos, pattern_name, adj_entry = elements
    except ValueError:
        print >>sys.stderr, 'zla liczba kolumn', elements
        raise
    assert pos == 'adv' and pattern_name == 'ndm'
    l = new_lexeme(entry, 'adv', comment)
    lip = LexemeInflectionPattern(lexeme=l, index=1, pattern=NDM)
    lip.root = lip.get_root()
    lip.save()
    adjs = Lexeme.objects.filter(entry=adj_entry)
    if len(adjs) == 1:
        adj = adjs.get()
        CrossReference.objects.create(from_lexeme=l, to_lexeme=adj, type=ADVADJ)
    else:
        if len(adjs) == 0:
            print >>sys.stderr, 'Brak przymiotnika: %s (%s)' \
                                % (adj_entry, entry)
        else:
            print >>sys.stderr, 'Niejednoznaczny przymiotnik: %s (%s)' % (
                adj_entry, entry)


POPRZ = LexemeAttributeValue.objects.get(
    value=u'obecna', attribute__name=u'forma poprz.')
NO_POPRZ = LexemeAttributeValue.objects.get(
    value=u'nieobecna', attribute__name=u'forma poprz.')
ZLOZ = LexemeAttributeValue.objects.get(
    value=u'obecna', attribute__name=u'forma złoż.')
NO_ZLOZ = LexemeAttributeValue.objects.get(
    value=u'nieobecna', attribute__name=u'forma złoż.')


def import_adj(elements, comment):
    try:
        entry, pos, pattern_name, zloz, poprz = elements
    except ValueError:
        print >> sys.stderr, 'zla liczba kolumn', elements
        raise
    assert pos == 'adj'
    l = new_lexeme(entry, 'adj', comment)
    if zloz[0] == '+':
        ZLOZ.add_lexeme(l)
    else:
        NO_ZLOZ.add_lexeme(l)
    if poprz[0] == '+':
        POPRZ.add_lexeme(l)
    else:
        NO_POPRZ.add_lexeme(l)
    pattern = Pattern.objects.get(name=pattern_name)
    lip = LexemeInflectionPattern(lexeme=l, index=1, pattern=pattern)
    lip.root = lip.get_root()
    lip.save()