import_witek.py 4.67 KB

Edit Raw Blame History

# -*- coding: utf-8 -*-
import sys
from django.core.management.base import BaseCommand
from django.db.transaction import atomic

from common.util import uniopen, no_history
from dictionary.models import Lexeme, Vocabulary, Inflection, Qualifier, \
    ClassificationValue, LexemeCV, Gender, LexemeAttributeValue, \
    CrossReferenceType, CrossReference
from patterns.models import Pattern


class Command(BaseCommand):
    help = "My shiny new management command."
    args = 'filename comment'

    def handle(self, filename, comment, *args, **options):
        import_lexemes(uniopen(filename), comment)


WSJP = Vocabulary.objects.get(id='WSJP')


@atomic
def import_lexemes(lines, comment):
    no_history()
    for line in lines:
        elements = line.strip().split(';')
        if elements[1] == 'subst':
            import_subst(elements, comment)
        elif elements[1] == 'adv':
            import_adv(elements, comment)
        elif elements[1] == 'adj':
            import_adj(elements, comment)


def new_lexeme(entry, pos, comment):
    l = Lexeme(
        entry=entry, part_of_speech_id=pos, status=Lexeme.STATUS_DESCRIBED,
        owner_vocabulary_id=WSJP, comment=comment)
    l.save()
    WSJP.add_lexeme(l)
    return l


def import_subst(elements, comment):
    try:
        entry, pos, gender, pattern_data, commonness = elements
    except ValueError:
        print >>sys.stderr, 'zla liczba kolumn', elements
        raise
    assert pos == 'subst'
    genders = [Gender.objects.get(symbol=g) for g in gender.split('/')]
    inflection_data = [p.rsplit(' ', 1) for p in pattern_data.split('/')]
    if len(genders) > 1 and len(inflection_data) > 1:
        print >>sys.stderr, 'mnogie wzory i rodzaje', elements
        return
    if len(genders) == 1:
        inflection_data = [(ld, genders[0]) for ld in inflection_data]
    else:
        inflection_data = [(inflection_data[0], g) for g in genders]
    l = new_lexeme(entry, 'subst', comment)
    comm_value = ClassificationValue.objects.get(
        classification__name=u'pospolitość', label=commonness)
    LexemeCV.objects.create(lexeme=l, classification_value=comm_value)
    for i, (ld, gender) in enumerate(inflection_data, 1):
        inflection = Inflection(lexeme=l, index=i, gender=gender)
        if len(ld) == 1:
            pattern = ld[0]
            qualifier = None
        else:
            qualifier, pattern = ld
        inflection.pattern = Pattern.objects.get(name=pattern)
        inflection.root = inflection.get_root()
        if inflection.get_root() is None:
            raise ValueError(u"%s: can't find root" % repr(entry))
        inflection.save()
        if qualifier:
            inflection.qualifiers.add(Qualifier.objects.get(label=qualifier))


NDM = Pattern.objects.get(name='ndm')  # hardcoded pattern
ADVADJ = CrossReferenceType.objects.get(symbol='advadj')


def import_adv(elements, comment):
    try:
        entry, pos, pattern_name, adj_entry = elements
    except ValueError:
        print >>sys.stderr, 'zla liczba kolumn', elements
        raise
    assert pos == 'adv' and pattern_name == 'ndm'
    l = new_lexeme(entry, 'adv', comment)
    inflection = Inflection(lexeme=l, index=1, pattern=NDM)
    inflection.root = inflection.get_root()
    inflection.save()
    adjs = Lexeme.objects.filter(entry=adj_entry)
    if len(adjs) == 1:
        adj = adjs.get()
        CrossReference.objects.create(from_lexeme=l, to_lexeme=adj, type=ADVADJ)
    else:
        if len(adjs) == 0:
            print >>sys.stderr, 'Brak przymiotnika: %s (%s)' \
                                % (adj_entry, entry)
        else:
            print >>sys.stderr, 'Niejednoznaczny przymiotnik: %s (%s)' % (
                adj_entry, entry)


POPRZ = LexemeAttributeValue.objects.get(
    value=u'obecna', attribute__name=u'forma poprz.')
NO_POPRZ = LexemeAttributeValue.objects.get(
    value=u'nieobecna', attribute__name=u'forma poprz.')
ZLOZ = LexemeAttributeValue.objects.get(
    value=u'obecna', attribute__name=u'forma złoż.')
NO_ZLOZ = LexemeAttributeValue.objects.get(
    value=u'nieobecna', attribute__name=u'forma złoż.')


def import_adj(elements, comment):
    try:
        entry, pos, pattern_name, zloz, poprz = elements
    except ValueError:
        print >> sys.stderr, 'zla liczba kolumn', elements
        raise
    assert pos == 'adj'
    l = new_lexeme(entry, 'adj', comment)
    if zloz[0] == '+':
        ZLOZ.add_lexeme(l)
    else:
        NO_ZLOZ.add_lexeme(l)
    if poprz[0] == '+':
        POPRZ.add_lexeme(l)
    else:
        NO_POPRZ.add_lexeme(l)
    pattern = Pattern.objects.get(name=pattern_name)
    inflection = Inflection(lexeme=l, index=1, pattern=pattern)
    inflection.root = inflection.get_root()
    inflection.save()