export.py 9.87 KB
#-*- coding:utf-8 -*-

import sys
from django.db import connection
from common.util import debug, flatten
from dictionary.models import CrossReferenceType, ClassificationValue, LexemeAttributeValue, Gender

ADJPREDYKATYWNE = [
    u'ciekaw',
    u'godzien',
    u'gotów',
    u'łaskaw',
    u'świadom',
    u'winien',
    u'zdrów',
    # wątpliwe:
    u'dłużen',
    u'miłościw',
    u'praw',
    u'wesół',
    u'żyw',
]

REFL_TRANSLATION = {
    u'—': 'nonrefl',
    u'się': 'refl',
    u'sobie': 'refl',
    u'się/sobie': 'refl',
    u'(się)': 'refl.nonrefl',
    u'(sobie)': 'refl.nonrefl',
}

ASPECT_TRANSLATION = {
    u'dk': 'perf',
    u'ndk': 'imperf',
    u'dk/ndk': 'imperf.perf',
    u'dk/(ndk)': 'imperf.perf',
    u'ndk/dk': 'imperf.perf',
    u'ndk/(dk)': 'imperf.perf',
}


def qualifier_clause(q_id):
    return '''not exists (
    select * from kwalifikatory_leksemow where lexeme_id = l.id and
      qualifier_id = %(q)d) and not exists (
    select * from kwalifikatory_odmieniasiow where qualifier_id = %(q)d and
      lexemeinflectionpattern_id = o.id) and not exists (
    select * from kwalifikatory_zakonczen where qualifier_id = %(q)d and
      ending_id = z.id) and ''' % {'q': q_id}


def magic_qualifier_clause():
    return '''and not (tag like %s and exists (
    select kw.id
    from kwalifikatory kw
      join kwalifikatory_leksemow kwl on kw.id = kwl.qualifier_id
    where kwl.lexeme_id = l.id and kw.id = %s)) '''


def export_lexemes(data=None, output_file=None):
    if not data:
        data = {
            'vocabs': ['PoliMorf'],
            'antivocabs': [],
            'variant': 'Morfeusz',
            'excluding_qualifiers': [],
            'magic_qualifiers': [],
            'refl': False,
            'commonness': False,
        }
    if output_file is None:
        output_file = sys.stdout
    vocabs_placeholders = ', '.join('%s' for v in data['vocabs'])

    if data['antivocabs']:
        antivocabs_placeholders = ', '.join('%s' for v in data['antivocabs'])
        antivocabs_clause = '''not exists (
      select * from leksemy_w_slownikach ls2 where ls2.l_id = l.id
      and ls2.slownik in (%s)) and''' % antivocabs_placeholders
    else:
        antivocabs_clause = ''

    qualifier_clauses = ''.join(
        qualifier_clause(q_id) for q_id in data['excluding_qualifiers'])
    magic_qualifier_clauses = ''.join(
        magic_qualifier_clause() for pattern, q_id in data['magic_qualifiers'])

    crtypes = ['comadv', 'comadj', 'gerver', 'pactver', 'ppasver']
    crtype_ids = CrossReferenceType.objects.filter(
        symbol__in=crtypes).values_list('id', flat=True)

    cv_ids = ClassificationValue.objects.filter(
        classification__name=u'pospolitość').values_list('id', flat=True)

    genders = dict(Gender.objects.values_list('id', 'symbol'))

    refls = dict(LexemeAttributeValue.objects.filter(
        attribute__name=u'zwrotność').values_list('id', 'value'))
    aspects = dict(LexemeAttributeValue.objects.filter(
        attribute__name=u'aspekt').values_list('id', 'value'))
    persons = dict(LexemeAttributeValue.objects.filter(
        attribute__name=u'osoba').values_list('id', 'value'))
    cases = dict(LexemeAttributeValue.objects.filter(
        attribute__name=u'przypadek').values_list('id', 'value'))

    select = '''prefix||rdzen||zak||suffix, l.pos, tag_template,
        l.id as leksem_id, refl.attribute_value_id, o.gender_id,
        aspect.attribute_value_id, person.attribute_value_id,
        "case".attribute_value_id
    '''

    table_joins = '''
    join leksemy_w_slownikach ls on (ls.l_id = l.id)
    left join dictionary_lexemeav aspect
        on (l.id = aspect.lexeme_id and %(aspect)s)
    left join dictionary_lexemeav person
        on (l.id = person.lexeme_id and %(person)s)
    left join dictionary_lexemeav "case"
        on (l.id = "case".lexeme_id and %(case)s)
    join odmieniasie o on (o.l_id = l.id)
    join wzory w on (o.w_id = w.id)
    join dictionary_tabletemplate_pattern_types tt_pt on
        w.typ = tt_pt.patterntype_id
    join dictionary_tabletemplate tt on
        (tt_pt.tabletemplate_id = tt.id)
    join dictionary_tabletemplate_parts_of_speech tt_pos on
        (tt.id = tt_pos.tabletemplate_id and
            l.pos = tt_pos.partofspeech_id)
    join dictionary_exportcell ec on tt.id = ec.table_template_id
    join dictionary_exportcell_pattern_types ec_pt on
        (ec.id = ec_pt.exportcell_id and w.typ = ec_pt.patterntype_id)
    left join dictionary_exportcell_genders ec_g on
        ec.id = ec_g.exportcell_id
    join zakonczenia z on
        (o.w_id = z.w_id and ec.base_form_label_id = z.efobaz)
    ''' % {
        'aspect': 'aspect.attribute_value_id in (%s)'
            % ', '.join(str(id) for id in aspects),
        'person': 'person.attribute_value_id in (%s)'
            % ', '.join(str(id) for id in persons),
        'case': '"case".attribute_value_id in (%s)'
            % ', '.join(str(id) for id in cases),
    }

    table_clause = '''
    true = all (
        select attr_val.id in (select lexemeattributevalue_id from
                dictionary_tabletemplate_attribute_values tt_attr_val
            where tt_attr_val.tabletemplate_id = tt.id)
        from dictionary_lexemeav lav
            join dictionary_lexemeattributevalue attr_val
                on lav.attribute_value_id = attr_val.id
            join dictionary_tabletemplate_attributes tt_attr
                on (attr_val.attribute_id = tt_attr.lexemeattribute_id and
                    tt.id = tt_attr.tabletemplate_id)
        where lav.lexeme_id = l.id) and
    true = all (
        select attr_val.id in (select lexemeattributevalue_id from
                dictionary_exportcell_attribute_values ec_attr_val
            where ec_attr_val.exportcell_id = ec.id)
        from dictionary_lexemeav lav
            join dictionary_lexemeattributevalue attr_val
                on lav.attribute_value_id = attr_val.id
            join dictionary_tabletemplate_cell_attributes ec_attr
                on (attr_val.attribute_id = ec_attr.lexemeattribute_id and
                    tt.id = ec_attr.tabletemplate_id)
        where lav.lexeme_id = l.id) and
    '''

    cursor = connection.cursor()
    query = """
select distinct haslo, %(select)s %(clas_field)s
from leksemy l
    left join dictionary_lexemeav refl
        on (l.id = refl.lexeme_id and %(refl)s)
    %(table_joins)s
    %(clas_join)s
where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s
    l.pos in ('v','subst','osc','adj','adv', 'num','advndm','fraz','comp',
              'conj','interj','prep','part','ppron','pred') and
    variant_id=%%s and l.status<>'cand' and l.usuniety = false %(magic)s
    --and haslo < 'b'
union all
-- wymagające gniazdowania przy hasłowaniu: adjcom, advcom, derywaty:
select distinct g.haslo as haslo, %(select)s %(clas_field)s
from leksemy l
    join odsylacze on l.id=l_id_od
    join leksemy g on (l_id_do=g.id and g.usuniety = false)
    left join dictionary_lexemeav refl
        on (g.id = refl.lexeme_id and %(refl)s)
    %(table_joins)s
    %(clas_join)s
where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s
    typods_id in (%(crtype_ids)s) and
    l.pos in ('adjcom','advcom','ger','pact','ppas','appas') and
    variant_id=%%s and l.status<>'cand' and l.usuniety = false
    --and g.haslo < 'b'
order by haslo, leksem_id
    """ % {
        'vocabs': vocabs_placeholders,
        'antivocabs': antivocabs_clause,
        'x_qual': qualifier_clauses,
        'magic': magic_qualifier_clauses,
        'crtype_ids': ', '.join(str(id) for id in crtype_ids),
        'clas_field': ', classification_value_id' if data['commonness'] else '',
        'select': select,
        'table_joins': table_joins,
        'table_clause': table_clause,
        'clas_join':
            'left outer join dictionary_lexemecv wkl '
            'on (wkl.lexeme_id=l.id and wkl.classification_value_id in (%s))'
            % ', '.join(str(id) for id in cv_ids) if data['commonness'] else '',
        'refl':
            'refl.attribute_value_id in (%s)'
            % ', '.join(str(id) for id in refls),
    }
    params_part = (list(data['vocabs']) + list(data['antivocabs']) +
                   [data['variant']])
    params = params_part + flatten(data['magic_qualifiers']) + params_part
    cursor.execute(query, params)
    refl = data['refl']
    cv_table = dict(ClassificationValue.objects.values_list('id', 'label'))
    for row in cursor:
        if data['commonness']:
            entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \
                person_id, case_id, cv_id = row
        else:
            entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \
                person_id, case_id = row
        form = form.lstrip('+') # odmienne postfiksy
        tags = None
        if tag == 'adja':
            form = form.rstrip('+')
        if tag == 'adjc':
            if form not in ADJPREDYKATYWNE:
                tags = ["adj:sg:nom:m1.m2.m3:pos", "adj:sg:acc:m3:pos"]
        if refl and pos in ('v', 'pact', 'ger'):
            if refl_id in refls:
                tag += ':' + REFL_TRANSLATION[refls[refl_id]]
            else:
                debug(entry, u'Nieznana zwrotność: %s' % refl_id)
        if 'RODZAJ' in tag:
            tag = tag.replace('RODZAJ', genders[gender_id])
        if 'ASPEKT' in tag:
            tag = tag.replace('ASPEKT', ASPECT_TRANSLATION[aspects[aspect_id]])
        if 'OSOBA' in tag:
            tag = tag.replace('OSOBA', persons[person_id])
        if 'PRZYPADEK' in tag:
            tag = tag.replace('PRZYPADEK', cases[case_id])
        tags = tags or [tag]
        for tag in tags:
            if data['commonness']:
                cv = cv_table[cv_id] if cv_id else ''
                output_file.write(
                    (u'%s\t%s\t%s\t%s\n' % (form, entry, tag, cv)).encode('utf-8'))
            else:
                output_file.write(
                    (u'%s\t%s\t%s\n' % (form, entry, tag)).encode('utf-8'))