export.py 6.78 KB
#-*- coding:utf-8 -*-

import sys
from django.db import connection
from common.util import debug, flatten
from dictionary.models import CrossReferenceType, ClassificationValue, LexemeAttributeValue

ADJPREDYKATYWNE = [
    u'ciekaw',
    u'godzien',
    u'gotów',
    u'łaskaw',
    u'świadom',
    u'winien',
    u'zdrów',
    # wątpliwe:
    u'dłużen',
    u'miłościw',
    u'praw',
    u'wesół',
    u'żyw',
]

REFL_TRANSLATION = {
    u'—': 'nonrefl',
    u'się': 'refl',
    u'sobie': 'refl',
    u'się/sobie': 'refl',
    u'(się)': 'refl.nonrefl',
    u'(sobie)': 'refl.nonrefl',
}


def qualifier_clause(q_id):
    return '''not exists (
    select * from kwalifikatory_leksemow where lexeme_id = l.id and
      qualifier_id = %(q)d) and not exists (
    select * from kwalifikatory_odmieniasiow where qualifier_id = %(q)d and
      lexemeinflectionpattern_id = o.id) and not exists (
    select * from kwalifikatory_zakonczen where qualifier_id = %(q)d and
      ending_id = z.id) and ''' % {'q': q_id}


def magic_qualifier_clause():
    return '''and not (tag like %s and exists (
    select kw.id
    from kwalifikatory kw
      join kwalifikatory_leksemow kwl on kw.id = kwl.qualifier_id
    where kwl.lexeme_id = l.id and kw.id = %s)) '''


def export_lexemes(data=None, output_file=None):
    if not data:
        data = {
            'vocabs': ['PoliMorf'],
            'antivocabs': [],
            'variant': 'Morfeusz',
            'excluding_qualifiers': [],
            'magic_qualifiers': [],
            'refl': False,
            'commonness': False,
        }
    if output_file is None:
        output_file = sys.stdout
    vocabs_placeholders = ', '.join('%s' for v in data['vocabs'])

    if data['antivocabs']:
        antivocabs_placeholders = ', '.join('%s' for v in data['antivocabs'])
        antivocabs_clause = '''not exists (
      select * from leksemy_w_slownikach ls2 where ls2.l_id = l.id
      and ls2.slownik in (%s)) and''' % antivocabs_placeholders
    else:
        antivocabs_clause = ''

    qualifier_clauses = ''.join(
        qualifier_clause(q_id) for q_id in data['excluding_qualifiers'])
    magic_qualifier_clauses = ''.join(
        magic_qualifier_clause() for pattern, q_id in data['magic_qualifiers'])

    crtypes = ['comadv', 'comadj', 'gerver', 'pactver', 'ppasver']
    crtype_ids = CrossReferenceType.objects.filter(
        symbol__in=crtypes).values_list('pk', flat=True)

    cv_ids = ClassificationValue.objects.filter(
        classification__name=u'pospolitość').values_list('pk', flat=True)

    refls = dict(LexemeAttributeValue.objects.filter(
        attribute__name=u'zwrotność').values_list('pk', 'value'))

    cursor = connection.cursor()
    query = """
    select distinct haslo, prefiks||rdzen||zak||sufiks, l.pos, ch.charfl, tag,
      l.id as leksem_id, refl.attribute_value_id %(clas_field)s
    from leksemy l
      join leksemy_w_slownikach ls on (ls.l_id = l.id)
      left outer join dictionary_lexemeav refl
        on (l.id = refl.lexeme_id and %(refl)s)
      join odmieniasie o on (o.l_id = l.id)
      join charfle ch on ch.id = o.charfl
      join wzory w on (o.w_id = w.id)
      join szablony_tabel s on (w.typ=s.wtyp and o.charfl=s.charfl)
      join klatki k on k.st_id = s.id
      join zakonczenia z on (o.w_id=z.w_id and k.efobaz=z.efobaz)
      join efobazy e on (e.id = k.efobaz)
      %(clas_join)s
    where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s
      l.status<>'cand' and wariant=%%s and
      l.pos in ('v','subst','osc','adj','adv', 'num','advndm','burk','comp',
                'conj','interj','prep','qub','ppron','pred') and
      l.usuniety = false %(magic)s
      --and haslo < 'b'
    union all
    -- wymagające gniazdowania przy hasłowaniu: adjcom, advcom, derywaty:
    select distinct g.haslo as haslo, prefiks||rdzen||zak||sufiks, l.pos,
      ch.charfl, tag, l.id as leksem_id, refl.attribute_value_id
      %(clas_field)s
    from leksemy l
      join leksemy_w_slownikach ls on (ls.l_id = l.id)
      join odsylacze on l.id=l_id_od
      join leksemy g on (l_id_do=g.id and g.usuniety = false)
      left outer join dictionary_lexemeav refl
        on (g.id = refl.lexeme_id and %(refl)s)
      join odmieniasie o on l.id=o.l_id
      join charfle ch on ch.id = o.charfl
      join wzory w on (o.w_id = w.id)
      join szablony_tabel s on (w.typ=s.wtyp and o.charfl=s.charfl)
      join klatki k on k.st_id = s.id
      join zakonczenia z on (o.w_id=z.w_id and k.efobaz=z.efobaz)
      %(clas_join)s
    where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s
      typods_id in (%(crtype_ids)s) and
      l.pos in ('adjcom','advcom','ger','pact','ppas','appas') and
      wariant=%%s and l.status<>'cand' and l.usuniety = false
      --and g.haslo < 'b'
    order by haslo, leksem_id
    """ % {
        'vocabs': vocabs_placeholders,
        'antivocabs': antivocabs_clause,
        'x_qual': qualifier_clauses,
        'magic': magic_qualifier_clauses,
        'crtype_ids': ', '.join(str(pk) for pk in crtype_ids), # brzydko, oj tam
        'clas_field': ', classification_value_id' if data['commonness'] else '',
        'clas_join':
            'left outer join dictionary_lexemecv wkl '
            'on (wkl.lexeme_id=l.id and wkl.classification_value_id in (%s))'
            % ', '.join(str(pk) for pk in cv_ids) if data['commonness'] else '',
        'refl':
            'refl.attribute_value_id in (%s)'
            % ', '.join(str(pk) for pk in refls),
    }
    params_part = (list(data['vocabs']) + list(data['antivocabs']) +
                   [data['variant']])
    params = params_part + flatten(data['magic_qualifiers']) + params_part
    cursor.execute(query, params)
    refl = data['refl']
    cv_table = dict(ClassificationValue.objects.values_list('id', 'label'))
    for row in cursor:
        if data['commonness']:
            entry, form, pos, _ic, tag, _id, refl_id, cv_id = row
        else:
            entry, form, pos, _ic, tag, _id, refl_id = row
        form = form.lstrip('+') # odmienne postfiksy
        tags = None
        if tag == 'adja':
            form = form.rstrip('+')
        if tag == 'adjc':
            if form not in ADJPREDYKATYWNE:
                tags = ["adj:sg:nom:m1.m2.m3:pos", "adj:sg:acc:m3:pos"]
        if refl and pos in ('v', 'pact', 'ger'):
            if refl_id in refls:
                tag += ':' + REFL_TRANSLATION[refls[refl_id]]
            else:
                debug(entry, u'Nieznana zwrotność: %s' % refl_id)
    tags = tags or [tag]
    for tag in tags:
        if data['commonness']:
            cv = cv_table[cv_id] if cv_id else ''
            output_file.write((u'%s\t%s\t%s\t%s\n' %
                               (form, entry, tag, cv)).encode('utf-8'))
        else:
            output_file.write((u'%s\t%s\t%s\n' %
                               (form, entry, tag)).encode('utf-8'))