export.py 10.7 KB
#-*- coding:utf-8 -*-
import locale
import sys
import time

from django.db import connection
from common.util import debug, flatten, uniprint
from dictionary.lexeme_form_query import attr_clauses_combinations, \
    EXPORT_FROM_CLAUSES, WHERE_CLAUSES
from dictionary.models import CrossReferenceType, ClassificationValue, \
    LexemeAttributeValue, Gender, TableTemplate

locale.setlocale(locale.LC_ALL, 'pl_PL.utf8')

class LexemeExport(object):
    ADJPREDYKATYWNE = [
        u'ciekaw',
        u'godzien',
        u'gotów',
        u'łaskaw',
        u'świadom',
        u'winien',
        u'zdrów',
        # wątpliwe:
        u'dłużen',
        u'miłościw',
        u'praw',
        u'wesół',
        u'żyw',
    ]

    REFL_TRANSLATION = {
        u'—': 'nonrefl',
        u'się': 'refl',
        u'sobie': 'refl',
        u'się/sobie': 'refl',
        u'(się)': 'refl.nonrefl',
        u'(sobie)': 'refl.nonrefl',
    }

    ASPECT_TRANSLATION = {
        u'dk': 'perf',
        u'ndk': 'imperf',
        u'dk/ndk': 'imperf.perf',
        u'dk/(ndk)': 'imperf.perf',
        u'ndk/dk': 'imperf.perf',
        u'ndk/(dk)': 'imperf.perf',
    }

    NESTED_POS = ('adjcom','advcom','ger','pact','ppas','appas')

    def __init__(self, data=None, output_file=None):
        self.data = data or {
            'vocabs': ['SGJP'],
            'antivocabs': [],
            'variant': 'Morfeusz',
            'excluding_qualifiers': [],
            'magic_qualifiers': [],
            'refl': False,
            'commonness': False,
        }
        self.output_file = output_file or sys.stdout
        self.vocabs_placeholders = ', '.join('%s' for v in self.data['vocabs'])

        if self.data['antivocabs']:
            antivocabs_placeholders = ', '.join(
                '%s' for v in self.data['antivocabs'])
            self.antivocabs_clause = '''not exists (
          select * from leksemy_w_slownikach ls2 where ls2.l_id = l.id
          and ls2.slownik in (%s)) and''' % antivocabs_placeholders
        else:
            self.antivocabs_clause = ''

        self.qualifier_clauses = ''.join(
            self.qualifier_clause(q_id)
                for q_id in self.data['excluding_qualifiers'])
        self.magic_qualifier_clauses = ''.join(
            self.magic_qualifier_clause()
                for pattern, q_id in self.data['magic_qualifiers'])

        crtypes = ['comadv', 'comadj', 'gerver', 'pactver', 'ppasver']
        self.crtype_ids = CrossReferenceType.objects.filter(
            symbol__in=crtypes).values_list('id', flat=True)

        self.cv_ids = ClassificationValue.objects.filter(
            classification__name=u'pospolitość').values_list('id', flat=True)

        self.genders = dict(Gender.objects.values_list('id', 'symbol'))

        self.refls = dict(LexemeAttributeValue.objects.filter(
            attribute__name=u'zwrotność').values_list('id', 'value'))
        self.aspects = dict(LexemeAttributeValue.objects.filter(
            attribute__name=u'aspekt').values_list('id', 'value'))
        self.persons = dict(LexemeAttributeValue.objects.filter(
            attribute__name=u'osoba').values_list('id', 'value'))
        self.cases = dict(LexemeAttributeValue.objects.filter(
            attribute__name=u'przypadek').values_list('id', 'value'))

        self.select = '''prefix||rdzen||zak||suffix, l.pos, tag_template,
            refl.attribute_value_id, o.gender_id,
            aspect.attribute_value_id, person.attribute_value_id,
            "case".attribute_value_id
        '''

        self.table_joins = EXPORT_FROM_CLAUSES + '''
        join leksemy_w_slownikach ls on (ls.l_id = l.id)
        left join dictionary_lexemeav aspect
            on (l.id = aspect.lexeme_id and %(aspect)s)
        left join dictionary_lexemeav person
            on (l.id = person.lexeme_id and %(person)s)
        left join dictionary_lexemeav "case"
            on (l.id = "case".lexeme_id and %(case)s)
        ''' % {
            'aspect': 'aspect.attribute_value_id in (%s)'
                % ', '.join(str(id) for id in self.aspects),
            'person': 'person.attribute_value_id in (%s)'
                % ', '.join(str(id) for id in self.persons),
            'case': '"case".attribute_value_id in (%s)'
                % ', '.join(str(id) for id in self.cases),
        }

        self.table_clause = WHERE_CLAUSES

        self.params_part = (
            list(self.data['vocabs']) + list(self.data['antivocabs']) +
            [self.data['variant']])

        self.cv_table = dict(
            ClassificationValue.objects.values_list('id', 'label'))

    @staticmethod
    def qualifier_clause(q_id):
        return '''not exists (
        select * from kwalifikatory_leksemow where lexeme_id = l.id and
          qualifier_id = %(q)d) and not exists (
        select * from kwalifikatory_odmieniasiow where qualifier_id = %(q)d and
          lexemeinflectionpattern_id = o.id) and not exists (
        select * from kwalifikatory_zakonczen where qualifier_id = %(q)d and
          ending_id = z.id) and ''' % {'q': q_id}

    @staticmethod
    def magic_qualifier_clause():
        return '''and not (tag like %s and exists (
        select kw.id
        from kwalifikatory kw
          join kwalifikatory_leksemow kwl on kw.id = kwl.qualifier_id
        where kwl.lexeme_id = l.id and kw.id = %s)) '''

    def export_row(self, row, acc):
        if self.data['commonness']:
            lexeme_id, entry, form, pos, tag, refl_id, gender_id, \
                aspect_id, person_id, case_id, cv_id = row
        else:
            lexeme_id, entry, form, pos, tag, refl_id, gender_id, \
                aspect_id, person_id, case_id = row
        form = form.lstrip('+') # odmienne postfiksy
        tags = None
        if tag == 'adja':
            form = form.rstrip('+')
        if tag == 'adjc':
            if form not in self.ADJPREDYKATYWNE:
                tags = ["adj:sg:nom:m1.m2.m3:pos", "adj:sg:acc:m3:pos"]
        if self.data['refl'] and pos in ('v', 'pact', 'ger'):
            if refl_id in self.refls:
                tag += ':' + self.REFL_TRANSLATION[self.refls[refl_id]]
            else:
                debug(entry, u'Nieznana zwrotność: %s' % refl_id)
        if 'RODZAJ' in tag:
            tag = tag.replace('RODZAJ', self.genders[gender_id])
        if 'ASPEKT' in tag:
            tag = tag.replace('ASPEKT',
                self.ASPECT_TRANSLATION[self.aspects[aspect_id]])
        if 'OSOBA' in tag:
            tag = tag.replace('OSOBA', self.persons[person_id])
        if 'PRZYPADEK' in tag:
            tag = tag.replace('PRZYPADEK', self.cases[case_id])
        tags = tags or [tag]
        for tag in tags:
            if self.data['commonness']:
                cv = self.cv_table[cv_id] if cv_id else ''
                acc.append((entry, lexeme_id, form, tag, cv))
            else:
                acc.append((entry, lexeme_id, form, tag))

    def export_part(self, cursor, tt, nested, attr_clauses, cell_c, tt_c, acc):
        if not nested:
            params = self.params_part + flatten(self.data['magic_qualifiers'])
            query = """
select distinct l.id, haslo, %(select)s %(clas_field)s
    %(table_joins)s
    %(clas_join)s
    left join dictionary_lexemeav refl
        on (l.id = refl.lexeme_id and %(refl)s)
where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s
    and variant_id=%%s and l.status<>'cand' and l.usuniety = false %(magic)s and
    %(attr_clauses)s"""
        else:
            params = self.params_part
            query = """
select distinct g.id, g.haslo as haslo, %(select)s %(clas_field)s
    %(table_joins)s
    %(clas_join)s
    join odsylacze on l.id=l_id_od
    join leksemy g on (l_id_do=g.id and g.usuniety = false)
    left join dictionary_lexemeav refl
        on (g.id = refl.lexeme_id and %(refl)s)
where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s
    and typods_id in (%(crtype_ids)s) and
    variant_id=%%s and l.status<>'cand' and l.usuniety = false and
    %(attr_clauses)s"""
        query = query % {
            'vocabs': self.vocabs_placeholders,
            'antivocabs': self.antivocabs_clause,
            'x_qual': self.qualifier_clauses,
            'magic': self.magic_qualifier_clauses,
            'crtype_ids': ', '.join(str(id) for id in self.crtype_ids),
            'clas_field': ', classification_value_id'
            if self.data['commonness'] else '',
            'select': self.select,
            'table_joins': self.table_joins,
            'table_clause': self.table_clause,
            'clas_join':
                '''left outer join dictionary_lexemecv wkl
                    on (wkl.lexeme_id=l.id and
                        wkl.classification_value_id in (%s))'''
                % ', '.join(str(id) for id in self.cv_ids)
                if self.data['commonness'] else '',
            'refl':
                'refl.attribute_value_id in (%s)'
                % ', '.join(str(id) for id in self.refls),
            'attr_clauses': ' and '.join(attr_clauses)
        }
        #if tt.name == 'gerundia':
        #    print >>sys.stderr, query
        #    print >>sys.stderr, [tt.id] + params + list(tt_c + cell_c + cell_c)
        cursor.execute(
            query, [tt.id] + params + list(tt_c + cell_c + cell_c))
        for row in cursor:
            self.export_row(row, acc)

    def export(self):
        cursor = connection.cursor()
        tts = TableTemplate.objects.filter(
            variant_id=self.data['variant']).prefetch_related(
                'attributes__values', 'cell_attributes__values',
                'parts_of_speech')
        export_data = []
        for tt in tts:
            uniprint(u'exporting table: %s' % tt.name, file=sys.stderr)
            attr_clauses, cell_attr_combinations, tt_attr_combinations = \
                attr_clauses_combinations(tt)
            nested = tt.parts_of_speech.all()[0].symbol in self.NESTED_POS
            for tt_c in tt_attr_combinations:
                for cell_c in cell_attr_combinations:
                    start = time.clock()
                    self.export_part(cursor, tt, nested, attr_clauses, cell_c,
                        tt_c, export_data)
                    print >>sys.stderr, time.clock() - start
        cursor.close()
        # uniprint(u'sorting', file=sys.stderr)
        # start = time.clock()
        # export_data.sort(cmp=lambda t1, t2: locale.strcoll(t1[2], t2[2]))
        # export_data.sort(key=operator.itemgetter(1))
        # export_data.sort(cmp=lambda t1, t2: locale.strcoll(t1[0], t2[0]))
        # print >>sys.stderr, time.clock() - start
        uniprint(u'outputting', file=sys.stderr)
        start = time.clock()
        for r in export_data:
            uniprint(
                u'\t'.join((str(r[1]), r[2], r[0]) + r[3:]),
                file=self.output_file)
        print >>sys.stderr, time.clock() - start