lexeme_export.py 24.2 KB

Edit Raw Blame History Permalink

# -*- coding: utf-8 -*-
from datetime import datetime
import time
import locale
import sys

from django.db import connection
from django.db.models import Count
import operator
from common.util import debug, flatten, uniprint
from export.lexeme_form_query import attr_clauses_combinations, \
    EXPORT_FROM_CLAUSES, WHERE_CLAUSES
from dictionary.models import CrossReferenceType, ClassificationValue, \
    LexemeAttributeValue, Gender, TableTemplate, HomonymNumber, Lexeme, \
    LexemeAttribute, SavedExportData, CachedExport, Variant, Classification

locale.setlocale(locale.LC_ALL, 'pl_PL.utf8')

UNDERSCORES = True

BATCH_SIZE = 10000

QUALIFIER_SEPARATOR = '|'


HEADER_TEMPLATE = u"""#!DICT-ID %s
#<COPYRIGHT>
%s
#</COPYRIGHT>"""


class LexemeExport(object):
    ADJPREDYKATYWNE = [
        u'ciekaw',
        u'godzien',
        u'gotów',
        u'łaskaw',
        u'świadom',
        u'winien',
        u'zdrów',
        # wątpliwe:
        u'dłużen',
        u'miłościw',
        u'praw',
        u'wesół',
        u'żyw',
    ]

    REFL_TRANSLATION = {
        u'—': 'nonrefl',
        u'się': 'refl',
        u'sobie': 'refl',
        u'się/sobie': 'refl',
        u'(się)': 'refl.nonrefl',
        u'(sobie)': 'refl.nonrefl',
    }

    ASPECT_TRANSLATION = {
        u'dk': 'perf',
        u'ndk': 'imperf',
        u'dk/ndk': 'imperf.perf',
        u'dk/(ndk)': 'imperf.perf',
        u'ndk/dk': 'imperf.perf',
        u'ndk/(dk)': 'imperf.perf',
    }

    SKR_POS = ('skrl', 'skrw', 'skrf')

    SKR_ATTR = LexemeAttribute.objects.get(name=u'rozwinięcie')

    TRANSLATE_NESTED = {
        'adjcom': 'adj',
        'advcom': 'adv',
        'ger': 'v',
        'pact': 'v',
        'ppas': 'v',
        'appas': 'v',
    }

    def __init__(self, export_data_name=None, data=None, output_file=None,
                 print_debug=False, cache=False):
        self.debug = print_debug
        self.cache = cache
        self.homonym_entries = None
        self.homonyms_with_numbers = None
        if export_data_name:
            export_data = SavedExportData.objects.get(name=export_data_name)
            self.data = export_data.get_data()
        else:
            if data is None:
                data = {}
            self.data = {
                'vocabs': ['SGJP'],
                'antivocabs': ['antyMorfeusz'],
                'variant': 'Morfeusz',
                'excluding_qualifiers': [],
                'magic_qualifiers': [],
                'refl': False,
                'commonness': True,
                'homonym_numbers': True,
                'form_qualifiers': True,
                # 'copyright': u'',
            }
            self.data.update(data)
        self.output_file = output_file or sys.stdout
        # self.copyright_file = open('copyright.txt', 'w')
        self.vocabs_placeholders = ', '.join('%s' for v in self.data['vocabs'])

        if self.data['antivocabs']:
            antivocabs_placeholders = ', '.join(
                '%s' for v in self.data['antivocabs'])
            self.antivocabs_clause = '''not exists (
          select * from leksemy_w_slownikach ls2 where ls2.l_id = l.id
          and ls2.slownik in (%s)) and''' % antivocabs_placeholders
        else:
            self.antivocabs_clause = ''

        self.qualifier_clauses = ''.join(
            self.qualifier_clause(q_id)
            for q_id in self.data['excluding_qualifiers'])
        self.magic_qualifier_clauses = ''.join(
            self.magic_qualifier_clause()
            for pattern, q_id in self.data['magic_qualifiers'])

        crtypes = ['comadv', 'comadj', 'gerver', 'pactver', 'ppasver']
        self.crtype_ids = CrossReferenceType.objects.filter(
            symbol__in=crtypes).values_list('id', flat=True)

        self.cv_ids = ClassificationValue.objects.filter(
            classification__name=u'pospolitość').values_list('id', flat=True)

        self.genders = dict(Gender.objects.values_list('id', 'symbol'))

        self.refls = dict(LexemeAttributeValue.objects.filter(
            attribute__name=u'zwrotność').values_list('id', 'value'))
        self.aspects = dict(LexemeAttributeValue.objects.filter(
            attribute__name=u'aspekt').values_list('id', 'value'))
        self.persons = dict(LexemeAttributeValue.objects.filter(
            attribute__name=u'osoba').values_list('id', 'value'))
        self.cases = dict(LexemeAttributeValue.objects.filter(
            attribute__name=u'przypadek').values_list('id', 'value'))
        self.numbers = dict(
            (id, v.replace('/', '.'))
            for id, v in LexemeAttributeValue.objects.filter(
                attribute__name=u'liczba selektywna').values_list(
                    'id', 'value'))

        self.select = '''prefix||rdzen||zak||suffix, l.pos, tag_template,
            refl.attribute_value_id, o.gender_id,
            aspect.attribute_value_id, person.attribute_value_id,
            "case".attribute_value_id, number.attribute_value_id, hn.number
        '''

        self.qualifier_select = ''',
        (select string_agg(kwal, '%s')
        from kwalifikatory
        where id in ((
                select qualifier_id from kwalifikatory_leksemow
                where lexeme_id = l.id)
            union (
                select qualifier_id from kwalifikatory_odmieniasiow
                where lexemeinflectionpattern_id = o.id)
            union (
                select qualifier_id from kwalifikatory_zakonczen
                where ending_id = z.id)))
        ''' % QUALIFIER_SEPARATOR
        # TODO
        # sprytniej by było uwzględniać tylko atrybuty, które mogą się
        # pojawić dla danej części mowy
        self.table_joins = EXPORT_FROM_CLAUSES + '''
        join leksemy_w_slownikach ls on (ls.l_id = l.id)
        left join dictionary_lexemeav aspect
            on (l.id = aspect.lexeme_id and %(aspect)s)
        left join dictionary_lexemeav person
            on (l.id = person.lexeme_id and %(person)s)
        left join dictionary_lexemeav "case"
            on (l.id = "case".lexeme_id and %(case)s)
        left join dictionary_lexemeav number
            on (l.id = number.lexeme_id and %(number)s)
        ''' % {
            'aspect': 'aspect.attribute_value_id in (%s)'
            % ', '.join(str(id) for id in self.aspects),
            'person': 'person.attribute_value_id in (%s)'
            % ', '.join(str(id) for id in self.persons),
            'case': '"case".attribute_value_id in (%s)'
            % ', '.join(str(id) for id in self.cases),
            'number': 'number.attribute_value_id in (%s)'
            % ', '.join(str(id) for id in self.numbers)
        }

        self.table_clause = WHERE_CLAUSES

        self.params_part = (
            [self.data['variant']] + list(self.data['vocabs']) +
            list(self.data['antivocabs']))

        if UNDERSCORES:
            self.cv_table = dict(
                (id, label.replace(' ', '_')) for id, label
                in ClassificationValue.objects.values_list('id', 'label'))
        else:
            self.cv_table = dict(
                ClassificationValue.objects.values_list('id', 'label'))

    def queryset(self, pos_set=None, all_pos=False):
        lexemes = Lexeme.objects.filter(
            vocabularies__id__in=self.data['vocabs'])
        if self.data['antivocabs']:
            lexemes = lexemes.extra(
                where=[
                    self.antivocabs_clause.replace(' l.', ' leksemy.')
                    + ' true'],
                params=self.data['antivocabs'])
        lexemes = lexemes.exclude(status=Lexeme.STATUS_CANDIDATE)
        if pos_set is None and not all_pos:
            pos_set = set(
                TableTemplate.objects.filter(
                    variant_id=self.data['variant']).values_list(
                    'parts_of_speech', flat=True).distinct())
            pos_set -= set(self.TRANSLATE_NESTED)
        if not all_pos:
            lexemes = lexemes.filter(part_of_speech_id__in=pos_set)
        lexemes = lexemes.exclude(
            qualifiers__in=self.data['excluding_qualifiers'])
        return lexemes

    @staticmethod
    def qualifier_clause(q_id):
        return '''not exists (
        select * from kwalifikatory_leksemow where lexeme_id = l.id and
          qualifier_id = %(q)d) and not exists (
        select * from kwalifikatory_odmieniasiow where qualifier_id = %(q)d and
          lexemeinflectionpattern_id = o.id) and not exists (
        select * from kwalifikatory_zakonczen where qualifier_id = %(q)d and
          ending_id = z.id) and ''' % {'q': q_id}

    @staticmethod
    def magic_qualifier_clause():
        return '''and not (tag like %s and exists (
        select kw.id
        from kwalifikatory kw
          join kwalifikatory_leksemow kwl on kw.id = kwl.qualifier_id
        where kwl.lexeme_id = l.id and kw.id = %s)) '''

    def homonym_entry(self, entry, pos, hn):
        if entry in self.homonym_entries:
            if (entry, pos) in self.homonyms_with_numbers:
                assert hn
            else:
                hn = ''
            letter = HomonymNumber.MORFEUSZ_LETTERS_REVERSE.get(
                self.TRANSLATE_NESTED.get(pos, pos))
            assert letter
            entry += ':%s%s' % (letter, hn)
        return entry

    def export_row(self, row, acc):
        i = 13
        lexeme_id, main_lexeme_id, entry, form, pos, tag, refl_id, gender_id,\
            aspect_id, person_id, case_id, number_id, hn = row[:i]
        if self.data['commonness']:
            cv_id = row[i]
            cv = self.cv_table[cv_id] if cv_id else ''
            i += 1
        else:
            cv = ''
        if self.data['form_qualifiers']:
            quals = row[i] or ''
            if UNDERSCORES and quals:
                quals = quals.replace(' ', '_')
        else:
            quals = ''
        if not self.cache and self.data['homonym_numbers']:
            entry = self.homonym_entry(entry, pos, hn)
        form = form.lstrip('+')  # odmienne postfiksy
        tags = None
        if tag == 'adja':
            form = form.rstrip('+')
            # entry = form
        elif tag == 'adjc':
            if form not in self.ADJPREDYKATYWNE:
                tags = ["adj:sg:nom:m1.m2.m3:pos", "adj:sg:acc:m3:pos"]
        # elif tag == 'num:comp':
        #    entry = form
        #    # to samo też dla jeszcze nieistniejących ppas:comp, pact:comp
        if self.data['refl'] and pos in ('v', 'pact', 'ger'):
            if refl_id in self.refls:
                tag += ':' + self.REFL_TRANSLATION[self.refls[refl_id]]
            else:
                debug(entry, u'Nieznana zwrotność: %s' % refl_id)
        if 'RODZAJ' in tag:
            tag = tag.replace('RODZAJ', self.genders[gender_id])
        if 'ASPEKT' in tag:
            tag = tag.replace(
                'ASPEKT', self.ASPECT_TRANSLATION[self.aspects[aspect_id]])
        if 'OSOBA' in tag:
            tag = tag.replace('OSOBA', self.persons[person_id])
        if 'PRZYPADEK' in tag:
            tag = tag.replace('PRZYPADEK', self.cases[case_id])
        if 'LICZBA' in tag:
            tag = tag.replace('LICZBA', self.numbers[number_id])
        if 'PL' in tag:
            tag = tag.replace(
                'PL', '.p1.p2' if 'pl' in self.numbers[number_id] else '')
        tags = tags or [tag]
        for tag in tags:
            if self.cache:
                acc.append(
                    CachedExport(
                        variant_id=self.data['variant'],
                        lexeme_id=lexeme_id,
                        form=form, entry=entry, tag=tag,
                        commonness=cv, qualifiers=quals,
                        main_lexeme_id=main_lexeme_id))
            else:
                output_row = (entry, main_lexeme_id, form, tag)
                if self.data['commonness']:
                    output_row += (cv,)
                if self.data['form_qualifiers']:
                    output_row += (quals,)
                acc.append(output_row)

    def export_part(self, cursor, tt, nested, attr_clauses, cell_c, tt_c, acc):
        if not nested:
            params = self.params_part + flatten(self.data['magic_qualifiers'])
            query = """
select distinct l.id, l.id, haslo, %(select)s %(clas_field)s %(qual_field)s
    %(table_joins)s
    %(clas_join)s
    left join dictionary_lexemeav refl
        on (l.id = refl.lexeme_id and %(refl)s)
    left join dictionary_homonymnumber hn
        on (l.id = hn.lexeme_id and hn.variant_id = %%s)
where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s
    and l.status<>'cand' and not l.usuniety %(magic)s and %(attr_clauses)s
    """
        else:
            params = self.params_part
            query = """
select distinct l.id, g.id, g.haslo as haslo, %(select)s %(clas_field)s %(qual_field)s
    %(table_joins)s
    %(clas_join)s
    join odsylacze on l.id=l_id_od
    join leksemy g on (l_id_do=g.id and g.usuniety = false)
    left join dictionary_lexemeav refl
        on (g.id = refl.lexeme_id and %(refl)s)
    left join dictionary_homonymnumber hn
        on (g.id = hn.lexeme_id and hn.variant_id = %%s)
where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s
    and typods_id in (%(crtype_ids)s) and
    l.status<>'cand' and not l.usuniety and %(attr_clauses)s
    """
        query = query % {
            'vocabs': self.vocabs_placeholders,
            'antivocabs': self.antivocabs_clause,
            'x_qual': self.qualifier_clauses,
            'magic': self.magic_qualifier_clauses,
            'crtype_ids': ', '.join(str(id) for id in self.crtype_ids),
            'clas_field':
                ', classification_value_id' if self.data['commonness'] else '',
            'qual_field':
                self.qualifier_select if self.data['form_qualifiers'] else '',
            'select': self.select,
            'table_joins': self.table_joins,
            'table_clause': self.table_clause,
            'clas_join':
                '''left outer join dictionary_lexemecv wkl
                    on (wkl.lexeme_id=l.id and
                        wkl.classification_value_id in (%s))'''
                % ', '.join(str(id) for id in self.cv_ids)
                if self.data['commonness'] else '',
            'refl':
                'refl.attribute_value_id in (%s)'
                % ', '.join(str(id) for id in self.refls),
            'attr_clauses': ' and '.join(attr_clauses)
        }
        # if tt.name == 'czasowniki':
        #     print >>sys.stderr, query
        #     print >>sys.stderr,
        #         [tt.id] + params + list(tt_c + cell_c + cell_c)
        cursor.execute(
            query, [tt.id] + params + list(tt_c + cell_c + cell_c))
        for row in cursor:
            self.export_row(row, acc)

    def export_skr(self, skr, acc):
        entry = skr.entry
        rozw = skr.attribute_value(self.SKR_ATTR).value.replace(' ', '_')
        quals = ''
        if self.data['form_qualifiers']:
            quals = '|'.join(skr.qualifiers.values_list('label', flat=True))
        if all(char not in rozw for char in ',./12'):
            pun = entry.endswith('.')
            skr_form = entry.rstrip('.')
            tag = 'brev:pun' if pun else 'brev:npun'
            if self.cache:
                acc.append(CachedExport(
                    variant_id=self.data['variant'],
                    lexeme_id=skr.id,
                    form=skr_form, entry=rozw, tag=tag,
                    commonness='', qualifiers=quals,
                    main_lexeme_id=skr.id))
            else:
                output_row = (rozw, skr.id, skr_form, tag)
                if self.data['commonness']:
                    output_row += ('',)
                if self.data['form_qualifiers']:
                    output_row += (quals,)
                acc.append(output_row)

    def homonyms_init(self):
        self.homonym_entries = set(
            self.queryset().values('entry').annotate(count=Count('pk'))
                .filter(count__gt=1).values_list('entry', flat=True))
        self.homonyms_with_numbers = set(
            self.queryset().values('entry', 'part_of_speech')
                .annotate(count=Count('pk')).filter(count__gt=1)
                .values_list('entry', 'part_of_speech'))

    def export(self):
        start = start0 = 0
        if self.debug:
            start0 = time.time()
            uniprint(u'preparing aux data', file=sys.stderr)
        if self.data['homonym_numbers']:
            self.homonyms_init()
        cursor = connection.cursor()
        tts = TableTemplate.objects.filter(
            variant_id=self.data['variant']).prefetch_related(
                'attributes__values', 'cell_attributes__values',
                'parts_of_speech')
        if self.debug:
            print >>sys.stderr, time.time() - start0
        export_data = []
        for tt in tts:
            if self.debug:
                uniprint(u'exporting table: %s' % tt.name, file=sys.stderr)
                start = time.time()
            attr_clauses, cell_attr_combinations, tt_attr_combinations = \
                attr_clauses_combinations(tt)
            cell_attr_combinations = list(cell_attr_combinations)
            nested = tt.parts_of_speech.all()[0].symbol in self.TRANSLATE_NESTED
            for tt_c in tt_attr_combinations:
                for cell_c in cell_attr_combinations:
                    self.export_part(
                        cursor, tt, nested, attr_clauses, cell_c, tt_c,
                        export_data)
                    if self.debug:
                        print >>sys.stderr, time.time() - start
                        start = time.time()
        cursor.close()
        if self.debug:
            print >>sys.stderr, time.time() - start
            uniprint(u'exporting abbreviations', file=sys.stderr)
            start = time.time()
        for skr in self.queryset(pos_set=self.SKR_POS):
            self.export_skr(skr, export_data)
        if not self.cache:
            if self.debug:
                print >>sys.stderr, time.time() - start
                uniprint(u'sorting', file=sys.stderr)
                start = time.time()
            export_data.sort(cmp=lambda t1, t2: locale.strcoll(t1[2], t2[2]))
            export_data.sort(key=operator.itemgetter(1))
            export_data.sort(cmp=lambda t1, t2: locale.strcoll(t1[0], t2[0]))
        if self.debug:
            print >>sys.stderr, time.time() - start
            uniprint(u'outputting', file=sys.stderr)
            start = time.time()
        # if copyright_header:
        #     uniprint(copyright_header, file=self.copyright_file)
        if self.cache:
            CachedExport.objects.bulk_create(export_data, batch_size=BATCH_SIZE)
        else:
            for r in export_data:
                uniprint(
                    u'\t'.join((r[2], r[0]) + r[3:]), file=self.output_file)
        if self.debug:
            print >>sys.stderr, time.time() - start
            print >>sys.stderr, 'Total:', time.time() - start0

    def export_from_cache(self):
        if self.data['homonym_numbers']:
            self.homonyms_init()
        lexeme_ids = tuple(
            self.queryset(all_pos=True).values_list('id', flat=True))
        cached_exports = CachedExport.objects.filter(
            variant_id=self.data['variant'], lexeme_id__in=lexeme_ids)\
            .order_by('entry', 'main_lexeme', 'form')
        cached_exports = cached_exports.extra(
            select={
                'hn': '(select number from dictionary_homonymnumber h '
                      'where h.lexeme_id = dictionary_cachedexport.lexeme_id '
                      'and h.variant_id = %s)',
            },
            select_params=[self.data['variant']])
        #     tables=['dictionary_homonymnumber'],
        #     where=['dictionary_homonymnumber.lexeme_id '
        #        '= dictionary_cachedexport.lexeme_id'])
        cached_exports.select_related('main_lexeme')
        header = HEADER_TEMPLATE % (
            self.data['dict_id'], self.data['copyright'])
        uniprint(
            datetime.now().strftime(header.encode('utf-8')).decode('utf-8'),
            file=self.output_file)

        for row in cached_exports:
            l = row.main_lexeme
            entry = row.entry
            if self.data['homonym_numbers']:
                entry = self.homonym_entry(entry, l.part_of_speech_id, row.hn)
            output_row = (row.form, entry, row.tag)
            if self.data['commonness']:
                output_row += (row.commonness,)
            if self.data['form_qualifiers']:
                output_row += (row.qualifiers,)
            uniprint(u'\t'.join(output_row), file=self.output_file)

    def cache_lexeme(self, lexeme):
        assert self.cache
        variant = self.data['variant']
        if lexeme.part_of_speech_id in self.SKR_POS:
            acc = []
            self.export_skr(lexeme, acc)
            for cached_export in acc:
                cached_export.save()
            return
        commonness = Classification.objects.get(name=u'pospolitość')
        aspect = LexemeAttribute.objects.get(name=u'aspekt')
        person = LexemeAttribute.objects.get(name=u'osoba')
        case = LexemeAttribute.objects.get(name=u'przypadek')
        number = LexemeAttribute.objects.get(
            name=u'liczba selektywna')
        value_ids = []
        for attr in (aspect, person, case, number):
            value = lexeme.attribute_value(attr)
            value_ids.append(value.id if value else None)
        aspect_id, person_id, case_id, number_id = value_ids
        commonness_values = lexeme.classification_values(commonness)\
            .values_list('id', flat=True) or [None]
        hn = lexeme.get_variant_homonym(variant)
        acc = []
        lexeme_quals = set(lexeme.qualifiers.values_list('label', flat=True))
        for lip in lexeme.lexemeinflectionpattern_set.all():
            lip_quals = set(lip.qualifiers.values_list('label', flat=True))
            for cell in lip.cells(variant):
                if lexeme.part_of_speech_id in self.TRANSLATE_NESTED:
                    main_lexeme_ids = lexeme.refs_to.filter(
                        type_id__in=self.crtype_ids).values_list(
                            'to_lexeme', flat=True)
                else:
                    main_lexeme_ids = [lexeme.id]
                for main_lexeme_id in main_lexeme_ids:
                    main_lexeme = Lexeme.objects.get(id=main_lexeme_id)
                    endings = lip.pattern.endings.filter(
                        base_form_label=cell.base_form_label)
                    for ending in endings:
                        ending_quals = set(
                            ending.qualifiers.values_list('label', flat=True))
                        quals = QUALIFIER_SEPARATOR.join(
                            lexeme_quals | lip_quals | ending_quals)
                        form = '%s%s%s%s' % (
                            cell.prefix, lip.root, ending.string, cell.suffix)
                        for cv in commonness_values:
                            row = (
                                lexeme.id,
                                main_lexeme_id,
                                main_lexeme.entry,
                                form,
                                lexeme.part_of_speech_id,
                                cell.tag_template,
                                None,  # refl
                                lip.gender_id,
                                aspect_id,
                                person_id,
                                case_id,
                                number_id,
                                hn,
                            )
                            if self.data['commonness']:
                                row += (cv,)
                            if self.data['form_qualifiers']:
                                row += (quals,)
                            self.export_row(row, acc)
        CachedExport.objects.bulk_create(acc)


def refresh_export(lexeme):
    CachedExport.objects.filter(lexeme=lexeme).delete()
    for variant in Variant.objects.filter(type=Variant.TYPE_EXPORT):
        export_manager = LexemeExport(data={
            'vocabs': [lexeme.owner_vocabulary_id],
            'antivocabs': [],
            'variant': variant.id,
            'excluding_qualifiers': [],
            'magic_qualifiers': [],
            'refl': False,
            'commonness': True,
            'homonym_numbers': True,
            'form_qualifiers': True,
        }, cache=True)
        export_manager.cache_lexeme(lexeme)