lexeme_export.py 16.8 KB

Edit Raw Blame History

# -*- coding: utf-8 -*-
import locale
import sys
import time
from datetime import date

from django.db import connection
from django.db.models import Count

from common.util import debug, flatten, uniprint
from export.lexeme_form_query import attr_clauses_combinations, \
    EXPORT_FROM_CLAUSES, WHERE_CLAUSES
from dictionary.models import CrossReferenceType, ClassificationValue, \
    LexemeAttributeValue, Gender, TableTemplate, HomonymNumber, Lexeme, \
    LexemeAttribute, SavedExportData


locale.setlocale(locale.LC_ALL, 'pl_PL.utf8')

UNDERSCORES = True


class LexemeExport(object):
    ADJPREDYKATYWNE = [
        u'ciekaw',
        u'godzien',
        u'gotów',
        u'łaskaw',
        u'świadom',
        u'winien',
        u'zdrów',
        # wątpliwe:
        u'dłużen',
        u'miłościw',
        u'praw',
        u'wesół',
        u'żyw',
    ]

    REFL_TRANSLATION = {
        u'—': 'nonrefl',
        u'się': 'refl',
        u'sobie': 'refl',
        u'się/sobie': 'refl',
        u'(się)': 'refl.nonrefl',
        u'(sobie)': 'refl.nonrefl',
    }

    ASPECT_TRANSLATION = {
        u'dk': 'perf',
        u'ndk': 'imperf',
        u'dk/ndk': 'imperf.perf',
        u'dk/(ndk)': 'imperf.perf',
        u'ndk/dk': 'imperf.perf',
        u'ndk/(dk)': 'imperf.perf',
    }

    SKR_POS = ('skrl', 'skrw', 'skrf')

    SKR_ATTR = LexemeAttribute.objects.get(name=u'rozwinięcie')

    TRANSLATE_NESTED = {
        'adjcom': 'adj',
        'advcom': 'adv',
        'ger': 'v',
        'pact': 'v',
        'ppas': 'v',
        'appas': 'v',
    }

    def __init__(self, export_data_name=None, data=None, output_file=None):
        self.homonym_entries = None
        self.homonyms_with_numbers = None
        if export_data_name:
            export_data = SavedExportData.objects.get(name=export_data_name)
            self.data = export_data.get_data()
        else:
            self.data = data or {
                'vocabs': ['SGJP'],
                'antivocabs': ['antyMorfeusz'],
                'variant': 'Morfeusz',
                'excluding_qualifiers': [],
                'magic_qualifiers': [],
                'refl': False,
                'commonness': True,
                'homonym_numbers': True,
                'form_qualifiers': True,
                'copyright': u'',
            }
        self.output_file = output_file or sys.stdout
        self.copyright_file = open('copyright.txt', 'w')
        self.vocabs_placeholders = ', '.join('%s' for v in self.data['vocabs'])

        if self.data['antivocabs']:
            antivocabs_placeholders = ', '.join(
                '%s' for v in self.data['antivocabs'])
            self.antivocabs_clause = '''not exists (
          select * from leksemy_w_slownikach ls2 where ls2.l_id = l.id
          and ls2.slownik in (%s)) and''' % antivocabs_placeholders
        else:
            self.antivocabs_clause = ''

        self.qualifier_clauses = ''.join(
            self.qualifier_clause(q_id)
            for q_id in self.data['excluding_qualifiers'])
        self.magic_qualifier_clauses = ''.join(
            self.magic_qualifier_clause()
            for pattern, q_id in self.data['magic_qualifiers'])

        crtypes = ['comadv', 'comadj', 'gerver', 'pactver', 'ppasver']
        self.crtype_ids = CrossReferenceType.objects.filter(
            symbol__in=crtypes).values_list('id', flat=True)

        self.cv_ids = ClassificationValue.objects.filter(
            classification__name=u'pospolitość').values_list('id', flat=True)

        self.genders = dict(Gender.objects.values_list('id', 'symbol'))

        self.refls = dict(LexemeAttributeValue.objects.filter(
            attribute__name=u'zwrotność').values_list('id', 'value'))
        self.aspects = dict(LexemeAttributeValue.objects.filter(
            attribute__name=u'aspekt').values_list('id', 'value'))
        self.persons = dict(LexemeAttributeValue.objects.filter(
            attribute__name=u'osoba').values_list('id', 'value'))
        self.cases = dict(LexemeAttributeValue.objects.filter(
            attribute__name=u'przypadek').values_list('id', 'value'))
        self.numbers = dict(
            (id, v.replace('/', '.'))
            for id, v in LexemeAttributeValue.objects.filter(
                attribute__name=u'liczba selektywna').values_list(
                    'id', 'value'))

        self.select = '''prefix||rdzen||zak||suffix, l.pos, tag_template,
            refl.attribute_value_id, o.gender_id,
            aspect.attribute_value_id, person.attribute_value_id,
            "case".attribute_value_id, number.attribute_value_id, hn.number
        '''

        self.qualifier_select = ''',
        (select string_agg(kwal, '|')
        from kwalifikatory
        where id in ((
                select qualifier_id from kwalifikatory_leksemow
                where lexeme_id = l.id)
            union (
                select qualifier_id from kwalifikatory_odmieniasiow
                where lexemeinflectionpattern_id = o.id)
            union (
                select qualifier_id from kwalifikatory_zakonczen
                where ending_id = z.id)))
        '''
        self.commonness_select = ''',
        (select string_agg(nazwa, '|')
        from wartosci_klasyfikacji
        where id in (
            select classification_value_id from dictionary_lexemecv
            where lexeme_id = l.id and classification_value_id in (%s)
        ))''' % (', '.join(str(id) for id in self.cv_ids))
        # TODO
        # sprytniej by było uwzględniać tylko atrybuty, które mogą się
        # pojawić dla danej części mowy
        self.table_joins = EXPORT_FROM_CLAUSES + '''
        join leksemy_w_slownikach ls on (ls.l_id = l.id)
        left join dictionary_lexemeav aspect
            on (l.id = aspect.lexeme_id and %(aspect)s)
        left join dictionary_lexemeav person
            on (l.id = person.lexeme_id and %(person)s)
        left join dictionary_lexemeav "case"
            on (l.id = "case".lexeme_id and %(case)s)
        left join dictionary_lexemeav number
            on (l.id = number.lexeme_id and %(number)s)
        ''' % {
            'aspect': 'aspect.attribute_value_id in (%s)'
            % ', '.join(str(id) for id in self.aspects),
            'person': 'person.attribute_value_id in (%s)'
            % ', '.join(str(id) for id in self.persons),
            'case': '"case".attribute_value_id in (%s)'
            % ', '.join(str(id) for id in self.cases),
            'number': 'number.attribute_value_id in (%s)'
            % ', '.join(str(id) for id in self.numbers)
        }

        self.table_clause = WHERE_CLAUSES

        self.params_part = (
            [self.data['variant']] + list(self.data['vocabs']) +
            list(self.data['antivocabs']))

        if UNDERSCORES:
            self.cv_table = dict(
                (id, label.replace(' ', '_')) for id, label
                in ClassificationValue.objects.values_list('id', 'label'))
        else:
            self.cv_table = dict(
                ClassificationValue.objects.values_list('id', 'label'))

    def queryset(self, pos_set=None):
        lexemes = Lexeme.objects.filter(
            vocabularies__id__in=self.data['vocabs'])
        if self.data['antivocabs']:
            lexemes = lexemes.extra(
                where=[
                    self.antivocabs_clause.replace(' l.', ' leksemy.')
                    + ' true'],
                params=self.data['antivocabs'])
        lexemes = lexemes.exclude(status__in=Lexeme.HIDDEN_STATUSES)
        if pos_set is None:
            pos_set = set(
                TableTemplate.objects.filter(
                    variant_id=self.data['variant']).values_list(
                    'parts_of_speech', flat=True).distinct())
            pos_set -= set(self.TRANSLATE_NESTED)
        lexemes = lexemes.filter(part_of_speech_id__in=pos_set)
        lexemes = lexemes.exclude(
            qualifiers__in=self.data['excluding_qualifiers'])
        return lexemes

    @staticmethod
    def qualifier_clause(q_id):
        return '''not exists (
        select * from kwalifikatory_leksemow where lexeme_id = l.id and
          qualifier_id = %(q)d) and not exists (
        select * from kwalifikatory_odmieniasiow where qualifier_id = %(q)d and
          lexemeinflectionpattern_id = o.id) and not exists (
        select * from kwalifikatory_zakonczen where qualifier_id = %(q)d and
          ending_id = z.id) and ''' % {'q': q_id}

    @staticmethod
    def magic_qualifier_clause():
        return '''and not (tag like %s and exists (
        select kw.id
        from kwalifikatory kw
          join kwalifikatory_leksemow kwl on kw.id = kwl.qualifier_id
        where kwl.lexeme_id = l.id and kw.id = %s)) '''

    def export_row(self, row, acc):
        i = 12
        lexeme_id, entry, form, pos, tag, refl_id, gender_id, aspect_id,\
            person_id, case_id, number_id, hn = row[:i]
        main_pos = self.TRANSLATE_NESTED.get(pos, pos)
        if self.data['commonness']:
            comm = row[i]
            i += 1
        quals = None
        if self.data['form_qualifiers']:
            quals = row[i]
            if UNDERSCORES and quals:
                quals = quals.replace(' ', '_')
        if self.data['homonym_numbers'] and entry in self.homonym_entries:
            if (entry, main_pos) in self.homonyms_with_numbers:
                assert hn
            else:
                hn = ''
            letter = HomonymNumber.MORFEUSZ_LETTERS_REVERSE.get(main_pos)
            assert letter
            entry += ':%s%s' % (letter, hn)
        form = form.lstrip('+')  # odmienne postfiksy
        tags = None
        if tag == 'adja':
            form = form.rstrip('+')
            # entry = form
        elif tag == 'adjc':
            if form not in self.ADJPREDYKATYWNE:
                tags = ["adj:sg:nom:m1.m2.m3:pos", "adj:sg:acc:m3:pos"]
        # elif tag == 'num:comp':
        #    entry = form
        #    # to samo też dla jeszcze nieistniejących ppas:comp, pact:comp
        if self.data['refl'] and pos in ('v', 'pact', 'ger'):
            if refl_id in self.refls:
                tag += ':' + self.REFL_TRANSLATION[self.refls[refl_id]]
            else:
                debug(entry, u'Nieznana zwrotność: %s' % refl_id)
        if 'RODZAJ' in tag:
            tag = tag.replace('RODZAJ', self.genders[gender_id])
        if 'ASPEKT' in tag:
            tag = tag.replace(
                'ASPEKT', self.ASPECT_TRANSLATION[self.aspects[aspect_id]])
        if 'OSOBA' in tag:
            tag = tag.replace('OSOBA', self.persons[person_id])
        if 'PRZYPADEK' in tag:
            tag = tag.replace('PRZYPADEK', self.cases[case_id])
        if 'LICZBA' in tag:
            tag = tag.replace('LICZBA', self.numbers[number_id])
        if 'PL' in tag:
            tag = tag.replace(
                'PL', '.p1.p2' if 'pl' in self.numbers[number_id] else '')
        tags = tags or [tag]
        for tag in tags:
            output_row = (entry, lexeme_id, form, tag)
            if self.data['commonness']:
                output_row += (comm or '',)
            if self.data['form_qualifiers']:
                output_row += (quals or '',)
            acc.append(output_row)

    def export_part(self, cursor, tt, nested, attr_clauses, cell_c, tt_c, acc):
        if not nested:
            params = self.params_part + flatten(self.data['magic_qualifiers'])
            query = """
select distinct l.id, haslo, %(select)s %(clas_field)s %(qual_field)s
    %(table_joins)s
    left join dictionary_lexemeav refl
        on (l.id = refl.lexeme_id and %(refl)s)
    left join dictionary_homonymnumber hn
        on (l.id = hn.lexeme_id and hn.variant_id = %%s)
where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s
    and l.status not in ('cand', 'litt') and not l.usuniety %(magic)s
    and %(attr_clauses)s
    """
        else:
            params = self.params_part
            query = """
select distinct g.id, g.haslo as haslo, %(select)s %(clas_field)s %(qual_field)s
    %(table_joins)s
    join odsylacze on l.id=l_id_od
    join leksemy g on (l_id_do=g.id and g.usuniety = false)
    left join dictionary_lexemeav refl
        on (g.id = refl.lexeme_id and %(refl)s)
    left join dictionary_homonymnumber hn
        on (g.id = hn.lexeme_id and hn.variant_id = %%s)
where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s
    and typods_id in (%(crtype_ids)s) and
    l.status not in ('cand', 'litt') and not l.usuniety and %(attr_clauses)s
    """
        query = query % {
            'vocabs': self.vocabs_placeholders,
            'antivocabs': self.antivocabs_clause,
            'x_qual': self.qualifier_clauses,
            'magic': self.magic_qualifier_clauses,
            'crtype_ids': ', '.join(str(id) for id in self.crtype_ids),
            'clas_field':
                self.commonness_select if self.data['commonness'] else '',
            'qual_field':
                self.qualifier_select if self.data['form_qualifiers'] else '',
            'select': self.select,
            'table_joins': self.table_joins,
            'table_clause': self.table_clause,
            'refl':
                'refl.attribute_value_id in (%s)'
                % ', '.join(str(id) for id in self.refls),
            'attr_clauses': ' and '.join(attr_clauses)
        }
        # if tt.name == 'czasowniki':
        #     print >>sys.stderr, query
        #     print >>sys.stderr,
        #         [tt.id] + params + list(tt_c + cell_c + cell_c)
        cursor.execute(
            query, [tt.id] + params + list(tt_c + cell_c + cell_c))
        for row in cursor:
            self.export_row(row, acc)

    def export_skr(self, acc):
        for skr in self.queryset(pos_set=self.SKR_POS):
            entry = skr.entry
            rozw = skr.attribute_value(self.SKR_ATTR).value.replace(' ', '_')
            if all(char not in rozw for char in ',./12'):
                pun = entry.endswith('.')
                skr_form = entry.rstrip('.')
                tag = 'brev:pun' if pun else 'brev:npun'
                output_row = (rozw, skr.id, skr_form, tag)
                if self.data['commonness']:
                    output_row += ('',)
                if self.data['form_qualifiers']:
                    quals = '|'.join(skr.qualifiers.values_list(
                        'label', flat=True))
                    output_row += (quals,)
                acc.append(output_row)

    def export(self):
        if self.data['homonym_numbers']:
            self.homonym_entries = set(
                self.queryset().values('entry').annotate(count=Count('pk'))
                    .filter(count__gt=1).values_list('entry', flat=True))
            self.homonyms_with_numbers = set(
                self.queryset().values('entry', 'part_of_speech')
                    .annotate(count=Count('pk')).filter(count__gt=1)
                    .values_list('entry', 'part_of_speech'))
        cursor = connection.cursor()
        tts = TableTemplate.objects.filter(
            variant_id=self.data['variant']).prefetch_related(
                'attributes__values', 'cell_attributes__values',
                'parts_of_speech')
        export_data = []
        for tt in tts:
            uniprint(u'exporting table: %s' % tt.name, file=sys.stderr)
            start = time.clock()
            attr_clauses, cell_attr_combinations, tt_attr_combinations = \
                attr_clauses_combinations(tt)
            cell_attr_combinations = list(cell_attr_combinations)
            nested = tt.parts_of_speech.all()[0].symbol in self.TRANSLATE_NESTED
            for tt_c in tt_attr_combinations:
                for cell_c in cell_attr_combinations:
                    self.export_part(
                        cursor, tt, nested, attr_clauses, cell_c, tt_c,
                        export_data)
                    print >>sys.stderr, time.clock() - start
                    start = time.clock()
        cursor.close()
        uniprint(u'exporting abbreviations', file=sys.stderr)
        start = time.clock()
        self.export_skr(export_data)
        print >>sys.stderr, time.clock() - start
        # uniprint(u'sorting', file=sys.stderr)
        # start = time.clock()
        # export_data.sort(cmp=lambda t1, t2: locale.strcoll(t1[2], t2[2]))
        # export_data.sort(key=operator.itemgetter(1))
        # export_data.sort(cmp=lambda t1, t2: locale.strcoll(t1[0], t2[0]))
        # print >>sys.stderr, time.clock() - start
        uniprint(u'outputting', file=sys.stderr)
        start = time.clock()
        today = date.today()
        copyright = self.data['copyright'] % {
            'year': today.year,
            'month': today.month,
            'day': today.day,
        }
        if copyright:
            uniprint(copyright, file=self.copyright_file)
        for r in export_data:
            uniprint(
                u'\t'.join((str(r[1]), r[2], r[0]) + r[3:]),
                file=self.output_file)
        print >>sys.stderr, time.clock() - start