export.py 6.08 KB
#-*- coding:utf-8 -*-

import sys
from django.core.management.base import BaseCommand, CommandError
from django.db import connection
from common.util import debug, flatten
from dictionary.models import CrossReferenceType, ClassificationValue

ADJPREDYKATYWNE = [
  u'ciekaw',
  u'godzien',
  u'gotów',
  u'łaskaw',
  u'świadom',
  u'winien',
  u'zdrów',
# wątpliwe:
  u'dłużen',
  u'miłościw',
  u'praw',
  u'wesół',
  u'żyw',
]

SUFFIX_TRANSLATION = {
  u'': 'nonrefl',
  u' się': 'refl',
  u' sobie': 'refl',
  u' się/sobie': 'refl',
  u' się?': 'refl',
  u' (się)': 'refl.nonrefl',
  u' (sobie)': 'refl.nonrefl',
  u' (się)?': 'refl.nonrefl',
}

def qualifier_clause(q_id):
  return '''not exists (
    select * from kwalifikatory_leksemow where lexeme_id = l.id and
      qualifier_id = %(q)d) and not exists (
    select * from kwalifikatory_odmieniasiow where qualifier_id = %(q)d and
      lexemeinflectionpattern_id = o.id) and not exists (
    select * from kwalifikatory_zakonczen where qualifier_id = %(q)d and
      ending_id = z.id) and ''' % {'q': q_id}

def magic_qualifier_clause():
  return '''and not (tag like %s and exists (
    select kw.id
    from kwalifikatory kw
      join kwalifikatory_leksemow kwl on kw.id = kwl.qualifier_id
    where kwl.lexeme_id = l.id and kw.id = %s)) '''


def export_lexemes(data=None, output_file=None):
  if not data:
    data = {
      'vocabs': ['PoliMorf'],
      'antivocabs': [],
      'variant': 'Morfeusz',
      'excluding_qualifiers': [],
      'magic_qualifiers': [],
      'refl': False,
      'commonness': False,
    }
  if output_file is None:
    output_file = sys.stdout
  vocabs_placeholders = ', '.join('%s' for v in data['vocabs'])
  if data['antivocabs']:
    antivocabs_placeholders = ', '.join('%s' for v in data['antivocabs'])
    antivocabs_clause = '''not exists (
      select * from leksemy_w_slownikach ls2 where ls2.l_id = l.id
      and ls2.slownik in (%s)) and''' % antivocabs_placeholders
  else:
    antivocabs_clause = ''
  qualifier_clauses = ''.join(
    qualifier_clause(q_id) for q_id in data['excluding_qualifiers'])
  magic_qualifier_clauses = ''.join(
    magic_qualifier_clause() for pattern, q_id in data['magic_qualifiers'])
  crtypes = ['comadv', 'comadj', 'gerver', 'pactver', 'ppasver']
  crtype_ids = CrossReferenceType.objects.filter(
    symbol__in=crtypes).values_list('pk', flat=True)
  cv_ids = ClassificationValue.objects.filter(
    classification__name=u'pospolitość').values_list('pk', flat=True)
  cursor = connection.cursor()
  query = """
    select haslo, prefiks||rdzen||zak||sufiks, l.pos, ch.charfl, tag,
      l.id as leksem_id, haslosuf %(clas_field)s
    from leksemy l
      join leksemy_w_slownikach ls on (ls.l_id = l.id)
      join odmieniasie o on (o.l_id = l.id)
      join charfle ch on ch.id = o.charfl
      join wzory w on (o.w_id = w.id)
      join szablony_tabel s on (
        w.typ=s.wtyp
        and o.charfl=s.charfl)
      join klatki k on k.st_id = s.id
      join zakonczenia z on (o.w_id=z.w_id and k.efobaz=z.efobaz)
      join efobazy e on (e.id = k.efobaz)
      %(clas_join)s
    where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s
      l.status<>'cand' and wariant=%%s and
      l.pos in ('v','subst','osc','adj','adv', 'num','advndm','burk','comp',
                'conj','interj','prep','qub','ppron','pred') and
      l.usuniety = false %(magic)s
      --and haslo < 'b'
    union all
    -- wymagające gniazdowania przy hasłowaniu: adjcom, advcom, derywaty:
    select g.haslo as haslo, prefiks||rdzen||zak||sufiks, l.pos, ch.charfl, tag,
      l.id as leksem_id, g.haslosuf %(clas_field)s -- l.haslosuf?
    from leksemy l
      join leksemy_w_slownikach ls on (ls.l_id = l.id)
      join odsylacze on l.id=l_id_od
      join leksemy g on (l_id_do=g.id and g.usuniety = false)
      join odmieniasie o on l.id=o.l_id
      join charfle ch on ch.id = o.charfl
      join wzory w on (o.w_id = w.id)
      join szablony_tabel s on (
        w.typ=s.wtyp
        and o.charfl=s.charfl)
      join klatki k on k.st_id = s.id
      join zakonczenia z on (o.w_id=z.w_id and k.efobaz=z.efobaz)
      %(clas_join)s
    where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s
      typods_id in (%(crtype_ids)s) and
      l.pos in ('adjcom','advcom','ger','pact','ppas') and
      wariant=%%s and l.status<>'cand' and l.usuniety = false
      --and g.haslo < 'b'
    order by haslo, leksem_id
    """ % {
      'vocabs': vocabs_placeholders,
      'antivocabs': antivocabs_clause,
      'x_qual': qualifier_clauses,
      'magic': magic_qualifier_clauses,
      'crtype_ids': ', '.join(str(pk) for pk in crtype_ids), # brzydko, oj tam
      'clas_field': ', classificationvalue_id' if data['commonness'] else '',
      'clas_join':
        ('left outer join wartosci_klasyfikacji_lexemes wkl '
         'on (wkl.lexeme_id=l.id and wkl.classificationvalue_id in (%s))'
         % ', '.join(str(pk) for pk in cv_ids)) if data['commonness'] else '',
    }
  params_part = (list(data['vocabs']) + list(data['antivocabs']) +
                 [data['variant']])
  params = params_part + flatten(data['magic_qualifiers']) + params_part
  cursor.execute(query, params)
  refl = data['refl']
  cv_table = dict(ClassificationValue.objects.values_list('id', 'label'))
  for row in cursor:
    if data['commonness']:
      entry, form, pos, _ic, tag, _id, suffix, cv_id = row
    else:
      entry, form, pos, _ic, tag, _id, suffix = row
    form = form.lstrip('+') # odmienne postfiksy
    if tag == 'adja':
      form = form.rstrip('+')
    if tag == 'adjc':
      if form not in ADJPREDYKATYWNE:
        tag = "adj:sg:nom:m1.m2.m3:pos|adj:sg:acc:m3:pos"
    if refl and pos in ('v', 'pact', 'ger'):
      if suffix in SUFFIX_TRANSLATION:
        tag += ':' + SUFFIX_TRANSLATION[suffix]
      else:
        debug(entry, u'Nieznana sięność: %s' % suffix)
    if data['commonness']:
      cv = cv_table[cv_id] if cv_id else ''
      output_file.write((u'%s\t%s\t%s\t%s\n' %
                         (form, entry, tag, cv)).encode('utf-8'))
    else:
      output_file.write((u'%s\t%s\t%s\n' %
                         (form, entry, tag)).encode('utf-8'))