import_data.py 21.1 KB

Edit Raw Blame History

#-*- coding:utf-8 -*-

import sqlite3
import datetime
from django.db import connection, transaction
from django.core.management.base import BaseCommand

from common.util import no_history
from dictionary.models import *

DEFAULT_DATABASE = 'data/sgjp.db'

MINI_MODE = False # do debugowania
MINI_LEXEME_COUNT = 5000
MINI_LEXEME_QUERY = "SELECT %s FROM leksemy WHERE pos IN ('v', 'ger', 'pact', 'ppas') ORDER BY haslo LIMIT ?"
#MINI_LEXEME_QUERY = "SELECT %s FROM leksemy l WHERE EXISTS (SELECT * FROM odmieniasie WHERE nr = l.nr AND charfl = 'm1') LIMIT ?"

SQL_MODE = True

BATCH_SIZE = 5000

OTHER = 'inne'
DEFAULT_VOCAB = 'SGJP'

ATTRS = {
  u'zwrotność': (
    (('v', 'ger', 'pact'), None),
    (u'—', u'się', u'(się)', u'sobie', u'(sobie)', u'się/sobie'),
    ('haslosuf', lambda suf: suf.strip(' ?') or u'—'),
  ),
  u'przechodniość': (
    (('v', 'pred'), None),
    ('iT', 'qT', 'T'),
    ('przechodniosc', lambda x: x),
  ),
  u'aspekt': (
    (('v', 'pred', 'ger', 'pact', 'ppas'), None),
    ('dk', 'ndk', 'ndk/dk', 'dk/ndk', 'ndk/(dk)', 'dk/(ndk)'),
    ('aspekt', lambda x: x),
  ),
  u'właściwy': (
    (('v', 'pred'), None),
    ('Q', '(Q)', ''),
    ('właściwy', lambda x: x),
  ),
  u'depr': (
    (('subst', 'skrs'), 'm1'),
    ('n', 'd', 'nd'),
    ('depr', lambda x: x),
  )
}

# tymczasowa tabelka
BASIC_FORM_LABELS = {
  '0-': '1',
  '3+': '1',
  'f':  'sg:nom',
  'm1': 'sg:nom',
  'm2': 'sg:nom',
  'm3': 'sg:nom',
  'n1': 'sg:nom',
  'n2': 'sg:nom',
  'p1': 'pl:nom:mo',
  'p2': 'pl:nom',
  'p3': 'pl:nom',
  'pri': 'sg:nom', # albo pl. teraz już naprawdę się nie da.
  'sec': 'sg:nom',
}

# to chyba nie jest najlepsze rozwiązanie...
BASIC_FORM_LABELS_POS = {
  'v': '5',
  'ger': '11',
  'pact': '3',
  'ppas': '10',
  'appas': '10',
  'pred': '5',
}

class Command(BaseCommand):
  args = '<input db filename>'
  help = 'Imports initial data'

  def handle(self, db_name=DEFAULT_DATABASE, **options):
    ImportData(db_name).delete_and_import()

def get_cursor(db):
  conn = sqlite3.connect(db)
  conn.row_factory = sqlite3.Row
  return conn.cursor()

def bulk_create(model, objects):
  model.objects.bulk_create(objects, batch_size=BATCH_SIZE)

METHOD_NAMES = {
  CrossReference: 'import_cross_references',
  Ending: 'import_endings',
  LexemeInflectionPattern: 'import_lexeme_inflection_patterns',
  Lexeme: 'import_lexemes',
  PatternType: 'import_pattern_types',
  TableTemplate: 'import_tables',
  BaseFormLabel: 'new_base_form_labels',
  CrossReferenceType: 'new_cross_reference_types',
  InflectionCharacteristic: 'new_inflection_characteristics',
  LexemeAssociation: 'new_lexeme_associations',
  LexicalClass: 'new_lexical_classes',
  PartOfSpeech: 'new_parts_of_speech',
  Pattern: 'new_patterns',
  Qualifier: 'new_qualifiers',
  TableHeader: 'new_table_headers',
  Vocabulary: 'new_vocabularies',
}

class ImportData(object):
  def __init__(self, db):
    transaction.commit_unless_managed()
    transaction.enter_transaction_management()
    transaction.managed()
    self.cursor = connection.cursor()
    self.sqlite_cursor = get_cursor(db)
    no_history()

  def close(self):
    self.cursor.close()
    self.sqlite_cursor.close()
    transaction.commit()
    transaction.leave_transaction_management()


  def new_lexical_classes(self):
    yield LexicalClass(symbol=OTHER)
    for row in self.sqlite_cursor.execute('select distinct pos from wzory'):
      yield LexicalClass(symbol=row['pos'])

  def cache_lc(self):
    if 'lc' not in self.__dict__:
      self.lc = dict((lc.symbol, lc) for lc in LexicalClass.objects.all())

  def new_parts_of_speech(self):
    lcs = {}
    for row in self.sqlite_cursor.execute(
        'select distinct wzory.pos, leksemy.pos from wzory '
        'natural join odmieniasie join leksemy on leksemy.nr = odmieniasie.nr'):
      lcs[row[1]] = row[0]

    for row in self.sqlite_cursor.execute('SELECT pos FROM klasygramatyczne'):
      lc = lcs.get(row['pos'], OTHER)
      yield PartOfSpeech(
        symbol=row['pos'], lexical_class = LexicalClass.objects.get(symbol=lc))

  def cache_pos(self):
    if 'pos' not in self.__dict__:
      self.pos = dict((pos.symbol, pos) for pos in PartOfSpeech.objects.all())

  def cache_lc_pos(self):
    if 'lc_pos' not in self.__dict__:
      self.lc_pos = dict(
        (pos.symbol, pos.lexical_class) for pos in PartOfSpeech.objects.all()
      )

  def new_base_form_labels(self):
    query_result = self.sqlite_cursor.execute("""
      SELECT efobaz FROM paradygmaty
      UNION
      SELECT efobaz FROM zakonczenia
      """)
    for row in query_result:
      yield BaseFormLabel(symbol=row[0])

  def cache_bfl(self):
    if 'bfls' not in self.__dict__:
      self.bfls = dict((bfl.symbol, bfl) for bfl in BaseFormLabel.objects.all())

  def new_inflection_characteristics(self):
    for row in self.sqlite_cursor.execute(
            'SELECT DISTINCT charfl, pos FROM paradygmaty'):
      if row['charfl'] == '':
        bfl_symbol = '1' if row['pos'] in ('adj', 'adjcom') else ''
      else:
        bfl_symbol = BASIC_FORM_LABELS.get(row['charfl'], '')
      if row['pos'] in BASIC_FORM_LABELS_POS:
        bfl_symbol = BASIC_FORM_LABELS_POS[row['pos']]
      bfl = BaseFormLabel.objects.get(symbol=bfl_symbol)
      yield InflectionCharacteristic(
        symbol=row['charfl'], basic_form_label=bfl,
        part_of_speech=PartOfSpeech.objects.get(pk=row['pos']))

  def cache_ics(self):
    if 'ics' not in self.__dict__:
      self.ics = dict(
        ((ic.symbol, ic.part_of_speech.symbol), ic)
        for ic in InflectionCharacteristic.objects.all()
      )

  def new_vocabularies(self):
    result = self.sqlite_cursor.execute("""
      SELECT slownik FROM leksemy
      UNION
      SELECT slownik_uz FROM slowniki_uzywajace
    """)
    for row in result:
      yield Vocabulary(id = row[0])

  def cache_vocabs(self):
    if 'vocabs' not in self.__dict__:
      self.vocabs = dict((v.id, v) for v in Vocabulary.objects.all())

  def new_qualifiers(self):
    default = Vocabulary.objects.get(id=DEFAULT_VOCAB)
    query_result = self.sqlite_cursor.execute("""
      SELECT okwal FROM odmieniasie
      UNION
      SELECT zkwal FROM zakonczenia
      UNION
      SELECT lkwal FROM leksemy
      """)
    added = set()
    for row in query_result:
      if row[0]:
        for qualifier_label in row[0].split('|'):
          if qualifier_label not in added:
            added.add(qualifier_label)
            yield Qualifier(label=qualifier_label, vocabulary=default)

  def cache_qualifiers(self):
    if 'qual' not in self.__dict__:
      self.qual = dict((q.label, q) for q in Qualifier.objects.all())

  def create_attributes(self):
    attr_values = {}
    for attr_name, ((poses, ic), values, import_info) in ATTRS.iteritems():
      la, created = LexemeAttribute.objects.get_or_create(
        name=attr_name, closed=True, required=True, takes_ic=bool(ic))
      for pos in PartOfSpeech.objects.filter(symbol__in=poses):
        la.parts_of_speech.add(pos) #add
        pos_ics = InflectionCharacteristic.objects.filter(
          part_of_speech=pos, symbol=ic)
        for ic0 in pos_ics:
          la.inflection_characteristics.add(ic0) #add
      values_cache = {}
      for val in values:
        values_cache[val], created = LexemeAttributeValue.objects.get_or_create(
          value=val, attribute=la)
      attr_values[attr_name] = values_cache
    return attr_values

  def new_lexemes(self):
    self.cache_qualifiers()
    if MINI_MODE:
      result = self.sqlite_cursor.execute(
        MINI_LEXEME_QUERY % '*',(MINI_LEXEME_COUNT,))
    else:
      result = self.sqlite_cursor.execute('SELECT * FROM leksemy')
    attr_values = self.create_attributes()
    date = datetime.datetime.now()
    cv_table = dict(
      (cv.label, cv) for cv in ClassificationValue.objects.all())
    lexemes = []
    lexeme_associations = []
    lexeme_qualifiers = []
    lexeme_cvs = []
    lexeme_attrs = []
    for row in result:
      slownik = row['slownik']
      status = 'conf' if slownik != 'zmiotki' else 'cand'
      cv = cv_table[row['pospolitosc']]
      lexemes.append(Lexeme(
        id=row['nr'],
        entry=row['haslo'],
        entry_suffix=row['haslosuf'] or '', # pozostałość historyczna
        gloss=row['glosa'] or '',
        note=row['nota'] or '',
        pronunciation=row['wymowa'] or '',
        valence=row['łączliwość'] or '',
        part_of_speech_id=row['pos'],
        source='SGJP',
        status=status,
        comment=row['komentarz'] or '',
        last_modified=date,
        owner_vocabulary_id=slownik,
      ))
      lexeme_associations.append(LexemeAssociation(
        lexeme_id=row['nr'], vocabulary_id=slownik))
      lexeme_cvs.append(LexemeCV(lexeme_id=row['nr'], classification_value=cv))
      if row['lkwal']:
        for qual in row['lkwal'].split('|'):
          lexeme_qualifiers.append((row['nr'], self.qual[qual]))
      for attr_name, ((poses, ic), values, (column, f)) in ATTRS.iteritems():
        if row['pos'] in poses:
          attr_value = attr_values[attr_name].get(f(row[column]))
          if attr_value:
            lexeme_attrs.append(
              LexemeAV(lexeme_id=row['nr'], attribute_value=attr_value))
          elif row[column]:
            print 'unknown value of %s: %s' % (attr_name, row[column])
    return (lexemes, lexeme_associations, lexeme_cvs, lexeme_qualifiers,
      lexeme_attrs)

  def new_lexeme_associations(self):
    self.cache_vocabs()
    if MINI_MODE:
      result = self.sqlite_cursor.execute(
        'SELECT * FROM slowniki_uzywajace WHERE nr in (%s)'
        % (MINI_LEXEME_QUERY % 'nr'), [MINI_LEXEME_COUNT])
    else:
      result = self.sqlite_cursor.execute('SELECT * FROM slowniki_uzywajace')
    for row in result:
      yield LexemeAssociation(
        vocabulary=self.vocabs[row['slownik_uz']], lexeme_id=row['nr'])

  def new_cross_reference_types(self):
    result = self.sqlite_cursor.execute(
      'select distinct l1.pos pos1, l2.pos pos2, t.* '
      'from odsylacze o join leksemy l1 on nrod=l1.nr '
      'join leksemy l2 on nrdo=l2.nr '
      'join typyodsylaczy t on t.typods=o.typods')
    for row in result:
      yield CrossReferenceType(
        symbol=row['typods'],
        desc=row['naglowek'],
        index=row['kolejnosc'],
        from_pos=PartOfSpeech.objects.get(symbol=row['pos1']),
        to_pos=PartOfSpeech.objects.get(symbol=row['pos2']),
      )

  def new_cross_references(self):
    if MINI_MODE:
      result = self.sqlite_cursor.execute(
        'SELECT o.*, l1.pos pos1, l2.pos pos2 FROM odsylacze o '
        'JOIN leksemy l1 on nrod=l1.nr '
        'JOIN leksemy l2 on nrdo=l2.nr '
        'WHERE nrod in (%(subq)s) and nrdo in (%(subq)s)'
        % {'subq': MINI_LEXEME_QUERY % 'nr'},
        [MINI_LEXEME_COUNT, MINI_LEXEME_COUNT])
    else:
      result = self.sqlite_cursor.execute(
        'SELECT o.*, l1.pos pos1, l2.pos pos2 FROM odsylacze o '
        'JOIN leksemy l1 on nrod=l1.nr '
        'JOIN leksemy l2 on nrdo=l2.nr'
      )
    cr_type_table = dict(
      ((crt.symbol, crt.from_pos.symbol, crt.to_pos.symbol), crt)
      for crt in CrossReferenceType.objects.all()
    )
    for row in result:
      # niekompletne odsyłacze zdarzają się dla 'asp'
      if row['nrod'] and row['nrdo']:
        cr_type = cr_type_table[(row['typods'], row['pos1'], row['pos2'])]
        yield CrossReference(
            from_lexeme_id=row['nrod'], to_lexeme_id=row['nrdo'],
            type=cr_type)

  def import_pattern_types(self):
    self.cache_lc_pos()
    result = self.sqlite_cursor.execute(
      'SELECT DISTINCT typr, pos FROM paradygmaty')
    for row in result:
      lc = self.lc_pos[row['pos']]
      PatternType.objects.get_or_create(lexical_class=lc, symbol=row['typr'])
    # prowizorka z powodu pustej klasy 'skr'
    self.cache_lc()
    result = self.sqlite_cursor.execute('SELECT DISTINCT typr, pos FROM wzory')
    for row in result:
      lc = self.lc[row['pos']]
      PatternType.objects.get_or_create(lexical_class=lc, symbol=row['typr'])

  def cache_ptypes(self):
    if 'ptypes' not in self.__dict__:
      self.ptypes = dict(
        ((pt.lexical_class.symbol, pt.symbol), pt)
        for pt in PatternType.objects.all()
      )

  def new_patterns(self):
    self.cache_ptypes()
    for row in self.sqlite_cursor.execute('SELECT * FROM wzory'):
      yield Pattern(
        name=row['wzor'],
        type=self.ptypes[(row['pos'], row['typr'])],
        basic_form_ending=row['zakp'],
        example=row['przyklad'] or '',
        comment=row['wkomentarz'] or '',
        status = 'temp',
      )

  def cache_patterns(self):
    if 'paterns' not in self.__dict__:
      self.patterns = dict((p.name, p) for p in Pattern.objects.all())

  def new_endings(self):
    self.cache_qualifiers()
    self.cache_patterns()
    self.cache_bfl()
    endings = []
    ending_quals = []
    for row in self.sqlite_cursor.execute('SELECT * FROM zakonczenia'):
      if row['zak'] is not None:
        endings.append(Ending(
          pattern=self.patterns[row['wzor']],
          base_form_label = self.bfls[row['efobaz']],
          string = row['zak'],
          index = row['nrskl'],
        ))
        if row['zkwal']:
          for qual in row['zkwal'].split('|'):
            ending_quals.append((
              self.patterns[row['wzor']],
              self.bfls[row['efobaz']],
              row['nrskl'],
              self.qual[qual]))
    return endings, ending_quals

  def new_lexeme_inflection_patterns(self):
    self.cache_ics()
    self.cache_qualifiers()
    self.cache_patterns()
    if MINI_MODE:
      result = self.sqlite_cursor.execute(
        'SELECT o.*, l.pos FROM odmieniasie o '
        'JOIN leksemy l ON o.nr = l.nr '
        'WHERE l.nr IN (%s)' % (MINI_LEXEME_QUERY % 'nr'),
        (MINI_LEXEME_COUNT,))
    else:
      result = self.sqlite_cursor.execute(
        'SELECT * FROM odmieniasie o JOIN leksemy l ON o.nr = l.nr')
    lips = []
    lip_quals = []
    for row in result:
      lexeme_id = row['nr']
      lips.append(LexemeInflectionPattern(
        lexeme_id=lexeme_id,
        index=row['oskl'],
        pattern=self.patterns[row['wzor']],
        inflection_characteristic=self.ics[
          (row['charfl'], row['pos'])],
        root=row['rdzen'],
      ))
      if row['okwal']:
        for qual in row['okwal'].split('|'):
          lip_quals.append((lexeme_id, row['oskl'], self.qual[qual]))
    return lips, lip_quals

  def new_variants(self):
    result = self.sqlite_cursor.execute(
      'SELECT DISTINCT wariant FROM paradygmaty')
    for row in result:
      yield Variant(id=row['wariant'])

  def new_table_templates(self):
    self.cache_ics()
    self.cache_ptypes()
    self.cache_lc_pos()
    result = self.sqlite_cursor.execute(
      'SELECT DISTINCT wariant, pos, typr, charfl FROM paradygmaty')
    for row in result:
      yield TableTemplate(
        variant_id=row['wariant'],
        pattern_type=self.ptypes[(self.lc_pos[row['pos']].symbol, row['typr'])],
        inflection_characteristic=self.ics[(row['charfl'], row['pos'])])

  # to zostaje, bo tabelki i tak się pozmieniają
  def import_tables(self):
    self.cache_bfl()
    tt_table = dict(
      ((
         tt.variant.id,
         tt.pattern_type.symbol,
         tt.inflection_characteristic.symbol,
         tt.inflection_characteristic.part_of_speech.symbol,
       ), tt) for tt in TableTemplate.objects.all()
    )
    for row in self.sqlite_cursor.execute('SELECT * FROM paradygmaty'):
      tt = tt_table[
        (unicode(row['wariant']), row['typr'], row['charfl'], row['pos'])]
      if not SQL_MODE:
        c = Cell(
          table_template=tt,
          base_form_label=BaseFormLabel.objects.get(symbol=row['efobaz']),
          tag=row['morf'],
          prefix=row['pref'],
          suffix=row['suf'],
          index=row['kskl'],
        )
        c.save()
        if row['row']:
          tc = TableCell(
            cell=c,
            row=row['row'],
            col=row['col'],
            rowspan=row['rowspan'],
            colspan=row['colspan'],
          )
          tc.save()
      else:
        efobaz_id = self.bfls[row['efobaz']].id
        self.cursor.execute(
          "INSERT INTO klatki (st_id, efobaz, tag, prefiks, sufiks, kind) "
          "VALUES (%s, %s, %s, %s, %s, %s)", [tt.pk, efobaz_id, row['morf'],
          row['pref'], row['suf'], row['kskl']])
        if row['row']:
          self.cursor.execute("select currval('klatki_id_seq')")
          last_id = self.cursor.fetchone()[0]
          self.cursor.execute(
            "INSERT INTO komorki_tabel (k_id, row, col, rowspan, colspan) "
            "VALUES (%s, %s, %s, %s, %s)", [last_id, row['row'],
            row['col'], row['rowspan'], row['colspan']])

  def new_table_headers(self):
    for row in self.sqlite_cursor.execute('SELECT * FROM naglowkiwierszy'):
      if row['styl'] != 'b' and row['nagl']:
        tts = TableTemplate.objects.filter(
          variant__id=row['wariant'], pattern_type__symbol=row['typr'],
          inflection_characteristic__symbol=row['charfl'],
          inflection_characteristic__part_of_speech__symbol=row['pos'])
        if tts:
          tt = tts.get()
          yield TableHeader(
            table_template=tt,
            row=row['row'],
            col=row['col'],
            rowspan=row['rowspan'],
            colspan=row['colspan'],
            label=row['nagl'],
            css_class=row['styl'],
          )
        else:
          raise Exception('Brak szablonu dla nagłówka: %s', dict(row))

  def delete_and_import(self):
    models = (
      TableCell,
      Cell,
      TableTemplate,
      Variant,
      CrossReference,
      CrossReferenceType,
      LexemeAssociation,
      LexemeInflectionPattern,
      Lexeme,
      Ending,
      Pattern,
      PatternType,
      Qualifier,
      #Vocabulary,
      InflectionCharacteristic,
      BaseFormLabel,
      PartOfSpeech,
      LexicalClass,
    )
    print 'deleting old data...'
    for model in models:
      model.objects.all().delete()

    print 'importing lexical classes...'
    bulk_create(LexicalClass, self.new_lexical_classes())
    print 'importing parts of speech...'
    bulk_create(PartOfSpeech, self.new_parts_of_speech())
    print 'importing base form labels...'
    bulk_create(BaseFormLabel, self.new_base_form_labels())
    print 'importing inflection characteristics...'
    bulk_create(InflectionCharacteristic,
      self.new_inflection_characteristics())
    print 'importing vocabularies...'
    for v in self.new_vocabularies():
      v.save()
    print 'importing qualifiers...'
    bulk_create(Qualifier, self.new_qualifiers())
    print 'importing pattern types...'
    self.import_pattern_types()
    print 'importing patterns...'
    bulk_create(Pattern, self.new_patterns())
    print 'importing endings...'
    endings, ending_quals = self.new_endings()
    bulk_create(Ending, endings)
    for pattern, bfl, index, q in ending_quals:
      Ending.objects.get(
        pattern=pattern, base_form_label=bfl, index=index).qualifiers.add(q)
    def import_lexemes():
      print 'importing lexemes...'
      (lexemes, lexeme_assoc, lexeme_cvs, lexeme_quals,
       lexeme_attrs) = self.new_lexemes()
      print 'creating...'
      bulk_create(Lexeme, lexemes)
      print 'associations...'
      bulk_create(LexemeAssociation, lexeme_assoc)
      print 'classifications...'
      bulk_create(LexemeCV, lexeme_cvs)
      print 'qualifiers...'
      for lexeme_id, q in lexeme_quals:
        q.lexeme_set.add(lexeme_id) #add
      print 'attributes...'
      bulk_create(LexemeAV, lexeme_attrs)
      return lexeme_attrs
    lexeme_attrs = import_lexemes()
    def import_lips():
      print 'importing lexeme inflection patterns...'
      lips, lip_quals = self.new_lexeme_inflection_patterns()
      print 'creating...'
      bulk_create(LexemeInflectionPattern, lips)
      print 'qualifiers...'
      for lexeme_id, index, q in lip_quals:
        LexemeInflectionPattern.objects.get(
          lexeme_id=lexeme_id, index=index).qualifiers.add(q)
    import_lips()
    print 'importing lexeme associations...'
    bulk_create(LexemeAssociation, self.new_lexeme_associations())
    print 'importing cross-reference types...'
    bulk_create(CrossReferenceType,
      self.new_cross_reference_types())
    print 'importing cross-references...'
    bulk_create(CrossReference, self.new_cross_references())
    print 'copying aspect values to derived lexemes...'
    aspect_vals = LexemeAttributeValue.objects.filter(attribute__name='aspekt')
    for lexeme_av in lexeme_attrs:
      if lexeme_av.attribute_value in aspect_vals:
        crs = CrossReference.objects.filter(
          from_lexeme__id=lexeme_av.lexeme_id,
          type__symbol__in=('verger', 'verppas', 'verpact'))
        for cr in crs:
          lexeme_av.attribute_value.add_lexeme(cr.to_lexeme)
    print 'importing variants...'
    bulk_create(Variant, self.new_variants())
    print 'importing table templates...'
    bulk_create(TableTemplate, self.new_table_templates())
    print 'importing tables...'
    self.import_tables()
    print 'importing table headers...'
    bulk_create(TableHeader, self.new_table_headers())
    print 'committing to database...'
    self.close()

  def single_import(self, model):
    method_name = METHOD_NAMES[model]
    if method_name.startswith('new'):
      bulk_create(model, self.__getattribute__(method_name)())
    elif method_name.startswith('import'):
      self.__getattribute__(method_name)()
    self.close()