import_data.py 18 KB

Edit Raw Blame History

#-*- coding:utf-8 -*-

import sqlite3
import datetime
from django.db import connection, transaction
from django.core.management.base import BaseCommand
from django.contrib.auth.models import User

from common.util import no_history
from dictionary.models import *

DEFAULT_DATABASE = 'data/sgjp.db'

MINI_MODE = False # do debugowania
MINI_LEXEME_COUNT = 500
MINI_LEXEME_QUERY = 'SELECT %s FROM leksemy LIMIT ?'

# UWAGA: aktualnie ustawienie SQL_MODE = False jest niekompletne
SQL_MODE = True

OTHER = 'inne'
DEFAULT_VOCAB = 'SGJP'

# tymczasowa tabelka
BASIC_FORM_LABELS = {
  '0-': '1',
  '3+': '1',
  'f':  'sg:nom',
  'm1': 'sg:nom',
  'm2': 'sg:nom',
  'm3': 'sg:nom',
  'n1': 'sg:nom',
  'n2': 'sg:nom',
  'p1': 'pl:nom:mo',
  'p2': 'pl:nom',
  'p3': 'pl:nom',
  'pri': 'sg:nom', # albo pl. teraz już naprawdę się nie da.
  'sec': 'sg:nom',
}

# to chyba nie jest najlepsze rozwiązanie...
BASIC_FORM_LABELS_POS = {
  'v': '5',
  'ger': '11',
  'pact': '3',
  'ppas': '10',
  'appas': '10',
  'pred': '5',
}

class Command(BaseCommand):
  args = '<input db filename>'
  help = 'Imports initial data'

  def handle(self, db_name=DEFAULT_DATABASE, **options):
    ImportData(db_name).delete_and_import()

def get_cursor(db):
  conn = sqlite3.connect(db)
  conn.row_factory = sqlite3.Row
  return conn.cursor()

METHOD_NAMES = {
  CrossReference: 'import_cross_references',
  Ending: 'import_endings',
  LexemeInflectionPattern: 'import_lexeme_inflection_patterns',
  Lexeme: 'import_lexemes',
  PatternType: 'import_pattern_types',
  TableTemplate: 'import_tables',
  BaseFormLabel: 'new_base_form_labels',
  CrossReferenceType: 'new_cross_reference_types',
  InflectionCharacteristic: 'new_inflection_characteristics',
  LexemeAssociation: 'new_lexeme_associations',
  LexicalClass: 'new_lexical_classes',
  PartOfSpeech: 'new_parts_of_speech',
  Pattern: 'new_patterns',
  Qualifier: 'new_qualifiers',
  TableHeader: 'new_table_headers',
  Vocabulary: 'new_vocabularies',
}

class ImportData(object):
  def __init__(self, db):
    self.cursor = connection.cursor()
    self.sqlite_cursor = get_cursor(db)
    no_history()

  def close(self):
    self.cursor.close()
    self.sqlite_cursor.close()

  def new_lexical_classes(self):
    yield LexicalClass(symbol=OTHER)
    for row in self.sqlite_cursor.execute('select distinct pos from wzory'):
      yield LexicalClass(symbol=row['pos'])

  def new_parts_of_speech(self):
    lcs = {}
    for row in self.sqlite_cursor.execute(
        'select distinct wzory.pos, leksemy.pos from wzory '
        'natural join odmieniasie join leksemy on leksemy.nr = odmieniasie.nr'):
      lcs[row[1]] = row[0]

    for row in self.sqlite_cursor.execute('SELECT pos FROM klasygramatyczne'):
      lc = lcs.get(row['pos'], OTHER)
      yield PartOfSpeech(
        symbol=row['pos'], lexical_class = LexicalClass.objects.get(symbol=lc))

  def new_base_form_labels(self):
    query_result = self.sqlite_cursor.execute("""
      SELECT efobaz FROM paradygmaty
      UNION
      SELECT efobaz FROM zakonczenia
      """)
    for row in query_result:
      yield BaseFormLabel(entry=row[0])

  def new_inflection_characteristics(self):
    for row in self.sqlite_cursor.execute(
            'SELECT DISTINCT charfl, pos FROM paradygmaty'):
      if row['charfl'] == '':
        bfl_entry = '1' if row['pos'] in ('adj', 'adjcom') else ''
      else:
        bfl_entry = BASIC_FORM_LABELS.get(row['charfl'], '')
      if row['pos'] in BASIC_FORM_LABELS_POS:
        bfl_entry = BASIC_FORM_LABELS_POS[row['pos']]
      bfl = BaseFormLabel.objects.get(entry=bfl_entry)
      yield InflectionCharacteristic(
        entry=row['charfl'], basic_form_label=bfl,
        part_of_speech=PartOfSpeech.objects.get(pk=row['pos']))

  def cache_ics(self):
    self.ics = {}
    for ic in InflectionCharacteristic.objects.all():
      self.ics[(ic.basic_form_label.entry, ic.part_of_speech.symbol)] = ic

  def new_vocabularies(self):
    result = self.sqlite_cursor.execute("""
      SELECT slownik FROM leksemy
      UNION
      SELECT slownik_uz FROM slowniki_uzywajace
    """)
    for row in result:
      yield Vocabulary(id = row[0])

  def new_qualifiers(self):
    sgjp = Vocabulary.objects.get(id=DEFAULT_VOCAB)
    query_result = self.sqlite_cursor.execute("""
      SELECT okwal FROM odmieniasie
      UNION
      SELECT zkwal FROM zakonczenia
      UNION
      SELECT lkwal FROM leksemy
      """)
    added = set()
    for row in query_result:
      if row[0]:
        for qualifier_label in row[0].split('|'):
          if qualifier_label not in added:
            added.add(qualifier_label)
            yield Qualifier(label=qualifier_label, vocabulary=sgjp)

  def import_lexemes(self):
    if MINI_MODE:
      result = self.sqlite_cursor.execute(
        MINI_LEXEME_QUERY % '*',(MINI_LEXEME_COUNT,))
    else:
      result = self.sqlite_cursor.execute('SELECT * FROM leksemy')
    date = datetime.datetime.now()
    cv_table = dict(ClassificationValue.objects.values_list('label', 'pk'))
    for row in result:
      slownik = row['slownik']
      status = 'conf' if slownik != 'zmiotki' else 'cand'
      cv_pk = cv_table[row['pospolitosc']]
      self.cursor.execute(
        "INSERT INTO leksemy (id, haslo, haslosuf, glosa, nota, wymowa, hom, "
        "pos, zrodlo, status, komentarz, data_modyfikacji, slownik, usuniety) "
        "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
        [row['nr'], row['haslo'], row['haslosuf'] or '', row['glosa'] or '',
         row['nota'] or '', row['wymowa'] or '', 1, row['pos'], 'SGJP',
         status, row['komentarz'], date, row['slownik'], False])
      self.cursor.execute(
        "INSERT INTO leksemy_w_slownikach (l_id, slownik) "
        "VALUES (%s, %s)", [row['nr'], slownik])
      self.cursor.execute(
        "INSERT INTO wartosci_klasyfikacji_lexemes (classificationvalue_id, "
        "lexeme_id) VALUES (%s, %s)", [cv_pk, row['nr']])
      if row['lkwal']:
        for qual in row['lkwal'].split('|'):
          q_id = Qualifier.objects.get(label=qual).pk
          self.cursor.execute(
            "INSERT INTO kwalifikatory_leksemow (lexeme_id, "
            "qualifier_id) VALUES (%s, %s)", [row['nr'], q_id])

  def new_lexeme_associations(self):
    if MINI_MODE:
      result = self.sqlite_cursor.execute(
        'SELECT * FROM slowniki_uzywajace WHERE nr in (%s)'
        % (MINI_LEXEME_QUERY % 'nr'), [MINI_LEXEME_COUNT])
    else:
      result = self.sqlite_cursor.execute('SELECT * FROM slowniki_uzywajace')
    vocab_table = dict(
      (v.id, v) for v in Vocabulary.objects.all()
    )
    for row in result:
      yield LexemeAssociation(
        vocabulary=vocab_table[row['slownik_uz']], lexeme__id=row['nr'])

  def new_cross_reference_types(self):
    result = self.sqlite_cursor.execute(
      'select distinct l1.pos pos1, l2.pos pos2, t.* '
      'from odsylacze o join leksemy l1 on nrod=l1.nr '
      'join leksemy l2 on nrdo=l2.nr '
      'join typyodsylaczy t on t.typods=o.typods')
    for row in result:
      yield CrossReferenceType(
        symbol=row['typods'],
        desc=row['naglowek'],
        index=row['kolejnosc'],
        from_pos=PartOfSpeech.objects.get(symbol=row['pos1']),
        to_pos=PartOfSpeech.objects.get(symbol=row['pos2']),
      )

  def import_cross_references(self):
    if MINI_MODE:
      result = self.sqlite_cursor.execute(
        'SELECT o.*, l1.pos pos1, l2.pos pos2 FROM odsylacze o '
        'JOIN leksemy l1 on nrod=l1.nr '
        'JOIN leksemy l2 on nrdo=l2.nr '
        'WHERE nrod in (%(subq)s) and nrdo in (%(subq)s)'
        % {'subq': MINI_LEXEME_QUERY % 'nr'},
        [MINI_LEXEME_COUNT, MINI_LEXEME_COUNT])
    else:
      result = self.sqlite_cursor.execute(
        'SELECT o.*, l1.pos pos1, l2.pos pos2 FROM odsylacze o '
        'JOIN leksemy l1 on nrod=l1.nr '
        'JOIN leksemy l2 on nrdo=l2.nr'
      )
    cr_type_table = dict(
      ((crt.symbol, crt.from_pos, crt.to_pos), crt)
      for crt in CrossReferenceType.objects.all()
    )
    for row in result:
      # niekompletne odsyłacze zdarzają się dla 'asp'
      if row['nrod'] and row['nrdo']:
        cr_type = cr_type_table[(row['typods'], row['pos1'], row['pos2'])]
        yield CrossReference(
            from_lexeme__id=row['nrod'], to_lexeme__id=row['nrdo'],
            type=cr_type)

  def import_pattern_types(self):
    result = self.sqlite_cursor.execute(
      'SELECT DISTINCT typr, pos FROM paradygmaty')
    lc_pos_table = dict(
      (pos.symbol, pos.lexical_class) for pos in PartOfSpeech.objects.all()
    )
    for row in result:
      lc = lc_pos_table[row['pos']]
      PatternType.objects.get_or_create(lexical_class=lc, entry=row['typr'])
    # prowizorka z powodu pustej klasy 'skr'
    lc_table = dict(
      (lc.symbol, lc) for lc in LexicalClass.objects.all()
    )
    result = self.sqlite_cursor.execute('SELECT DISTINCT typr, pos FROM wzory')
    for row in result:
      lc = lc_table[row['pos']]
      PatternType.objects.get_or_create(lexical_class=lc, entry=row['typr'])

  def new_patterns(self):
    pt_table = dict(
      ((pt.lexical_class.symbol, pt.entry), pt)
      for pt in PatternType.objects.all()
    )
    for row in self.sqlite_cursor.execute('SELECT * FROM wzory'):
      pt = pt_table[(row['pos'], row['typr'])]
      status = 'temp'
      yield Pattern(
        name=row['wzor'],
        type=pt,
        basic_form_ending=row['zakp'],
        example=row['przyklad'] or '',
        comment=row['wkomentarz'] or '',
        status = status,
      )

  def import_endings(self):
    if SQL_MODE:
      pattern_pk_table = dict(Pattern.objects.values_list('name', 'pk'))
      bfl_table = dict(BaseFormLabel.objects.values_list('entry', 'pk'))
    for row in self.sqlite_cursor.execute('SELECT * FROM zakonczenia'):
      if row['zak'] is not None:
        if not SQL_MODE:
          e = Ending(
            pattern=Pattern.objects.get(name=row['wzor']),
            base_form_label = BaseFormLabel.objects.get(entry=row['efobaz']),
            string = row['zak'],
            index = row['nrskl'],
          )
          e.save()
          for qual in row['zkwal'].split('|'):
            e.qualifiers.add(Qualifier.objects.get(label=qual)) #add
        else:
          pattern_pk = pattern_pk_table[row['wzor']]
          if pattern_pk:
            efobaz_id = bfl_table[row['efobaz']]
            self.cursor.execute(
              "INSERT INTO zakonczenia (w_id, efobaz, zind, zak) VALUES "
              "(%s, %s, %s, %s)",
              [pattern_pk, efobaz_id, row['nrskl'], row['zak']])
            if row['zkwal']:
              self.cursor.execute("select currval('zakonczenia_id_seq')")
              last_id = self.cursor.fetchone()[0]
              for qual in row['zkwal'].split('|'):
                q_id = Qualifier.objects.get(label=qual).pk
                self.cursor.execute(
                  "INSERT INTO kwalifikatory_zakonczen (ending_id, qualifier_id) "
                  "VALUES (%s, %s)", [last_id, q_id])

  def import_lexeme_inflection_patterns(self):
    if MINI_MODE:
      result = self.sqlite_cursor.execute(
        'SELECT * FROM odmieniasie WHERE nr IN (%s)' % (MINI_LEXEME_QUERY % 'nr'),
        (MINI_LEXEME_COUNT,))
    else:
      result = self.sqlite_cursor.execute('SELECT * FROM odmieniasie')
    pos_table = dict(Lexeme.objects.values_list('pk', 'part_of_speech'))
    pattern_pk_table = dict(Pattern.objects.values_list('name', 'pk'))
    for row in result:
      if not SQL_MODE:
        lip = LexemeInflectionPattern(
          lexeme__id=row['nr'],
          index=row['oskl'],
          pattern=Pattern.objects.get(name=row['wzor']),
          inflection_characteristic=self.ics[
            (row['charfl'], lip.lexeme.part_of_speech)],
          root=row['rdzen'],
        )
        lip.save()
        # nieaktualne
        if row['okwal']:
          lip.qualifiers.add(Qualifier.objects.get(label=row['okwal'])) #add
      else:
        pos = pos_table[row['nr']]
        pattern_pk = pattern_pk_table[row['wzor']]
        charfl_id = self.ics[(row['charfl'], pos)].pk
        self.cursor.execute(
          "INSERT INTO odmieniasie (l_id, oind, w_id, charfl, rdzen) "
          "VALUES (%s, %s, %s, %s, %s) ", [row['nr'], row['oskl'], pattern_pk,
          charfl_id, row['rdzen']])
        if row['okwal']:
          self.cursor.execute("select currval('odmieniasie_id_seq')")
          last_id = self.cursor.fetchone()[0]
          for qual in row['okwal'].split('|'):
            q_id = Qualifier.objects.get(label=qual).pk
            self.cursor.execute(
              "INSERT INTO kwalifikatory_odmieniasiow (lexemeinflectionpattern_id, "
              "qualifier_id) VALUES (%s, %s)", [last_id, q_id])

  def import_tables(self):
    bfl_table = dict(BaseFormLabel.objects.values_list('entry', 'pk'))
    lc_pos_table = dict(
      (pos.symbol, pos.lexical_class) for pos in PartOfSpeech.objects.all()
    )
    for row in self.sqlite_cursor.execute('SELECT * FROM paradygmaty'):
      lc = lc_pos_table[row['pos']]
      variant, _created = Variant.objects.get_or_create(id=row['wariant'])
      tt_data = {
        'variant': variant,
        'pattern_type': PatternType.objects.get(
          entry=row['typr'], lexical_class=lc),
        'inflection_characteristic': InflectionCharacteristic.objects.get(
          entry=row['charfl'], part_of_speech__symbol=row['pos']),
      }
      tt, _created = TableTemplate.objects.get_or_create(**tt_data)
      if not SQL_MODE:
        c = Cell(
          table_template=tt,
          base_form_label=BaseFormLabel.objects.get(entry=row['efobaz']),
          tag=row['morf'],
          prefix=row['pref'],
          suffix=row['suf'],
          index=row['kskl'],
        )
        c.save()
        if row['row']:
          tc = TableCell(
            cell=c,
            row=row['row'],
            col=row['col'],
            rowspan=row['rowspan'],
            colspan=row['colspan'],
          )
          tc.save()
      else:
        efobaz_id = bfl_table[row['efobaz']]
        self.cursor.execute(
          "INSERT INTO klatki (st_id, efobaz, tag, prefiks, sufiks, kind) "
          "VALUES (%s, %s, %s, %s, %s, %s)", [tt.pk, efobaz_id, row['morf'],
          row['pref'], row['suf'], row['kskl']])
        if row['row']:
          self.cursor.execute("select currval('klatki_id_seq')")
          last_id = self.cursor.fetchone()[0]
          self.cursor.execute(
            "INSERT INTO komorki_tabel (k_id, row, col, rowspan, colspan) "
            "VALUES (%s, %s, %s, %s, %s)", [last_id, row['row'],
            row['col'], row['rowspan'], row['colspan']])

  def new_table_headers(self):
    for row in self.sqlite_cursor.execute('SELECT * FROM naglowkiwierszy'):
      if row['styl'] != 'b' and row['nagl']:
        tts = TableTemplate.objects.filter(
          variant__id=row['wariant'], pattern_type__entry=row['typr'],
          inflection_characteristic__entry=row['charfl'],
          inflection_characteristic__part_of_speech__symbol=row['pos'])
        if tts:
          tt = tts.get()
          yield TableHeader(
            table_template=tt,
            row=row['row'],
            col=row['col'],
            rowspan=row['rowspan'],
            colspan=row['colspan'],
            label=row['nagl'],
            horizontal=row['styl'] == 'h',
          )
        else:
          raise Exception('Brak szablonu dla nagłówka: %s', dict(row))

  def delete_and_import(self):
    transaction.commit_unless_managed()
    transaction.enter_transaction_management()
    transaction.managed()
    models = (
      TableCell,
      Cell,
      TableTemplate,
      CrossReference,
      CrossReferenceType,
      LexemeAssociation,
      LexemeInflectionPattern,
      Lexeme,
      Ending,
      Pattern,
      PatternType,
      Qualifier,
      Vocabulary,
      InflectionCharacteristic,
      BaseFormLabel,
      PartOfSpeech,
      LexicalClass,
    )
    print 'deleting old data...'
    for model in models:
      model.objects.all().delete()

    print 'importing lexical classes...'
    LexicalClass.objects.bulk_create(self.new_lexical_classes())
    print 'importing parts of speech'
    PartOfSpeech.objects.bulk_create(self.new_parts_of_speech())
    print 'importing base form labels'
    BaseFormLabel.objects.bulk_create(self.new_base_form_labels())
    print 'importing inflection characteristics'
    InflectionCharacteristic.objects.bulk_create(
      self.new_inflection_characteristics())
    print 'importing vocabularies...'
    Vocabulary.objects.bulk_create(self.new_vocabularies())
    print 'importing qualifiers...'
    Qualifier.objects.bulk_create(self.new_qualifiers())
    print 'importing lexemes...'
    self.import_lexemes()
    print 'importing lexeme associations...'
    LexemeAssociation.objects.bulk_create(self.new_lexeme_associations())
    print 'importing cross-reference types...'
    CrossReferenceType.objects.bulk_create(
      self.new_cross_reference_types())
    print 'importing cross-references...'
    self.import_cross_references()
    print 'importing pattern types...'
    self.import_pattern_types()
    print 'importing patterns...'
    Pattern.objects.bulk_create(self.new_patterns())
    print 'importing lexeme inflection patterns...'
    self.import_lexeme_inflection_patterns()
    print 'importing endings...'
    self.import_endings()
    print 'importing table templates...'
    self.import_tables()
    print 'importing table headers...'
    TableHeader.objects.bulk_create(self.new_table_headers())
    self.close()
    transaction.commit()
    transaction.leave_transaction_management()

  def single_import(self, model):
    transaction.commit_unless_managed()
    transaction.enter_transaction_management()
    transaction.managed()
    method_name = METHOD_NAMES[model]
    if method_name.startswith('new'):
      model.objects.bulk_create(self.__getattribute__(method_name)())
    elif method_name.startswith('import'):
      self.__getattribute__(method_name)()
    self.close()
    transaction.commit()
    transaction.leave_transaction_management()

import sys
if __name__ == '__main__':
  if sys.argv[-1] == '-mini':
    MINI_MODE = True
    del sys.argv[-1]
  if len(sys.argv) > 1:
    db = sys.argv[1]
  else:
    db = DEFAULT_DATABASE
  ImportData(db).delete_and_import()