load_morfologik.py 5.68 KB

Edit Raw Blame History

#-*- coding:utf-8 -*-

import sys
import json
from django.db import connection, transaction
from django.db.models import Max
from django.core.management.base import BaseCommand, CommandError
from common.util import no_history, debug
from dictionary.models import Lexeme, Pattern, LexemeAssociation, \
  LexemeInflectionPattern, PartOfSpeech, Vocabulary, InflectionCharacteristic, \
  CrossReference, CrossReferenceType, ClassificationValue

START_ID = 500000
END_ID =  1000000

next_id = Lexeme.objects.filter(
  pk__gte=START_ID, pk__lt=END_ID).aggregate(Max('id'))['id__max']
next_id = next_id + 1 if next_id else START_ID

class Command(BaseCommand):
  args = '<nazwa słownika> <nazwa źródła> <nazwa pliku wejściowego>'
  help = 'Load prepared lexeme data'

  def handle(self, input_file, vocab_name, source, **options):
    load_morfologik(input_file, vocab_name, source)

source = None
vocab = None # brzydko, ale nie chce mi się przerabiać wszystkiego

parts_of_speech = PartOfSpeech.objects.all()
pos_table = {}
for part_of_speech in parts_of_speech:
  pos_table[part_of_speech.symbol] = part_of_speech

ic_list = InflectionCharacteristic.objects.all()
ic_table = {}
for ic in ic_list:
  ic_table[(ic.entry, ic.part_of_speech.symbol)] = ic

pattern_list = Pattern.objects.all()
pattern_table = {}
for p in pattern_list:
  pattern_table[p.name] = p

def associate(l):
  la, created = LexemeAssociation.objects.get_or_create(
    lexeme=l, vocabulary=vocab)
  if not created and l.part_of_speech.symbol not in ('ppas', 'pact', 'ger'):
    debug(l.entry, u'wielokrotne przypisanie leksemu do słownika!')

def add_cr(l_from, l_to, symbol):
  cr_type = CrossReferenceType.objects.get(
    symbol=symbol, from_pos=l_from.part_of_speech, to_pos=l_to.part_of_speech)
  CrossReference(from_lexeme=l_from, to_lexeme=l_to, type=cr_type).save()

def create_lexeme(entry, homonym_number, part_of_speech, status, comment,
                  commonness=None):
  global next_id
  l = Lexeme(id=next_id, entry=entry, homonym_number=homonym_number,
    part_of_speech=part_of_speech, source=source, status=status,
    comment=comment, owner_vocabulary=vocab)
  l.save()
  if commonness:
    ClassificationValue.objects.get(label=commonness).lexemes.add(l) #add
  associate(l)
  next_id += 1
  return l

def create_negated(l):
  lneg = create_lexeme(
    u"nie" + l.entry, l.homonym_number, l.part_of_speech,
    "cand" if l.status == "cand" else "desc", '')
  for lip in l.lexemeinflectionpattern_set.all():
    if lip.inflection_characteristic.entry != "0-":
      ic = ic_table[("0-", "adj")]
    else:
      ic = lip.inflection_characteristic
    LexemeInflectionPattern(lexeme=lneg, index=lip.index,
      pattern=lip.pattern, root=u"nie" + lip.root,
      inflection_characteristic=ic).save()
  add_cr(l, lneg, "adjnie")
  add_cr(lneg, l, "nieadj")

def check_der(verb, pos, entry, patterns):
  lips = verb.lexemeinflectionpattern_set.all()
  if not lips:
    return None
  ic = lips[0].inflection_characteristic.entry
  matched = []
  for l in Lexeme.objects.filter(
             deleted=False, entry=entry, part_of_speech__symbol=pos,
             lexemeinflectionpattern__inflection_characteristic__entry=ic):
    l_lips = l.lexemeinflectionpattern_set.all()
    if l_lips[0].inflection_characteristic.entry == ic:
      l_patterns = set(l.patterns.values_list('name', flat=True))
      if l_patterns == set(patterns):
        matched.append(l)
  if len(matched) > 1:
    debug(entry, u'niejednoznaczny derywat')
  if len(matched) > 0:
    return matched[0]
  else:
    return None

def create_derived(l, pos, entry, patterns):
  old_der = check_der(l, pos, entry, patterns)
  if old_der:
    if vocab not in old_der.vocabularies.all():
      associate(old_der)
    lder = old_der
  else:
    # kopiowanie homonym_number nie ma sensu, ale co ma?
    lder = create_lexeme(entry, l.homonym_number, pos_table[pos], l.status, u'')
    for lip in l.lexemeinflectionpattern_set.all():
      if lip.pattern.name in patterns:
        ic = lip.inflection_characteristic.entry.lstrip("q")
        LexemeInflectionPattern(lexeme=lder, index=lip.index,
          pattern=lip.pattern, root=lip.root,
          inflection_characteristic=ic_table[(ic, pos)]).save()
  add_cr(l, lder, "ver" + pos)
  add_cr(lder, l, pos + "ver")


def load_morfologik(filename, vocab_name, source_):
  global vocab, source
  vocab = Vocabulary.objects.get(id=vocab_name)
  source = source_
  transaction.commit_unless_managed()
  transaction.enter_transaction_management()
  transaction.managed(True)
  no_history()
  with open(filename) as file:
    for line in file:
      data = json.loads(line.decode('utf-8'))
      if data['lexeme']['source'] == 'sgjp':
        l = Lexeme.objects.get(pk=data['lexeme']['id'])
        associate(l)
      elif data['lexeme']['source'] == 'morfologik':
        l_data = data['lexeme']
        l = create_lexeme(l_data['entry'], l_data['homonym_number'],
          pos_table[l_data['part_of_speech']], l_data['status'],
          l_data['comment'], l_data.get('commonness'))
        for lip_data in data['lips']:
          pattern = pattern_table[lip_data['pattern']]
          ic = ic_table[tuple(lip_data['ic'])]
          if lip_data['root']['type'] == 'string':
            root = lip_data['root']['root']
          elif lip_data['root']['type'] == 'compute':
            root = l.get_root(pattern, ic)
          LexemeInflectionPattern(lexeme=l, index=lip_data['ind'],
            pattern=pattern, root=root, inflection_characteristic=ic).save()
        if 'derived' in data:
          for pos, entry, patterns in data['derived']:
            create_derived(l, pos, entry, patterns)
      if 'negated' in data:
        create_negated(l)
  transaction.commit()
  transaction.leave_transaction_management()