import_warszawa.py 3.23 KB
#-*- coding:utf-8 -*-

import sys
from django.core.management.base import BaseCommand, CommandError
from common.util import debug
from dictionary.models import Lexeme, Vocabulary, LexemeAssociation, Pattern, \
  all_forms, InflectionCharacteristic, get_root
from dictionary.management.commands.import_morfologik import create_lexeme, \
  create_lip, print_data

class Command(BaseCommand):
  args = '<input file name>'
  help = 'importuje nazwy warszawskie'

  def handle(self, filename, **options):
    import_warszawa(open(filename))


def inflection_characteristic(forms, pos):
  # w nazwach warszawskich jest tylko subst i adj
  tag = forms[0][1]
  if pos == 'subst':
    if 'depr' in tag or tag.endswith('m1'):
      ic = 'm1'
    else:
      ic = tag.rsplit(':', 1)[1]
  elif pos == 'adj':
    # formy 3+ tu nie występują
    if any(tag == 'adja' for form, tag in forms):
      ic = ''
    else:
      ic = '0-'
  #return InflectionCharacteristic.objects.get(
  #  entry=ic, part_of_speech__symbol=pos)
  return ic

def process_forms(forms, base, pos, patterns):
  ic = inflection_characteristic(forms, pos)
  #### wyłączone, bo sprawdzone, że wszystkie wzory się zgadzają
  #patterns_ok = True
  #try:
  #  k_patterns = [Pattern.objects.get(name=p_name) for p_name in patterns]
  #  # sprawdzić wygenerowane formy...
  #  p_forms = set()
  #  for pattern in k_patterns:
  #    p_forms |= all_forms(pattern, ic, pos, base, affixes=False)
  #  w_forms = set(form for form, tag in forms)
  #  if p_forms != w_forms:
  #    patterns_ok = False
  #    print p_forms - w_forms, w_forms - p_forms, patterns
  #except Pattern.DoesNotExist:
  #  patterns_ok = False
  #  print patterns

  # szukamy leksemów wg base, pos, ic, wzory
  homonyms = Lexeme.objects.filter(entry=base, part_of_speech__symbol=pos)
  for l in homonyms:
    lips = l.lexemeinflectionpattern_set.all()
    l_patterns = set(lip.pattern.name for lip in lips)
    l_ics = [lip.inflection_characteristic.entry for lip in lips]
    if l_ics in ([ic], ['3+']) and l_patterns == patterns:
      break # nie importujemy, bo już jest
    #else:
    #  diff = ''
    #  if l_ics != [ic]:
    #    diff += '%s %s ' % (l_ics, ic)
    #  if l_patterns != patterns:
    #    diff += '%s %s' % (l_patterns, patterns)
    #  debug(base, diff)
  else:
    if homonyms:
      status = 'cand'
      comment = u'z nazw warszawskich; rozbieżność'
    else:
      status = 'desc'
      comment = u'z nazw warszawskich'
    lips = []
    for i, pattern in enumerate(patterns):
      lips.append(create_lip(pattern, None, i + 1, ic, pos))
    data = {
      'lexeme': create_lexeme(base, 1, pos, status, comment),
      'lips': lips,
    }
    print_data(data)

def import_warszawa(input_file):
  last_id = None
  forms = None
  last_base = None
  last_pos = None
  patterns = None
  for line in input_file:
    data = line.strip().decode('utf-8').split('\t')
    w_id, lip_ind, pos, pattern, form, base, tag = data
    if w_id != last_id:
      if last_id is not None:
        process_forms(forms, last_base, last_pos, patterns)
      last_id = w_id
      last_base = base
      last_pos = pos
      forms = []
      patterns = set()
    forms.append((form, tag))
    patterns.add(pattern)
  process_forms(forms, last_base, last_pos, patterns)