import_kipi.py 13 KB
#-*- coding:utf-8 -*-

import sys
from django.core.management.base import BaseCommand, CommandError
from common.util import debug, suffixes, cut_end
from dictionary.models import Lexeme, Vocabulary, LexemeAssociation, Pattern, \
  all_forms, InflectionCharacteristic, get_root, Ending, BaseFormLabel
from dictionary.management.commands.import_morfologik import create_lexeme, \
  create_lip, print_data, find_minimal_sets, blacklist_filter, join_many, join, print_forms

class Command(BaseCommand):
  args = '<input file name>'
  help = 'importuje leksemy z KIPI 1.0'

  def handle(self, filename, **options):
    import_kipi(open(filename))

DEBUG = False

COMMONNESS = {
  'geog': u'geograficzna',
  'imie': u'imię',
  'inna': u'własna',
  'nazw': u'nazwisko',
  'orga': u'organizacja',
  'posp': u'pospolita',
}

def inflection_characteristic(forms, pos):
  # w KIPI jest tylko subst i adj
  tag = forms[0][1]
  if pos == 'subst':
    if 'depr' in tag or tag.endswith('m1'):
      ic = 'm1'
    else:
      ic = tag.rsplit(':', 1)[1]
  elif pos == 'adj':
    # formy 3+ tu nie występują
    if any(tag == 'adja' for form, tag in forms):
      ic = ''
    else:
      ic = '0-'
  return ic

# COPYPASTA HEAVEN
def get_basic_endings(lexical_class, ic):
  return Ending.objects.filter(
    base_form_label=ic.basic_form_label,
    pattern__type__lexical_class__symbol=lexical_class)

basic_form_endings_dict = {}
for pos in ('adj', 'subst'):
  for ic in InflectionCharacteristic.objects.filter(part_of_speech__symbol=pos):
    basic_form_endings_dict[(pos, ic.symbol)] = get_basic_endings(pos, ic)

sure_bfls_sg = tuple(
  BaseFormLabel.objects.filter(
    symbol__in=['sg:dat', 'sg:gen', 'sg:inst']).values_list('pk', flat=True))
sure_bfls_pl = tuple(
  BaseFormLabel.objects.filter(
    symbol__in=['pl:dat', 'pl:inst', 'pl:loc']).values_list('pk', flat=True))

def basic_form_endings(lexical_class, ic, basic_form, form_set):
  if lexical_class != 'subst':
    return basic_form_endings_dict[(lexical_class, ic)].filter(
      string__in=suffixes(basic_form))
  else:
    # karkołomne, ale trochę przyśpiesza
    endings = basic_form_endings_dict[(lexical_class, ic)]
    new_endings = Ending.objects.none()
    for suf in suffixes(basic_form):
      root = cut_end(basic_form, suf)
      n = len(root)
      ending_strings = tuple(
        form[n:] for form in form_set if form.startswith(root))
      endings_part = endings.filter(string=suf)
      pattern_ids = endings_part.values_list('pattern', flat=True)
      patterns = Pattern.objects.filter(pk__in=pattern_ids).extra(
        where=["(id = '0000' or not exists "
               "(select id from zakonczenia where w_id = wzory.id "
               "and zak not in %s and efobaz in %s) or not exists "
               "(select id from zakonczenia where w_id = wzory.id "
               "and zak not in %s and efobaz in %s))"],
        params=[ending_strings, sure_bfls_sg, ending_strings, sure_bfls_pl])
      new_endings = new_endings | endings_part.filter(pattern__in=patterns)
    return new_endings

memoized_pattern_ics = {}

def bad_pattern_subst(pattern, ic):
  if (pattern, ic) in memoized_pattern_ics:
    return memoized_pattern_ics[(pattern, ic)]
  if not pattern.lexemeinflectionpattern_set.filter(
       inflection_characteristic__symbol=ic).exclude(lexeme__status='cand'):
    ret = True
  elif pattern.type.symbol in 'mn' and ic == 'f':
    ret = True
  elif pattern.type.symbol in 'fm' and ic[0] == 'n':
    ret = True
  else:
    ret = False
  memoized_pattern_ics[(pattern, ic)] = ret
  return ret

memoized_good_endings = {}

def good_ending_set_subst(pattern, ic, root):
  if (pattern, ic) in memoized_good_endings:
    good_endings = memoized_good_endings[(pattern, ic)]
    return set(root + e for e in good_endings)
  endings = pattern.endings
  if ic not in ('m1', 'p1'):
    endings = endings.exclude(base_form_label__symbol='pl:nom:mo')
  if ic[0] == 'p':
    endings = endings.filter(base_form_label__symbol__startswith='pl')
  else:
    for g in list(set('mfn') - set(ic[0])):
      endings = endings.exclude(
        base_form_label__symbol__startswith='pl:gen:' + g)
  if ic == 'p3':
    if pattern.type.symbol == 'f':
      endings = endings.exclude(base_form_label__symbol='pl:gen:m')
    elif pattern.type.symbol == 'n':
      endings = endings.exclude(base_form_label__symbol='pl:gen:n')
  good_endings = list(endings.values_list('string', flat=True))
  memoized_good_endings[(pattern, ic)] = good_endings
  return set(root + e for e in good_endings)

def good_ending_set(lexical_class, ic, pattern, root=''):
  if lexical_class != 'subst':
    return pattern.ending_set(root)
  else:
    return good_ending_set_subst(pattern, ic, root)

def relevant_subst(ending, ic):
  bfl = ending.base_form_label.symbol
  tag = bfl.split(':')
  pattern_type = ending.pattern.type.symbol
  return (not (ic in ('m1', 'p1') and bfl == 'pl:nom') and
          not (len(tag) >= 3 and ic[0] != 'p' and
               tag[2][0] != ic[0]) and
          not (ic[0] == 'p' and tag[0] != 'pl') and
          not (ic == 'p3' and bfl.startswith('pl:gen:') and (
                (pattern_type == 'n' and tag[2] == 'n') or
                (pattern_type == 'f' and tag[2] == 'm')
              )) and
          not (ic not in ('m1', 'p1') and bfl == 'pl:nom:mo'))

def relevant_adj(ending):
  tag = ending.base_form_label.symbol
  return tag not in ('0', '3+')

def relevant(lexical_class, ending, ic):
  if lexical_class == 'subst':
    return relevant_subst(ending, ic)
  elif lexical_class == 'adj':
    return relevant_adj(ending)

def find_patterns(basic_form, pos, ic, forms):
  patterns = Pattern.objects.filter(type__lexical_class__symbol=pos)
  # znaleźć wszystkie zawarte i zawierające wzory
  form_set = set(form for form, tag in forms)
  ending_sets = {}
  included_patterns = set()
  including_patterns = set()
  matching_patterns = set()
  for basic_ending in basic_form_endings(pos, ic, basic_form, form_set):
    pattern = basic_ending.pattern
    if pos == 'subst' and bad_pattern_subst(pattern, ic):
      #print 'odpadł:', pattern
      continue # olewamy komentarze że formy odrzucone przez charfle?
    root = basic_form[:len(basic_form) - len(basic_ending.string)]
    ending_sets[pattern] = good_ending_set(pos, ic, pattern, root)
    including = form_set.issubset(ending_sets[pattern])
    bad_forms = set()
    for ending in pattern.endings.all():
      if relevant(pos, ending, ic):
        if root + ending.string not in form_set:
          bfl = ending.base_form_label.symbol
          #print pattern.name, root, ending.string, bfl
          bad_forms.add(root + ending.string)
    if not bad_forms:
      included_patterns.add((pattern, root))
      if including:
        matching_patterns.add((pattern, root))
    elif including:
      including_patterns.add(((pattern, root), tuple(bad_forms)))

  # nie wiem, czy to potrzebne, ale na wszelki wypadek
  included_patterns = list(included_patterns)
  including_patterns = list(including_patterns)
  matching_patterns = list(matching_patterns)
  if len(matching_patterns) > 0:
    if DEBUG:
      print u'dokładne wzory: %s' % join(matching_patterns)
    return 'match', matching_patterns, included_patterns, including_patterns
  # nic nie pasuje albo trzeba wybrać wiele wzorów
  if DEBUG and len(including_patterns) > 0:
    print u'zawierające: %s' % join(p for p, b_f in including_patterns)
  if DEBUG and len(included_patterns) > 0:
    print u'zawarte: %s' % join(included_patterns)
  return find_many_patterns(
    pos, ic, form_set, basic_form, included_patterns, ending_sets) + (
    included_patterns, including_patterns)

def find_many_patterns(pos, ic, form_set, basic_form, included_patterns,
                       ending_sets):
  necessary_patterns = set()
  missing_form = None
  for form in form_set:
    having = []
    for pattern, root in included_patterns:
      if form in ending_sets[pattern]:
        having.append((pattern, root))
    if len(having) == 1:
      necessary_patterns.add(having[0])
    if having == []:
      missing_form = form
      break
  if missing_form:
    if DEBUG:
      print u"brak formy: %s" % missing_form
    return 'none', []
  covered_forms = set()
  for pattern, root in necessary_patterns:
    covered_forms |= ending_sets[pattern]
  if form_set.issubset(covered_forms):
    if DEBUG:
      print u"pokryte koniecznymi wzorami: %s" % join(necessary_patterns)
    return 'many', [list(necessary_patterns)]
  else:
    #for pattern, root in included_patterns:
    #  print pattern, ending_sets[pattern]
    minimal_sets = find_minimal_sets(
      form_set, covered_forms, necessary_patterns, included_patterns,
      ending_sets)
    return 'many', minimal_sets

def filter_patterns(filter, action_name, type, patterns, included, including,
                    lexical_class, form_set, entry, ic):
  old_patterns = patterns
  old_included = included
  bad_patterns = False
  if type == 'many':
    if any(pattern_set != filter(pattern_set) for pattern_set in patterns):
      included = filter(included)
      ending_sets = {}
      for pattern, root in included:
        ending_sets[pattern] = good_ending_set(lexical_class, ic, pattern, root)
      type, patterns = find_many_patterns(
        lexical_class, ic, form_set, entry, included, ending_sets)
      if type != 'many':
        debug(entry, u'mnogie dopasowanie zepsute przez %s (%s)' %
                     (action_name, join_many(old_patterns)))
        type = 'many'
        patterns, included = old_patterns, old_included
        bad_patterns = True
  elif type == 'none':
    including_dict = dict(including)
    including = [(key, including_dict[key]) for key in filter(including_dict)]
  else: # type == 'match'
    patterns = filter(patterns)
    including_dict = dict(including)
    including = [(key, including_dict[key]) for key in filter(including_dict)]
    included = filter(included)
    if old_patterns and not patterns:
      ending_sets = {}
      for pattern, root in included:
        ending_sets[pattern] = good_ending_set(lexical_class, ic, pattern, root)
      type, patterns = find_many_patterns(
        lexical_class, ic, form_set, entry, included, ending_sets)
      if type == 'none':
        debug(entry, u'znikły wzory przez %s (%s)' %
                     (action_name, join(old_patterns)))
        type = 'match'
        patterns = old_patterns
        bad_patterns = True
  return type, patterns, included, including, bad_patterns

def process_forms(forms, base, pos, commonness):
  if Lexeme.objects.filter(entry=base):
    return
  ic = inflection_characteristic(forms, pos)
  form_set = set(form for form, tag in forms)
  type, patterns, included, including = find_patterns(base, pos, ic, forms)
  type, patterns, included, including, bad_patterns = filter_patterns(
    blacklist_filter, u'czarną listę', type, patterns, included, including,
    pos, form_set, base, ic)
  # wzory się już nie zmienią od tego miejsca
  if type == 'many':
    all_patterns = [p for pattern_set in patterns for p in pattern_set]
  else:
    all_patterns = patterns

  if type == 'none':
    debug(base, u'zawiera się w %s' % join(p for p, b_f in including))
    chosen = []
    fitting = including
    if pos == 'adj' and including:
      print_forms(forms, 'rzeczownik#')
      return
  elif type == 'match':
    patterns.sort(key=lambda p: p[0].name)
    fitting = patterns
    chosen = patterns[:1]
  elif type == 'many':
    chosen = patterns[0]
    if DEBUG:
      print u'zestawy wielu wzorów: %s' % join_many(patterns)
    fitting = patterns

  if not DEBUG:
    comments = [u'z Korpusu IPI 1.0']
    if commonness == u'własna' or type != 'match' or len(fitting) > 1:
      status = 'cand'
    else:
      status = 'desc'
    if bad_patterns:
      comments.append(u'Wzory z czarnej listy!')
      status = 'cand'
    if len(fitting) > 1 or (type == 'none' and fitting):
      if type == 'none':
        comments.append(u'Zawierające wzory:')
        for (pattern, root), bad_forms in fitting:
          comments.append(u'%s: %s' % (pattern.name, ', '.join(bad_forms)))
      elif type != 'many':
        comments.append(u'Pasujące wzory: %s' % join(fitting))
      else:
        comments.append(u'Pasujące zestawy wzorów: %s' % join_many(fitting))
    comment = '\n'.join(comments)
    lips = []
    for i, pattern in enumerate(chosen):
      lips.append(create_lip(pattern[0], pattern[1], i + 1, ic, pos))
    lexeme_data = create_lexeme(base, 1, pos, status, comment)
    lexeme_data['commonness'] = commonness
    data = {
      'lexeme': lexeme_data,
      'lips': lips,
    }
    print_data(data)

def import_kipi(input_file):
  last_key = None
  forms = None
  for line in input_file:
    data = line.strip().decode('utf-8').split('\t')
    form, base, comm, tag = data
    pos = 'subst' if tag.startswith('subst') else 'adj' # bez split, bo adja
    key = (base, pos, comm)
    if key != last_key:
      if last_key is not None:
        process_forms(forms, last_key[0], last_key[1], COMMONNESS[last_key[2]])
      last_key = key
      forms = []
    forms.append((form, tag))
  process_forms(forms, last_key[0], last_key[1], COMMONNESS[last_key[2]])