disamb.py 5.91 KB
#!/usr/bin/env python
#-*- coding:utf-8 -*-

import sys
from django.db import connection, transaction
from django.core.management.base import BaseCommand, CommandError
from common.util import debug
from dictionary.util import expand_tag

class Command(BaseCommand):
  args = '<nazwa pliku wejściowego>'
  help = 'Dezambiguacja tagów z Morfologika'

  def handle(self, input_file, **options):
    parse_file(input_file)


base_tag = {}

form_categories = {
  'adj': ['adj'],
  'verb': ['subst:ger', 'verb', 'pact', 'ppas', 'pant', 'pcon'],
  'subst': ['subst'],
  'adv': ['adv'],
}

def all_tags(form, filter_base=False):
  tags = []
  for tag, base in zip(form['tags'], form['base']):
    if not filter_base or base:
      tags += expand_tag(tag)
  return tags

def base_tag(tag):
  pos = tag.split(':')[0]
  if tag.startswith('subst:ger'):
    return False
  if pos in ('num', 'subst', 'adj'):
    return 'nom' in tag
  if pos == 'verb':
    return 'inf' in tag or 'winien' in tag
  if pos in ('refl', 'pact', 'ppas'):
    return False
  return True

# forms: lista {form: forma, tags: lista możliwych tagów)?
def disamb_lexeme(forms):
  base_form_tags = (tag for tag, base
                     in zip(forms[0]['tags'], forms[0]['base'])
                     if base and base_tag(tag))
  possible_pos = set(tag.split(':')[0] for tag in base_form_tags)
  entry = forms[0]['form']
  if forms[0]['tags'][0] == '-':
    print >>sys.stderr, (u'%s: nierozpoznana forma podstawowa' %
                         entry).encode('utf-8')
    return None, None
  if len(possible_pos) == 1:
    pos = list(possible_pos)[0]
  else:
    print >>sys.stderr, (u'%s: niejednoznaczna część mowy' %
                         entry).encode('utf-8')
    return None, None
  cats = form_categories.get(pos, [pos])
  new_forms = []
  other_lexemes = []
  for form in forms:
    new_tags = []
    for tag, base in zip(form['tags'], form['base']):
      for cat in cats:
        if tag.startswith(cat) and base:
          new_tags.append(tag)
          break
      else: # nie pasowało
        if pos == 'verb':
          tags = tag.split('+')
          fixed = None
          if tags[0].startswith('subst'):
            fixed = ['subst:ger' + tag[len('subst'):] for tag in tags]
          elif tags[0].startswith('adj'):
            start = None
            if any(form['form'].endswith(end) for end in [u'cy', u'ca', u'ce',
                   u'cą', u'cego', u'cej', u'cemu', u'cym', u'cych', u'cymi']):
              start = 'pact'
            else:
              start = 'ppas'
            if start:
              fixed = [start + tag[len('adj'):] for tag in tags]
          if fixed:
            new_tags.append('+'.join(fixed))
    if new_tags:
      form['tags'] = new_tags
      new_forms.append(form)
    else:
      if pos == 'adj' and 'adv' in (tag.split(':')[0] for tag in form['tags']):
        form['tags'] = [tag for tag in form['tags'] if tag.startswith('adv')]
        other_lexemes.append([form])
        nie_prefix = ''
        while forms[0]['form'].startswith(nie_prefix):
          nie_prefix += 'nie'
        if not form['form'].startswith(nie_prefix):
          print >>sys.stderr, (u'advadj: %s %s' % (form['form'], forms[0]['form'])).encode('utf-8')
      else:
        form['tags'] = [pos + ':irreg']
        new_forms.append(form)
      #print >>sys.stderr, (u'odrzucona forma: %s %s [%s]' %
      #                     (form['form'], ', '.join(form['tags']), pos)).encode('utf-8')

#  if len(new_forms[0]['tags']) == 1:
#    if pos not in base_tag:
#      base_tag[pos] = set()
#    base_tag[pos].add(new_forms[0]['tags'][0])

  if pos == 'subst':
    # ujednoznacznić rodzaj... niezguła, sezamek
    genders = set(tag[-1][0] for tag in all_tags(new_forms[0]))
    if len(genders) == 1:
      gender = list(genders)[0]
    else:
      genders = set(tag[-1][0] for tag in all_tags(new_forms[0], filter_base=True))
      if len(genders) == 1:
        gender = list(genders)[0]
      else:
        good_genders = []
        for gender in genders:
          for form in new_forms:
            for tag in all_tags(form):
              if tag[-1][0] in (gender, 'i'):
                break # jest
            else: # nie ma
              break
          else: # ok
            good_genders.append(gender)
        if len(good_genders) != 1:
          print >> sys.stderr, (u'%s: nie da się ujednoznacznić rodzaju' %
                                entry).encode('utf-8')
          return None, None
        gender = good_genders[0]
    # znamy rodzaj, przesiewamy
    for form in new_forms:
      good_tags = []
      for tag in all_tags(form):
        if tag[-1][0] in (gender, 'i') or (tag[-1] == 'depr' and gender == 'm'):
          good_tags.append(':'.join(tag))
      if good_tags:
        form['tags'] = good_tags
      else:
        form['tags'] = [pos + ':irreg']
  return new_forms, other_lexemes

def print_forms(forms):
  for form in forms:
    for tag in form['tags']:
      print ('%s\t%s' % (form['form'], tag)).encode('utf-8')
  print

def parse_file(path):
  with open(path) as file:
    forms = []
    for line in file:
      line = line.decode('utf-8').rstrip('\n')
      if line.startswith('Processed '):
        break
      if line == '':
        disambiguated, other_lexemes = disamb_lexeme(forms)
        if disambiguated:
          print_forms(disambiguated)
          for l in other_lexemes:
            print_forms(l)
        forms = []
      else:
        form, base, tag = line.split('\t')
        if not forms:
          entry = form
        if not forms or form != forms[-1]['form']:
          forms.append({'form': form, 'base': [], 'tags': []})
        forms[-1]['tags'].append(tag)
        forms[-1]['base'].append(
          base == entry or tag == 'adv:comp'
          or (tag.startswith('subst:pl') and 'nom' in tag)) # brzydko...
#  for pos, tags in base_tag.iteritems():
#    print ('base %s: %s' % (pos, ', '.join(tags))).encode('utf-8')

if __name__ == '__main__':
  import sys
  parse_file(sys.argv[1])