disamb.py 6.63 KB
#!/usr/bin/env python
#-*- coding:utf-8 -*-

import sys
from django.core.management.base import BaseCommand
from common.util import debug, uniprint, uniopen
from dictionary.util import expand_tag


class Command(BaseCommand):
    args = '<nazwa pliku wejściowego>'
    help = 'Dezambiguacja tagów z Morfologika'

    def handle(self, input_file, **options):
        parse_file(input_file)


base_tag = {}

form_categories = {
    'adj': ['adj'],
    'verb': ['subst:ger', 'verb', 'pact', 'ppas', 'pant', 'pcon'],
    'subst': ['subst'],
    'adv': ['adv'],
}


def all_tags(form, filter_base=False):
    tags = []
    for tag, base in zip(form['tags'], form['base']):
        if not filter_base or base:
            tags += expand_tag(tag)
    return tags


def base_tag(tag):
    pos = tag.split(':')[0]
    if tag.startswith('subst:ger'):
        return False
    if pos in ('num', 'subst', 'adj'):
        return 'nom' in tag
    if pos == 'verb':
        return 'inf' in tag or 'winien' in tag
    if pos in ('refl', 'pact', 'ppas'):
        return False
    return True

# forms: lista {form: forma, tags: lista możliwych tagów)?
def disamb_lexeme(forms):
    base_form_tags = (tag for tag, base
        in zip(forms[0]['tags'], forms[0]['base'])
        if base and base_tag(tag))
    possible_pos = set(tag.split(':')[0] for tag in base_form_tags)
    entry = forms[0]['form']
    if forms[0]['tags'][0] == '-':
        debug(entry, u'nierozpoznana forma podstawowa')
        return None, None
    if len(possible_pos) == 1:
        pos = list(possible_pos)[0]
    else:
        debug(entry, u'niejednoznaczna część mowy')
        return None, None
    cats = form_categories.get(pos, [pos])
    new_forms = []
    other_lexemes = []
    for form in forms:
        new_tags = []
        for tag, base in zip(form['tags'], form['base']):
            for cat in cats:
                if tag.startswith(cat) and base:
                    new_tags.append(tag)
                    break
            else: # nie pasowało
                if pos == 'verb':
                    tags = tag.split('+')
                    fixed = None
                    if tags[0].startswith('subst'):
                        fixed = ['subst:ger' + tag[len('subst'):] for tag in
                            tags]
                    elif tags[0].startswith('adj'):
                        start = None
                        if any(form['form'].endswith(end) for end in
                            [u'cy', u'ca', u'ce',
                                u'cą', u'cego', u'cej', u'cemu', u'cym',
                                u'cych', u'cymi']):
                            start = 'pact'
                        else:
                            start = 'ppas'
                        if start:
                            fixed = [start + tag[len('adj'):] for tag in tags]
                    if fixed:
                        new_tags.append('+'.join(fixed))
        if new_tags:
            form['tags'] = new_tags
            new_forms.append(form)
        else:
            if pos == 'adj' and 'adv' in (tag.split(':')[0] for tag in
                form['tags']):
                form['tags'] = [tag for tag in form['tags'] if
                    tag.startswith('adv')]
                other_lexemes.append([form])
                nie_prefix = ''
                while forms[0]['form'].startswith(nie_prefix):
                    nie_prefix += 'nie'
                if not form['form'].startswith(nie_prefix):
                    debug(
                        u'advadj', '%s %s' % (form['form'], forms[0]['form']))
            else:
                form['tags'] = [pos + ':irreg']
                new_forms.append(form)
                #debug(u'odrzucona forma', '%s %s [%s]' %
                #      (form['form'], ', '.join(form['tags']), pos))

                #  if len(new_forms[0]['tags']) == 1:
                #    if pos not in base_tag:
                #      base_tag[pos] = set()
                #    base_tag[pos].add(new_forms[0]['tags'][0])

    if pos == 'subst':
        # ujednoznacznić rodzaj... niezguła, sezamek
        genders = set(tag[-1][0] for tag in all_tags(new_forms[0]))
        if len(genders) == 1:
            gender = list(genders)[0]
        else:
            genders = set(
                tag[-1][0] for tag in all_tags(new_forms[0], filter_base=True))
            if len(genders) == 1:
                gender = list(genders)[0]
            else:
                good_genders = []
                for gender in genders:
                    for form in new_forms:
                        for tag in all_tags(form):
                            if tag[-1][0] in (gender, 'i'):
                                break # jest
                        else: # nie ma
                            break
                    else: # ok
                        good_genders.append(gender)
                if len(good_genders) != 1:
                    debug(entry, u'nie da się ujednoznacznić rodzaju')
                    return None, None
                gender = good_genders[0]
                # znamy rodzaj, przesiewamy
        for form in new_forms:
            good_tags = []
            for tag in all_tags(form):
                if tag[-1][0] in (gender, 'i') or (
                            tag[-1] == 'depr' and gender == 'm'):
                    good_tags.append(':'.join(tag))
            if good_tags:
                form['tags'] = good_tags
            else:
                form['tags'] = [pos + ':irreg']
    return new_forms, other_lexemes


def print_forms(forms):
    for form in forms:
        for tag in form['tags']:
            uniprint('%s\t%s' % (form['form'], tag))
    print


def parse_file(path):
    forms = []
    for line in uniopen(path):
        if line.startswith('Processed '):
            break
        if line == '':
            disambiguated, other_lexemes = disamb_lexeme(forms)
            if disambiguated:
                print_forms(disambiguated)
                for l in other_lexemes:
                    print_forms(l)
            forms = []
        else:
            form, base, tag = line.split('\t')
            if not forms:
                entry = form
            if not forms or form != forms[-1]['form']:
                forms.append({'form': form, 'base': [], 'tags': []})
            forms[-1]['tags'].append(tag)
            forms[-1]['base'].append(
                base == entry or tag == 'adv:comp'
                or (
                    tag.startswith(
                        'subst:pl') and 'nom' in tag)) # brzydko...

#  for pos, tags in base_tag.iteritems():
#      uniprint('base %s: %s' % (pos, ', '.join(tags)))

if __name__ == '__main__':
    import sys

    parse_file(sys.argv[1])