normalize.py 11.6 KB
# -*- coding: utf-8 -*-

import itertools

import settings
import nkjp
from wordnet.models import LexicalUnit, Relation


def expressions(expressions):
    orth_expressions = []
    nkjp_connection = None
    if settings.NKJP_VALIDATION:
        nkjp_connection = nkjp.connect('nkjp1800x')
    for expr in expressions.order_by('text'):
        if expr.segments.count() < 2:
            continue
        no_spaced_orig_expr = expr.orth_text.replace(' ', '')
        for generated in expression(nkjp_connection, expr):
            no_spaced_gen_expr = generated.replace(' ', '')
            if no_spaced_gen_expr != no_spaced_orig_expr:
                orth_expressions.append(generated)
    if settings.NKJP_VALIDATION:
        nkjp_connection.close()
    return orth_expressions


# @TODO: czasowniki z sie przy szukaniu synonimow
def expression(nkjp_connection, expression):
    normalized_expressions = []
    if expression.segments.count() > 1:
        possible_forms = generate_forms(expression)
        if len(possible_forms) > 1:
            for form in possible_forms:
                if (not settings.NKJP_VALIDATION
                    or nkjp.contains(nkjp_connection, get_normalized_expr_query(expression, form))):
                    normalized_expression = get_normalized_expr_text(expression, form)
                    normalized_expressions.append(normalized_expression)
    return set(normalized_expressions)


def generate_forms(expression):
    segments = []
    expr_segments = expression.segments.order_by('position_in_expr')
    for seg in expr_segments:
        seg_lus = get_lus(seg.base)
        if seg.is_head and (seg_lus.count() == 1 or settings.NKJP_VALIDATION):
            head_equivalents = get_head_equivalents(seg)
            segments.append(head_equivalents)
        elif seg_lus.count() == 1 or settings.NKJP_VALIDATION:
            equivalents = get_synonymic_equivalents(seg)
            segments.append(equivalents)
        else:
            segments.append([seg.orth])
    generated_expressions = itertools.product(*segments)

    generated_expressions = list(generated_expressions)
    pariciple_expressions = create_pariciple_expressions(generated_expressions, expression)
    generated_expressions.extend(pariciple_expressions)

    return generated_expressions


def create_pariciple_expressions(generated_expressions, expression):
    if u'który' not in expression.base_text:
        return []

    pariciple_expressions = []
    original_segments = expression.segments.order_by('position_in_expr')
    for i, orig_seg in enumerate(original_segments):
        ifin = -1
        if is_aff_construction(original_segments, i):

            gender = get_adj_gender(original_segments[i + 1].msd)
            case = get_adj_case(original_segments[i + 1].msd)

            if 'nom' in case.split('.'):
                ctag = 'pact'
                case = 'nom.voc'
                ifin = i + 2

        elif is_neg_construction(original_segments, i):
            gender = get_adj_gender(original_segments[i + 1].msd)
            case = get_adj_case(original_segments[i + 1].msd)

            if 'nom' in case.split('.'):
                ctag = 'pact'
                case = 'nom.voc'
                ifin = i + 3

        if ifin >= 0:
            pariciple_expressions.extend(get_participle_expressions(generated_expressions, original_segments, ifin,
                                                                    gender, case, ctag))

    return pariciple_expressions


def is_aff_construction(segments, start):
    if (start + 2 < segments.count() and
                segments[start].orth == u',' and
                segments[start + 1].base == u'który' and
                segments[start + 2].ctag == u'fin'):
        return True
    return False


def is_neg_construction(segments, start):
    if (start + 3 < segments.count() and
                segments[start].orth == u',' and
                segments[start + 1].base == u'który' and
                segments[start + 2].base == u'nie' and segments[start + 2].ctag != u'ppron3' and
                segments[start + 3].ctag == u'fin'):
        return True
    return False


def get_participle_expressions(orig_expressions, segments, start, gender, case, ctag):
    iseg = start
    participle_expressions = []
    verbs = []
    while iseg < segments.count() and (segments[iseg].ctag in ['interp', 'fin', 'conj'] or segments[iseg].base in [u'się', u'nie']):
        if segments[iseg].ctag == 'fin':

            number = get_fin_number(segments[iseg].msd)
            aspect = get_fin_aspect(segments[iseg].msd)
            verb_base = segments[iseg].base

            if verb_base == u'być':
                return []

            negation = 'aff'
            if iseg - 1 > 0 and segments[iseg - 1].base == u'nie' and segments[iseg - 1].ctag != u'ppron3':
                negation = 'neg'

            possible_participles = get_matching_forms(verb_base, ctag,
                                                      number, case, gender, aspect, negation)
            verbs.append({'seg': segments[iseg],
                          'iseg': iseg,
                          'negation': negation,
                          'possible_participles': possible_participles})
        iseg += 1


    for expr in orig_expressions:
        participle_expr = list(expr)
        for verb in verbs:
            # for part in verb['possible_participles']:

                participle_expr[verb['iseg']] = verb['possible_participles'][0]

                if verb['iseg'] == start and verb['negation'] == 'neg':
                    participle_expr[verb['iseg'] - 3] = None
                    participle_expr[verb['iseg'] - 2] = None
                    participle_expr[verb['iseg'] - 1] = None
                elif verb['iseg'] == start and verb['negation'] == 'aff':
                    participle_expr[verb['iseg'] - 2] = None
                    participle_expr[verb['iseg'] - 1] = None
                elif verb['iseg'] != start and verb['negation'] == 'neg':
                    participle_expr[verb['iseg'] - 1] = None

                if (verb['iseg'] + 1 < segments.count() and
                            segments[verb['iseg'] + 1].base == u'się' and
                            ctag == 'ppas'):
                    participle_expr[verb['iseg'] + 1] = None

        participle_expressions.append(participle_expr)

    return participle_expressions


def get_adj_case(msd):
    tags = msd.split(':')
    return tags[1]


def get_adj_gender(msd):
    tags = msd.split(':')
    return tags[2]


def get_fin_number(msd):
    tags = msd.split(':')
    return tags[0]


def get_fin_aspect(msd):
    tags = msd.split(':')
    return tags[2]


def get_matching_forms(base, ctag, number, case, gender, aspect, negation):
    matching_forms = []
    possible_forms = settings.MORFEUSZ2.generate(base.encode('utf8'))
    for form in possible_forms:
        form_tags = form.getTag(settings.MORFEUSZ2).split(':')
        if (ctag == form_tags[0] and
            tags_match(form_tags[1], number) and
            tags_match(form_tags[2], case) and
            tags_match(form_tags[3], gender) and
            tags_match(form_tags[4], aspect) and
            tags_match(form_tags[5], negation)):

            if form.orth not in matching_forms:
                matching_forms.append(form.orth)

    return matching_forms


def tags_match(tag1, tag2):
    tag1_alternatives = tag1.split('.')
    tag2_alternatives = tag2.split('.')
    return bool(set(tag1_alternatives) & set(tag2_alternatives))


def get_synonymic_equivalents(seg):
    all_equivalents = []
    all_equivalents.extend(get_synonyms(seg.base))

    inflected_equivalents = [seg.orth]
    for equivalent in all_equivalents:
        forms = segment(seg, equivalent)
        inflected_equivalents.extend(forms)

    return list(set(inflected_equivalents))


def get_head_equivalents(seg):
    all_equivalents = []
    all_equivalents.extend(get_synonyms(seg.base))
    all_equivalents.extend(get_by_relation(seg.base, 1,
                                           Relation.objects.get(parent=None, name='hiperonimia')))
    inflected_equivalents = [seg.orth]
    for equivalent in all_equivalents:
        forms = segment(seg, equivalent)
        inflected_equivalents.extend(forms)

    return list(set(inflected_equivalents))


def get_synonyms(base):
    synonyms = []
    synsets = get_synsets(base)
    for synset in synsets:
        for lu in synset.lus.all():
            if (base != lu.base and len(lu.base.split()) == 1 and lu.base not in synonyms):
                synonyms.append(lu.base)
    return synonyms


def get_synsets(base):
    synsets = []
    lus = get_lus(base)
    for lu in lus:
        if lu.synset not in synsets:
            synsets.append(lu.synset)
    return synsets


def get_lus(base):
    return LexicalUnit.objects.filter(base=base)


def get_by_relation(base, max_depth, relation):
    related_bases = []
    source_synsets = get_synsets(base)
    related_synsets = get_related_synsets(source_synsets, relation, max_depth)
    for synset in related_synsets:
        if synset not in source_synsets:
            for lu in synset.lus.all():
                if (base != lu.base and len(lu.base.split()) == 1 and
                            lu.base not in related_bases):
                    related_bases.append(lu.base)
    return related_bases


def get_related_synsets(sources, relation, max_depth):
    related_synsets = []
    for source in sources:
        related_synsets.extend(get_related(source, relation, max_depth, 0))
    return related_synsets


def get_related(source, relation, max_depth, depth):
    depth += 1
    visited = [source]
    if depth > max_depth:
        return visited
    links = source.targets.filter(relation=relation)
    for lid, link in enumerate(links):
        visited.extend(get_related(link.parent, relation, max_depth, depth))
    return visited


def get_normalized_expr_text(expression, form_segments):
    expr = ''
    expr_segments = expression.segments.order_by('position_in_expr')
    for expr_seg, form_seg in itertools.izip(expr_segments, form_segments):
        if form_seg is None:
            continue

        orth = form_seg

        if expr_seg.has_nps:
            expr += orth
        else:
            expr += ' %s' % orth

    return expr.lstrip()


def get_normalized_expr_query(expression, form_segments):
    expr = ''
    expr_segments = expression.segments.order_by('position_in_expr')
    for expr_seg, form_seg in itertools.izip(expr_segments, form_segments):
        if form_seg is None:
            continue

        orth = form_seg

        if expr_seg.ctag == 'interp':
            orth = u'[orth="\\%s"]' % orth
        elif any(char.isdigit() for char in orth):
            orth = u'[orth="%s"]' % orth
        else:
            orth = orth + '/i'

        if expr_seg.has_nps:
            expr += orth
        else:
            expr += ' %s' % orth

    return expr.lstrip()


def segment(orig_seg, synonym):
    return generate_inflected_forms(synonym, orig_seg)


def generate_inflected_forms(synonym, orig_seg):
    inflected_forms = []

    try:
        orig_tag = '%s:%s' % (orig_seg.ctag, orig_seg.msd)
        orig_tag_id = settings.MORFEUSZ2.getIdResolver().getTagId(orig_tag)
        inflected_forms = inflect(synonym, orig_tag_id)
    except RuntimeError:
        pass

    # orig_morf2_interps = settings.MORFEUSZ2.analyse(orig_seg.orth.encode('utf8'))
    # for interp in orig_morf2_interps:
    #     if interp.getTag(settings.MORFEUSZ2) == u'%s:%s' % (orig_seg.ctag, orig_seg.msd):
    #         inflected.extend(inflect(synonym, interp.tagId))
    return set(inflected_forms)


def inflect(base, tag_id):
    possible_forms = settings.MORFEUSZ2.generate(base.encode('utf8'), tag_id)
    return [form.orth for form in possible_forms]