normalize.py 6.82 KB
import itertools

import settings
import nkjp
from webapp.models import Segment
from wordnet.models import LexicalUnit, Relation

# dodac inwersje, np. intrygant kulkowy >> kulkowy intrygant !!!
def expressions(expressions):
    orth_expressions = []
    nkjp_connection = nkjp.connect('nkjp1800x')
    nkjp_connection = None
    for expr in expressions.order_by('text'):
        orth_expressions.extend(expression(nkjp_connection, expr))
        if expr.orth_text not in orth_expressions:
            orth_expressions.append(expr.orth_text)
    nkjp_connection.close()
    return orth_expressions


def expression(nkjp_connection, expression):
    normalized_expressions = []
    if expression.segments.count() > 1 and expression.segments.count() < 5:
        possible_forms = generate_forms(expression)
        for form in possible_forms:
            normalized_expression = get_normalized_expr_text(expression, form)
            normalized_expression_query = get_normalized_expr_query(expression, form)
            if nkjp.contains(nkjp_connection, normalized_expression_query):
                normalized_expressions.append(normalized_expression)
        # tutaj sie bedzie jeszcze bruzdzic --> stawic sie np.
        #normalize_verbs(expression)
        # normalized_expressions.extend(normalize_head(nkjp_connection, expression))
    return set(normalized_expressions)


def generate_forms(expression):
    segments = []
    for seg in expression.segments.order_by('position_in_expr'):
        if seg.is_verb() and seg.orth != u'jest':
            verb_equivalents = get_verb_equivalents(seg)
            segments.append(verb_equivalents)
        elif seg.is_head:# and get_lus(seg.base).count() == 1:
            head_equivalents = get_head_equivalents(seg)
            segments.append(head_equivalents)
        elif seg.is_noun() and get_lus(seg.base).count() == 1:
            verb_equivalents = get_verb_equivalents(seg)
            segments.append(verb_equivalents)
        else:
            segments.append([seg.orth])
    return list(itertools.product(*segments))


def get_verb_equivalents(seg):
    all_equivalents = []
    all_equivalents.extend(get_synonyms(seg.base))

    inflected_equivalents = [seg.orth]
    for equivalent in all_equivalents:
        forms = segment(seg, equivalent)
        inflected_equivalents.extend(forms)

    return list(set(inflected_equivalents))


def get_head_equivalents(seg):
    all_equivalents = []
    all_equivalents.extend(get_synonyms(seg.base))
    all_equivalents.extend(get_by_relation(seg.base, 1,
                                           Relation.objects.get(parent=None, name='hiperonimia')))
    inflected_equivalents = [seg.orth]
    for equivalent in all_equivalents:
        forms = segment(seg, equivalent)
        inflected_equivalents.extend(forms)

    return list(set(inflected_equivalents))


# def normalize_head(nkjp_connection, expression):
#     normalized_expressions = []
#     try:
#         head = expression.segments.get(is_head=True)
#         all_equivalents = []
#         head_lus = get_lus(head.base)
#         all_equivalents.extend(get_synonyms(head.base))
#         all_equivalents.extend(get_by_relation(head.base, 1,
#                                                Relation.objects.get(parent=None, name='hiperonimia')))
#         for synonym in all_equivalents:
#             synonym_forms = segment(head, synonym)
#             for form in synonym_forms:
#                 normalized_expression = get_normalized_expr_text(expression, form)
#                 normalized_expression_query = get_normalized_expr_query(expression, form)
#                 if head_lus.count() == 1 or nkjp.contains(nkjp_connection, normalized_expression_query):
#                     normalized_expressions.append(normalized_expression)
#     except Segment.DoesNotExist:
#         pass
#
#     return set(normalized_expressions)


def get_synonyms(base):
    synonyms = []
    synsets = get_synsets(base)
    for synset in synsets:
        for lu in synset.lus.all():
            if (base != lu.base and len(lu.base.split()) == 1
                and lu.base not in synonyms):
                synonyms.append(lu.base)
    return synonyms


def get_synsets(base):
    synsets = []
    lus = get_lus(base)
    for lu in lus:
        if lu.synset not in synsets:
            synsets.append(lu.synset)
    return synsets


def get_lus(base):
    return LexicalUnit.objects.filter(base=base)


def get_by_relation(base, max_depth, relation):
    related_bases = []
    source_synsets = get_synsets(base)
    related_synsets = get_related_synsets(source_synsets, relation, max_depth)
    for synset in related_synsets:
        if synset not in source_synsets:
            for lu in synset.lus.all():
                if (base != lu.base and len(lu.base.split()) == 1
                    and lu.base not in related_bases):
                    related_bases.append(lu.base)
    return related_bases


def get_related_synsets(sources, relation, max_depth):
    related_synsets = []
    for source in sources:
        related_synsets.extend(get_related(source, relation, max_depth, 0))
    return related_synsets


def get_related(source, relation, max_depth, depth):
    depth += 1
    visited = [source]
    if depth > max_depth:
        return visited
    links = source.targets.filter(relation=relation)
    for lid, link in enumerate(links):
        visited.extend(get_related(link.parent, relation, max_depth, depth))
    return visited


def get_normalized_expr_text(expression, form_segments):
    expr = ''
    expr_segments = expression.segments.order_by('position_in_expr')
    for expr_seg, form_seg in itertools.izip(expr_segments, form_segments):
        orth = form_seg

        if expr_seg.has_nps:
            expr += orth
        else:
            expr += ' %s' % orth

    return expr.lstrip()


def get_normalized_expr_query(expression, form_segments):
    expr = ''
    expr_segments = expression.segments.order_by('position_in_expr')
    for expr_seg, form_seg in itertools.izip(expr_segments, form_segments):
        orth = form_seg

        if expr_seg.ctag == 'interp':
            orth = u'[orth="\\%s"]' % orth
        else:
            orth = orth + '/i'

        if expr_seg.has_nps:
            expr += orth
        else:
            expr += ' %s' % orth

    return expr.lstrip()


def segment(orig_seg, synonym):
    return generate_inflected_forms(synonym, orig_seg)


def generate_inflected_forms(synonym, orig_seg):
    inflected = []
    orig_morf2_interps = settings.MORFEUSZ2.analyse(orig_seg.orth.encode('utf8'))
    for interp in orig_morf2_interps:
        if interp.getTag(settings.MORFEUSZ2) == u'%s:%s' % (orig_seg.ctag, orig_seg.msd):
            inflected.extend(inflect(synonym, interp.tagId))
    return set(inflected)


def inflect(base, tag_id):
    possible_forms = settings.MORFEUSZ2.generate(base.encode('utf8'), tag_id)
    return [form.orth for form in possible_forms]