normalize.py 3.92 KB
import settings
import nkjp
from webapp.models import Segment
from wordnet.models import LexicalUnit, Relation


def expression(expression):
    normalized_expressions = []
    if expression.segments.count() > 1:
        # normalize_verbs(expression)
        normalized_expressions.extend(normalize_head(expression))
    return normalized_expressions


def normalize_head(expression):
    normalized_expressions = []
    try:
        head = expression.segments.get(is_head=True)
        all_equivalents = []
        head_lus = get_lus(head.base)
        all_equivalents.extend(get_synonyms(head.base))
        # all_equivalents.extend(get_by_relation(head.base, 1,
        #                                        Relation.objects.get(parent=None, name='hiponimia')))
        all_equivalents.extend(get_by_relation(head.base, 1,
                                               Relation.objects.get(parent=None, name='hiperonimia')))
        for synonym in all_equivalents:
            synonym_forms = segment(head, synonym)
            for form in synonym_forms:
                normalized_expression = get_normalized_expr_text(expression, form)
                if head_lus.count() == 1 or nkjp.exists(normalized_expression):
                    normalized_expressions.append(normalized_expression)
    except Segment.DoesNotExist:
        pass

    return set(normalized_expressions)


def get_synonyms(base):
    synonyms = []
    synsets = get_synsets(base)
    for synset in synsets:
        for lu in synset.lus.all():
            if (base != lu.base and len(lu.base.split()) == 1
                and lu.base not in synonyms):
                synonyms.append(lu.base)
    return synonyms


# tylko jedno znaczenie to ok
# w przeciwnym razie sprawdz czy jest w NKJP
def get_synsets(base):
    synsets = []
    lus = get_lus(base)
    # if lus.count() == 1: # tylko jednoznaczne jednoski nas interesuja
    #     return synsets
    for lu in lus:
        if lu.synset not in synsets:
            synsets.append(lu.synset)
    return synsets


def get_lus(base):
    return LexicalUnit.objects.filter(base=base)


def get_by_relation(base, max_depth, relation):
    related_bases = []
    source_synsets = get_synsets(base)
    related_synsets = get_related_synsets(source_synsets, relation, max_depth)
    for synset in related_synsets:
        if synset not in source_synsets:
            for lu in synset.lus.all():
                if (base != lu.base and len(lu.base.split()) == 1
                    and lu.base not in related_bases):
                    related_bases.append(lu.base)
    return related_bases


def get_related_synsets(sources, relation, max_depth):
    related_synsets = []
    for source in sources:
        related_synsets.extend(get_related(source, relation, max_depth, 0))
    return related_synsets


def get_related(source, relation, max_depth, depth):
    depth += 1
    visited = [source]
    if depth > max_depth:
        return visited
    links = source.targets.filter(relation=relation)
    for lid, link in enumerate(links):
        visited.extend(get_related(link.parent, relation, max_depth, depth))
    return visited


def get_normalized_expr_text(expression, new_head):
    expr = ''
    for seg in expression.segments.order_by('position_in_expr'):
        orth = seg.orth
        if seg.is_head:
            orth = new_head

        if seg.has_nps:
            expr += orth
        else:
            expr += ' %s' % orth

    return expr.lstrip()


def segment(orig_seg, synonym):
    orig_morf2_interps = settings.MORFEUSZ2.analyse(orig_seg.orth.encode('utf8'))
    return generate_inflected_forms(synonym, orig_morf2_interps)


def generate_inflected_forms(synonym, morf2_interps):
    inflected = []
    for interp in morf2_interps:
        inflected.extend(inflect(synonym, interp.tagId))
    return set(inflected)


def inflect(base, tag_id):
    possible_forms = settings.MORFEUSZ2.generate(base.encode('utf8'), tag_id)
    return [form.orth for form in possible_forms]