features.py 3.96 KB
import numpy
import random

from conf import RANDOM_WORD_VECTORS, W2V_MODEL, W2V_SIZE


# mention features
def head_vec(mention):
    return list(get_wv(W2V_MODEL, mention.head_base))


def first_word_vec(mention):
    return list(get_wv(W2V_MODEL, mention.words[0]['base']))


def last_word_vec(mention):
    return list(get_wv(W2V_MODEL, mention.words[-1]['base']))


def first_after_vec(mention):
    if len(mention.follow_context) > 0:
        vec = list(get_wv(W2V_MODEL, mention.follow_context[0]['base']))
    else:
        vec = [0.0] * W2V_SIZE
    return vec


def second_after_vec(mention):
    if len(mention.follow_context) > 1:
        vec = list(get_wv(W2V_MODEL, mention.follow_context[1]['base']))
    else:
        vec = [0.0] * W2V_SIZE
    return vec


def first_before_vec(mention):
    if len(mention.prec_context) > 0:
        vec = list(get_wv(W2V_MODEL, mention.prec_context[-1]['base']))
    else:
        vec = [0.0] * W2V_SIZE
    return vec


def second_before_vec(mention):
    if len(mention.prec_context) > 1:
        vec = list(get_wv(W2V_MODEL, mention.prec_context[-2]['base']))
    else:
        vec = [0.0] * W2V_SIZE
    return vec


def preceding_context_vec(mention):
    return list(get_context_vec(mention.prec_context, W2V_MODEL))


def following_context_vec(mention):
    return list(get_context_vec(mention.follow_context, W2V_MODEL))


def mention_vec(mention):
    return list(get_context_vec(mention.words, W2V_MODEL))


def sentence_vec(mention):
    return list(get_context_vec(mention.sentence, W2V_MODEL))


# pair features
def distances_vec(ante, ana):
    vec = []

    mnts_intersect = pair_intersect(ante, ana)

    words_dist = [0] * 11
    words_bucket = 0
    if mnts_intersect != 1:
        words_bucket = get_distance_bucket(ana.start_in_words - ante.end_in_words - 1)
    words_dist[words_bucket] = 1
    vec.extend(words_dist)

    mentions_dist = [0] * 11
    mentions_bucket = 0
    if mnts_intersect != 1:
        mentions_bucket = get_distance_bucket(ana.position_in_mentions - ante.position_in_mentions - 1)
    if words_bucket == 10:
        mentions_bucket = 10
    mentions_dist[mentions_bucket] = 1
    vec.extend(mentions_dist)

    vec.append(mnts_intersect)

    return vec


def pair_intersect(ante, ana):
    for ante_word in ante.words:
        for ana_word in ana.words:
            if ana_word['id'] == ante_word['id']:
                return 1
    return 0


def head_match(ante, ana):
    if ante.head_orth.lower() == ana.head_orth.lower():
        return 1
    return 0


def exact_match(ante, ana):
    if ante.text.lower() == ana.text.lower():
        return 1
    return 0


def base_match(ante, ana):
    if ante.lemmatized_text.lower() == ana.lemmatized_text.lower():
        return 1
    return 0


# supporting functions
def get_wv(model, lemma, use_random_vec=True):
    vec = None
    if use_random_vec:
        vec = random_vec()
    try:
        vec = model.wv[lemma]
    except KeyError:
        pass
    except TypeError:
        pass
    return vec


def random_vec():
    return numpy.asarray([random.uniform(-0.25, 0.25) for i in range(0, W2V_SIZE)], dtype=numpy.float32)


def get_context_vec(words, model):
    vec = numpy.zeros(W2V_SIZE, dtype=numpy.float32)
    unknown_count = 0
    if len(words) != 0:
        for word in words:
            word_vec = get_wv(model, word['base'], RANDOM_WORD_VECTORS)
            if word_vec is None:
                unknown_count += 1
            else:
                vec += word_vec
        significant_words = len(words) - unknown_count
        if significant_words != 0:
            vec = vec / float(significant_words)
        else:
            vec = random_vec()
    return vec


def get_distance_bucket(distance):
    if 0 <= distance <= 4:
        return distance
    elif 5 <= distance <= 7:
        return 5
    elif 8 <= distance <= 15:
        return 6
    elif 16 <= distance <= 31:
        return 7
    elif 32 <= distance <= 63:
        return 8
    elif distance >= 64:
        return 9
    return 10