features.py
3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import numpy
import random
from conf import RANDOM_WORD_VECTORS, W2V_MODEL, W2V_SIZE
# mention features
def head_vec(mention):
return list(get_wv(W2V_MODEL, mention.head_base))
def first_word_vec(mention):
return list(get_wv(W2V_MODEL, mention.words[0]['base']))
def last_word_vec(mention):
return list(get_wv(W2V_MODEL, mention.words[-1]['base']))
def first_after_vec(mention):
if len(mention.follow_context) > 0:
vec = list(get_wv(W2V_MODEL, mention.follow_context[0]['base']))
else:
vec = [0.0] * W2V_SIZE
return vec
def second_after_vec(mention):
if len(mention.follow_context) > 1:
vec = list(get_wv(W2V_MODEL, mention.follow_context[1]['base']))
else:
vec = [0.0] * W2V_SIZE
return vec
def first_before_vec(mention):
if len(mention.prec_context) > 0:
vec = list(get_wv(W2V_MODEL, mention.prec_context[-1]['base']))
else:
vec = [0.0] * W2V_SIZE
return vec
def second_before_vec(mention):
if len(mention.prec_context) > 1:
vec = list(get_wv(W2V_MODEL, mention.prec_context[-2]['base']))
else:
vec = [0.0] * W2V_SIZE
return vec
def preceding_context_vec(mention):
return list(get_context_vec(mention.prec_context, W2V_MODEL))
def following_context_vec(mention):
return list(get_context_vec(mention.follow_context, W2V_MODEL))
def mention_vec(mention):
return list(get_context_vec(mention.words, W2V_MODEL))
def sentence_vec(mention):
return list(get_context_vec(mention.sentence, W2V_MODEL))
# pair features
def distances_vec(ante, ana):
vec = []
mnts_intersect = pair_intersect(ante, ana)
words_dist = [0] * 11
words_bucket = 0
if mnts_intersect != 1:
words_bucket = get_distance_bucket(ana.start_in_words - ante.end_in_words - 1)
words_dist[words_bucket] = 1
vec.extend(words_dist)
mentions_dist = [0] * 11
mentions_bucket = 0
if mnts_intersect != 1:
mentions_bucket = get_distance_bucket(ana.position_in_mentions - ante.position_in_mentions - 1)
if words_bucket == 10:
mentions_bucket = 10
mentions_dist[mentions_bucket] = 1
vec.extend(mentions_dist)
vec.append(mnts_intersect)
return vec
def pair_intersect(ante, ana):
for ante_word in ante.words:
for ana_word in ana.words:
if ana_word['id'] == ante_word['id']:
return 1
return 0
def head_match(ante, ana):
if ante.head_orth.lower() == ana.head_orth.lower():
return 1
return 0
def exact_match(ante, ana):
if ante.text.lower() == ana.text.lower():
return 1
return 0
def base_match(ante, ana):
if ante.lemmatized_text.lower() == ana.lemmatized_text.lower():
return 1
return 0
# supporting functions
def get_wv(model, lemma, use_random_vec=True):
vec = None
if use_random_vec:
vec = random_vec()
try:
vec = model.wv[lemma]
except KeyError:
pass
except TypeError:
pass
return vec
def random_vec():
return numpy.asarray([random.uniform(-0.25, 0.25) for i in range(0, W2V_SIZE)], dtype=numpy.float32)
def get_context_vec(words, model):
vec = numpy.zeros(W2V_SIZE, dtype=numpy.float32)
unknown_count = 0
if len(words) != 0:
for word in words:
word_vec = get_wv(model, word['base'], RANDOM_WORD_VECTORS)
if word_vec is None:
unknown_count += 1
else:
vec += word_vec
significant_words = len(words) - unknown_count
if significant_words != 0:
vec = vec / float(significant_words)
else:
vec = random_vec()
return vec
def get_distance_bucket(distance):
if 0 <= distance <= 4:
return distance
elif 5 <= distance <= 7:
return 5
elif 8 <= distance <= 15:
return 6
elif 16 <= distance <= 31:
return 7
elif 32 <= distance <= 63:
return 8
elif distance >= 64:
return 9
return 10