resolve.py
6.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import numpy
from corneferencer.resolvers import features, vectors
def siamese(text, threshold, neural_model):
last_set_id = 0
for i, ana in enumerate(text.mentions):
if i > 0:
for ante in reversed(text.mentions[:i]):
if not features.pair_intersect(ante, ana):
pair_features = vectors.get_pair_features(ante, ana)
ante_vec = []
ante_vec.extend(ante.features)
ante_vec.extend(pair_features)
ante_sample = numpy.asarray([ante_vec], dtype=numpy.float32)
ana_vec = []
ana_vec.extend(ana.features)
ana_vec.extend(pair_features)
ana_sample = numpy.asarray([ana_vec], dtype=numpy.float32)
prediction = neural_model.predict([ante_sample, ana_sample])[0]
if prediction < threshold:
if ante.set:
ana.set = ante.set
else:
str_set_id = 'set_%d' % last_set_id
ante.set = str_set_id
ana.set = str_set_id
last_set_id += 1
break
# incremental resolve algorithm
def incremental(text, threshold, neural_model):
last_set_id = 0
for i, ana in enumerate(text.mentions):
if i > 0:
best_prediction = 0.0
best_ante = None
for ante in text.mentions[:i]:
if not features.pair_intersect(ante, ana):
pair_vec = vectors.get_pair_vector(ante, ana)
sample = numpy.asarray([pair_vec], dtype=numpy.float32)
prediction = neural_model.predict(sample)[0]
if prediction > threshold and prediction >= best_prediction:
best_prediction = prediction
best_ante = ante
if best_ante is not None:
if best_ante.set:
ana.set = best_ante.set
else:
str_set_id = 'set_%d' % last_set_id
best_ante.set = str_set_id
ana.set = str_set_id
last_set_id += 1
# all2all resolve algorithm
def all2all(text, threshold, neural_model):
last_set_id = 0
sets = text.get_sets()
for pos1, mnt1 in enumerate(text.mentions):
best_prediction = 0.0
best_link = None
for pos2, mnt2 in enumerate(text.mentions):
if (pos2 > pos1 and
(mnt1.set != mnt2.set or not mnt1.set or not mnt2.set)
and not features.pair_intersect(mnt1, mnt2)):
ante = mnt1
ana = mnt2
pair_vec = vectors.get_pair_vector(ante, ana)
sample = numpy.asarray([pair_vec], dtype=numpy.float32)
prediction = neural_model.predict(sample)[0]
if prediction > threshold and prediction > best_prediction:
best_prediction = prediction
best_link = mnt2
if best_link is not None:
if best_link.set and not mnt1.set:
mnt1.set = best_link.set
elif not best_link.set and mnt1.set:
best_link.set = mnt1.set
elif best_link.set and mnt1.set:
text.merge_sets(best_link.set, mnt1.set)
elif not best_link.set and not mnt1.set:
str_set_id = 'set_%d' % last_set_id
while str_set_id in sets:
last_set_id += 1
str_set_id = 'set_%d' % last_set_id
best_link.set = str_set_id
mnt1.set = str_set_id
sets[str_set_id] = [best_link, mnt1]
# entity based resolve algorithm
def entity_based(text, threshold, neural_model):
sets = []
last_set_id = 0
for i, ana in enumerate(text.mentions):
if i > 0:
best_fit = get_best_set(sets, ana, threshold, neural_model)
if best_fit is not None:
ana.set = best_fit['set_id']
best_fit['mentions'].append(ana)
else:
str_set_id = 'set_%d' % last_set_id
sets.append({'set_id': str_set_id,
'mentions': [ana]})
ana.set = str_set_id
last_set_id += 1
else:
str_set_id = 'set_%d' % last_set_id
sets.append({'set_id': str_set_id,
'mentions': [ana]})
ana.set = str_set_id
last_set_id += 1
remove_singletons(sets)
def get_best_set(sets, ana, threshold, neural_model):
best_prediction = 0.0
best_set = None
for s in sets:
accuracy = predict_set(s['mentions'], ana, neural_model)
if accuracy > threshold and accuracy >= best_prediction:
best_prediction = accuracy
best_set = s
return best_set
def predict_set(mentions, ana, neural_model):
prediction_sum = 0.0
for mnt in mentions:
prediction = 0.0
if not features.pair_intersect(mnt, ana):
pair_vec = vectors.get_pair_vector(mnt, ana)
sample = numpy.asarray([pair_vec], dtype=numpy.float32)
prediction = neural_model.predict(sample)[0]
prediction_sum += prediction
return prediction_sum / float(len(mentions))
def remove_singletons(sets):
for s in sets:
if len(s['mentions']) == 1:
s['mentions'][0].set = ''
# closest resolve algorithm
def closest(text, threshold, neural_model):
last_set_id = 0
for i, ana in enumerate(text.mentions):
if i > 0:
for ante in reversed(text.mentions[:i]):
if not features.pair_intersect(ante, ana):
pair_vec = vectors.get_pair_vector(ante, ana)
sample = numpy.asarray([pair_vec], dtype=numpy.float32)
prediction = neural_model.predict(sample)[0]
if prediction > threshold:
if ante.set:
ana.set = ante.set
else:
str_set_id = 'set_%d' % last_set_id
ante.set = str_set_id
ana.set = str_set_id
last_set_id += 1
break