Commit 07ecdd418fac471bb15d7286c7187931b111c0d5
1 parent
c11a89c8
A neural network model of a binary classifier for mention pairs
Showing
1 changed file
with
285 additions
and
0 deletions
mention-pair-classifier.ipynb
0 → 100644
1 | +{ | ||
2 | + "cells": [ | ||
3 | + { | ||
4 | + "cell_type": "code", | ||
5 | + "execution_count": null, | ||
6 | + "metadata": { | ||
7 | + "collapsed": false, | ||
8 | + "deletable": true, | ||
9 | + "editable": true | ||
10 | + }, | ||
11 | + "outputs": [], | ||
12 | + "source": [ | ||
13 | + "from keras.models import Model\n", | ||
14 | + "from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization\n", | ||
15 | + "from keras.optimizers import SGD, Adam\n", | ||
16 | + "import numpy as np" | ||
17 | + ] | ||
18 | + }, | ||
19 | + { | ||
20 | + "cell_type": "markdown", | ||
21 | + "metadata": { | ||
22 | + "deletable": true, | ||
23 | + "editable": true | ||
24 | + }, | ||
25 | + "source": [ | ||
26 | + "# Data preparation" | ||
27 | + ] | ||
28 | + }, | ||
29 | + { | ||
30 | + "cell_type": "code", | ||
31 | + "execution_count": null, | ||
32 | + "metadata": { | ||
33 | + "collapsed": true, | ||
34 | + "deletable": true, | ||
35 | + "editable": true | ||
36 | + }, | ||
37 | + "outputs": [], | ||
38 | + "source": [ | ||
39 | + "filename = 'input_data.csv'\n", | ||
40 | + "raw_data = open(filename, 'rt')\n", | ||
41 | + "data = np.loadtxt(raw_data, delimiter= '\\t')" | ||
42 | + ] | ||
43 | + }, | ||
44 | + { | ||
45 | + "cell_type": "code", | ||
46 | + "execution_count": null, | ||
47 | + "metadata": { | ||
48 | + "collapsed": false, | ||
49 | + "deletable": true, | ||
50 | + "editable": true | ||
51 | + }, | ||
52 | + "outputs": [], | ||
53 | + "source": [ | ||
54 | + "print data.shape" | ||
55 | + ] | ||
56 | + }, | ||
57 | + { | ||
58 | + "cell_type": "markdown", | ||
59 | + "metadata": { | ||
60 | + "deletable": true, | ||
61 | + "editable": true | ||
62 | + }, | ||
63 | + "source": [ | ||
64 | + "Our dataset consists of ~466K examples (pairs of mentions), each example described by 1126 features. Labels say whether a pair belongs to the same cluster (1) or not (0)." | ||
65 | + ] | ||
66 | + }, | ||
67 | + { | ||
68 | + "cell_type": "code", | ||
69 | + "execution_count": null, | ||
70 | + "metadata": { | ||
71 | + "collapsed": false, | ||
72 | + "deletable": true, | ||
73 | + "editable": true | ||
74 | + }, | ||
75 | + "outputs": [], | ||
76 | + "source": [ | ||
77 | + "size_of_dataset = len(data)\n", | ||
78 | + "number_of_features = 1126\n", | ||
79 | + "\n", | ||
80 | + "X = data[:,0:1126]\n", | ||
81 | + "Y = data[:,1126] #last column consists of labels\n" | ||
82 | + ] | ||
83 | + }, | ||
84 | + { | ||
85 | + "cell_type": "markdown", | ||
86 | + "metadata": { | ||
87 | + "deletable": true, | ||
88 | + "editable": true | ||
89 | + }, | ||
90 | + "source": [ | ||
91 | + "Now let's split data into trainig and test set (90/10)" | ||
92 | + ] | ||
93 | + }, | ||
94 | + { | ||
95 | + "cell_type": "code", | ||
96 | + "execution_count": null, | ||
97 | + "metadata": { | ||
98 | + "collapsed": true, | ||
99 | + "deletable": true, | ||
100 | + "editable": true | ||
101 | + }, | ||
102 | + "outputs": [], | ||
103 | + "source": [ | ||
104 | + "np.random.seed(999) #seed fixed for reproducibility\n", | ||
105 | + "mask = np.random.rand(size_of_dataset) < 0.9 #array of boolean variables\n", | ||
106 | + "\n", | ||
107 | + "training_set = X[mask]\n", | ||
108 | + "training_labels = Y[mask]\n", | ||
109 | + "\n", | ||
110 | + "test_set = X[~mask]\n", | ||
111 | + "test_labels = Y[~mask]" | ||
112 | + ] | ||
113 | + }, | ||
114 | + { | ||
115 | + "cell_type": "markdown", | ||
116 | + "metadata": { | ||
117 | + "deletable": true, | ||
118 | + "editable": true | ||
119 | + }, | ||
120 | + "source": [ | ||
121 | + "# Neural network configuration" | ||
122 | + ] | ||
123 | + }, | ||
124 | + { | ||
125 | + "cell_type": "code", | ||
126 | + "execution_count": null, | ||
127 | + "metadata": { | ||
128 | + "collapsed": false, | ||
129 | + "deletable": true, | ||
130 | + "editable": true | ||
131 | + }, | ||
132 | + "outputs": [], | ||
133 | + "source": [ | ||
134 | + "inputs = Input(shape=(number_of_features,))\n", | ||
135 | + "output_from_1st_layer = Dense(1000, activation='relu')(inputs)\n", | ||
136 | + "output_from_1st_layer = Dropout(0.5)(output_from_1st_layer)\n", | ||
137 | + "output_from_1st_layer = BatchNormalization()(output_from_1st_layer)\n", | ||
138 | + "output_from_2nd_layer = Dense(500, activation='relu')(output_from_1st_layer)\n", | ||
139 | + "output_from_2nd_layer = Dropout(0.5)(output_from_2nd_layer)\n", | ||
140 | + "output_from_2nd_layer = BatchNormalization()(output_from_2nd_layer)\n", | ||
141 | + "output = Dense(1, activation='sigmoid')(output_from_2nd_layer)\n", | ||
142 | + "\n", | ||
143 | + "model = Model(inputs, output)\n", | ||
144 | + "model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])" | ||
145 | + ] | ||
146 | + }, | ||
147 | + { | ||
148 | + "cell_type": "markdown", | ||
149 | + "metadata": {}, | ||
150 | + "source": [ | ||
151 | + "# Training" | ||
152 | + ] | ||
153 | + }, | ||
154 | + { | ||
155 | + "cell_type": "code", | ||
156 | + "execution_count": null, | ||
157 | + "metadata": { | ||
158 | + "collapsed": false, | ||
159 | + "deletable": true, | ||
160 | + "editable": true | ||
161 | + }, | ||
162 | + "outputs": [], | ||
163 | + "source": [ | ||
164 | + "model.fit(training_set, training_labels, batch_size=256, nb_epoch=25)" | ||
165 | + ] | ||
166 | + }, | ||
167 | + { | ||
168 | + "cell_type": "markdown", | ||
169 | + "metadata": { | ||
170 | + "collapsed": false, | ||
171 | + "deletable": true, | ||
172 | + "editable": true | ||
173 | + }, | ||
174 | + "source": [ | ||
175 | + "# Evaluation" | ||
176 | + ] | ||
177 | + }, | ||
178 | + { | ||
179 | + "cell_type": "code", | ||
180 | + "execution_count": null, | ||
181 | + "metadata": { | ||
182 | + "collapsed": false, | ||
183 | + "deletable": true, | ||
184 | + "editable": true, | ||
185 | + "scrolled": true | ||
186 | + }, | ||
187 | + "outputs": [], | ||
188 | + "source": [ | ||
189 | + "scores = model.evaluate(test_set, test_labels)\n", | ||
190 | + "print(\"%s: %.2f%%\" % (model.metrics_names[1], scores[1]*100))" | ||
191 | + ] | ||
192 | + }, | ||
193 | + { | ||
194 | + "cell_type": "markdown", | ||
195 | + "metadata": {}, | ||
196 | + "source": [ | ||
197 | + "# Playing with the model" | ||
198 | + ] | ||
199 | + }, | ||
200 | + { | ||
201 | + "cell_type": "markdown", | ||
202 | + "metadata": { | ||
203 | + "deletable": true, | ||
204 | + "editable": true | ||
205 | + }, | ||
206 | + "source": [ | ||
207 | + "You can save the weights of the model to a file and later recreate the model without training by model.load_weights(\"my_weights.h5\")" | ||
208 | + ] | ||
209 | + }, | ||
210 | + { | ||
211 | + "cell_type": "code", | ||
212 | + "execution_count": null, | ||
213 | + "metadata": { | ||
214 | + "collapsed": true, | ||
215 | + "deletable": true, | ||
216 | + "editable": true | ||
217 | + }, | ||
218 | + "outputs": [], | ||
219 | + "source": [ | ||
220 | + "model.save_weights(\"my_weights.h5\")" | ||
221 | + ] | ||
222 | + }, | ||
223 | + { | ||
224 | + "cell_type": "markdown", | ||
225 | + "metadata": {}, | ||
226 | + "source": [ | ||
227 | + "To have predictions for a test set we do" | ||
228 | + ] | ||
229 | + }, | ||
230 | + { | ||
231 | + "cell_type": "code", | ||
232 | + "execution_count": null, | ||
233 | + "metadata": { | ||
234 | + "collapsed": false, | ||
235 | + "deletable": true, | ||
236 | + "editable": true | ||
237 | + }, | ||
238 | + "outputs": [], | ||
239 | + "source": [ | ||
240 | + "predictions = model.predict(test_set)" | ||
241 | + ] | ||
242 | + }, | ||
243 | + { | ||
244 | + "cell_type": "markdown", | ||
245 | + "metadata": {}, | ||
246 | + "source": [ | ||
247 | + "and for a single example" | ||
248 | + ] | ||
249 | + }, | ||
250 | + { | ||
251 | + "cell_type": "code", | ||
252 | + "execution_count": null, | ||
253 | + "metadata": { | ||
254 | + "collapsed": false | ||
255 | + }, | ||
256 | + "outputs": [], | ||
257 | + "source": [ | ||
258 | + "single_example = test_set[4:5,:] #example number 5 from the test set\n", | ||
259 | + "prediction = model.predict(single_example)\n", | ||
260 | + "print '%.8f' % prediction[0]" | ||
261 | + ] | ||
262 | + } | ||
263 | + ], | ||
264 | + "metadata": { | ||
265 | + "kernelspec": { | ||
266 | + "display_name": "Python 2", | ||
267 | + "language": "python", | ||
268 | + "name": "python2" | ||
269 | + }, | ||
270 | + "language_info": { | ||
271 | + "codemirror_mode": { | ||
272 | + "name": "ipython", | ||
273 | + "version": 2 | ||
274 | + }, | ||
275 | + "file_extension": ".py", | ||
276 | + "mimetype": "text/x-python", | ||
277 | + "name": "python", | ||
278 | + "nbconvert_exporter": "python", | ||
279 | + "pygments_lexer": "ipython2", | ||
280 | + "version": "2.7.10" | ||
281 | + } | ||
282 | + }, | ||
283 | + "nbformat": 4, | ||
284 | + "nbformat_minor": 2 | ||
285 | +} |