Commit 87c8a5217622d56975dd60990bde7a6e7ebd9ea5
1 parent
07ecdd41
10-fold cross validation of the model
Showing
1 changed file
with
207 additions
and
0 deletions
cross_validation.ipynb
0 → 100644
1 | +{ | ||
2 | + "cells": [ | ||
3 | + { | ||
4 | + "cell_type": "code", | ||
5 | + "execution_count": null, | ||
6 | + "metadata": { | ||
7 | + "collapsed": false, | ||
8 | + "deletable": true, | ||
9 | + "editable": true | ||
10 | + }, | ||
11 | + "outputs": [], | ||
12 | + "source": [ | ||
13 | + "from keras.models import Model\n", | ||
14 | + "from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization\n", | ||
15 | + "from keras.optimizers import SGD, Adam\n", | ||
16 | + "import numpy as np\n", | ||
17 | + "from sklearn.model_selection import StratifiedKFold" | ||
18 | + ] | ||
19 | + }, | ||
20 | + { | ||
21 | + "cell_type": "markdown", | ||
22 | + "metadata": { | ||
23 | + "deletable": true, | ||
24 | + "editable": true | ||
25 | + }, | ||
26 | + "source": [ | ||
27 | + "# Preparing data" | ||
28 | + ] | ||
29 | + }, | ||
30 | + { | ||
31 | + "cell_type": "code", | ||
32 | + "execution_count": null, | ||
33 | + "metadata": { | ||
34 | + "collapsed": true, | ||
35 | + "deletable": true, | ||
36 | + "editable": true | ||
37 | + }, | ||
38 | + "outputs": [], | ||
39 | + "source": [ | ||
40 | + "filename = 'input_data.csv'\n", | ||
41 | + "raw_data = open(filename, 'rt')\n", | ||
42 | + "data = np.loadtxt(raw_data, delimiter= '\\t')" | ||
43 | + ] | ||
44 | + }, | ||
45 | + { | ||
46 | + "cell_type": "code", | ||
47 | + "execution_count": null, | ||
48 | + "metadata": { | ||
49 | + "collapsed": false, | ||
50 | + "deletable": true, | ||
51 | + "editable": true | ||
52 | + }, | ||
53 | + "outputs": [], | ||
54 | + "source": [ | ||
55 | + "print data.shape" | ||
56 | + ] | ||
57 | + }, | ||
58 | + { | ||
59 | + "cell_type": "markdown", | ||
60 | + "metadata": { | ||
61 | + "deletable": true, | ||
62 | + "editable": true | ||
63 | + }, | ||
64 | + "source": [ | ||
65 | + "Our dataset consists of ~466K examples (pairs of mentions), each example described by 1126 features. Labels say whether a pair belongs to the same cluster (1) or not (0)." | ||
66 | + ] | ||
67 | + }, | ||
68 | + { | ||
69 | + "cell_type": "code", | ||
70 | + "execution_count": null, | ||
71 | + "metadata": { | ||
72 | + "collapsed": true, | ||
73 | + "deletable": true, | ||
74 | + "editable": true | ||
75 | + }, | ||
76 | + "outputs": [], | ||
77 | + "source": [ | ||
78 | + "size_of_dataset = 466852\n", | ||
79 | + "number_of_features = 1126\n", | ||
80 | + "\n", | ||
81 | + "X = data[:,0:1126]\n", | ||
82 | + "Y = data[:,1126] #last column consists of labels\n" | ||
83 | + ] | ||
84 | + }, | ||
85 | + { | ||
86 | + "cell_type": "markdown", | ||
87 | + "metadata": { | ||
88 | + "deletable": true, | ||
89 | + "editable": true | ||
90 | + }, | ||
91 | + "source": [ | ||
92 | + "# 10-fold cross validation of the neural network model" | ||
93 | + ] | ||
94 | + }, | ||
95 | + { | ||
96 | + "cell_type": "code", | ||
97 | + "execution_count": null, | ||
98 | + "metadata": { | ||
99 | + "collapsed": false, | ||
100 | + "deletable": true, | ||
101 | + "editable": true | ||
102 | + }, | ||
103 | + "outputs": [], | ||
104 | + "source": [ | ||
105 | + "seed = 1\n", | ||
106 | + "kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)\n", | ||
107 | + "cvscores = []\n", | ||
108 | + "precision_scores = []\n", | ||
109 | + "recall_scores = []\n", | ||
110 | + "f1_scores = []\n", | ||
111 | + "\n", | ||
112 | + "for train, test in kfold.split(X, Y):\n", | ||
113 | + "\n", | ||
114 | + " inputs = Input(shape=(number_of_features,))\n", | ||
115 | + " output_from_1st_layer = Dense(1000, activation='relu')(inputs)\n", | ||
116 | + " output_from_1st_layer = Dropout(0.5)(output_from_1st_layer)\n", | ||
117 | + " output_from_1st_layer = BatchNormalization()(output_from_1st_layer)\n", | ||
118 | + " output_from_2nd_layer = Dense(500, activation='relu')(output_from_1st_layer)\n", | ||
119 | + " output_from_2nd_layer = Dropout(0.5)(output_from_2nd_layer)\n", | ||
120 | + " output_from_2nd_layer = BatchNormalization()(output_from_2nd_layer)\n", | ||
121 | + " output = Dense(1, activation='sigmoid')(output_from_2nd_layer)\n", | ||
122 | + "\n", | ||
123 | + " model = Model(inputs, output)\n", | ||
124 | + " model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])\n", | ||
125 | + " model.fit(X[train], Y[train], batch_size=256, nb_epoch=25)\n", | ||
126 | + " \n", | ||
127 | + " # evaluate the model\n", | ||
128 | + " scores = model.evaluate(X[test], Y[test])\n", | ||
129 | + " print(\"%s: %.2f%%\" % (model.metrics_names[1], scores[1]*100))\n", | ||
130 | + " cvscores.append(scores[1] * 100)\n", | ||
131 | + "\n", | ||
132 | + " #calculate other metrics: precision, recall, f1\n", | ||
133 | + " predictions = model.predict(X[test])\n", | ||
134 | + " true_positives = 0.0\n", | ||
135 | + " false_positives = 0.0\n", | ||
136 | + " true_negatives = 0.0\n", | ||
137 | + " false_negatives = 0.0\n", | ||
138 | + "\n", | ||
139 | + " for i in range(len(X[test])):\n", | ||
140 | + " if (predictions[i]<0.5 and Y[test][i]==0): true_negatives += 1 \n", | ||
141 | + " if (predictions[i]<0.5 and Y[test][i]==1): false_negatives += 1\n", | ||
142 | + " if (predictions[i]>=0.5 and Y[test][i]==1): true_positives += 1\n", | ||
143 | + " if (predictions[i]>=0.5 and Y[test][i]==0): false_positives += 1 \n", | ||
144 | + " \n", | ||
145 | + " precision = true_positives/(true_positives+false_positives)\n", | ||
146 | + " recall = true_positives/(true_positives+false_negatives)\n", | ||
147 | + " f1 = 2*(precision*recall)/(precision+recall)\n", | ||
148 | + "\n", | ||
149 | + " precision_scores.append(precision)\n", | ||
150 | + " recall_scores.append(recall)\n", | ||
151 | + " f1_scores.append(f1)\n", | ||
152 | + "\n", | ||
153 | + " print ('Precision: ' + repr(precision))\n", | ||
154 | + " print ('Recall: ' + repr(recall))\n", | ||
155 | + " print ('F1: ' + repr(f1))" | ||
156 | + ] | ||
157 | + }, | ||
158 | + { | ||
159 | + "cell_type": "markdown", | ||
160 | + "metadata": { | ||
161 | + "collapsed": false, | ||
162 | + "deletable": true, | ||
163 | + "editable": true | ||
164 | + }, | ||
165 | + "source": [ | ||
166 | + "# Summary" | ||
167 | + ] | ||
168 | + }, | ||
169 | + { | ||
170 | + "cell_type": "code", | ||
171 | + "execution_count": null, | ||
172 | + "metadata": { | ||
173 | + "collapsed": true, | ||
174 | + "deletable": true, | ||
175 | + "editable": true | ||
176 | + }, | ||
177 | + "outputs": [], | ||
178 | + "source": [ | ||
179 | + "print(\"%.2f%% (+/- %.2f%%)\" % (np.mean(cvscores), np.std(cvscores)))\n", | ||
180 | + "print(\"%.2f%% (+/- %.2f%%)\" % (np.mean(precision_scores), np.std(precision_scores)))\n", | ||
181 | + "print(\"%.2f%% (+/- %.2f%%)\" % (np.mean(recall_scores), np.std(recall_scores)))\n", | ||
182 | + "print(\"%.2f%% (+/- %.2f%%)\" % (np.mean(f1_scores), np.std(f1_scores)))" | ||
183 | + ] | ||
184 | + } | ||
185 | + ], | ||
186 | + "metadata": { | ||
187 | + "kernelspec": { | ||
188 | + "display_name": "Python 2", | ||
189 | + "language": "python", | ||
190 | + "name": "python2" | ||
191 | + }, | ||
192 | + "language_info": { | ||
193 | + "codemirror_mode": { | ||
194 | + "name": "ipython", | ||
195 | + "version": 2 | ||
196 | + }, | ||
197 | + "file_extension": ".py", | ||
198 | + "mimetype": "text/x-python", | ||
199 | + "name": "python", | ||
200 | + "nbconvert_exporter": "python", | ||
201 | + "pygments_lexer": "ipython2", | ||
202 | + "version": "2.7.10" | ||
203 | + } | ||
204 | + }, | ||
205 | + "nbformat": 4, | ||
206 | + "nbformat_minor": 2 | ||
207 | +} |