Commit 87c8a5217622d56975dd60990bde7a6e7ebd9ea5

Authored by Paweł Morawiecki
1 parent 07ecdd41

10-fold cross validation of the model

Showing 1 changed file with 207 additions and 0 deletions
cross_validation.ipynb 0 → 100644
  1 +{
  2 + "cells": [
  3 + {
  4 + "cell_type": "code",
  5 + "execution_count": null,
  6 + "metadata": {
  7 + "collapsed": false,
  8 + "deletable": true,
  9 + "editable": true
  10 + },
  11 + "outputs": [],
  12 + "source": [
  13 + "from keras.models import Model\n",
  14 + "from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization\n",
  15 + "from keras.optimizers import SGD, Adam\n",
  16 + "import numpy as np\n",
  17 + "from sklearn.model_selection import StratifiedKFold"
  18 + ]
  19 + },
  20 + {
  21 + "cell_type": "markdown",
  22 + "metadata": {
  23 + "deletable": true,
  24 + "editable": true
  25 + },
  26 + "source": [
  27 + "# Preparing data"
  28 + ]
  29 + },
  30 + {
  31 + "cell_type": "code",
  32 + "execution_count": null,
  33 + "metadata": {
  34 + "collapsed": true,
  35 + "deletable": true,
  36 + "editable": true
  37 + },
  38 + "outputs": [],
  39 + "source": [
  40 + "filename = 'input_data.csv'\n",
  41 + "raw_data = open(filename, 'rt')\n",
  42 + "data = np.loadtxt(raw_data, delimiter= '\\t')"
  43 + ]
  44 + },
  45 + {
  46 + "cell_type": "code",
  47 + "execution_count": null,
  48 + "metadata": {
  49 + "collapsed": false,
  50 + "deletable": true,
  51 + "editable": true
  52 + },
  53 + "outputs": [],
  54 + "source": [
  55 + "print data.shape"
  56 + ]
  57 + },
  58 + {
  59 + "cell_type": "markdown",
  60 + "metadata": {
  61 + "deletable": true,
  62 + "editable": true
  63 + },
  64 + "source": [
  65 + "Our dataset consists of ~466K examples (pairs of mentions), each example described by 1126 features. Labels say whether a pair belongs to the same cluster (1) or not (0)."
  66 + ]
  67 + },
  68 + {
  69 + "cell_type": "code",
  70 + "execution_count": null,
  71 + "metadata": {
  72 + "collapsed": true,
  73 + "deletable": true,
  74 + "editable": true
  75 + },
  76 + "outputs": [],
  77 + "source": [
  78 + "size_of_dataset = 466852\n",
  79 + "number_of_features = 1126\n",
  80 + "\n",
  81 + "X = data[:,0:1126]\n",
  82 + "Y = data[:,1126] #last column consists of labels\n"
  83 + ]
  84 + },
  85 + {
  86 + "cell_type": "markdown",
  87 + "metadata": {
  88 + "deletable": true,
  89 + "editable": true
  90 + },
  91 + "source": [
  92 + "# 10-fold cross validation of the neural network model"
  93 + ]
  94 + },
  95 + {
  96 + "cell_type": "code",
  97 + "execution_count": null,
  98 + "metadata": {
  99 + "collapsed": false,
  100 + "deletable": true,
  101 + "editable": true
  102 + },
  103 + "outputs": [],
  104 + "source": [
  105 + "seed = 1\n",
  106 + "kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)\n",
  107 + "cvscores = []\n",
  108 + "precision_scores = []\n",
  109 + "recall_scores = []\n",
  110 + "f1_scores = []\n",
  111 + "\n",
  112 + "for train, test in kfold.split(X, Y):\n",
  113 + "\n",
  114 + " inputs = Input(shape=(number_of_features,))\n",
  115 + " output_from_1st_layer = Dense(1000, activation='relu')(inputs)\n",
  116 + " output_from_1st_layer = Dropout(0.5)(output_from_1st_layer)\n",
  117 + " output_from_1st_layer = BatchNormalization()(output_from_1st_layer)\n",
  118 + " output_from_2nd_layer = Dense(500, activation='relu')(output_from_1st_layer)\n",
  119 + " output_from_2nd_layer = Dropout(0.5)(output_from_2nd_layer)\n",
  120 + " output_from_2nd_layer = BatchNormalization()(output_from_2nd_layer)\n",
  121 + " output = Dense(1, activation='sigmoid')(output_from_2nd_layer)\n",
  122 + "\n",
  123 + " model = Model(inputs, output)\n",
  124 + " model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])\n",
  125 + " model.fit(X[train], Y[train], batch_size=256, nb_epoch=25)\n",
  126 + " \n",
  127 + " # evaluate the model\n",
  128 + " scores = model.evaluate(X[test], Y[test])\n",
  129 + " print(\"%s: %.2f%%\" % (model.metrics_names[1], scores[1]*100))\n",
  130 + " cvscores.append(scores[1] * 100)\n",
  131 + "\n",
  132 + " #calculate other metrics: precision, recall, f1\n",
  133 + " predictions = model.predict(X[test])\n",
  134 + " true_positives = 0.0\n",
  135 + " false_positives = 0.0\n",
  136 + " true_negatives = 0.0\n",
  137 + " false_negatives = 0.0\n",
  138 + "\n",
  139 + " for i in range(len(X[test])):\n",
  140 + " if (predictions[i]<0.5 and Y[test][i]==0): true_negatives += 1 \n",
  141 + " if (predictions[i]<0.5 and Y[test][i]==1): false_negatives += 1\n",
  142 + " if (predictions[i]>=0.5 and Y[test][i]==1): true_positives += 1\n",
  143 + " if (predictions[i]>=0.5 and Y[test][i]==0): false_positives += 1 \n",
  144 + " \n",
  145 + " precision = true_positives/(true_positives+false_positives)\n",
  146 + " recall = true_positives/(true_positives+false_negatives)\n",
  147 + " f1 = 2*(precision*recall)/(precision+recall)\n",
  148 + "\n",
  149 + " precision_scores.append(precision)\n",
  150 + " recall_scores.append(recall)\n",
  151 + " f1_scores.append(f1)\n",
  152 + "\n",
  153 + " print ('Precision: ' + repr(precision))\n",
  154 + " print ('Recall: ' + repr(recall))\n",
  155 + " print ('F1: ' + repr(f1))"
  156 + ]
  157 + },
  158 + {
  159 + "cell_type": "markdown",
  160 + "metadata": {
  161 + "collapsed": false,
  162 + "deletable": true,
  163 + "editable": true
  164 + },
  165 + "source": [
  166 + "# Summary"
  167 + ]
  168 + },
  169 + {
  170 + "cell_type": "code",
  171 + "execution_count": null,
  172 + "metadata": {
  173 + "collapsed": true,
  174 + "deletable": true,
  175 + "editable": true
  176 + },
  177 + "outputs": [],
  178 + "source": [
  179 + "print(\"%.2f%% (+/- %.2f%%)\" % (np.mean(cvscores), np.std(cvscores)))\n",
  180 + "print(\"%.2f%% (+/- %.2f%%)\" % (np.mean(precision_scores), np.std(precision_scores)))\n",
  181 + "print(\"%.2f%% (+/- %.2f%%)\" % (np.mean(recall_scores), np.std(recall_scores)))\n",
  182 + "print(\"%.2f%% (+/- %.2f%%)\" % (np.mean(f1_scores), np.std(f1_scores)))"
  183 + ]
  184 + }
  185 + ],
  186 + "metadata": {
  187 + "kernelspec": {
  188 + "display_name": "Python 2",
  189 + "language": "python",
  190 + "name": "python2"
  191 + },
  192 + "language_info": {
  193 + "codemirror_mode": {
  194 + "name": "ipython",
  195 + "version": 2
  196 + },
  197 + "file_extension": ".py",
  198 + "mimetype": "text/x-python",
  199 + "name": "python",
  200 + "nbconvert_exporter": "python",
  201 + "pygments_lexer": "ipython2",
  202 + "version": "2.7.10"
  203 + }
  204 + },
  205 + "nbformat": 4,
  206 + "nbformat_minor": 2
  207 +}
... ...