10-fold cross validation of the model (87c8a521) | Commits | cothec / neural4coref

Browse Code »

Commit 87c8a5217622d56975dd60990bde7a6e7ebd9ea5

Authored by Paweł Morawiecki 8 years ago

1 parent 07ecdd41

master

10-fold cross validation of the model

Inline Side-by-side

Showing 1 changed file with 207 additions and 0 deletions

cross_validation.ipynb 0 → 100644

View file @87c8a52

		1	+{
		2	+ "cells": [
		3	+ {
		4	+ "cell_type": "code",
		5	+ "execution_count": null,
		6	+ "metadata": {
		7	+ "collapsed": false,
		8	+ "deletable": true,
		9	+ "editable": true
		10	+ },
		11	+ "outputs": [],
		12	+ "source": [
		13	+ "from keras.models import Model\n",
		14	+ "from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization\n",
		15	+ "from keras.optimizers import SGD, Adam\n",
		16	+ "import numpy as np\n",
		17	+ "from sklearn.model_selection import StratifiedKFold"
		18	+ ]
		19	+ },
		20	+ {
		21	+ "cell_type": "markdown",
		22	+ "metadata": {
		23	+ "deletable": true,
		24	+ "editable": true
		25	+ },
		26	+ "source": [
		27	+ "# Preparing data"
		28	+ ]
		29	+ },
		30	+ {
		31	+ "cell_type": "code",
		32	+ "execution_count": null,
		33	+ "metadata": {
		34	+ "collapsed": true,
		35	+ "deletable": true,
		36	+ "editable": true
		37	+ },
		38	+ "outputs": [],
		39	+ "source": [
		40	+ "filename = 'input_data.csv'\n",
		41	+ "raw_data = open(filename, 'rt')\n",
		42	+ "data = np.loadtxt(raw_data, delimiter= '\\t')"
		43	+ ]
		44	+ },
		45	+ {
		46	+ "cell_type": "code",
		47	+ "execution_count": null,
		48	+ "metadata": {
		49	+ "collapsed": false,
		50	+ "deletable": true,
		51	+ "editable": true
		52	+ },
		53	+ "outputs": [],
		54	+ "source": [
		55	+ "print data.shape"
		56	+ ]
		57	+ },
		58	+ {
		59	+ "cell_type": "markdown",
		60	+ "metadata": {
		61	+ "deletable": true,
		62	+ "editable": true
		63	+ },
		64	+ "source": [
		65	+ "Our dataset consists of ~466K examples (pairs of mentions), each example described by 1126 features. Labels say whether a pair belongs to the same cluster (1) or not (0)."
		66	+ ]
		67	+ },
		68	+ {
		69	+ "cell_type": "code",
		70	+ "execution_count": null,
		71	+ "metadata": {
		72	+ "collapsed": true,
		73	+ "deletable": true,
		74	+ "editable": true
		75	+ },
		76	+ "outputs": [],
		77	+ "source": [
		78	+ "size_of_dataset = 466852\n",
		79	+ "number_of_features = 1126\n",
		80	+ "\n",
		81	+ "X = data[:,0:1126]\n",
		82	+ "Y = data[:,1126] #last column consists of labels\n"
		83	+ ]
		84	+ },
		85	+ {
		86	+ "cell_type": "markdown",
		87	+ "metadata": {
		88	+ "deletable": true,
		89	+ "editable": true
		90	+ },
		91	+ "source": [
		92	+ "# 10-fold cross validation of the neural network model"
		93	+ ]
		94	+ },
		95	+ {
		96	+ "cell_type": "code",
		97	+ "execution_count": null,
		98	+ "metadata": {
		99	+ "collapsed": false,
		100	+ "deletable": true,
		101	+ "editable": true
		102	+ },
		103	+ "outputs": [],
		104	+ "source": [
		105	+ "seed = 1\n",
		106	+ "kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)\n",
		107	+ "cvscores = []\n",
		108	+ "precision_scores = []\n",
		109	+ "recall_scores = []\n",
		110	+ "f1_scores = []\n",
		111	+ "\n",
		112	+ "for train, test in kfold.split(X, Y):\n",
		113	+ "\n",
		114	+ " inputs = Input(shape=(number_of_features,))\n",
		115	+ " output_from_1st_layer = Dense(1000, activation='relu')(inputs)\n",
		116	+ " output_from_1st_layer = Dropout(0.5)(output_from_1st_layer)\n",
		117	+ " output_from_1st_layer = BatchNormalization()(output_from_1st_layer)\n",
		118	+ " output_from_2nd_layer = Dense(500, activation='relu')(output_from_1st_layer)\n",
		119	+ " output_from_2nd_layer = Dropout(0.5)(output_from_2nd_layer)\n",
		120	+ " output_from_2nd_layer = BatchNormalization()(output_from_2nd_layer)\n",
		121	+ " output = Dense(1, activation='sigmoid')(output_from_2nd_layer)\n",
		122	+ "\n",
		123	+ " model = Model(inputs, output)\n",
		124	+ " model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])\n",
		125	+ " model.fit(X[train], Y[train], batch_size=256, nb_epoch=25)\n",
		126	+ " \n",
		127	+ " # evaluate the model\n",
		128	+ " scores = model.evaluate(X[test], Y[test])\n",
		129	+ " print(\"%s: %.2f%%\" % (model.metrics_names[1], scores[1]*100))\n",
		130	+ " cvscores.append(scores[1] * 100)\n",
		131	+ "\n",
		132	+ " #calculate other metrics: precision, recall, f1\n",
		133	+ " predictions = model.predict(X[test])\n",
		134	+ " true_positives = 0.0\n",
		135	+ " false_positives = 0.0\n",
		136	+ " true_negatives = 0.0\n",
		137	+ " false_negatives = 0.0\n",
		138	+ "\n",
		139	+ " for i in range(len(X[test])):\n",
		140	+ " if (predictions[i]<0.5 and Y[test][i]==0): true_negatives += 1 \n",
		141	+ " if (predictions[i]<0.5 and Y[test][i]==1): false_negatives += 1\n",
		142	+ " if (predictions[i]>=0.5 and Y[test][i]==1): true_positives += 1\n",
		143	+ " if (predictions[i]>=0.5 and Y[test][i]==0): false_positives += 1 \n",
		144	+ " \n",
		145	+ " precision = true_positives/(true_positives+false_positives)\n",
		146	+ " recall = true_positives/(true_positives+false_negatives)\n",
		147	+ " f1 = 2(precisionrecall)/(precision+recall)\n",
		148	+ "\n",
		149	+ " precision_scores.append(precision)\n",
		150	+ " recall_scores.append(recall)\n",
		151	+ " f1_scores.append(f1)\n",
		152	+ "\n",
		153	+ " print ('Precision: ' + repr(precision))\n",
		154	+ " print ('Recall: ' + repr(recall))\n",
		155	+ " print ('F1: ' + repr(f1))"
		156	+ ]
		157	+ },
		158	+ {
		159	+ "cell_type": "markdown",
		160	+ "metadata": {
		161	+ "collapsed": false,
		162	+ "deletable": true,
		163	+ "editable": true
		164	+ },
		165	+ "source": [
		166	+ "# Summary"
		167	+ ]
		168	+ },
		169	+ {
		170	+ "cell_type": "code",
		171	+ "execution_count": null,
		172	+ "metadata": {
		173	+ "collapsed": true,
		174	+ "deletable": true,
		175	+ "editable": true
		176	+ },
		177	+ "outputs": [],
		178	+ "source": [
		179	+ "print(\"%.2f%% (+/- %.2f%%)\" % (np.mean(cvscores), np.std(cvscores)))\n",
		180	+ "print(\"%.2f%% (+/- %.2f%%)\" % (np.mean(precision_scores), np.std(precision_scores)))\n",
		181	+ "print(\"%.2f%% (+/- %.2f%%)\" % (np.mean(recall_scores), np.std(recall_scores)))\n",
		182	+ "print(\"%.2f%% (+/- %.2f%%)\" % (np.mean(f1_scores), np.std(f1_scores)))"
		183	+ ]
		184	+ }
		185	+ ],
		186	+ "metadata": {
		187	+ "kernelspec": {
		188	+ "display_name": "Python 2",
		189	+ "language": "python",
		190	+ "name": "python2"
		191	+ },
		192	+ "language_info": {
		193	+ "codemirror_mode": {
		194	+ "name": "ipython",
		195	+ "version": 2
		196	+ },
		197	+ "file_extension": ".py",
		198	+ "mimetype": "text/x-python",
		199	+ "name": "python",
		200	+ "nbconvert_exporter": "python",
		201	+ "pygments_lexer": "ipython2",
		202	+ "version": "2.7.10"
		203	+ }
		204	+ },
		205	+ "nbformat": 4,
		206	+ "nbformat_minor": 2
		207	+}