{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import sklearn\n", "import os\n", "from sklearn import preprocessing\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.cross_validation import train_test_split" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>feat1</th>\n", " <th>feat2</th>\n", " <th>feat3</th>\n", " <th>feat4</th>\n", " <th>class</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>5.1</td>\n", " <td>3.5</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>Iris-setosa</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>4.9</td>\n", " <td>3.0</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>Iris-setosa</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>4.7</td>\n", " <td>3.2</td>\n", " <td>1.3</td>\n", " <td>0.2</td>\n", " <td>Iris-setosa</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4.6</td>\n", " <td>3.1</td>\n", " <td>1.5</td>\n", " <td>0.2</td>\n", " <td>Iris-setosa</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5.0</td>\n", " <td>3.6</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>Iris-setosa</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " feat1 feat2 feat3 feat4 class\n", "0 5.1 3.5 1.4 0.2 Iris-setosa\n", "1 4.9 3.0 1.4 0.2 Iris-setosa\n", "2 4.7 3.2 1.3 0.2 Iris-setosa\n", "3 4.6 3.1 1.5 0.2 Iris-setosa\n", "4 5.0 3.6 1.4 0.2 Iris-setosa" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# reading data into pandas dataframe\n", "DATA_DIR = 'data'\n", "df = pd.read_table(\n", " os.path.abspath(os.path.join(DATA_DIR, 'day1/iris.csv')),\n", " sep=','\n", " )\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# encoding the class to integers\n", "X = df.iloc[:, :-1].values\n", "Y = df.iloc[:, -1].values\n", "# encode the class with integers\n", "le = preprocessing.LabelEncoder()\n", "Y = le.fit_transform(Y)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(105, 4) (45, 4)\n" ] } ], "source": [ "# ideal practice is to use test as 20% - 30% of training data\n", "# defined by test_size in train_test_split()\n", "# random_state is required to avoid sequential biasness in the data distribution\n", "def data_split(X, Y):\n", " X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.30, random_state = 10)\n", " return X_train, X_test, Y_train, Y_test\n", "\n", "X_train, X_test, Y_train, Y_test = data_split(X, Y)\n", "print X_train.shape, X_test.shape" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# this class takes care for scaling the features to the scale of 0-1\n", "# we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which \n", "# also has the range from 0-1\n", "class Normalizer:\n", "\n", " def __init__(self):\n", " self.sc = StandardScaler()\n", " \n", " def scale(self, X, dtype):\n", " if dtype=='train':\n", " XX = self.sc.fit_transform(X)\n", " elif dtype=='test':\n", " XX = self.sc.transform(X)\n", " else:\n", " return None\n", " return XX" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "norm = Normalizer()\n", "X_train = norm.scale(X_train, 'train')\n", "X_test = norm.scale(X_test, 'test')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Model 1 (Logistic)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.8666666666666667\n" ] } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "# train the model\n", "classifier = LogisticRegression()\n", "model = classifier.fit(X_train, Y_train)\n", "predictions_lr = model.predict_proba(X_test)\n", "print sklearn.metrics.accuracy_score(Y_test, np.argmax(predictions_lr, axis=1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Model 2 (Decision Tree)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.9777777777777777\n" ] } ], "source": [ "from sklearn import tree\n", "# train the model\n", "classifier = tree.DecisionTreeClassifier()\n", "model = classifier.fit(X_train, Y_train)\n", "predictions_dtree = model.predict_proba(X_test)\n", "print sklearn.metrics.accuracy_score(Y_test, np.argmax(predictions_dtree, axis=1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Model 3 (KNN)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.9555555555555556\n" ] } ], "source": [ "from sklearn.neighbors import KNeighborsClassifier\n", "knn = KNeighborsClassifier(n_neighbors=3)\n", "model = knn.fit(X_train, Y_train)\n", "predictions_knn = model.predict_proba(X_test)\n", "print sklearn.metrics.accuracy_score(Y_test, np.argmax(predictions_knn, axis=1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Meta Model (Ensemble)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [], "source": [ "class Ensemble(object):\n", " \"\"\"\n", " Implements averaging voting ensemble technique\n", " Each model is given equal weight\n", " \"\"\"\n", " def __init__(self, samples=None, classes=None, classifiers=None):\n", " self.classes = classes\n", " self.samples = samples\n", " self.classifiers = classifiers\n", " \n", " def mixmatch(self, predictions):\n", " if not self.classifiers:\n", " self.classifiers = len(predictions)\n", " \n", " if not self.samples:\n", " self.samples = len(predictions[0])\n", " \n", " if not self.classes:\n", " self.classes = len(predictions[0][0])\n", " \n", " final_pred = np.array([0]*self.classes)\n", " for s in range(self.samples):\n", " s_pred = np.array([0]*self.classes)\n", " for c in range(self.classifiers):\n", " pred = predictions[c][s]\n", " s_pred = np.vstack((s_pred, pred))\n", " s_pred = s_pred[1:, :]\n", " s_pred_avg = np.average(s_pred, axis=0)\n", " final_pred = np.vstack((final_pred, s_pred_avg))\n", " return final_pred[1:, :]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.0\n" ] } ], "source": [ "ensemble = Ensemble(45, 3, 3)\n", "pred = np.argmax(ensemble.mixmatch([predictions_lr, predictions_dtree, predictions_knn]), axis=1)\n", "print sklearn.metrics.accuracy_score(Y_test, pred)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }