{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.20.3\n", "1.14.2\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", " \"This module will be removed in 0.20.\", DeprecationWarning)\n" ] } ], "source": [ "import os\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.neighbors import KNeighborsClassifier\n", "\n", "print pd.__version__\n", "print np.__version__" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>id</th>\n", " <th>ri</th>\n", " <th>na</th>\n", " <th>mg</th>\n", " <th>al</th>\n", " <th>si</th>\n", " <th>k</th>\n", " <th>ca</th>\n", " <th>ba</th>\n", " <th>fe</th>\n", " <th>class</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>1.52101</td>\n", " <td>13.64</td>\n", " <td>4.49</td>\n", " <td>1.10</td>\n", " <td>71.78</td>\n", " <td>0.06</td>\n", " <td>8.75</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>1.51761</td>\n", " <td>13.89</td>\n", " <td>3.60</td>\n", " <td>1.36</td>\n", " <td>72.73</td>\n", " <td>0.48</td>\n", " <td>7.83</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>1.51618</td>\n", " <td>13.53</td>\n", " <td>3.55</td>\n", " <td>1.54</td>\n", " <td>72.99</td>\n", " <td>0.39</td>\n", " <td>7.78</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4</td>\n", " <td>1.51766</td>\n", " <td>13.21</td>\n", " <td>3.69</td>\n", " <td>1.29</td>\n", " <td>72.61</td>\n", " <td>0.57</td>\n", " <td>8.22</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5</td>\n", " <td>1.51742</td>\n", " <td>13.27</td>\n", " <td>3.62</td>\n", " <td>1.24</td>\n", " <td>73.08</td>\n", " <td>0.55</td>\n", " <td>8.07</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " id ri na mg al si k ca ba fe class\n", "0 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0.0 0.0 1\n", "1 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0.0 0.0 1\n", "2 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0.0 0.0 1\n", "3 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22 0.0 0.0 1\n", "4 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07 0.0 0.0 1" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "DATA_DIR = \"../data\"\n", "\n", "df = pd.read_csv(\n", " os.path.abspath(os.path.join(DATA_DIR, \"day6/glass.csv\")), \n", " )\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# we will first drop the id column; since it is of no use to us\n", "# remember axis=1 (column); axis=0 (row); default value of axis is 0\n", "df.drop(['id'], inplace=True, axis=1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Index([u'ri', u'na', u'mg', u'al', u'si', u'k', u'ca', u'ba', u'fe', u'class'], dtype='object')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "ri 0\n", "na 0\n", "mg 0\n", "al 0\n", "si 0\n", "k 0\n", "ca 0\n", "ba 0\n", "fe 0\n", "class 0\n", "dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# checking for any NaN value in the dataset across any of the remaining columns in our df\n", "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([1, 2, 3, 5, 6, 7])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# total classes for prediction\n", "df['class'].unique()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X = df.iloc[:,:-1].values\n", "Y = df.iloc[:,-1].values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Split" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# ideal practice is to use test as 20% - 30% of training data\n", "# defined by test_size in train_test_split()\n", "# random_state is required to avoid sequential biasness in the data distribution\n", "def data_split(X, Y):\n", " X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)\n", " X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, Y_train, test_size=0.2, random_state = 10)\n", " return X_train, X_validation, X_test, Y_train, Y_validation, Y_test\n", "\n", "X_train, X_validation, X_test, Y_train, Y_validation, Y_test = data_split(X, Y)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(136, 9)\n", "(35, 9)\n", "(43, 9)\n" ] } ], "source": [ "print X_train.shape\n", "print X_validation.shape\n", "print X_test.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tuning of K using validation" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Neighbour: 5 - Score: 0.571428571429\n", "Neighbour: 6 - Score: 0.6\n", "Neighbour: 7 - Score: 0.628571428571\n" ] } ], "source": [ "n = [5, 6, 7, 8, 9, 10, 11, 12]\n", "maximum = 0\n", "for neighbor in n:\n", " knn = KNeighborsClassifier(n_neighbors=neighbor)\n", " knn.fit(X_train, Y_train)\n", " score = knn.score(X_validation, Y_validation)\n", " if score > maximum:\n", " maximum = score\n", " n_final = neighbor\n", " print 'Neighbour: {} - Score: {}'.format(neighbor, maximum)\n", " else:\n", " pass" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.4883720930232558" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn = KNeighborsClassifier(n_neighbors=n_final)\n", "knn.fit(X_train, Y_train)\n", "knn.score(X_test, Y_test)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }