{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.20.3\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn import preprocessing\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.cross_validation import train_test_split\n",
    "\n",
    "print pd.__version__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Loading Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>10</th>\n",
       "      <th>11</th>\n",
       "      <th>12</th>\n",
       "      <th>13</th>\n",
       "      <th>14</th>\n",
       "      <th>15</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>b</td>\n",
       "      <td>30.83</td>\n",
       "      <td>0.000</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>w</td>\n",
       "      <td>v</td>\n",
       "      <td>1.25</td>\n",
       "      <td>t</td>\n",
       "      <td>t</td>\n",
       "      <td>1</td>\n",
       "      <td>f</td>\n",
       "      <td>g</td>\n",
       "      <td>00202</td>\n",
       "      <td>0</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>a</td>\n",
       "      <td>58.67</td>\n",
       "      <td>4.460</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>q</td>\n",
       "      <td>h</td>\n",
       "      <td>3.04</td>\n",
       "      <td>t</td>\n",
       "      <td>t</td>\n",
       "      <td>6</td>\n",
       "      <td>f</td>\n",
       "      <td>g</td>\n",
       "      <td>00043</td>\n",
       "      <td>560</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a</td>\n",
       "      <td>24.50</td>\n",
       "      <td>0.500</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>q</td>\n",
       "      <td>h</td>\n",
       "      <td>1.50</td>\n",
       "      <td>t</td>\n",
       "      <td>f</td>\n",
       "      <td>0</td>\n",
       "      <td>f</td>\n",
       "      <td>g</td>\n",
       "      <td>00280</td>\n",
       "      <td>824</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>b</td>\n",
       "      <td>27.83</td>\n",
       "      <td>1.540</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>w</td>\n",
       "      <td>v</td>\n",
       "      <td>3.75</td>\n",
       "      <td>t</td>\n",
       "      <td>t</td>\n",
       "      <td>5</td>\n",
       "      <td>t</td>\n",
       "      <td>g</td>\n",
       "      <td>00100</td>\n",
       "      <td>3</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>b</td>\n",
       "      <td>20.17</td>\n",
       "      <td>5.625</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>w</td>\n",
       "      <td>v</td>\n",
       "      <td>1.71</td>\n",
       "      <td>t</td>\n",
       "      <td>f</td>\n",
       "      <td>0</td>\n",
       "      <td>f</td>\n",
       "      <td>s</td>\n",
       "      <td>00120</td>\n",
       "      <td>0</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  0      1      2  3  4  5  6     7  8  9   10 11 12     13   14 15\n",
       "0  b  30.83  0.000  u  g  w  v  1.25  t  t   1  f  g  00202    0  +\n",
       "1  a  58.67  4.460  u  g  q  h  3.04  t  t   6  f  g  00043  560  +\n",
       "2  a  24.50  0.500  u  g  q  h  1.50  t  f   0  f  g  00280  824  +\n",
       "3  b  27.83  1.540  u  g  w  v  3.75  t  t   5  t  g  00100    3  +\n",
       "4  b  20.17  5.625  u  g  w  v  1.71  t  f   0  f  s  00120    0  +"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# reading data to pandas dataframe\n",
    "DATA_DIR = '../data'\n",
    "\n",
    "df = pd.read_table(\n",
    "                    os.path.abspath(os.path.join(DATA_DIR, 'day11/credit.csv')),\n",
    "                    sep=',',\n",
    "                    header=None\n",
    "                )\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(690, 16)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# (rows, columns)\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     0\n",
       "1     0\n",
       "2     0\n",
       "3     0\n",
       "4     0\n",
       "5     0\n",
       "6     0\n",
       "7     0\n",
       "8     0\n",
       "9     0\n",
       "10    0\n",
       "11    0\n",
       "12    0\n",
       "13    0\n",
       "14    0\n",
       "15    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# checking for NaN in the entire df\n",
    "df.isnull().sum()\n",
    "\n",
    "# None of the columns have missing values in them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      object\n",
       "1      object\n",
       "2     float64\n",
       "3      object\n",
       "4      object\n",
       "5      object\n",
       "6      object\n",
       "7     float64\n",
       "8      object\n",
       "9      object\n",
       "10      int64\n",
       "11     object\n",
       "12     object\n",
       "13     object\n",
       "14      int64\n",
       "15     object\n",
       "dtype: object"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# object in pandas means string; we need to convert all to numerical\n",
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prune rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# but here is something we found unusual; a '?' we will prune out all the rows \n",
    "# where in any of the column if '?' exists\n",
    "\n",
    "# figuring out all the columns where '?' exists\n",
    "for columns in range(16):\n",
    "    if '?' in df[columns].unique().tolist():\n",
    "        df = df[df[columns]!='?']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(653, 16)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# new data after removing the rows having '?' in them\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Transform datapoints"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# we will now encode the objects to float dtype for features\n",
    "# 0, 3, 4, 5, 6, 8, 9, 11\n",
    "for column in [0, 3, 4, 5, 6, 8, 9, 11, 12]:\n",
    "    possible_values = df[column].unique().tolist()\n",
    "    encoded_inp = {v:idx for idx, v in enumerate(possible_values)}\n",
    "    df[column].replace(encoded_inp, inplace=True)\n",
    "\n",
    "# we will now encode the objects to float dtype for target\n",
    "# 15\n",
    "encoded_inp = {'+':1, '-':0}\n",
    "df[15].replace(encoded_inp, inplace=True)\n",
    "\n",
    "# we will not convert the remaining object dtypes to float dtype\n",
    "# 1, 13; i am not sure what 13 column is about; will drop it for now\n",
    "df.drop([13], axis = 1, inplace = True)\n",
    "df[1] = df[1].astype(float)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Sep. features and target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = df.iloc[:, :-1].values\n",
    "Y = df.iloc[:, -1].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train/Test split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# ideal practice is to use test as 20% - 30% of training data\n",
    "# defined by test_size in train_test_split()\n",
    "# random_state is required to avoid sequential biasness in the data distribution\n",
    "def data_split(X, Y):\n",
    "    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)\n",
    "    return X_train, X_test, Y_train, Y_test\n",
    "\n",
    "X_train, X_test, Y_train, Y_test = data_split(X, Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((522, 14), (131, 14))"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train / Evaluate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class DecisionTrees(object):\n",
    "    \n",
    "    def __init__(self):\n",
    "        self.classifier = DecisionTreeClassifier(random_state=10)\n",
    "\n",
    "    def train(self, X_train, Y_train):\n",
    "        model = self.classifier.fit(X_train, Y_train)\n",
    "        return model\n",
    "    \n",
    "    def predict(self, model, X_test):\n",
    "        return model.predict(X_test)\n",
    "    \n",
    "    def evaluate(self, Y_test, Y_pred):\n",
    "        return accuracy_score(Y_test, Y_pred)*100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "81.67938931297711\n"
     ]
    }
   ],
   "source": [
    "# train the model and tuning depth paramater of the classifier over validation set\n",
    "dtree = DecisionTrees()\n",
    "model_dtree = dtree.train(X_train, Y_train)\n",
    "predictions = dtree.predict(model_dtree, X_test)\n",
    "print dtree.evaluate(Y_test, predictions)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}