{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.20.3\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.metrics import confusion_matrix\n",
    "\n",
    "print pd.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>f1</th>\n",
       "      <th>f2</th>\n",
       "      <th>f3</th>\n",
       "      <th>f4</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3.62160</td>\n",
       "      <td>8.6661</td>\n",
       "      <td>-2.8073</td>\n",
       "      <td>-0.44699</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.54590</td>\n",
       "      <td>8.1674</td>\n",
       "      <td>-2.4586</td>\n",
       "      <td>-1.46210</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.86600</td>\n",
       "      <td>-2.6383</td>\n",
       "      <td>1.9242</td>\n",
       "      <td>0.10645</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3.45660</td>\n",
       "      <td>9.5228</td>\n",
       "      <td>-4.0112</td>\n",
       "      <td>-3.59440</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.32924</td>\n",
       "      <td>-4.4552</td>\n",
       "      <td>4.5718</td>\n",
       "      <td>-0.98880</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        f1      f2      f3       f4  target\n",
       "0  3.62160  8.6661 -2.8073 -0.44699       0\n",
       "1  4.54590  8.1674 -2.4586 -1.46210       0\n",
       "2  3.86600 -2.6383  1.9242  0.10645       0\n",
       "3  3.45660  9.5228 -4.0112 -3.59440       0\n",
       "4  0.32924 -4.4552  4.5718 -0.98880       0"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# reading data into a dataframe\n",
    "DATA_DIR = 'data'\n",
    "\n",
    "df = pd.read_csv(\n",
    "                    os.path.abspath(os.path.join(DATA_DIR, 'day9/banknote_authentication.csv'))\n",
    "                )\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1372, 5)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# (rows, columns)\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "f1        0\n",
       "f2        0\n",
       "f3        0\n",
       "f4        0\n",
       "target    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# look for NaN values\n",
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    762\n",
       "1    610\n",
       "Name: target, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# let's figure out the distribution of target variable\n",
    "# there are many options to handle this imbalance; of which one is to add false data to class 1; secondly we can \n",
    "# delete records from 0 to make it equal to 1\n",
    "# we will leave it for now\n",
    "df['target'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X = df.iloc[:, :-1].values\n",
    "Y = df.iloc[:, -1].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# ideal practice is to use test as 20% - 30% of training data\n",
    "# defined by test_size in train_test_split()\n",
    "# random_state is required to avoid sequential biasness in the data distribution\n",
    "def data_split(X, Y):\n",
    "    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.3, random_state = 10)\n",
    "    return X_train, X_test, Y_train, Y_test\n",
    "\n",
    "X_train, X_test, Y_train, Y_test = data_split(X, Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((960, 4), (412, 4))"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Model Define"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class SVMModel:\n",
    "    \n",
    "    def __init__(self):\n",
    "        self.classifier = SVC()\n",
    "\n",
    "    def train(self, X_train, Y_train):\n",
    "        model = self.classifier.fit(X_train, Y_train)\n",
    "        return model\n",
    "    \n",
    "    def predict(self, model, X_test):\n",
    "        return model.predict(X_test)\n",
    "    \n",
    "    def evaluate(self, Y_test, Y_pred, measure):\n",
    "        if measure=='matrix':\n",
    "            cm = confusion_matrix(Y_test, Y_pred, labels=[0, 1])\n",
    "            return cm\n",
    "        elif measure=='accuracy':\n",
    "            return accuracy_score(Y_test, Y_pred)*100\n",
    "        else: return None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# train the model\n",
    "svm = SVMModel()\n",
    "model = svm.train(X_train, Y_train)\n",
    "predictions = svm.predict(model, X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[226   0]\n",
      " [  0 186]]\n",
      "\n",
      "100.0\n"
     ]
    }
   ],
   "source": [
    "# evaluating the model\n",
    "print svm.evaluate(Y_test, predictions, 'matrix')\n",
    "print \n",
    "print svm.evaluate(Y_test, predictions, 'accuracy')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}