{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.19.1\n",
      "1.14.2\n",
      "0.20.3\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import sklearn\n",
    "import os\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn import preprocessing\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "print sklearn.__version__\n",
    "print np.__version__\n",
    "print pd.__version__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loading and Describe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>feat1</th>\n",
       "      <th>feat2</th>\n",
       "      <th>feat3</th>\n",
       "      <th>feat4</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>Iris-setosa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   feat1  feat2  feat3  feat4        class\n",
       "0    5.1    3.5    1.4    0.2  Iris-setosa\n",
       "1    4.9    3.0    1.4    0.2  Iris-setosa\n",
       "2    4.7    3.2    1.3    0.2  Iris-setosa\n",
       "3    4.6    3.1    1.5    0.2  Iris-setosa\n",
       "4    5.0    3.6    1.4    0.2  Iris-setosa"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "DATA_DIR = '../data'\n",
    "df = pd.read_table(\n",
    "                    os.path.abspath(os.path.join(DATA_DIR, 'day1/iris.csv')),\n",
    "                    sep=','\n",
    "                  )\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "class\n",
       "Iris-setosa        50\n",
       "Iris-versicolor    50\n",
       "Iris-virginica     50\n",
       "Name: class, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examples per class\n",
    "df.groupby('class')['class'].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# feat1/2/3/4 are considered as input (features) to the model, whereas class is considered as output of the model\n",
    "X = df.iloc[:, :-1].values\n",
    "Y = df.iloc[:, -1].values\n",
    "# encode the class with integers\n",
    "le = preprocessing.LabelEncoder()\n",
    "Y = le.fit_transform(Y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train/Test Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# ideal practice is to use test as 20% - 30% of training data\n",
    "# defined by test_size in train_test_split()\n",
    "# random_state is required to avoid sequential biasness in the data distribution\n",
    "def data_split(X, Y):\n",
    "    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)\n",
    "    return X_train, X_test, Y_train, Y_test\n",
    "\n",
    "X_train, X_test, Y_train, Y_test = data_split(X, Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((120, 4), (30, 4))"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Normalizer Class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# this class takes care for scaling the features to the scale of 0-1\n",
    "# we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which \n",
    "# also has the range from 0-1\n",
    "class Normalizer:\n",
    "\n",
    "    def __init__(self):\n",
    "        self.sc = StandardScaler()\n",
    "    \n",
    "    def scale(self, X, dtype):\n",
    "        if dtype=='train':\n",
    "            XX = self.sc.fit_transform(X)\n",
    "        elif dtype=='test':\n",
    "            XX = self.sc.transform(X)\n",
    "        else:\n",
    "            return None\n",
    "        return XX"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Logistic Model Class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class LogisticModel:\n",
    "    \n",
    "    def __init__(self):\n",
    "        self.classifier = LogisticRegression()\n",
    "\n",
    "    def train(self, X_train, Y_train):\n",
    "        model = self.classifier.fit(X_train, Y_train)\n",
    "        return model\n",
    "    \n",
    "    def predict(self, model, X_test):\n",
    "        return model.predict(X_test)\n",
    "    \n",
    "    def evaluate(self, Y_test, Y_pred, measure):\n",
    "        if measure=='matrix':\n",
    "            cm = confusion_matrix(Y_test, Y_pred, labels=[0, 1, 2])\n",
    "            return cm\n",
    "        elif measure=='accuracy':\n",
    "            return accuracy_score(Y_test, Y_pred)*100\n",
    "        else: return None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Starting to Train and Predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# cap to range of 0-1\n",
    "norm = Normalizer()\n",
    "X_train = norm.scale(X_train, 'train')\n",
    "X_test = norm.scale(X_test, 'test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# train the model\n",
    "logit = LogisticModel()\n",
    "model = logit.train(X_train, Y_train)\n",
    "predictions = logit.predict(model, X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluating"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[10  0  0]\n",
      " [ 0 10  3]\n",
      " [ 0  0  7]]\n",
      "\n",
      "90.0\n"
     ]
    }
   ],
   "source": [
    "print logit.evaluate(Y_test, predictions, 'matrix')\n",
    "print \n",
    "print logit.evaluate(Y_test, predictions, 'accuracy')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#                iris  versicolor  verginica\n",
    "# iris       -    10      10          0\n",
    "# versicolor -    0       10          3\n",
    "# verginica  -    0       0           7\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}