{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## K-Fold Cross Validation\n",
    "Testing accuracy for just once doesn't account for the variance in the data and might give misleading results. K-Fold validation randomly selects one of $k$ parts of the data set then tests the accuracy on the same. After required number of iterations, the accuracy is averaged"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>User ID</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Age</th>\n",
       "      <th>EstimatedSalary</th>\n",
       "      <th>Purchased</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15624510</td>\n",
       "      <td>Male</td>\n",
       "      <td>19</td>\n",
       "      <td>19000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>15810944</td>\n",
       "      <td>Male</td>\n",
       "      <td>35</td>\n",
       "      <td>20000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>15668575</td>\n",
       "      <td>Female</td>\n",
       "      <td>26</td>\n",
       "      <td>43000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>15603246</td>\n",
       "      <td>Female</td>\n",
       "      <td>27</td>\n",
       "      <td>57000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>15804002</td>\n",
       "      <td>Male</td>\n",
       "      <td>19</td>\n",
       "      <td>76000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    User ID  Gender  Age  EstimatedSalary  Purchased\n",
       "0  15624510    Male   19            19000          0\n",
       "1  15810944    Male   35            20000          0\n",
       "2  15668575  Female   26            43000          0\n",
       "3  15603246  Female   27            57000          0\n",
       "4  15804002    Male   19            76000          0"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "df = pd.read_csv('Social_Network_Ads.csv')\n",
    "X = df.iloc[:, 2:4]   # Using 1:2 as indices will give us np array of dim (10, 1)\n",
    "y = df.iloc[:, 4]\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Scale\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "X_sca = StandardScaler()\n",
    "X = X_sca.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.82\n"
     ]
    }
   ],
   "source": [
    "from __future__ import division\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "\n",
    "kfold_cv = KFold(n_splits=10)\n",
    "correct = 0\n",
    "total = 0\n",
    "for train_indices, test_indices in kfold_cv.split(X):\n",
    "    X_train, X_test, y_train, y_test = X[train_indices], X[test_indices], \\\n",
    "                                        y[train_indices], y[test_indices]\n",
    "    clf = SVC(kernel='linear', random_state=0).fit(X_train, y_train)\n",
    "    correct += accuracy_score(y_test, clf.predict(X_test))\n",
    "    total += 1\n",
    "print(\"Accuracy: {0:.2f}\".format(correct/total))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.svm import SVC #support vector classifier\n",
    "clf = SVC(kernel='linear', random_state=0).fit(X_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.90322581  0.90322581  0.77419355  0.87096774  0.77419355  0.86206897\n",
      "  0.82758621  0.68965517  0.79310345  0.89655172]\n",
      "0.829477196885\n",
      "0.0671935884472\n"
     ]
    }
   ],
   "source": [
    "# applying k-fold cross validation\n",
    "from sklearn.model_selection import cross_val_score\n",
    "accuracies = cross_val_score(clf, X_train, y_train, cv=10)\n",
    "print accuracies\n",
    "print accuracies.mean()\n",
    "print accuracies.std()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Leave one out cross validation\n",
    "\n",
    "Another type of cross validation is leave one out cross validation. Out of the $n$ samples, one of them is left out and the model is trained on other samples. When K in KFold validation is equal to the number of samples then K-Fold validation is same as leave one out cross validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>User ID</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Age</th>\n",
       "      <th>EstimatedSalary</th>\n",
       "      <th>Purchased</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15624510</td>\n",
       "      <td>Male</td>\n",
       "      <td>19</td>\n",
       "      <td>19000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>15810944</td>\n",
       "      <td>Male</td>\n",
       "      <td>35</td>\n",
       "      <td>20000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>15668575</td>\n",
       "      <td>Female</td>\n",
       "      <td>26</td>\n",
       "      <td>43000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>15603246</td>\n",
       "      <td>Female</td>\n",
       "      <td>27</td>\n",
       "      <td>57000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>15804002</td>\n",
       "      <td>Male</td>\n",
       "      <td>19</td>\n",
       "      <td>76000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    User ID  Gender  Age  EstimatedSalary  Purchased\n",
       "0  15624510    Male   19            19000          0\n",
       "1  15810944    Male   35            20000          0\n",
       "2  15668575  Female   26            43000          0\n",
       "3  15603246  Female   27            57000          0\n",
       "4  15804002    Male   19            76000          0"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "df = pd.read_csv('Social_Network_Ads.csv')\n",
    "X = df.iloc[:, 2:4]   # Using 1:2 as indices will give us np array of dim (10, 1)\n",
    "y = df.iloc[:, 4]\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Scale\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "X_sca = StandardScaler()\n",
    "X = X_sca.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.84\n"
     ]
    }
   ],
   "source": [
    "from __future__ import division\n",
    "from sklearn.model_selection import LeaveOneOut\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "loo_cv = LeaveOneOut()\n",
    "correct = 0\n",
    "total = 0\n",
    "for train_indices, test_indices in loo_cv.split(X):\n",
    "#     uncomment these lines to print splits\n",
    "#     print(\"Train Indices: {}...\".format(train_indices[:4]))\n",
    "#     print(\"Test Indices: {}...\".format(test_indices[:4]))\n",
    "#     print(\"Training SVC model using this configuration\")\n",
    "    X_train, X_test, y_train, y_test = X[train_indices], X[test_indices], \\\n",
    "                                        y[train_indices], y[test_indices]\n",
    "    clf = SVC(kernel='linear', random_state=0).fit(X_train, y_train)\n",
    "    correct += accuracy_score(y_test, clf.predict(X_test))\n",
    "    total += 1\n",
    "print(\"Accuracy: {0:.2f}\".format(correct/total))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Stratified KFold\n",
    "\n",
    "Kfold validation does not preserve the split of the output variable while splitting the data in k-folds. Imagine training a Naive Bayes classifier using KFold validation using 10 samples where 5 are positive and 5 are negative. Since KFold randomly selects the split imagine splitting it in an unfortunate way -- 1 split contains all positive samples and 1 contains all negative. Naive Bayes classifier will calculate the prior probabilities and find it to be 100% i.e. the model will think the output is always positive which is obviously wrong. To tackle this scenario we use Stratified split, what it would essentially do is preserve the split in the original dataset in training set, that is, if the original dataset has 50% positive and 50% negative outputs then the training set will also have 50% positive and 50% negative outputs. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>User ID</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Age</th>\n",
       "      <th>EstimatedSalary</th>\n",
       "      <th>Purchased</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15624510</td>\n",
       "      <td>Male</td>\n",
       "      <td>19</td>\n",
       "      <td>19000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>15810944</td>\n",
       "      <td>Male</td>\n",
       "      <td>35</td>\n",
       "      <td>20000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>15668575</td>\n",
       "      <td>Female</td>\n",
       "      <td>26</td>\n",
       "      <td>43000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>15603246</td>\n",
       "      <td>Female</td>\n",
       "      <td>27</td>\n",
       "      <td>57000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>15804002</td>\n",
       "      <td>Male</td>\n",
       "      <td>19</td>\n",
       "      <td>76000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    User ID  Gender  Age  EstimatedSalary  Purchased\n",
       "0  15624510    Male   19            19000          0\n",
       "1  15810944    Male   35            20000          0\n",
       "2  15668575  Female   26            43000          0\n",
       "3  15603246  Female   27            57000          0\n",
       "4  15804002    Male   19            76000          0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "df = pd.read_csv('Social_Network_Ads.csv')\n",
    "X = df.iloc[:, 2:4]   # Using 1:2 as indices will give us np array of dim (10, 1)\n",
    "y = df.iloc[:, 4]\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Scale\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "X_sca = StandardScaler()\n",
    "X = X_sca.fit_transform(X)"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "from __future__ import division\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "strat_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)\n",
    "correct = 0\n",
    "total = 0\n",
    "for train_indices, test_indices in strat_cv.split(X, y):\n",
    "#     uncomment these lines to print splits\n",
    "#     print(\"Train Indices: {}...\".format(train_indices[:4]))\n",
    "#     print(\"Test Indices: {}...\".format(test_indices[:4]))\n",
    "#     print(\"Training SVC model using this configuration\")\n",
    "    X_train, X_test, y_train, y_test = X[train_indices], X[test_indices], \\\n",
    "                                        y[train_indices], y[test_indices]\n",
    "    clf = SVC(kernel='linear', random_state=0).fit(X_train, y_train)\n",
    "    correct += accuracy_score(y_test, clf.predict(X_test))\n",
    "    total += 1\n",
    "print(\"Accuracy: {0:.2f}\".format(correct/total))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Validating Time Series data\n",
    "Time series data is data associated with a time frame, for instance stock prices. The motivation is to predict stock price for future given the data from previous data. If we were to use any splitting techniques from above we would end up predicting past from future (due to random nature from splitting) which shouldn't be permitted, we should always predict future from past. This can be achieved using TimeSeriesSplit\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.08485204  0.84689345]\n",
      " [ 0.02834187  0.68234029]\n",
      " [ 0.36309891  0.07100943]\n",
      " [ 0.66955444  0.88070583]\n",
      " [ 0.28241451  0.56733126]\n",
      " [ 0.30521588  0.73973179]\n",
      " [ 0.0566575   0.96430919]\n",
      " [ 0.53957399  0.05946202]\n",
      " [ 0.11530205  0.16625273]\n",
      " [ 0.89429006  0.83914383]]\n",
      "[ 0.97006781  0.81953045  0.50522986  0.88384404  0.30715333  0.9750431\n",
      "  0.68943093  0.74947717  0.93600522  0.33118984]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import TimeSeriesSplit\n",
    "import numpy as np\n",
    "\n",
    "X = np.random.rand(10, 2)\n",
    "y = np.random.rand(10)\n",
    "print(X)\n",
    "print(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train indices: [0 1 2] Test indices: [3]\n",
      "Train indices: [0 1 2 3] Test indices: [4]\n",
      "Train indices: [0 1 2 3 4] Test indices: [5]\n",
      "Train indices: [0 1 2 3 4 5] Test indices: [6]\n",
      "Train indices: [0 1 2 3 4 5 6] Test indices: [7]\n",
      "Train indices: [0 1 2 3 4 5 6 7] Test indices: [8]\n",
      "Train indices: [0 1 2 3 4 5 6 7 8] Test indices: [9]\n"
     ]
    }
   ],
   "source": [
    "tss = TimeSeriesSplit(n_splits=7)\n",
    "\n",
    "for train_indices, test_indices in tss.split(X):\n",
    "    print(\"Train indices: {0} Test indices: {1}\".format(train_indices, test_indices))"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}