{ "cells": [ { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# General Utility\n", "import os\n", "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from IPython.core.display import display, HTML\n", "sns.set()\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.model_selection import RepeatedStratifiedKFold\n", "from sklearn.model_selection import train_test_split " ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "nb_seed = 1234" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "from imblearn.datasets import make_imbalance" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
attr1attr2attr3attr4attr5attr6class
00.2300205.072578-0.2760610.832444-0.3778660.480322'-1'
10.155491-0.1693900.670652-0.859553-0.377866-0.945723'-1'
2-0.784415-0.4436545.674705-0.859553-0.377866-0.945723'-1'
30.5460880.131415-0.456387-0.859553-0.377866-0.945723'-1'
4-0.102987-0.394994-0.1408160.979703-0.3778661.013566'-1'
\n", "
" ], "text/plain": [ " attr1 attr2 attr3 attr4 attr5 attr6 class\n", "0 0.230020 5.072578 -0.276061 0.832444 -0.377866 0.480322 '-1'\n", "1 0.155491 -0.169390 0.670652 -0.859553 -0.377866 -0.945723 '-1'\n", "2 -0.784415 -0.443654 5.674705 -0.859553 -0.377866 -0.945723 '-1'\n", "3 0.546088 0.131415 -0.456387 -0.859553 -0.377866 -0.945723 '-1'\n", "4 -0.102987 -0.394994 -0.140816 0.979703 -0.377866 1.013566 '-1'" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "CSV_PATH = os.path.join('data', 'example', 'mammography.csv')\n", "df = pd.read_csv(CSV_PATH, encoding='latin1')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(11183, 7)" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig, ax = plt.subplots(figsize=(6,4))\n", "sns.countplot(df['class'], ax=ax)\n", "ax.set(xlabel='Class')\n", "plt.title('Class Distribution')" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'-1' 10923\n", "'1' 260\n", "Name: class, dtype: int64" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target = df['class']\n", "target.value_counts()" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(11183, 6)" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y = (target == \"'-1'\").astype(np.int)\n", "X = df.iloc[:, :-1]\n", "X.shape" ] }, { "cell_type": "code", "execution_count": 130, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X.values, \n", " y.values,\n", " test_size=0.3,\n", " random_state=nb_seed)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The testing data will be held out for validation at the end." ] }, { "cell_type": "code", "execution_count": 131, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 131, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(10,4))\n", "sns.countplot(y_train, ax=ax0)\n", "ax0.set(xlabel='Class')\n", "plt.title('Training Class Distribution')\n", "sns.countplot(y_test, ax=ax1)\n", "ax1.set(xlabel='Class')\n", "plt.title('Holdout Class Distribution')" ] }, { "cell_type": "code", "execution_count": 132, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Class 0 makes up 2.261% of the Model Building data\n", "Class 0 makes up 2.474% of the Holdout data\n" ] } ], "source": [ "print('Class 0 makes up {:2.3f}% of the {} data'\n", " .format(y_train[y_train == 0].shape[0] * 100 / y_train.shape[0], 'Model Building'))\n", "print('Class 0 makes up {:2.3f}% of the {} data'\n", " .format(y_test[y_test == 0].shape[0] * 100 / y_test.shape[0], 'Holdout'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From these printouts, we see that the model building and holdout data have slightly different distributions. Modeling is based on the assumption that the data is being produced by a fairly non-random, approximately consistent process. From this assumption, we treat the training data as a representative sample of data produced by this process. If we use a biased training set, then our model will probably be slightly biased." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using Stratification to produce comparable model-building and holdout data" ] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X.values, \n", " y.values, \n", " stratify=y,\n", " test_size=0.3,\n", " random_state=nb_seed)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The testing data will be held out for validation at the end." ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 134, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(10,4))\n", "sns.countplot(y_train, ax=ax0)\n", "ax0.set(xlabel='Class')\n", "plt.title('Training Class Distribution')\n", "sns.countplot(y_test, ax=ax1)\n", "ax1.set(xlabel='Class')\n", "plt.title('Holdout Class Distribution')" ] }, { "cell_type": "code", "execution_count": 135, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Class 0 makes up 2.325% of the Model Building data\n", "Class 0 makes up 2.325% of the Holdout data\n" ] } ], "source": [ "print('Class 0 makes up {:2.3f}% of the {} data'\n", " .format(y_train[y_train == 0].shape[0] * 100 / y_train.shape[0], 'Model Building'))\n", "print('Class 0 makes up {:2.3f}% of the {} data'\n", " .format(y_test[y_test == 0].shape[0] * 100 / y_test.shape[0], 'Holdout'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From these distribution plots, we see that train_test_split with stratify enabled produced comparably imbalanced datasets, and from the printout, we see that the minority class makes up 2.325% of the data in both the model building and holdout data. That is as we expect." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Stratified" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "def stratified_kfold_check(n, X_train_, y_train_, nb_seed=nb_seed):\n", " skfolds = StratifiedKFold(n_splits=n, random_state=nb_seed)\n", " for train_index, test_index in skfolds.split(X_train_, y_train_):\n", " X_train_folds = X_train_[train_index]\n", " y_train_folds = (y_train_[train_index])\n", " X_test_folds = X_train_[test_index]\n", " y_test_folds = (y_train_[test_index])\n", " print('Total class observations: {:4d}'\n", " .format(len(y_train_folds)))\n", " print('Number of class {} observations: {:4d}'\n", " .format(1,len(y_train_folds[y_train_folds == 1])))\n", " print('Number of class {} observations: {:4d}\\n'\n", " .format(0,len(y_train_folds[y_train_folds == 0])))" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total class observations: 6709\n", "Number of class 1 observations: 6553\n", "Number of class 0 observations: 156\n", "\n", "Total class observations: 6709\n", "Number of class 1 observations: 6553\n", "Number of class 0 observations: 156\n", "\n", "Total class observations: 6710\n", "Number of class 1 observations: 6554\n", "Number of class 0 observations: 156\n", "\n", "Total class observations: 6710\n", "Number of class 1 observations: 6554\n", "Number of class 0 observations: 156\n", "\n", "Total class observations: 6710\n", "Number of class 1 observations: 6554\n", "Number of class 0 observations: 156\n", "\n" ] } ], "source": [ "stratified_kfold_check(5, X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total class observations: 7188\n", "Number of class 1 observations: 7021\n", "Number of class 0 observations: 167\n", "\n", "Total class observations: 7188\n", "Number of class 1 observations: 7021\n", "Number of class 0 observations: 167\n", "\n", "Total class observations: 7189\n", "Number of class 1 observations: 7022\n", "Number of class 0 observations: 167\n", "\n", "Total class observations: 7189\n", "Number of class 1 observations: 7022\n", "Number of class 0 observations: 167\n", "\n", "Total class observations: 7189\n", "Number of class 1 observations: 7022\n", "Number of class 0 observations: 167\n", "\n", "Total class observations: 7189\n", "Number of class 1 observations: 7022\n", "Number of class 0 observations: 167\n", "\n", "Total class observations: 7190\n", "Number of class 1 observations: 7022\n", "Number of class 0 observations: 168\n", "\n" ] } ], "source": [ "stratified_kfold_check(7, X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total class observations: 4193\n", "Number of class 1 observations: 4096\n", "Number of class 0 observations: 97\n", "\n", "Total class observations: 4194\n", "Number of class 1 observations: 4096\n", "Number of class 0 observations: 98\n", "\n" ] } ], "source": [ "stratified_kfold_check(2, X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "def rep_stratified_kfold_check(n, reps, X_train_, y_train_, nb_seed=nb_seed):\n", " skfolds = RepeatedStratifiedKFold(n_splits=n, n_repeats=reps, random_state=nb_seed)\n", " for train_index, test_index in skfolds.split(X_train_, y_train_):\n", " X_train_folds = X_train_[train_index]\n", " y_train_folds = (y_train_[train_index])\n", " X_test_folds = X_train_[test_index]\n", " y_test_folds = (y_train_[test_index])\n", " print('Total class observations: {:4d}'\n", " .format(len(y_train_folds)))\n", " print('Number of class {} observations: {:4d}'\n", " .format(1,len(y_train_folds[y_train_folds == 1])))\n", " print('Number of class {} observations: {:4d}\\n'\n", " .format(0,len(y_train_folds[y_train_folds == 0])))" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5592\n", "Number of class 1 observations: 5462\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5592\n", "Number of class 1 observations: 5462\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5592\n", "Number of class 1 observations: 5462\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5592\n", "Number of class 1 observations: 5462\n", "Number of class 0 observations: 130\n", "\n" ] } ], "source": [ "rep_stratified_kfold_check(3, 4, X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total class observations: 6710\n", "Number of class 1 observations: 6574\n", "Number of class 0 observations: 136\n" ] } ], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:py36]", "language": "python", "name": "conda-env-py36-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }