{ "cells": [ { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# General Utility\n", "import os\n", "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from IPython.core.display import display, HTML\n", "sns.set()\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.model_selection import RepeatedStratifiedKFold\n", "from sklearn.model_selection import train_test_split " ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "nb_seed = 1234" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "from imblearn.datasets import make_imbalance" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
attr1attr2attr3attr4attr5attr6class
00.2300205.072578-0.2760610.832444-0.3778660.480322'-1'
10.155491-0.1693900.670652-0.859553-0.377866-0.945723'-1'
2-0.784415-0.4436545.674705-0.859553-0.377866-0.945723'-1'
30.5460880.131415-0.456387-0.859553-0.377866-0.945723'-1'
4-0.102987-0.394994-0.1408160.979703-0.3778661.013566'-1'
\n", "
" ], "text/plain": [ " attr1 attr2 attr3 attr4 attr5 attr6 class\n", "0 0.230020 5.072578 -0.276061 0.832444 -0.377866 0.480322 '-1'\n", "1 0.155491 -0.169390 0.670652 -0.859553 -0.377866 -0.945723 '-1'\n", "2 -0.784415 -0.443654 5.674705 -0.859553 -0.377866 -0.945723 '-1'\n", "3 0.546088 0.131415 -0.456387 -0.859553 -0.377866 -0.945723 '-1'\n", "4 -0.102987 -0.394994 -0.140816 0.979703 -0.377866 1.013566 '-1'" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "CSV_PATH = os.path.join('data', 'example', 'mammography.csv')\n", "df = pd.read_csv(CSV_PATH, encoding='latin1')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(11183, 7)" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEcCAYAAADtODJSAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAGZ5JREFUeJzt3XuU3WV97/F3SOQoJoRERtCAIAG/XtGDcrFqEZSlQQ9wakGlItdqD1gvtFaQWxFYCl0UxcuxR6IkFstFqlBLNaXJsUetICoqRb9EIEASCYGZBAIVcpnzx+8Z2Awzyc482bMzM+/XWrNm7+/v9vyyJvPZz/P8fr+Z1N/fjyRJI7VNtxsgSRrbDBJJUhWDRJJUxSCRJFUxSCRJVQwSSVIVg0TjWkScExFf73Y7WkXEDRFxzBba1xsj4tct7++OiIO3xL7L/m6LiD/cUvvT+DSl2w2QakXE0cDHgJcCDwO3Ahdk5o/KKqN2s1REbAAeLcd8vLTlK5l59cA6mXnoZuxrz8y8a7h1MvMHwMuqGv3U8b4G3JeZZ7fs/5VbYt8a3+yRaEyLiFOBvwXOB54PvAj4EnB4l5rUD+ydmdsDAcwDvhARZ41wX8OKiMkj2Ke0xU3yznaNVRGxPbAMODYz/3GYdc4BZmfm+8v7q4E3Ac8GfgGcnJm3l2WHAn8D7AqsBi7JzL+NiOcBlwNvBDYAt2XmgcMc7xm9iIh4F/D3wAszsy8iFgFfz8yvRsRsYC7wGuAJ4N8y870R8f3SzsfKMU8EHij7+TxND2wB8FXg7zNz13Ksu4G/A44Bdga+DfyvzHwiIo4FTsrMNw1uL/AW4IvlWE8AizLz8LK/EzNzYURsC1wEHEkTctcAf5WZayPiwNK2S4BPAOuAMzLz8qH+nTS+2CPRWPZ64L/R/LJs1w3AbJrey8+AK1qWXQb8aelNvBJYWOp/AdwHPK9s98nNbOd1NMPI+w2x7Dzge5m5A7ALTUjQElSvysztM/Oa8n5nYAeantcHSm3wp8GjgUPKeQZwZsuywev2l+N9hebf4qJyvKF6dGeWc9gbeHV53brvnYFpwAuBk4AvRsT0IfajccYg0Vj2PODBzNzQ7gaZeXlmPpaZa4FPAa+OiGll8RPAKyJiWmauzsxbS30t8ALgxZm5PjN/uDmNzMx1wIPAzCEWrwV2i4hZmflEy7zOgEmD3q8HzsnMtZn5+DCH/HxmLs/MVcAFwHs30rzB+9+Yo4FzM/OhzHwIOJem5zPgCeC88m/0L8AamiDTOGeQaCx7CNgxItr6OY6IbSLiMxHx24hYBdxN84l8x7LKu4B3APdExKKIOKDULwLuBBaUbT+xOY2MiClAT2nvYB+n+X94c0T8KiKO38TuVpYQ3JilLa/voekhbAkvBO7dyL4fGhTqjwFTt9CxtRUzSDSW/Qfwe+CINtf/E+B/AAeXoaTdaT6RTwLIzJ9m5hE0v/SvA64u9Ucz8y8zc3bZ/tSIOGgz2nkETc/jJ4MXZOYDmfmBzJwF/BnwpYjYYyP7amdSc9eW17sBy8vrR4HtBhZExM6bue/lZX9D7VsTmJf/aszKzIfLZPoXI2I9zeTzWpr5gQMz87RBm0yluSS3LyKeC3ya8sszIp5FM4n8nbLfR2gmjImIdwC/ycw7aYZr1tEMMW1URMwADgUuBj6TmX1DrPPHwH9k5jJgFc1k98C+7wf2AIa9/HcYp0TEPwP/BZwOXFnqv6AZutsbSOAcnh4eK8rxhvMPwJkRcUt5fxawVd2jo+6wR6IxLTMvAU6lmfR9gGbo5WSGnoCfX5YvA24DBs9HHAPcXYa9PkDTgwHYC7ixhMsPgS9m5r8P06R+4BcR8TCwGDgB+EhmnjtonQH7AjeV9b8NfDgz7ynL/hqYHxG9JXDa0Q98gyZUf1u+LgDIzMU080L/BtwB/L9B286lCZreiPjHlv0NOB+4BfglTSjdMrDvjbRFE0BHL/+NiLnAO4EVmbl3qV1EMzzwOM248/GZ+XBZdjrNf7x1NP/5FpT624HP0gTf3My8sNR3p/m0NYPmCpxjysSmJGmUdLpH8jXgbYNqC4BXZOZraD6xnQ4QES8HjqK5S3cOzVjxpDKR+oWyn1cA742Il5Z9XQhcnJlBMyxwYofPR5I0SEeDpDy+oW9Q7caWKzt+THPtPMBhwJWZuS4zl9CEzH7la3Fm3lOuVrmSp+5aPhi4tryeB/zPTp2LJGlo3Z4jOYHmBjGAWTQ3fQ1YVmqD60uBWeVu476WUFrKlrvMUZLUpq4FSUScAazNzH8opaFujOrfRH3wMif3JGmUdeXy3/LMn0NphqYGLOXp17/vQnON+iSax0E8rZ6ZD0bEDhGxTemVDKy/SevWre+fMsXn3UnSZhrySQijESRP6zmUK7D+CvjDQY94uB64IiIuoRnO2hO4mabXtGdE7Ab8DnhP+YLmWUhHAlcBx9LcRLZJfX2P1ZyPJE1IPT3Thqx3+vLfbwBvpnkm0gqaG6A+CWzLU4+L+HFmnlzWP53myqu1PPPy38/x1OW/nyn1F/PU5b8/B97XxuMjWLnyEYfAJGkz9fRMG7JHMiEfI2+QSNLmGy5Iun3VliRpjDNIJElVDBJJUhWDRJJUxSCRJFUxSCRJVQwSSVIVg0SSVMUgkSRVMUgkSVUMEklSFYNEklTFIJEkVTFIJElVDBJJUpWu/KndsWz9+vUsWXJXt5uhrdDuu+/B5Mn+CWdNPAbJZlqy5C5Ov/gqnju9p9tN0Vbk0dUr+fRfvJvZs/fqdlOkUWeQjMBzp/ew/cwXdLsZkrRVcI5EklTFIJEkVTFIJElVDBJJUhWDRJJUxSCRJFUxSCRJVQwSSVIVg0SSVMUgkSRV6egjUiJiLvBOYEVm7l1qM4CrgN2AJcBRmbm6LLsUmAM8ChyXmbeW+rHAGUA/cEFmzi/1fYDLgWcDN2TmRzt5PpKkZ+p0j+RrwNsG1U4DbszMABYCpwNExBxgdmbuBXwQ+HKpzwDOBvYF9gfOiYjpZV//GzgpM18CvCQiBh9LktRhHQ2SzPwB0DeofDgwr7yeV94P1OeX7W4CpkfETjRBtCAzV2fmKmAB8PaI2BmYlpk3l+3nA0d07GQkSUPqxhzJ8zNzBUBm3g88v9RnAfe1rLe01AbXl7XUlw6xviRpFG1Nk+2ThnjfP0SdTdQlSaOoG3+PZEVE7JSZK8rw1AOlvhTYtWW9XYDlpf7mQfVFG1l/k2bM2I4pU0b2l+z6+qaOaDuNfzNnTqWnZ1q3myGNutEIkkk8vfdwPXAccGH5fl1L/RTgqog4AFhVwuZ7wAVlgn0b4BDgtMxcFREPR8R+wE+A9wOXttOgvr7HRnwyvb1rRrytxrfe3jWsXPlIt5shdcxwH5Q6OrQVEd8AfkRzRdW9EXE88BngkIhI4C3lPZl5A3B3RPwW+Dvg5FLvA84DbgFuAs4tk+6UdeYCdwCLM/O7nTwfSdIzdbRHkplHD7PorcOs/6Fh6pfT3C8yuP5T4FUjbJ4kaQvYmibbJUljkEEiSapikEiSqhgkkqQqBokkqYpBIkmqYpBIkqoYJJKkKgaJJKmKQSJJqmKQSJKqGCSSpCoGiSSpikEiSapikEiSqhgkkqQqBokkqYpBIkmqYpBIkqoYJJKkKgaJJKmKQSJJqmKQSJKqGCSSpCoGiSSpikEiSapikEiSqhgkkqQqBokkqcqUbh04Ij4GnAhsAH4FHA+8ELgSmAH8DDgmM9dFxLbAfOC1wIPAuzPz3rKf04ETgHXARzJzwWifiyRNZF3pkUTEC4E/B/bJzL1pAu29wIXAxZkZwCqaoKF8783MvYDPAheV/bwcOAp4GTAH+FJETBrNc5Gkia6bQ1uTgedGxBTgOcBy4CDg2rJ8HnBEeX14eQ/wTeDg8vow4MrMXJeZS4DFwH6db7okaUBXgiQzlwMXA/cCy4DVNENZqzJzQ1ltKTCrvJ4F3Fe2XQ+sjoiZrfViWcs2kqRR0JU5kojYgaaXsRtNiFxDMzQ1WH/5PtRwVf9G6hs1Y8Z2TJkyub3GDtLXN3VE22n8mzlzKj0907rdDGnUdWuy/a3AXZnZCxAR3wL+ANghIrYpvZJdaIa7oOmd7Aosj4jJwPTM7IuIgfqA1m2G1df32Igb3tu7ZsTbanzr7V3DypWPdLsZUscM90GpW0FyL3BARDwbeBx4C/AT4HnAkcBVwLHAdWX968v7m8ryhS31KyLiEpohrT2Bm0fpHCRJdG+O5GaaSfOfA7+gGaL6P8BpwKkRcQcwE5hbNpkL7BgRi4GPlvXIzNuBq4HbgRuAkzNzk0NbkqQtZ1J//8T7vbty5SMjPuk771zM+ZctZPuZL9iSTdIY93Dv7zjzpIOZPXuvbjdF6pienmlD3l7hne2SpCoGiSSpikEiSapikEiSqhgkkqQqBokkqYpBIkmqYpBIkqoYJJKkKgaJJKmKQSJJqmKQSJKqGCSSpCoGiSSpikEiSapikEiSqhgkkqQqBokkqYpBIkmqYpBIkqoYJJKkKm0FSUR8fojavC3fHEnSWDNlYwsj4jJgD+B1EfGKlkXPAqZ3smGSpLFho0ECnA/sDnwOOLelvg74dYfaJEkaQzYaJJm5BFgCvDoitqfphUwqi6cCvZ1snCRp67epHgkAEXE6cDrwUEu5n2bYS5I0gbUVJMBJwOzMXNnJxkiSxp52L/+9F4exJElDaLdHshj4QUQsAn4/UMzMT430wBExHbgMeCWwATgBuAO4CtiNZm7mqMxcXda/FJgDPAocl5m3lvqxwBk0Q20XZOb8kbZJkrT52u2RLAO+CzxOM9k+8FXjc8ANmfky4NXAb4DTgBszM4CFNPMyRMQcmqG1vYAPAl8u9RnA2cC+wP7AOSWgJEmjpK0eSWaeu+m12hcR04A3ZeZxZf/rgNURcThwYFltHrCIJlwOB+aXdW+KiOkRsRNwELCgpdeyAHg7Ta9GkjQK2r1qawPN0FGr5Zm56wiPuwfwYER8jaY3cgvwUWCnzFwBkJn3R8Tzy/qzgPtatl9aaoPry0pNkjRK2u2RPDkEFhHPAo4AXl953H2AUzLzloi4hKbnMTisBgweRptU1h1qeG24fTxpxoztmDJl8mY09yl9fVNHtJ3Gv5kzp9LTM63bzZBGXbuT7U/KzLXANRFxRsVxlwL3ZeYt5f21NEGyIiJ2yswVEbEz8EDL+q29n12A5aX+5kH1RZs6eF/fYyNueG/vmhFvq/Gtt3cNK1c+0u1mSB0z3Aeldoe23t/ydhLwCmDtSBtTguK+iHhJZt4BvAX4z/J1HHBh+X5d2eR64BTgqog4AFhV9vE94IIywb4NcAhNIEmSRkm7PZKDWl73Aw8C76489oeBK8pQ2V3A8cBk4OqIOIHm3pUjATLzhog4NCJ+S3P57/Gl3hcR59HMsfQD52bmqsp2SZI2w6T+/k1OKQBPzo0ETfjcVq60GpNWrnykvZMewp13Lub8yxay/cwXbMkmaYx7uPd3nHnSwcyevVe3myJ1TE/PtCFv+2j375G8luamxHnA14B7I2L/Ldc8SdJY1e7Q1qXAuzPzJoAyT/F5YL9ONUySNDa0e2f71IEQAcjMHwPP7kyTJEljSbtB0lvuOgcgIo7g6Y+UlyRNUO0ObX0A+E5EzOWpmwH/oGOtkiSNGe32SOYAj9E8lfcgYCVPvxFQkjRBtRskHwDekJmPZuYvgdcCf965ZkmSxop2g+RZwBMt75+gjWdaSZLGv3bnSL4NLIyIq2kC5F089fgSSdIE1laPJDM/QXMvSQCzgUsz86xONkySNDa0/fTfzPwm8M0OtkWSNAa1O0ciSdKQDBJJUhWDRJJUxSCRJFUxSCRJVQwSSVIVg0SSVMUgkSRVMUgkSVUMEklSFYNEklTFIJEkVTFIJElVDBJJUhWDRJJUxSCRJFUxSCRJVdr+C4mdEBHbALcASzPzsIjYHbgSmAH8DDgmM9dFxLbAfOC1wIPAuzPz3rKP04ETgHXARzJzweifiSRNXN3ukXwEuL3l/YXAxZkZwCrgxFI/EejNzL2AzwIXAUTEy4GjgJcBc4AvRcSkUWq7JIkuBklE7AIcClzWUj4YuLa8ngccUV4fXt5D83fjDy6vDwOuzMx1mbkEWAzs18FmS5IG6WaP5BLg40A/QEQ8D+jLzA1l+VJgVnk9C7gPIDPXA6sjYmZrvVjWso0kaRR0ZY4kIt4BrMjMWyPizaU8qXy16m9ZNlj/RuobNWPGdkyZMrnN1j5dX9/UEW2n8W/mzKn09EzrdjOkUdetyfY3AIdFxKHAc4BpNHMf0yNim9Ir2QVYXtZfCuwKLI+IycD0zOyLiIH6gNZthtXX99iIG97bu2bE22p86+1dw8qVj3S7GVLHDPdBqStDW5n5ycx8UWbuAbwHWJiZ7wMWAUeW1Y4Friuvry/vKcsXttTfExHbRsSLgT2Bm0fjHCRJjW5ftTXYacCpEXEHMBOYW+pzgR0jYjHw0bIemXk7cDXNlV83ACdn5iaHtiRJW05X7yMByMzvA98vr+8G9h9incdpLvMdavtPA5/uZBslScPb2nokkqQxxiCRJFUxSCRJVQwSSVIVg0SSVMUgkSRVMUgkSVUMEklSFYNEklTFIJEkVTFIJElVDBJJUhWDRJJUxSCRJFUxSCRJVQwSSVIVg0SSVMUgkSRVMUgkSVUMEklSFYNEklTFIJEkVTFIJElVDBJJUhWDRJJUxSCRJFUxSCRJVQwSSVKVKd04aETsAswHdgbWA1/JzEsjYgZwFbAbsAQ4KjNXl20uBeYAjwLHZeatpX4scAbQD1yQmfNH+XQkaULrVo9kHXBqZr4ceD1wSkS8FDgNuDEzA1gInA4QEXOA2Zm5F/BB4MulPgM4G9gX2B84JyKmj/bJSNJE1pUgycz7B3oUmbkG+DWwC3A4MK+sNq+8p3yfX9a/CZgeETsBbwMWZObqzFwFLADePmonIknq/hxJROwOvAb4MbBTZq6AJmyA55fVZgH3tWy2tNQG15eVmiRplHQ1SCJiKvBN4COlZ9I/zKqThnjfP0SdjexDktQBXZlsB4iIKTQh8vXMvK6UV0TETpm5IiJ2Bh4o9aXAri2b7wIsL/U3D6ov2tSxZ8zYjilTJo+o3X19U0e0nca/mTOn0tMzrdvNkEZd14IE+Cpwe2Z+rqV2PXAccGH5fl1L/RTgqog4AFhVwuZ7wAVlgn0b4BCaCfuN6ut7bMSN7u1dM+JtNb719q5h5cpHut0MqWOG+6DUrct/3wD8CfCriPg5zXDUJ2kC5OqIOAG4FzgSIDNviIhDI+K3NJf/Hl/qfRFxHnBL2ce5ZdJdkjRKuhIkmflDYLixpbcOs82HhqlfDly+RRomSdpsXb9qS5I0thkkkqQqBokkqYpBIkmqYpBIkqoYJJKkKgaJJKmKQSJJqmKQSJKqGCSSpCoGiSSpikEiSapikEiSqhgkkqQqBokkqYpBIkmqYpBIkqoYJJKkKgaJJKmKQSJJqmKQSJKqGCSSpCoGiSSpikEiSapikEiSqhgkkqQqBokkqcqUbjdA0pazfv16liy5q9vN0FZo9933YPLkyR3Z97gIkoh4O/BZmh7W3My8sMtNkrpiyZK7OOuaTzF1x+273RRtRdY8+DDnHXk2s2fv1ZH9j/kgiYhtgC8AbwGWAz+JiOsy8zfdbZnUHVN33J7pO8/odjM0gYyHOZL9gMWZeU9mrgWuBA7vcpskacIYD0EyC7iv5f3SUpMkjYIxP7QFTBqi1t/JAz66emUnd68xaGv6mVjz4MPdboK2Mp3+mRgPQbIUeFHL+11o5kqG1dMzbajwaUtPzz4sumafkW4udVRPzz786wHf7nYzNMGMhyD5CbBnROwG/A54D/De7jZJkiaOMT9HkpnrgQ8BC4D/BK7MzF93t1WSNHFM6u/v6HSCJGmcG/M9EklSdxkkkqQqBokkqYpBorZExN3D1N8UET+NiLUR8Uct9d0iYtHotVATXevPaET8S0T0RcT1g9ZZFBEveubWqmGQqF3DXZVxD3AscMVmbCN1QuvP20XA+7rVkInGIFG7hrx1OzPvzczbeGZorAd6O94q6SlP/oxm5iJgzRDrPETzs6ktaDzckKhRkJn7b+b6S4E/7lBzpGdo52c0M/2Z7AB7JJKkKvZItFki4nzgHUB/ZvrQMUkGiTZPZp4JnDnM4hE/DFPqgEn4MzkqfESKqkTE64BvATsAvwfuz8xXdbdVmugi4t+BAKbSTLCfmJn/2t1WjV8GiSSpipPtkqQqBokkqYpBIkmqYpBIkqoYJJKkKgaJJKmKNyRKHRQR04BPAwcCa4E+4C+B7YG/zsyDutg8aYuwRyJ1SERMAm6guSHu1eWRMueV2kx8zL7GCXskUuccBOyamecMFDLz/0bE8cC0gVpEHAicDzyH5gkBH8vMf4qIo4GPA+uAu2n+vkYPzd9+2Q7YAHw4M28epfORhmSPROqc/w7cOriYmd8FHmgpnULzCI/XAX9KEyrQ9F4Oycx9aYLkpcCJwD9l5n7A2cAbO9d8qT32SKTO2UDz/LFNOQZ4Z0QcBRxA83wogOuBH0XEt4BrM/OXETEVuDYi9gH+GfhCB9otbRZ7JFLn3AI841H7EXEBT38q7Q+Afcv6Ty7LzI8Bf0TzlyaviIijM/NHwMuB7wJHAd/p5AlI7fChjVIHRcQPgAXA+Zm5ISLeBnwV+BjwZ8C7gLuAnTLziYi4EHgPsAfwa+APM/P+iDiL5kqvDcCyzLw0InYFfpaZPaN/ZtJTHNqSOusw4LPAbRHxBPAgMAeYAZCZfRHxVeD2snwhzUT6tsBZwI0R8V/ACuA44NnAN8qE/TqaYTGpq+yRSJKqOEciSapikEiSqhgkkqQqBokkqYpBIkmqYpBIkqoYJJKkKgaJJKnK/wd8ZXPhifRAvAAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig, ax = plt.subplots(figsize=(6,4))\n", "sns.countplot(df['class'], ax=ax)\n", "ax.set(xlabel='Class')\n", "plt.title('Class Distribution')" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'-1' 10923\n", "'1' 260\n", "Name: class, dtype: int64" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target = df['class']\n", "target.value_counts()" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(11183, 6)" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y = (target == \"'-1'\").astype(np.int)\n", "X = df.iloc[:, :-1]\n", "X.shape" ] }, { "cell_type": "code", "execution_count": 130, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X.values, \n", " y.values,\n", " test_size=0.3,\n", " random_state=nb_seed)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The testing data will be held out for validation at the end." ] }, { "cell_type": "code", "execution_count": 131, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 131, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmsAAAEcCAYAAACYg/MAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XuYXXV56PFvSASFhJjAGCgglNsr6KNcykU9PUKBSiwFjz0gYJEAKucAFaW1Ah5BRIrQgxf0WK0gBAoFxCrg4YFIybHFSwABxYKvKAQIMTEwk0CgIknm/LF+Y7bDnsmeZO/Za2a+n+fJk7Xevdbavz0Tfrz7d53U39+PJEmS6mmjbhdAkiRJQzNZkyRJqjGTNUmSpBozWZMkSaoxkzVJkqQaM1mTJEmqMZM1SVItRcQVEfHJYV5fExE7jmaZhhIR8yPixG6Xo1FEPBcRO7TpWWdFxD+W4+3Lz74tOUREbBcRz0bEpHY8bzya0u0CSJLGp4h4DDgpM+9siB0PvC8z/7gNb9GWhUIjYj5wdWZ+bZhrXgF8DDgW2BpYBtwJfDIzn2hHOVoVEW8r7/18CS0HfgD8fWbeO3BdZk5r8Vn/lJnbDXddZl44KLTeP/vB/y4y80lg8/V93kRgy5okabS1azX20WyJ+QZwGHA0MB14E/Aj4KBRLEOjpzJz88zcHNgf+Bnw7xFx4AifM4l1/D4iYvJ6llFtYsuaJKlrIuJ1wD8AewCLgLMz85Yhrv0I8GFgDfBxGpKMiNgc+CJwKFWL02WZeUF57Vxg58w8rpxvDzxG9f/ATwJ/DOwXEZ8DrszMDw5634OpkrJdMnNxCT9Xyt2snDsCX6VK6NYA84BTMvPZ8vpHgb+iak16qrw2PyL2Ab4E7Aq8AFyTmX+zrp9hKdO5ETETuAjYt7zPmvK5H42IdwB/D2wHrAA+C3wZuBXYOCKeKz/PXYGTgTcAvwH+HDgjIrZr/BlSJXknRcQnyvklmfmZ8r5XAE9m5jnl/HetdxFxFfBa4JaIWF1+/l+n/D4yc01EbF3K9l+AZ4CLM/Oy8qxzgd1L2f4b8DhwfGbet66f01hmy5okaTT9rjUsIqYAtwC3AT3AB4FrImKXwTdFxKHAGZSkCTh40CVfBKYBOwAHAO+NiBMaXh/cetQPkJn/C/h34LTSUvVBXu4g4O6GRG1dJgF/B2wF7AZsC3yifI5dgVOBvUur2NuBheW+zwOfy8zpwE7ADS2+34B/AfaKiFeV88bPfBnw/vKebwDuzMwXgNnA4sycVj7/knL94cANmflq4Nomz4Pq57xT+QxnRsSfDFO2gZ/3e4EngMPK+/3vJs++rlyzFXAk8HeDWgz/vJRpOtW/n/8zzPuOC7asSZI66VsRsarhfBOq7kOANwObZeZF5Xx+RHwbOIaqxaXRkcAVmfkwQGnROaYcbwQcBbypJCCPR8QlwHHAFW34DFsAv2r14sz8JfDLcvpMRHwWOKecrwY2Bt4QEc8MGu/2W2DniNgiM58B7h5hORdTJYqvBv6T3+8m/i3w+oh4MDNXAA+s41k/GGjhzMzfRESzaz6Rmb8Bflpa046hGkvXiqZd2KUF7y3A7Mx8CfhxRFxG9bucXy67KzNvL9dfDZze4nuOWbasSZI66YjMnDnwBzil4bWtgScHXf84sE2T5/zBoGsfbzjeEngFVWvMup6zPp6hKmtLIqInIv45IhZFxHLgn0oZBxK5D1G1tC2NiGtLtx/ASUAAP4uIBRHxZyMs5zZULVTLm7z2F8CfUSWy8yNi/3U8a/DvZbB+qm7rAY9T/Y421NZAb0m6G5/d+Ltc0nD8AvDKds1Mratx/eEkSV033CSAxVRjqBq9lmoc12C/GnTt9qztOnsaeKnEGl8feM7zwKYNrw1OvNY14eEOYN+IaDUZuZBqrNobSjfiX9Lwc8jM68ps2IHyfrrEf5mZx2ZmD3AxcGNDl2Yr3gXcl5n/OfiFzPxRZr6Tqrv5JtZ2sQ712VuZBNL4+3gt1e8TNuznvRiYGRGbDXp2s38TE4bdoJKkblkAPB8Rfwt8hmpA+WGU8V2D3AB8rXR7Pc7abkXKoPQbgAvK0iBbUE1EuLhc8gDwt6WL7VngzEHPXgoMuV5bZv5rRHwH+GZE/E/gx8CrgPcAL2bmlYNumUbVuvVsRGwDfGTghTJmbRvge1Rdk7/rroyI9wC3Z+bTVJMA+qm6TZtpHPv3B8D7gROpxnP9nrLsyJHAtzPz2TKZYKBreimwRURsPjABokWTgI9HxAeofnYnUC1rAtXP+4yIuICq23twN+WSck9jl+kkgMxcFBHfBy4sE0qCqsXxWIY27tdns2VNktQpw7bOlDFJhwPvoGod+yJwXGY+Mvj+zLwN+BzV/+B/DvzroMd9kKpL7FHg36hmH15R7r0DuB74CXAP1aD0Rp8HjoyIZ8qM0Gb+O9XMyeupErEHgb2pWt0Gf9bzymvLy3t9o+G1Taha0pZRtSL1AGeX1w4F/iMinqWarfnuzPztEOXZuiwk+xzV2LbXA2/LzMafS2OZjgMeK92yH6Bq7SMzE/hn4NGI6I2IrYZ4v8H6ge8CvwC+QzVjc+C9r6b6WS+kmjxy3aB7P02V6PVGxBlNynoM8IdUP59vAB9vXKtviLKMa5P6+zv7GSPiw1RZ8Rqqf9wnUPVrXwfMAO6j+o9zVURsDFxF9Y/8aap/qE+U55xF9a1hFXB6Zs7raMElTWgRsQnV//Q3puqFuDEzzysDqd/G2paPOZn5k3LPpVSz654v8QdK/HiqBVX7gQsy86rR/jySxq6OtqyVptm/AvbKzDdSVXjHUK0Dc0lmBtU3j5PKLSdRDSzcheob1MXlObtTzfTZjaoi/JLbUkjqpMx8ETgwM/ekWgNsdkTsV17+m8zcMzP3akjUZgM7lfrrZKp1ooiIGVRddvsA+1GthzV9lD+OpDFsNLpBJwOblfV0XkXVrHkga5uF5wLvLMdHlHOAG4GBNVsOB67LzFWZuRB4hLLonyR1SsOMtE2ovmyuKefNviweQdUzQGYuAKZHxCyqNajmZeaKzFxOtUDqoR0tuKRxpaPJWllA8BKq6dRPUXUb3Acsz8yBSm8Ra6fkbkOZLpyZq4EVZUXm38WLp2jflGxJaioiNoqI+6kGRH8nM+8pL30qIh6IiEvK4G14eT01ULdZf0naIJ3uBn011bfN7anGqW1G1Y052MDAuWbfVvuHiUtSx2TmmtINui3V0g27A2dm5m5U3ZpbAB8tlw+upwb2XLT+krRBOt0NejDwaGb2lpayb1KtTPzqhgXstmXt2iyLKOu2lI1jp2dmX2O8yT1NrVq1up+qQvSPf/wzcf50RFnS4LvAoZm5tMReolodf2BIxlD11CKqdaIGx4dlHeYf/0y4P0Pq9DprTwD7R8QrgRep9le7h+rb6JFUU6CPp1qgD+Dmcr6gvH5nQ/yasmXHNsDOrGMbjr6+F4Z7WdI41NMzrW3PiogtgZcyc0VZmPRg4NMRsVVmLimTnN4J/LTccjPVno/Xl9Xhl2fm0oi4nWr9r+lUX5AP4eXrfL2MdZg0sQxXf3V6zNrdVBMF7qdaRHAS8I9UFdUZEfFzYCZwebnlcmDLiHiEajuOM8tzHqJaEPEhqnVuTsnMYbNQSdpAW1PtVfkA1RfI2zPzVqovjj+mqtO2AD4FUF57LCJ+AXyFsq1S6R04H7i3POe8MtFAklrS8XXWumXZsufG5weTNKSenmnjZkkf6zBpYhmu/nIHA0mSpBozWZMkSaoxkzVJkqQaM1mTJEmqsU4v3SHV3urVq1m48NFuF0OD7LDDjkyePLnbxZCkrjNZ04S3cOGjfPzrn2Tqlpt3uygqVj79LOcfeQ477bRLt4siSV1nsiYBU7fcnOlbzeh2MSRJehnHrEmSJNWYyZokSVKN2Q0qSdIY5QSp+mrnJCmTNUmSxignSNVTuydJmaxJkjSGOUFq/HPMmiRJUo2ZrEmSJNWYyZokSVKNmaxJkiTVmMmaJElSjZmsSZIk1ZjJmiRJUo11dJ21iNgVuB7oByYBOwIfB64u8e2BhcBRmbmi3HMpMBt4HpiTmQ+U+PHAx8qzLsjMqzpZdkmSpDroaMtaZv48M/fMzL2AvakSsG8CZwJ3ZGYAdwJnAUTEbGCnzNwFOBn4conPAM4B9gH2A86NiOmdLLskSVIdjGY36MHALzPzSeAIYG6Jzy3nlL+vAsjMBcD0iJgFvB2Yl5krMnM5MA84dBTLLkmS1BWjmay9G7i2HM/KzKUAmbkEeE2JbwM82XDPohIbHH+qxCRJksa1UdkbNCJeARwOfLSE+oe4dFKT8/4m8eGeIUkbLCI2Af4N2JiqrrwxM8+LiB2A64AZwH3AcZm5KiI2puoZ2Bt4Gnh3Zj5RnnUWcCKwCjg9M+eN9ueRNHaN1kbus4EfZebT5XxpRMzKzKURsRXw6xJfBGzXcN+2wOISP2BQfP5wbzhjxqZMmTK5HWXXONfXN7XbRVATM2dOpadnWtfePzNfjIgDM/OFiJgMfC8ibgPOAC7JzK9HxD8AJwFfKX/3ZuYuEfFu4GLg6IjYHTgK2I2q7rojInbJTL9wSmrJaCVrxwD/3HB+MzAHuKj8fVND/FTg+ojYH1heErrbgQvKpIKNgEOoJikMqa/vhXaWX+NYb+/KbhdBTfT2rmTZsudGdE+7k7vMHKhINqGqL/uBA6nqNKjG3J5LlawdUY4BbgS+UI4PB67LzFXAwoh4BNgXWNDWwkoatzo+Zi0iXkU1ueBfGsIXAYdERAIHAZ8GyMxbgcci4hdUld8pJd4HnA/cS1XBnVcmGkhSx0TERhFxP7AE+A7wS6ovkWvKJQPjaqFhbG1mrgZWRMRMHHMraQN1vGUtM/8T6BkU66VK4Jpdf9oQ8SuBK9tcPEkaUknK9oyIzamWHdqtyWUD3ZlDja1drzG3DuVQKxzGUV/tHMoxWt2gkjRmZeazEfFdYH/g1RGxUUnkBsbVwtoxt4vLGLfpmdkXEUONxR2WQznUCodx1NdIh3IMl9i53ZQkNRERWw4svt0wnOMhqslNR5bLjuf3x9weX46PpFrweyB+dERsHBF/COwM3N35TyBpvDBZk6TmtgbmR8QDVGNlby/jas8EzoiInwMzgcvL9ZcDW5YJBB8q15GZDwE3UCV6twKnOBNU0kjYDSpJTWTmg8BeTeKPUW17Nzj+ItUSHc2edSFwYbvLKGlisGVNkiSpxkzWJEmSasxkTZIkqcZM1iRJkmrMZE2SJKnGTNYkSZJqzGRNkiSpxkzWJEmSasxkTZIkqcZM1iRJkmrMZE2SJKnGTNYkSZJqzGRNkiSpxkzWJEmSasxkTZIkqcamdPoNImI6cBnwBmANcCLwc+B6YHtgIXBUZq4o118KzAaeB+Zk5gMlfjzwMaAfuCAzr+p02SVJkrptNFrWPg/cmpm7AW8CfgacCdyRmQHcCZwFEBGzgZ0ycxfgZODLJT4DOAfYB9gPOLckgZIkSeNaR5O1iJgG/HFmXgGQmatKC9oRwNxy2dxyTvn7qnLtAmB6RMwC3g7My8wVmbkcmAcc2smyS5Ik1UGnu0F3BJ6OiCuoWtXuBT4EzMrMpQCZuSQiXlOu3wZ4suH+RSU2OP5UiUmSJI1rnU7WpgB7Aadm5r0R8VmqLtD+Ia6f1OS8v0mcYZ4BwIwZmzJlyuQRFlcTUV/f1G4XQU3MnDmVnp5p3S6GJHVdp5O1RcCTmXlvOf8GVbK2NCJmZebSiNgK+HXD9ds13L8tsLjEDxgUnz/cG/f1vbDhpdeE0Nu7sttFUBO9vStZtuy5Ed1jcidpPOromLXS1flkROxaQgcB/wHcDMwpsTnATeX4ZuC9ABGxP7C8PON24JCImF4mGxxSYpIkSeNax5fuAD4IXBMRrwAeBU4AJgM3RMSJwBPAkQCZeWtEvCMifkG1dMcJJd4XEedTjXnrB84rEw0kSZLGtY4na5n5Y6olNwY7eIjrTxsifiVwZdsKJknDiIhtqWanbwWsBv4xM78QEecC72ft8I2zM/O2cs9ZVGtJrgJOz8x5JX4o8Dmq3ozLM/OiUf0wksa00WhZk6SxaBVwRmY+EBFTgR9FxHfKa5/JzM80XhwRuwFHAbtRjau9IyJ2oZog9UWqYSCLgXsi4qbM/NlofRBJY5vJmiQ1kZlLgCXleGVEPMzaJYOazVA/ArguM1cBCyPiEWDfcu0jmfk4QERcV641WZPUEvcGlaR1iIgdgD2ABSV0akQ8EBGXNeymMtR6kEOtHylJLbFlTZKGUbpAb6Qag7YyIr4EfDIz+yPiU8AlwPsYej3IZl+Kh10nElwrUq1xncj6audakSZrkjSEiJhClahdnZk3AWTmsoZLvgrcUo6HWidyEvDaJvFhuVakWuE6kfU10rUih0vsTNYkaWhfAx7KzM8PBCJiqzKeDeBdwE/L8c1UyxR9lqqbc2fgbqqWtZ0jYnvgV8DRwDGjVH5J44DJmiQ1ERFvBd4DPBgR91N1XZ4NHBsRewBrgIXAyQCZ+VBE3AA8BLwEnJKZ/cDqiDgNmMfapTseHu3PI2nsMlmTpCYy83tUC3gPdtsw91wIXNgkfhsQ7SudpInE2aCSJEk1ZrImSZJUYyZrkiRJNWayJkmSVGMma5IkSTVmsiZJklRjJmuSJEk1ZrImSZJUYyZrkiRJNWayJkmSVGMma5IkSTXW8b1BI2IhsIJq0+OXMnPfiJgBXA9sT7UR8lGZuaJcfykwG3gemJOZD5T48cDHqDZTviAzr+p02SVJkrptNFrW1gAHZOaemblviZ0J3JGZAdwJnAUQEbOBnTJzF+Bk4MslPgM4B9gH2A84NyKmj0LZJUmSumo0krVJTd7nCGBuOZ5bzgfiVwFk5gJgekTMAt4OzMvMFZm5HJgHHNrpgkuSJHXbaCRr/cDtEXFPRLyvxGZl5lKAzFwCvKbEtwGebLh3UYkNjj9VYpIkSeNax8esAW/JzCUR0QPMi4ikSuCamdTkvL9JnGGeAcCMGZsyZcrkERdWE09f39RuF0FNzJw5lZ6ead0uhiR1XceTtdJyRmYui4hvAfsCSyNiVmYujYitgF+XyxcB2zXcvi2wuMQPGBSfP9z79vW90J4PoHGvt3dlt4ugJnp7V7Js2XMjusfkTtJ41NFu0IjYNCKmluPNgD8FHgRuBuaUy+YAN5Xjm4H3luv3B5aX7tLbgUMiYnqZbHBIiUmSJI1rnR6zNgu4KyLuB34I3JKZ84CLqJKvBA4CPg2QmbcCj0XEL4CvAKeUeB9wPnAvsAA4r0w0kCRJGtc62g2amY8BezSJ9wIHD3HPaUPErwSubGPxJEmSas8dDCRJkmrMZE2SJKnGTNYkSZJqbDTWWZOkMScitqXaUWUrYDXw1cy81L2NJY02W9YkqblVwBmZuTvwZuDUiHgd7m0saZSZrElSE5m5ZKBlLDNXAg9TLcjt3saSRpXJmiStQ0TsQLUM0Q9xb2NJo8wxa5I0jLILy43A6Zm5MiI6vrcxuL+xWuPexvXVzv2NTdYkaQgRMYUqUbs6Mwe2xev43sbg/sZqjXsb19dI9zceLrGzG1SShvY14KHM/HxDzL2NJY0qW9YkqYmIeCvwHuDBsr9xP3A21d7GN0TEicATwJFQ7W0cEe8oexs/D5xQ4n0RMbC3cT/ubSxphEzWJKmJzPweMNSgMfc2ljRqWuoGjYgvNInNbXatJNWJ9ZeksW7YlrWIuAzYEfijiHh9w0uvAFzUUVJtWX9JGi/W1Q36KWAH4PPAeQ3xVVQLREpSXVl/SRoXhk3WMnMh1d53b4qIzam+jQ6sGTQV6O1k4SRpfVl/SRovWppgEBFnUe1/90xDuJ+qi0GSasv6S9JY1+ps0PdRbVC8rJOFkaQOsP6SNKa1uijuE9hlIGlssv6SNKa12rL2CHBXRMwHfjMQzMxPtnJzRGxEtSDkosw8vGyKfB0wA7gPOC4zV0XExsBVwN7A08C7M/OJ8oyzgBOpBgefnpnzWiy7pIltg+ovSeq2VlvWngJuA16kGqA78KdVpwMPNZxfBFySmQEsB04q8ZOA3szcBfgccDFAROwOHAXsBswGvhQRI3l/SRPXhtZfktRVLbWsZeZ5676quYjYFngHcAFwRgn/CXBMOZ4LnAt8BTiiHEO1efLAYpaHA9dl5ipgYUQ8AuwLLFjfckmaGDak/pKkOmh1NugaqtlTjRZn5nYt3P5Z4COURSgjYgugLzPXlNcXAduU422AJwEyc3VErIiImSX+g4ZnPtVwjyQNaQPrL0nqulZb1n7XXRoRrwDeCbx5XfdFxJ8BSzPzgYg4oISbdUH0N7w2WP8w8SHNmLEpU6YMta2ftFZf39RuF0FNzJw5lZ6eaRv8nPWtvySpLka8kXtmvgR8PSI+1sLlbwUOj4h3AK8CplGNRZseERuV1rVtgcXl+kXAdsDiiJgMTM/MvogYiA9ovKepvr4XRvKxNIH19q7sdhHURG/vSpYte25E96wruRth/SVJtdBqN+h7G04nAa8HXlrXfZl5NnB2ecbbgL/OzL+MiOuBI4HrgeOBm8otN5fzBeX1Oxvi10TEZ6m6P3cG7m6l7JImtvWtvySpLlptWTuw4bifsqzGBrzvmcB1EXE+cD9weYlfDlxdJhA8AxwNkJkPRcQNVDNKXwJOycxhu0ElqWh3/SVJo6rVMWsnlLEeUe75aZmZ2bLM/C7w3XL8GLBfk2tepFqio9n9FwIXjuQ9Jakd9ZckdVNL66xFxN5UC0vOBa4AnoiIlyVbklQ31l+SxrpWu0EvpdpNYAFAROxPtQbavp0qmCS1ifWXpDGt1R0Mpg5UdACZ+UPglZ0pkiS1lfWXpDGt1WStNyKOGDiJiHdSTQCQpLqz/pI0prXaDfoB4NsRcTnV1Pd+4C0dK5UktY/1l6QxrdWWtdnAC8D2VNPglwEHdKhMktRO1l+SxrRWk7UPAG/NzOcz8yfA3sBfda5YktQ21l+SxrRWu0FfAfy24fy3rGNvTkmqifWuv0rX6WFUexy/scTOBd4P/LpcdnZm3lZeOws4EVgFnJ6Z80r8UKqt9jYCLs/Mizb0Q0maOFpN1r4F3Fl2EegH/oK1W0RJUp1tSP11BdUyH1cNin8mMz/TGIiI3agW9d6Nav/iOyJiF6pxcl8EDqLa0/ieiLgpM3+2np9H0gTTUjdoZn6Uaq2iAHYCLs3Mj3eyYJLUDhtSf2XmXUBfk5cmNYkdAVyXmasycyHVQrz7lj+PZObjZSP568q1ktSSVlvWyMwbgRs7WBZJ6ogO1F+nRsRxwL3AX2fmCmAb4AcN1zxVYpOAJxvii3BBXkkj0HKyJkkC4EvAJzOzPyI+BVwCvI/mrW39NO/BWOeYuRkzNmXKlMkbVFCNf319U7tdBA1h5syp9PRMa8uzTNYkaQQyc1nD6VeBW8rxImC7hte2pRqjNgl4bZP4sPr6XtiwgmpC6O1d2e0iaAi9vStZtuy5lq8fLrEzWZOk4U2iodUsIrbKzCXl9F3AT8vxzcA1EfFZqu7PnYG7qVrWdo6I7YFfAUcDx4xS2SWNAyZrkjSEiLiWagHdLSLiCeBc4MCI2ANYAywETgbIzIfKjNOHgJeAUzKzH1gdEacB81i7dMfDo/1ZJI1dJmuSNITMPLZJ+Iphrr8QuLBJ/Daq2aiSNGKt7mAgSZKkLjBZkyRJqjGTNUmSpBrr6Ji1iNgE+Ddg4/JeN2bmeRGxA9Uq3jOA+4DjMnNVRGxMta3L3sDTwLsz84nyrKZ77kmSJI1nHW1Zy8wXgQMzc09gD2B2ROwHXARckpkBLAdOKrecBPRm5i5Umx5fDBARu7N2z73ZwJciotkClJIkSeNKx7tBM3NgZcdNqFrX+oEDgW+U+FzgneX4iHIO1dYwf1KOD6f5nnuSJEnjWseTtYjYKCLuB5YA3wF+CSzPzDXlkkVUC0hS/n4SIDNXAysiYmZjvHiq4R5JkqRxq+PrrJWkbM+I2Bz4JlVX5mAD++QNtbfeUPEhua+eWuXeevXUzn31JGksG7VFcTPz2Yj4LrA/8OqI2Kgkco375A3srbc4IiYD0zOzLyKG2nNvSO6rp1a5t149jXRfPRh+bz1JGqs62g0aEVtGxPRy/CrgYKqtWOYDR5bLjgduKsc3l3PK63c2xI+OiI0j4g9Zu+eeJEnSuNbpMWtbA/Mj4gFgAXB7Zt4KnAmcERE/B2YCl5frLwe2jIhHgA+V68jMh4CBPfduZe2ee5IkSeNaR7tBM/NBYK8m8ceA/ZrEX6RaoqPZs5ruuSdJkjSeuYOBJElSjZmsSZIk1ZjJmiRJUo2ZrEmSJNWYyZokSVKNmaxJkiTVmMmaJElSjZmsSZIk1ZjJmiRJUo2ZrEmSJNWYyZokSVKNdXRvUEkayyLicuAwYGlmvrHEZgDXA9sDC4GjMnNFee1SYDbwPDAnMx8o8eOBjwH9wAWZedUofxRJY5gta5I0tCuAtw+KnQnckZkB3AmcBRARs4GdMnMX4GTgyyU+AzgH2AfYDzg3IqaPTvEljQcma5I0hMy8C+gbFD4CmFuO55bzgfhV5b4FwPSImEWV7M3LzBWZuRyYBxza6bJLGj9M1iRpZF6TmUsBMnMJ8JoS3wZ4suG6RSU2OP5UiUlSSxyzJkntManJeX+TOCU+rBkzNmXKlMntKJfGsb6+qd0ugoYwc+ZUenqmteVZJmuSNDJLI2JWZi6NiK2AX5f4ImC7huu2BRaX+AGD4vPX9SZ9fS+0p7Qa13p7V3a7CBpCb+9Kli17ruXrh0vs7AaVpOFN4vdbx24G5pTjOcBNDfH3AkTE/sDy0l16O3BIREwvkw0OKTFJaonJmiQNISKuBb4P7BoRT0TECcCnqZKvBA4q52TmrcBjEfEL4CvAKSXeB5wP3AssAM4rEw0kqSUd7QaNiG2pZkdtBawGvpqZl7pOkaSxIDOPHeKlg4e4/rQh4lcCV7anVJImmk63rK0CzsjM3YE3A6dGxOtwnSJJkqSWdDRZy8wlAy1jmbkSeJhqcK3rFEmSJLVg1MasRcQOwB7AD4FZrlMkSZK0bqOydEdETAVuBE7PzJURMdQaQ21bp8hOvvb/AAAIeUlEQVQ1itQq1ymqp3auUSRJY1nHk7WImEKVqF2dmQNT3Du+TpFrFKlVrlNUTyNdowiGX6dIksaq0egG/RrwUGZ+viHmOkWSJEkt6PTSHW8F3gM8GBH3U3Vdng1cBNwQEScCTwBHQrVOUUS8o6xT9DxwQon3RcTAOkX9uE6RJEmaIDqarGXm94ChBo65TpEkSdI6uIOBJElSjZmsSZIk1ZjJmiRJUo2ZrEmSJNWYyZokSVKNmaxJkiTVmMmaJElSjZmsSZIk1ZjJmiRJUo2ZrEmSJNWYyZokSVKNmaxJkiTVmMmaJElSjZmsSZIk1ZjJmiRJUo1N6XYBJGksioiFwApgDfBSZu4bETOA64HtgYXAUZm5olx/KTAbeB6Yk5kPdKHYksYgW9Ykaf2sAQ7IzD0zc98SOxO4IzMDuBM4CyAiZgM7ZeYuwMnAl7tRYEljk8maJK2fSby8Dj0CmFuO55bzgfhVAJm5AJgeEbNGo5CSxr6OdoNGxOXAYcDSzHxjiY24myAijgc+BvQDF2TmVZ0styS1oB+4PSL6ga9k5mXArMxcCpCZSyLiNeXabYAnG+59qsSWjmaBJY1NnR6zdgXwBco3ymKgm+DiiPgoVTfBmY3dBBGxH1U3wf4luTsH2Ivqm+yPIuKmgQRPkrrkLSUh6wHmRURSJXDNTGoSG+paAGbM2JQpUyZvaBk1zvX1Te12ETSEmTOn0tMzrS3P6miylpl3RcT2g8JHAG8rx3OB+VQJ3O91E0TEQDfBgcC8hta3ecChVK1zktQVmbmk/L0sIr4F7AssjYhZmbk0IrYCfl0uXwRs13D7tsDi4Z7f1/dCB0qt8aa3d2W3i6Ah9PauZNmy51q+frjErhtj1l7T2E0ADNVNsKjEhuo+kKSuiIhNI2JqOd4M+FPgQeBmYE65bA5wUzm+GXhvuX5/YPlAPShJ61KnCQaDuwkmUXUTjLj7QJI6bBZwV0TcD/wQuCUz5wEXAYeULtGDgE8DZOatwGMR8QvgK8Ap3Sm2pLGoG+usjbSbYBFwwKD4/HW9ieM91CrHfNRTO8d7tFtmPgbs0STeCxw8xD2ndbpcksan0UjWJvH7rWMD3QQX8fJuglOB6xu7CSLiduCCiJhO1RJ4CNUYt2E53kOtcsxHPY10vAcMP+ZDksaqjnaDRsS1wPeBXSPiiYg4gapboOVugszsA84H7gUWAOdl5vJOlluSJKkuOj0b9NghXhpRN0FmXglc2Z5SSZIkjR11mmAgSZKkQUzWJEmSasxkTZIkqcZM1iRJkmrMZE2SJKnGTNYkSZJqzGRNkiSpxkzWJEmSasxkTZIkqcZM1iRJkmrMZE2SJKnGTNYkSZJqzGRNkiSpxkzWJEmSasxkTZIkqcZM1iRJkmrMZE2SJKnGTNYkSZJqbEq3C1AXq1evZuHCR7tdDA2yww47Mnny5G4XQ6o166/6sg5TO4ypZC0iDgU+R9UieHlmXtSuZy9c+ChnXXI9m03vadcjtYGeX7GMC//63ey00y7dLoq0way/Jh7rMLXLmEnWImIj4IvAQcBi4J6IuCkzf9au99hseg+bz9y6XY+TJMD6S9KGGUtj1vYFHsnMxzPzJeA64Igul0mSWmH9JWm9jaVkbRvgyYbzRSUmSXVn/SVpvY2ZblBgUpNYfzvf4PkVy9r5OG2g0fx9rHz62VF7L63bOPx9WH9NQKP1OxmH/72Mee3+nYylZG0R8NqG822pxn401dMzrVnlOKSenr2Y//W91rNoGst6evbiO/t/q9vF0Pg2ovoLRlaHWX9NXNZfE8NYStbuAXaOiO2BXwFHA8d0t0iS1BLrL0nrbcyMWcvM1cBpwDzgP4DrMvPh7pZKktbN+kvShpjU39/WYROSJElqozHTsiZJkjQRmaxJkiTVmMmaJElSjY2l2aBqQSf3H1R9RcTlwGHA0sx8Y7fLI60v67CJyTpseLasjSMN+w++HXg9cExEvK67pdIouYLq9y6NWdZhE5p12DBM1sYX9x+coDLzLqCv2+WQNpB12ARlHTY8k7Xxxf0HJY1l1mFSEyZr40vH9x+UpA6yDpOaMFkbX0a8/6Ak1Yh1mNSEs0HHF/cfnNgm0bxlQhorrMMmNuuwIdiyNo64/+DEFRHXAt8Hdo2IJyLihG6XSRop67CJyzpseO4NKkmSVGO2rEmSJNWYyZokSVKNmaxJkiTVmMmaJElSjZmsSZIk1ZjJmiRJUo25KK5qLyKmARcCbwNeotrs92+AzYFPZOaBXSyeJA3J+kvtYMuaai0iJgG3As8Ab8rMvYDzS2wm7hsoqaasv9Qutqyp7g4EtsvMcwcCmfn/yurW0wZiEfE24FPAq4BXAx/OzFsi4ljgI8Aq4DHgL4Ee4BpgU2AN8MHMvHuUPo+kicP6S21hy5rqbk/ggcHBzLwN+HVD6FTgpMz8I+D9VBUfVN9iD8nMfagqu9cBJwG3ZOa+wDnAf+lc8SVNYNZfagtb1lR3a4DftHDdccBhEXEUsD8wtcRvBr4fEd8EvpGZP4mIqcA3ImIv4P8CX+xAuSXJ+kttYcua6u5eYK/BwYi4AJjUELoL2Kdc/7vXMvPDwLuAXuCaiDg2M78P7A7cBhwFfLuTH0DShGX9pbZwI3fVXkTcBcwDPpWZayLi7cDXgA8D/wP4C+BRYFZm/jYiLgKOBnYEHgb+a2YuiYiPU83AWgM8lZmXRsR2wH2Z2TP6n0zSeGf9pXawG1RjweHA54CfRsRvgaeB2cAMgMzsi4ivAQ+V1++kGny7MfBx4I6I+E9gKTAHeCVwbRnku4qqC0KSOsH6SxvMljVJkqQac8yaJElSjZmsSZIk1ZjJmiRJUo2ZrEmSJNWYyZokSVKNmaxJkiTVmMmaJElSjZmsSZIk1dj/B8YHrJlpTLs6AAAAAElFTkSuQmCC\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(10,4))\n", "sns.countplot(y_train, ax=ax0)\n", "ax0.set(xlabel='Class')\n", "plt.title('Training Class Distribution')\n", "sns.countplot(y_test, ax=ax1)\n", "ax1.set(xlabel='Class')\n", "plt.title('Holdout Class Distribution')" ] }, { "cell_type": "code", "execution_count": 132, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Class 0 makes up 2.261% of the Model Building data\n", "Class 0 makes up 2.474% of the Holdout data\n" ] } ], "source": [ "print('Class 0 makes up {:2.3f}% of the {} data'\n", " .format(y_train[y_train == 0].shape[0] * 100 / y_train.shape[0], 'Model Building'))\n", "print('Class 0 makes up {:2.3f}% of the {} data'\n", " .format(y_test[y_test == 0].shape[0] * 100 / y_test.shape[0], 'Holdout'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From these printouts, we see that the model building and holdout data have slightly different distributions. Modeling is based on the assumption that the data is being produced by a fairly non-random, approximately consistent process. From this assumption, we treat the training data as a representative sample of data produced by this process. If we use a biased training set, then our model will probably be slightly biased." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using Stratification to produce comparable model-building and holdout data" ] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X.values, \n", " y.values, \n", " stratify=y,\n", " test_size=0.3,\n", " random_state=nb_seed)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The testing data will be held out for validation at the end." ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 134, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmsAAAEcCAYAAACYg/MAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XuYXXV56PFvSASFhJjAGCgglNsr6KNcykU9PUKBSiwFjz0gYJEAKucAFaW1Ah5BRIrQgxf0WK0gBAoFxCrg4YFIybHFSwABxYKvKAQIMTEwk0CgIknm/LF+Y7bDnsmeZO/Za2a+n+fJk7Xevdbavz0Tfrz7d53U39+PJEmS6mmjbhdAkiRJQzNZkyRJqjGTNUmSpBozWZMkSaoxkzVJkqQaM1mTJEmqMZM1SVItRcQVEfHJYV5fExE7jmaZhhIR8yPixG6Xo1FEPBcRO7TpWWdFxD+W4+3Lz74tOUREbBcRz0bEpHY8bzya0u0CSJLGp4h4DDgpM+9siB0PvC8z/7gNb9GWhUIjYj5wdWZ+bZhrXgF8DDgW2BpYBtwJfDIzn2hHOVoVEW8r7/18CS0HfgD8fWbeO3BdZk5r8Vn/lJnbDXddZl44KLTeP/vB/y4y80lg8/V93kRgy5okabS1azX20WyJ+QZwGHA0MB14E/Aj4KBRLEOjpzJz88zcHNgf+Bnw7xFx4AifM4l1/D4iYvJ6llFtYsuaJKlrIuJ1wD8AewCLgLMz85Yhrv0I8GFgDfBxGpKMiNgc+CJwKFWL02WZeUF57Vxg58w8rpxvDzxG9f/ATwJ/DOwXEZ8DrszMDw5634OpkrJdMnNxCT9Xyt2snDsCX6VK6NYA84BTMvPZ8vpHgb+iak16qrw2PyL2Ab4E7Aq8AFyTmX+zrp9hKdO5ETETuAjYt7zPmvK5H42IdwB/D2wHrAA+C3wZuBXYOCKeKz/PXYGTgTcAvwH+HDgjIrZr/BlSJXknRcQnyvklmfmZ8r5XAE9m5jnl/HetdxFxFfBa4JaIWF1+/l+n/D4yc01EbF3K9l+AZ4CLM/Oy8qxzgd1L2f4b8DhwfGbet66f01hmy5okaTT9rjUsIqYAtwC3AT3AB4FrImKXwTdFxKHAGZSkCTh40CVfBKYBOwAHAO+NiBMaXh/cetQPkJn/C/h34LTSUvVBXu4g4O6GRG1dJgF/B2wF7AZsC3yifI5dgVOBvUur2NuBheW+zwOfy8zpwE7ADS2+34B/AfaKiFeV88bPfBnw/vKebwDuzMwXgNnA4sycVj7/knL94cANmflq4Nomz4Pq57xT+QxnRsSfDFO2gZ/3e4EngMPK+/3vJs++rlyzFXAk8HeDWgz/vJRpOtW/n/8zzPuOC7asSZI66VsRsarhfBOq7kOANwObZeZF5Xx+RHwbOIaqxaXRkcAVmfkwQGnROaYcbwQcBbypJCCPR8QlwHHAFW34DFsAv2r14sz8JfDLcvpMRHwWOKecrwY2Bt4QEc8MGu/2W2DniNgiM58B7h5hORdTJYqvBv6T3+8m/i3w+oh4MDNXAA+s41k/GGjhzMzfRESzaz6Rmb8Bflpa046hGkvXiqZd2KUF7y3A7Mx8CfhxRFxG9bucXy67KzNvL9dfDZze4nuOWbasSZI66YjMnDnwBzil4bWtgScHXf84sE2T5/zBoGsfbzjeEngFVWvMup6zPp6hKmtLIqInIv45IhZFxHLgn0oZBxK5D1G1tC2NiGtLtx/ASUAAP4uIBRHxZyMs5zZULVTLm7z2F8CfUSWy8yNi/3U8a/DvZbB+qm7rAY9T/Y421NZAb0m6G5/d+Ltc0nD8AvDKds1Mratx/eEkSV033CSAxVRjqBq9lmoc12C/GnTt9qztOnsaeKnEGl8feM7zwKYNrw1OvNY14eEOYN+IaDUZuZBqrNobSjfiX9Lwc8jM68ps2IHyfrrEf5mZx2ZmD3AxcGNDl2Yr3gXcl5n/OfiFzPxRZr6Tqrv5JtZ2sQ712VuZBNL4+3gt1e8TNuznvRiYGRGbDXp2s38TE4bdoJKkblkAPB8Rfwt8hmpA+WGU8V2D3AB8rXR7Pc7abkXKoPQbgAvK0iBbUE1EuLhc8gDwt6WL7VngzEHPXgoMuV5bZv5rRHwH+GZE/E/gx8CrgPcAL2bmlYNumUbVuvVsRGwDfGTghTJmbRvge1Rdk7/rroyI9wC3Z+bTVJMA+qm6TZtpHPv3B8D7gROpxnP9nrLsyJHAtzPz2TKZYKBreimwRURsPjABokWTgI9HxAeofnYnUC1rAtXP+4yIuICq23twN+WSck9jl+kkgMxcFBHfBy4sE0qCqsXxWIY27tdns2VNktQpw7bOlDFJhwPvoGod+yJwXGY+Mvj+zLwN+BzV/+B/DvzroMd9kKpL7FHg36hmH15R7r0DuB74CXAP1aD0Rp8HjoyIZ8qM0Gb+O9XMyeupErEHgb2pWt0Gf9bzymvLy3t9o+G1Taha0pZRtSL1AGeX1w4F/iMinqWarfnuzPztEOXZuiwk+xzV2LbXA2/LzMafS2OZjgMeK92yH6Bq7SMzE/hn4NGI6I2IrYZ4v8H6ge8CvwC+QzVjc+C9r6b6WS+kmjxy3aB7P02V6PVGxBlNynoM8IdUP59vAB9vXKtviLKMa5P6+zv7GSPiw1RZ8Rqqf9wnUPVrXwfMAO6j+o9zVURsDFxF9Y/8aap/qE+U55xF9a1hFXB6Zs7raMElTWgRsQnV//Q3puqFuDEzzysDqd/G2paPOZn5k3LPpVSz654v8QdK/HiqBVX7gQsy86rR/jySxq6OtqyVptm/AvbKzDdSVXjHUK0Dc0lmBtU3j5PKLSdRDSzcheob1MXlObtTzfTZjaoi/JLbUkjqpMx8ETgwM/ekWgNsdkTsV17+m8zcMzP3akjUZgM7lfrrZKp1ooiIGVRddvsA+1GthzV9lD+OpDFsNLpBJwOblfV0XkXVrHkga5uF5wLvLMdHlHOAG4GBNVsOB67LzFWZuRB4hLLonyR1SsOMtE2ovmyuKefNviweQdUzQGYuAKZHxCyqNajmZeaKzFxOtUDqoR0tuKRxpaPJWllA8BKq6dRPUXUb3Acsz8yBSm8Ra6fkbkOZLpyZq4EVZUXm38WLp2jflGxJaioiNoqI+6kGRH8nM+8pL30qIh6IiEvK4G14eT01ULdZf0naIJ3uBn011bfN7anGqW1G1Y052MDAuWbfVvuHiUtSx2TmmtINui3V0g27A2dm5m5U3ZpbAB8tlw+upwb2XLT+krRBOt0NejDwaGb2lpayb1KtTPzqhgXstmXt2iyLKOu2lI1jp2dmX2O8yT1NrVq1up+qQvSPf/wzcf50RFnS4LvAoZm5tMReolodf2BIxlD11CKqdaIGx4dlHeYf/0y4P0Pq9DprTwD7R8QrgRep9le7h+rb6JFUU6CPp1qgD+Dmcr6gvH5nQ/yasmXHNsDOrGMbjr6+F4Z7WdI41NMzrW3PiogtgZcyc0VZmPRg4NMRsVVmLimTnN4J/LTccjPVno/Xl9Xhl2fm0oi4nWr9r+lUX5AP4eXrfL2MdZg0sQxXf3V6zNrdVBMF7qdaRHAS8I9UFdUZEfFzYCZwebnlcmDLiHiEajuOM8tzHqJaEPEhqnVuTsnMYbNQSdpAW1PtVfkA1RfI2zPzVqovjj+mqtO2AD4FUF57LCJ+AXyFsq1S6R04H7i3POe8MtFAklrS8XXWumXZsufG5weTNKSenmnjZkkf6zBpYhmu/nIHA0mSpBozWZMkSaoxkzVJkqQaM1mTJEmqsU4v3SHV3urVq1m48NFuF0OD7LDDjkyePLnbxZCkrjNZ04S3cOGjfPzrn2Tqlpt3uygqVj79LOcfeQ477bRLt4siSV1nsiYBU7fcnOlbzeh2MSRJehnHrEmSJNWYyZokSVKN2Q0qSdIY5QSp+mrnJCmTNUmSxignSNVTuydJmaxJkjSGOUFq/HPMmiRJUo2ZrEmSJNWYyZokSVKNmaxJkiTVmMmaJElSjZmsSZIk1ZjJmiRJUo11dJ21iNgVuB7oByYBOwIfB64u8e2BhcBRmbmi3HMpMBt4HpiTmQ+U+PHAx8qzLsjMqzpZdkmSpDroaMtaZv48M/fMzL2AvakSsG8CZwJ3ZGYAdwJnAUTEbGCnzNwFOBn4conPAM4B9gH2A86NiOmdLLskSVIdjGY36MHALzPzSeAIYG6Jzy3nlL+vAsjMBcD0iJgFvB2Yl5krMnM5MA84dBTLLkmS1BWjmay9G7i2HM/KzKUAmbkEeE2JbwM82XDPohIbHH+qxCRJksa1UdkbNCJeARwOfLSE+oe4dFKT8/4m8eGeIUkbLCI2Af4N2JiqrrwxM8+LiB2A64AZwH3AcZm5KiI2puoZ2Bt4Gnh3Zj5RnnUWcCKwCjg9M+eN9ueRNHaN1kbus4EfZebT5XxpRMzKzKURsRXw6xJfBGzXcN+2wOISP2BQfP5wbzhjxqZMmTK5HWXXONfXN7XbRVATM2dOpadnWtfePzNfjIgDM/OFiJgMfC8ibgPOAC7JzK9HxD8AJwFfKX/3ZuYuEfFu4GLg6IjYHTgK2I2q7rojInbJTL9wSmrJaCVrxwD/3HB+MzAHuKj8fVND/FTg+ojYH1heErrbgQvKpIKNgEOoJikMqa/vhXaWX+NYb+/KbhdBTfT2rmTZsudGdE+7k7vMHKhINqGqL/uBA6nqNKjG3J5LlawdUY4BbgS+UI4PB67LzFXAwoh4BNgXWNDWwkoatzo+Zi0iXkU1ueBfGsIXAYdERAIHAZ8GyMxbgcci4hdUld8pJd4HnA/cS1XBnVcmGkhSx0TERhFxP7AE+A7wS6ovkWvKJQPjaqFhbG1mrgZWRMRMHHMraQN1vGUtM/8T6BkU66VK4Jpdf9oQ8SuBK9tcPEkaUknK9oyIzamWHdqtyWUD3ZlDja1drzG3DuVQKxzGUV/tHMoxWt2gkjRmZeazEfFdYH/g1RGxUUnkBsbVwtoxt4vLGLfpmdkXEUONxR2WQznUCodx1NdIh3IMl9i53ZQkNRERWw4svt0wnOMhqslNR5bLjuf3x9weX46PpFrweyB+dERsHBF/COwM3N35TyBpvDBZk6TmtgbmR8QDVGNlby/jas8EzoiInwMzgcvL9ZcDW5YJBB8q15GZDwE3UCV6twKnOBNU0kjYDSpJTWTmg8BeTeKPUW17Nzj+ItUSHc2edSFwYbvLKGlisGVNkiSpxkzWJEmSasxkTZIkqcZM1iRJkmrMZE2SJKnGTNYkSZJqzGRNkiSpxkzWJEmSasxkTZIkqcZM1iRJkmrMZE2SJKnGTNYkSZJqzGRNkiSpxkzWJEmSasxkTZIkqcamdPoNImI6cBnwBmANcCLwc+B6YHtgIXBUZq4o118KzAaeB+Zk5gMlfjzwMaAfuCAzr+p02SVJkrptNFrWPg/cmpm7AW8CfgacCdyRmQHcCZwFEBGzgZ0ycxfgZODLJT4DOAfYB9gPOLckgZIkSeNaR5O1iJgG/HFmXgGQmatKC9oRwNxy2dxyTvn7qnLtAmB6RMwC3g7My8wVmbkcmAcc2smyS5Ik1UGnu0F3BJ6OiCuoWtXuBT4EzMrMpQCZuSQiXlOu3wZ4suH+RSU2OP5UiUmSJI1rnU7WpgB7Aadm5r0R8VmqLtD+Ia6f1OS8v0mcYZ4BwIwZmzJlyuQRFlcTUV/f1G4XQU3MnDmVnp5p3S6GJHVdp5O1RcCTmXlvOf8GVbK2NCJmZebSiNgK+HXD9ds13L8tsLjEDxgUnz/cG/f1vbDhpdeE0Nu7sttFUBO9vStZtuy5Ed1jcidpPOromLXS1flkROxaQgcB/wHcDMwpsTnATeX4ZuC9ABGxP7C8PON24JCImF4mGxxSYpIkSeNax5fuAD4IXBMRrwAeBU4AJgM3RMSJwBPAkQCZeWtEvCMifkG1dMcJJd4XEedTjXnrB84rEw0kSZLGtY4na5n5Y6olNwY7eIjrTxsifiVwZdsKJknDiIhtqWanbwWsBv4xM78QEecC72ft8I2zM/O2cs9ZVGtJrgJOz8x5JX4o8Dmq3ozLM/OiUf0wksa00WhZk6SxaBVwRmY+EBFTgR9FxHfKa5/JzM80XhwRuwFHAbtRjau9IyJ2oZog9UWqYSCLgXsi4qbM/NlofRBJY5vJmiQ1kZlLgCXleGVEPMzaJYOazVA/ArguM1cBCyPiEWDfcu0jmfk4QERcV641WZPUEvcGlaR1iIgdgD2ABSV0akQ8EBGXNeymMtR6kEOtHylJLbFlTZKGUbpAb6Qag7YyIr4EfDIz+yPiU8AlwPsYej3IZl+Kh10nElwrUq1xncj6audakSZrkjSEiJhClahdnZk3AWTmsoZLvgrcUo6HWidyEvDaJvFhuVakWuE6kfU10rUih0vsTNYkaWhfAx7KzM8PBCJiqzKeDeBdwE/L8c1UyxR9lqqbc2fgbqqWtZ0jYnvgV8DRwDGjVH5J44DJmiQ1ERFvBd4DPBgR91N1XZ4NHBsRewBrgIXAyQCZ+VBE3AA8BLwEnJKZ/cDqiDgNmMfapTseHu3PI2nsMlmTpCYy83tUC3gPdtsw91wIXNgkfhsQ7SudpInE2aCSJEk1ZrImSZJUYyZrkiRJNWayJkmSVGMma5IkSTVmsiZJklRjJmuSJEk1ZrImSZJUYyZrkiRJNWayJkmSVGMma5IkSTXW8b1BI2IhsIJq0+OXMnPfiJgBXA9sT7UR8lGZuaJcfykwG3gemJOZD5T48cDHqDZTviAzr+p02SVJkrptNFrW1gAHZOaemblviZ0J3JGZAdwJnAUQEbOBnTJzF+Bk4MslPgM4B9gH2A84NyKmj0LZJUmSumo0krVJTd7nCGBuOZ5bzgfiVwFk5gJgekTMAt4OzMvMFZm5HJgHHNrpgkuSJHXbaCRr/cDtEXFPRLyvxGZl5lKAzFwCvKbEtwGebLh3UYkNjj9VYpIkSeNax8esAW/JzCUR0QPMi4ikSuCamdTkvL9JnGGeAcCMGZsyZcrkERdWE09f39RuF0FNzJw5lZ6ead0uhiR1XceTtdJyRmYui4hvAfsCSyNiVmYujYitgF+XyxcB2zXcvi2wuMQPGBSfP9z79vW90J4PoHGvt3dlt4ugJnp7V7Js2XMjusfkTtJ41NFu0IjYNCKmluPNgD8FHgRuBuaUy+YAN5Xjm4H3luv3B5aX7tLbgUMiYnqZbHBIiUmSJI1rnR6zNgu4KyLuB34I3JKZ84CLqJKvBA4CPg2QmbcCj0XEL4CvAKeUeB9wPnAvsAA4r0w0kCRJGtc62g2amY8BezSJ9wIHD3HPaUPErwSubGPxJEmSas8dDCRJkmrMZE2SJKnGTNYkSZJqbDTWWZOkMScitqXaUWUrYDXw1cy81L2NJY02W9YkqblVwBmZuTvwZuDUiHgd7m0saZSZrElSE5m5ZKBlLDNXAg9TLcjt3saSRpXJmiStQ0TsQLUM0Q9xb2NJo8wxa5I0jLILy43A6Zm5MiI6vrcxuL+xWuPexvXVzv2NTdYkaQgRMYUqUbs6Mwe2xev43sbg/sZqjXsb19dI9zceLrGzG1SShvY14KHM/HxDzL2NJY0qW9YkqYmIeCvwHuDBsr9xP3A21d7GN0TEicATwJFQ7W0cEe8oexs/D5xQ4n0RMbC3cT/ubSxphEzWJKmJzPweMNSgMfc2ljRqWuoGjYgvNInNbXatJNWJ9ZeksW7YlrWIuAzYEfijiHh9w0uvAFzUUVJtWX9JGi/W1Q36KWAH4PPAeQ3xVVQLREpSXVl/SRoXhk3WMnMh1d53b4qIzam+jQ6sGTQV6O1k4SRpfVl/SRovWppgEBFnUe1/90xDuJ+qi0GSasv6S9JY1+ps0PdRbVC8rJOFkaQOsP6SNKa1uijuE9hlIGlssv6SNKa12rL2CHBXRMwHfjMQzMxPtnJzRGxEtSDkosw8vGyKfB0wA7gPOC4zV0XExsBVwN7A08C7M/OJ8oyzgBOpBgefnpnzWiy7pIltg+ovSeq2VlvWngJuA16kGqA78KdVpwMPNZxfBFySmQEsB04q8ZOA3szcBfgccDFAROwOHAXsBswGvhQRI3l/SRPXhtZfktRVLbWsZeZ5676quYjYFngHcAFwRgn/CXBMOZ4LnAt8BTiiHEO1efLAYpaHA9dl5ipgYUQ8AuwLLFjfckmaGDak/pKkOmh1NugaqtlTjRZn5nYt3P5Z4COURSgjYgugLzPXlNcXAduU422AJwEyc3VErIiImSX+g4ZnPtVwjyQNaQPrL0nqulZb1n7XXRoRrwDeCbx5XfdFxJ8BSzPzgYg4oISbdUH0N7w2WP8w8SHNmLEpU6YMta2ftFZf39RuF0FNzJw5lZ6eaRv8nPWtvySpLka8kXtmvgR8PSI+1sLlbwUOj4h3AK8CplGNRZseERuV1rVtgcXl+kXAdsDiiJgMTM/MvogYiA9ovKepvr4XRvKxNIH19q7sdhHURG/vSpYte25E96wruRth/SVJtdBqN+h7G04nAa8HXlrXfZl5NnB2ecbbgL/OzL+MiOuBI4HrgeOBm8otN5fzBeX1Oxvi10TEZ6m6P3cG7m6l7JImtvWtvySpLlptWTuw4bifsqzGBrzvmcB1EXE+cD9weYlfDlxdJhA8AxwNkJkPRcQNVDNKXwJOycxhu0ElqWh3/SVJo6rVMWsnlLEeUe75aZmZ2bLM/C7w3XL8GLBfk2tepFqio9n9FwIXjuQ9Jakd9ZckdVNL66xFxN5UC0vOBa4AnoiIlyVbklQ31l+SxrpWu0EvpdpNYAFAROxPtQbavp0qmCS1ifWXpDGt1R0Mpg5UdACZ+UPglZ0pkiS1lfWXpDGt1WStNyKOGDiJiHdSTQCQpLqz/pI0prXaDfoB4NsRcTnV1Pd+4C0dK5UktY/1l6QxrdWWtdnAC8D2VNPglwEHdKhMktRO1l+SxrRWk7UPAG/NzOcz8yfA3sBfda5YktQ21l+SxrRWu0FfAfy24fy3rGNvTkmqifWuv0rX6WFUexy/scTOBd4P/LpcdnZm3lZeOws4EVgFnJ6Z80r8UKqt9jYCLs/Mizb0Q0maOFpN1r4F3Fl2EegH/oK1W0RJUp1tSP11BdUyH1cNin8mMz/TGIiI3agW9d6Nav/iOyJiF6pxcl8EDqLa0/ieiLgpM3+2np9H0gTTUjdoZn6Uaq2iAHYCLs3Mj3eyYJLUDhtSf2XmXUBfk5cmNYkdAVyXmasycyHVQrz7lj+PZObjZSP568q1ktSSVlvWyMwbgRs7WBZJ6ogO1F+nRsRxwL3AX2fmCmAb4AcN1zxVYpOAJxvii3BBXkkj0HKyJkkC4EvAJzOzPyI+BVwCvI/mrW39NO/BWOeYuRkzNmXKlMkbVFCNf319U7tdBA1h5syp9PRMa8uzTNYkaQQyc1nD6VeBW8rxImC7hte2pRqjNgl4bZP4sPr6XtiwgmpC6O1d2e0iaAi9vStZtuy5lq8fLrEzWZOk4U2iodUsIrbKzCXl9F3AT8vxzcA1EfFZqu7PnYG7qVrWdo6I7YFfAUcDx4xS2SWNAyZrkjSEiLiWagHdLSLiCeBc4MCI2ANYAywETgbIzIfKjNOHgJeAUzKzH1gdEacB81i7dMfDo/1ZJI1dJmuSNITMPLZJ+Iphrr8QuLBJ/Daq2aiSNGKt7mAgSZKkLjBZkyRJqjGTNUmSpBrr6Ji1iNgE+Ddg4/JeN2bmeRGxA9Uq3jOA+4DjMnNVRGxMta3L3sDTwLsz84nyrKZ77kmSJI1nHW1Zy8wXgQMzc09gD2B2ROwHXARckpkBLAdOKrecBPRm5i5Umx5fDBARu7N2z73ZwJciotkClJIkSeNKx7tBM3NgZcdNqFrX+oEDgW+U+FzgneX4iHIO1dYwf1KOD6f5nnuSJEnjWseTtYjYKCLuB5YA3wF+CSzPzDXlkkVUC0hS/n4SIDNXAysiYmZjvHiq4R5JkqRxq+PrrJWkbM+I2Bz4JlVX5mAD++QNtbfeUPEhua+eWuXeevXUzn31JGksG7VFcTPz2Yj4LrA/8OqI2Kgkco375A3srbc4IiYD0zOzLyKG2nNvSO6rp1a5t149jXRfPRh+bz1JGqs62g0aEVtGxPRy/CrgYKqtWOYDR5bLjgduKsc3l3PK63c2xI+OiI0j4g9Zu+eeJEnSuNbpMWtbA/Mj4gFgAXB7Zt4KnAmcERE/B2YCl5frLwe2jIhHgA+V68jMh4CBPfduZe2ee5IkSeNaR7tBM/NBYK8m8ceA/ZrEX6RaoqPZs5ruuSdJkjSeuYOBJElSjZmsSZIk1ZjJmiRJUo2ZrEmSJNWYyZokSVKNmaxJkiTVmMmaJElSjZmsSZIk1ZjJmiRJUo2ZrEmSJNWYyZokSVKNdXRvUEkayyLicuAwYGlmvrHEZgDXA9sDC4GjMnNFee1SYDbwPDAnMx8o8eOBjwH9wAWZedUofxRJY5gta5I0tCuAtw+KnQnckZkB3AmcBRARs4GdMnMX4GTgyyU+AzgH2AfYDzg3IqaPTvEljQcma5I0hMy8C+gbFD4CmFuO55bzgfhV5b4FwPSImEWV7M3LzBWZuRyYBxza6bJLGj9M1iRpZF6TmUsBMnMJ8JoS3wZ4suG6RSU2OP5UiUlSSxyzJkntManJeX+TOCU+rBkzNmXKlMntKJfGsb6+qd0ugoYwc+ZUenqmteVZJmuSNDJLI2JWZi6NiK2AX5f4ImC7huu2BRaX+AGD4vPX9SZ9fS+0p7Qa13p7V3a7CBpCb+9Kli17ruXrh0vs7AaVpOFN4vdbx24G5pTjOcBNDfH3AkTE/sDy0l16O3BIREwvkw0OKTFJaonJmiQNISKuBb4P7BoRT0TECcCnqZKvBA4q52TmrcBjEfEL4CvAKSXeB5wP3AssAM4rEw0kqSUd7QaNiG2pZkdtBawGvpqZl7pOkaSxIDOPHeKlg4e4/rQh4lcCV7anVJImmk63rK0CzsjM3YE3A6dGxOtwnSJJkqSWdDRZy8wlAy1jmbkSeJhqcK3rFEmSJLVg1MasRcQOwB7AD4FZrlMkSZK0bqOydEdETAVuBE7PzJURMdQaQ21bp8hOvvb/AAAIeUlEQVQ1itQq1ymqp3auUSRJY1nHk7WImEKVqF2dmQNT3Du+TpFrFKlVrlNUTyNdowiGX6dIksaq0egG/RrwUGZ+viHmOkWSJEkt6PTSHW8F3gM8GBH3U3Vdng1cBNwQEScCTwBHQrVOUUS8o6xT9DxwQon3RcTAOkX9uE6RJEmaIDqarGXm94ChBo65TpEkSdI6uIOBJElSjZmsSZIk1ZjJmiRJUo2ZrEmSJNWYyZokSVKNmaxJkiTVmMmaJElSjZmsSZIk1ZjJmiRJUo2ZrEmSJNWYyZokSVKNmaxJkiTVmMmaJElSjZmsSZIk1ZjJmiRJUo1N6XYBJGksioiFwApgDfBSZu4bETOA64HtgYXAUZm5olx/KTAbeB6Yk5kPdKHYksYgW9Ykaf2sAQ7IzD0zc98SOxO4IzMDuBM4CyAiZgM7ZeYuwMnAl7tRYEljk8maJK2fSby8Dj0CmFuO55bzgfhVAJm5AJgeEbNGo5CSxr6OdoNGxOXAYcDSzHxjiY24myAijgc+BvQDF2TmVZ0styS1oB+4PSL6ga9k5mXArMxcCpCZSyLiNeXabYAnG+59qsSWjmaBJY1NnR6zdgXwBco3ymKgm+DiiPgoVTfBmY3dBBGxH1U3wf4luTsH2Ivqm+yPIuKmgQRPkrrkLSUh6wHmRURSJXDNTGoSG+paAGbM2JQpUyZvaBk1zvX1Te12ETSEmTOn0tMzrS3P6miylpl3RcT2g8JHAG8rx3OB+VQJ3O91E0TEQDfBgcC8hta3ecChVK1zktQVmbmk/L0sIr4F7AssjYhZmbk0IrYCfl0uXwRs13D7tsDi4Z7f1/dCB0qt8aa3d2W3i6Ah9PauZNmy51q+frjErhtj1l7T2E0ADNVNsKjEhuo+kKSuiIhNI2JqOd4M+FPgQeBmYE65bA5wUzm+GXhvuX5/YPlAPShJ61KnCQaDuwkmUXUTjLj7QJI6bBZwV0TcD/wQuCUz5wEXAYeULtGDgE8DZOatwGMR8QvgK8Ap3Sm2pLGoG+usjbSbYBFwwKD4/HW9ieM91CrHfNRTO8d7tFtmPgbs0STeCxw8xD2ndbpcksan0UjWJvH7rWMD3QQX8fJuglOB6xu7CSLiduCCiJhO1RJ4CNUYt2E53kOtcsxHPY10vAcMP+ZDksaqjnaDRsS1wPeBXSPiiYg4gapboOVugszsA84H7gUWAOdl5vJOlluSJKkuOj0b9NghXhpRN0FmXglc2Z5SSZIkjR11mmAgSZKkQUzWJEmSasxkTZIkqcZM1iRJkmrMZE2SJKnGTNYkSZJqzGRNkiSpxkzWJEmSasxkTZIkqcZM1iRJkmrMZE2SJKnGTNYkSZJqzGRNkiSpxkzWJEmSasxkTZIkqcZM1iRJkmrMZE2SJKnGTNYkSZJqbEq3C1AXq1evZuHCR7tdDA2yww47Mnny5G4XQ6o166/6sg5TO4ypZC0iDgU+R9UieHlmXtSuZy9c+ChnXXI9m03vadcjtYGeX7GMC//63ey00y7dLoq0way/Jh7rMLXLmEnWImIj4IvAQcBi4J6IuCkzf9au99hseg+bz9y6XY+TJMD6S9KGGUtj1vYFHsnMxzPzJeA64Igul0mSWmH9JWm9jaVkbRvgyYbzRSUmSXVn/SVpvY2ZblBgUpNYfzvf4PkVy9r5OG2g0fx9rHz62VF7L63bOPx9WH9NQKP1OxmH/72Mee3+nYylZG0R8NqG822pxn401dMzrVnlOKSenr2Y//W91rNoGst6evbiO/t/q9vF0Pg2ovoLRlaHWX9NXNZfE8NYStbuAXaOiO2BXwFHA8d0t0iS1BLrL0nrbcyMWcvM1cBpwDzgP4DrMvPh7pZKktbN+kvShpjU39/WYROSJElqozHTsiZJkjQRmaxJkiTVmMmaJElSjY2l2aBqQSf3H1R9RcTlwGHA0sx8Y7fLI60v67CJyTpseLasjSMN+w++HXg9cExEvK67pdIouYLq9y6NWdZhE5p12DBM1sYX9x+coDLzLqCv2+WQNpB12ARlHTY8k7Xxxf0HJY1l1mFSEyZr40vH9x+UpA6yDpOaMFkbX0a8/6Ak1Yh1mNSEs0HHF/cfnNgm0bxlQhorrMMmNuuwIdiyNo64/+DEFRHXAt8Hdo2IJyLihG6XSRop67CJyzpseO4NKkmSVGO2rEmSJNWYyZokSVKNmaxJkiTVmMmaJElSjZmsSZIk1ZjJmiRJUo25KK5qLyKmARcCbwNeotrs92+AzYFPZOaBXSyeJA3J+kvtYMuaai0iJgG3As8Ab8rMvYDzS2wm7hsoqaasv9Qutqyp7g4EtsvMcwcCmfn/yurW0wZiEfE24FPAq4BXAx/OzFsi4ljgI8Aq4DHgL4Ee4BpgU2AN8MHMvHuUPo+kicP6S21hy5rqbk/ggcHBzLwN+HVD6FTgpMz8I+D9VBUfVN9iD8nMfagqu9cBJwG3ZOa+wDnAf+lc8SVNYNZfagtb1lR3a4DftHDdccBhEXEUsD8wtcRvBr4fEd8EvpGZP4mIqcA3ImIv4P8CX+xAuSXJ+kttYcua6u5eYK/BwYi4AJjUELoL2Kdc/7vXMvPDwLuAXuCaiDg2M78P7A7cBhwFfLuTH0DShGX9pbZwI3fVXkTcBcwDPpWZayLi7cDXgA8D/wP4C+BRYFZm/jYiLgKOBnYEHgb+a2YuiYiPU83AWgM8lZmXRsR2wH2Z2TP6n0zSeGf9pXawG1RjweHA54CfRsRvgaeB2cAMgMzsi4ivAQ+V1++kGny7MfBx4I6I+E9gKTAHeCVwbRnku4qqC0KSOsH6SxvMljVJkqQac8yaJElSjZmsSZIk1ZjJmiRJUo2ZrEmSJNWYyZokSVKNmaxJkiTVmMmaJElSjZmsSZIk1dj/B8YHrJlpTLs6AAAAAElFTkSuQmCC\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(10,4))\n", "sns.countplot(y_train, ax=ax0)\n", "ax0.set(xlabel='Class')\n", "plt.title('Training Class Distribution')\n", "sns.countplot(y_test, ax=ax1)\n", "ax1.set(xlabel='Class')\n", "plt.title('Holdout Class Distribution')" ] }, { "cell_type": "code", "execution_count": 135, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Class 0 makes up 2.325% of the Model Building data\n", "Class 0 makes up 2.325% of the Holdout data\n" ] } ], "source": [ "print('Class 0 makes up {:2.3f}% of the {} data'\n", " .format(y_train[y_train == 0].shape[0] * 100 / y_train.shape[0], 'Model Building'))\n", "print('Class 0 makes up {:2.3f}% of the {} data'\n", " .format(y_test[y_test == 0].shape[0] * 100 / y_test.shape[0], 'Holdout'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From these distribution plots, we see that train_test_split with stratify enabled produced comparably imbalanced datasets, and from the printout, we see that the minority class makes up 2.325% of the data in both the model building and holdout data. That is as we expect." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Stratified" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "def stratified_kfold_check(n, X_train_, y_train_, nb_seed=nb_seed):\n", " skfolds = StratifiedKFold(n_splits=n, random_state=nb_seed)\n", " for train_index, test_index in skfolds.split(X_train_, y_train_):\n", " X_train_folds = X_train_[train_index]\n", " y_train_folds = (y_train_[train_index])\n", " X_test_folds = X_train_[test_index]\n", " y_test_folds = (y_train_[test_index])\n", " print('Total class observations: {:4d}'\n", " .format(len(y_train_folds)))\n", " print('Number of class {} observations: {:4d}'\n", " .format(1,len(y_train_folds[y_train_folds == 1])))\n", " print('Number of class {} observations: {:4d}\\n'\n", " .format(0,len(y_train_folds[y_train_folds == 0])))" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total class observations: 6709\n", "Number of class 1 observations: 6553\n", "Number of class 0 observations: 156\n", "\n", "Total class observations: 6709\n", "Number of class 1 observations: 6553\n", "Number of class 0 observations: 156\n", "\n", "Total class observations: 6710\n", "Number of class 1 observations: 6554\n", "Number of class 0 observations: 156\n", "\n", "Total class observations: 6710\n", "Number of class 1 observations: 6554\n", "Number of class 0 observations: 156\n", "\n", "Total class observations: 6710\n", "Number of class 1 observations: 6554\n", "Number of class 0 observations: 156\n", "\n" ] } ], "source": [ "stratified_kfold_check(5, X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total class observations: 7188\n", "Number of class 1 observations: 7021\n", "Number of class 0 observations: 167\n", "\n", "Total class observations: 7188\n", "Number of class 1 observations: 7021\n", "Number of class 0 observations: 167\n", "\n", "Total class observations: 7189\n", "Number of class 1 observations: 7022\n", "Number of class 0 observations: 167\n", "\n", "Total class observations: 7189\n", "Number of class 1 observations: 7022\n", "Number of class 0 observations: 167\n", "\n", "Total class observations: 7189\n", "Number of class 1 observations: 7022\n", "Number of class 0 observations: 167\n", "\n", "Total class observations: 7189\n", "Number of class 1 observations: 7022\n", "Number of class 0 observations: 167\n", "\n", "Total class observations: 7190\n", "Number of class 1 observations: 7022\n", "Number of class 0 observations: 168\n", "\n" ] } ], "source": [ "stratified_kfold_check(7, X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total class observations: 4193\n", "Number of class 1 observations: 4096\n", "Number of class 0 observations: 97\n", "\n", "Total class observations: 4194\n", "Number of class 1 observations: 4096\n", "Number of class 0 observations: 98\n", "\n" ] } ], "source": [ "stratified_kfold_check(2, X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "def rep_stratified_kfold_check(n, reps, X_train_, y_train_, nb_seed=nb_seed):\n", " skfolds = RepeatedStratifiedKFold(n_splits=n, n_repeats=reps, random_state=nb_seed)\n", " for train_index, test_index in skfolds.split(X_train_, y_train_):\n", " X_train_folds = X_train_[train_index]\n", " y_train_folds = (y_train_[train_index])\n", " X_test_folds = X_train_[test_index]\n", " y_test_folds = (y_train_[test_index])\n", " print('Total class observations: {:4d}'\n", " .format(len(y_train_folds)))\n", " print('Number of class {} observations: {:4d}'\n", " .format(1,len(y_train_folds[y_train_folds == 1])))\n", " print('Number of class {} observations: {:4d}\\n'\n", " .format(0,len(y_train_folds[y_train_folds == 0])))" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5592\n", "Number of class 1 observations: 5462\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5592\n", "Number of class 1 observations: 5462\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5592\n", "Number of class 1 observations: 5462\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5591\n", "Number of class 1 observations: 5461\n", "Number of class 0 observations: 130\n", "\n", "Total class observations: 5592\n", "Number of class 1 observations: 5462\n", "Number of class 0 observations: 130\n", "\n" ] } ], "source": [ "rep_stratified_kfold_check(3, 4, X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total class observations: 6710\n", "Number of class 1 observations: 6574\n", "Number of class 0 observations: 136\n" ] } ], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:py36]", "language": "python", "name": "conda-env-py36-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }