{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# AddMissingIndicator\n",
    "\n",
    "\n",
    "AddMissingIndicator adds additional binary variables indicating missing data (thus, called missing indicators). The binary variables take the value 1 if the observation's value is missing, or 0 otherwise. AddMissingIndicator adds 1 binary variable per variable.\n",
    "\n",
    "**For this demonstration, we use the Ames House Prices dataset produced by Professor Dean De Cock:**\n",
    "\n",
    "[Dean De Cock (2011) Ames, Iowa: Alternative to the Boston Housing\n",
    "Data as an End of Semester Regression Project, Journal of Statistics Education, Vol.19, No. 3](http://jse.amstat.org/v19n3/decock.pdf)\n",
    "\n",
    "The version of the dataset used in this notebook can be obtained from [Kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1.2.0'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Make sure you are using this \n",
    "# Feature-engine version.\n",
    "\n",
    "import feature_engine\n",
    "\n",
    "feature_engine.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "from feature_engine.imputation import (\n",
    "    AddMissingIndicator,\n",
    "    MeanMedianImputer,\n",
    "    CategoricalImputer,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>...</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>208500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>20</td>\n",
       "      <td>RL</td>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>2007</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>181500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>223500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>70</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2006</td>\n",
       "      <td>WD</td>\n",
       "      <td>Abnorml</td>\n",
       "      <td>140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>250000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 81 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
       "0   1          60       RL         65.0     8450   Pave   NaN      Reg   \n",
       "1   2          20       RL         80.0     9600   Pave   NaN      Reg   \n",
       "2   3          60       RL         68.0    11250   Pave   NaN      IR1   \n",
       "3   4          70       RL         60.0     9550   Pave   NaN      IR1   \n",
       "4   5          60       RL         84.0    14260   Pave   NaN      IR1   \n",
       "\n",
       "  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \\\n",
       "0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   \n",
       "1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   \n",
       "2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   \n",
       "3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   \n",
       "4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   \n",
       "\n",
       "  YrSold  SaleType  SaleCondition  SalePrice  \n",
       "0   2008        WD         Normal     208500  \n",
       "1   2007        WD         Normal     181500  \n",
       "2   2008        WD         Normal     223500  \n",
       "3   2006        WD        Abnorml     140000  \n",
       "4   2008        WD         Normal     250000  \n",
       "\n",
       "[5 rows x 81 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Download the data from Kaggle and store it\n",
    "# in the same folder as this notebook.\n",
    "\n",
    "data = pd.read_csv('houseprice.csv')\n",
    "\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1022, 79), (438, 79))"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Separate the data into train and test sets.\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    data.drop(['Id', 'SalePrice'], axis=1),\n",
    "    data['SalePrice'],\n",
    "    test_size=0.3,\n",
    "    random_state=0,\n",
    ")\n",
    "\n",
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Add indicators\n",
    "\n",
    "We will add indicators to 4 variables with missing data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Alley          0.939335\n",
       "MasVnrType     0.004892\n",
       "LotFrontage    0.184932\n",
       "MasVnrArea     0.004892\n",
       "dtype: float64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check missing data\n",
    "\n",
    "X_train[['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea']].isnull().mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AddMissingIndicator(variables=['Alley', 'MasVnrType', 'LotFrontage',\n",
       "                               'MasVnrArea'])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Start the imputer with the variables for which\n",
    "# we want indicators.\n",
    "\n",
    "imputer = AddMissingIndicator(\n",
    "    variables=['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea'],\n",
    ")\n",
    "\n",
    "imputer.fit(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the variables for which missing \n",
    "# indicators will be added.\n",
    "\n",
    "imputer.variables_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Alley_na</th>\n",
       "      <th>MasVnrType_na</th>\n",
       "      <th>LotFrontage_na</th>\n",
       "      <th>MasVnrArea_na</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>64</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>682</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>960</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1384</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1100</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Alley_na  MasVnrType_na  LotFrontage_na  MasVnrArea_na\n",
       "64           1              0               1              0\n",
       "682          1              0               1              0\n",
       "960          1              0               0              0\n",
       "1384         1              0               0              0\n",
       "1100         1              0               0              0"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check the added indicators. They take the name of\n",
    "# the variable underscore na\n",
    "\n",
    "train_t = imputer.transform(X_train)\n",
    "test_t = imputer.transform(X_test)\n",
    "\n",
    "train_t[['Alley_na', 'MasVnrType_na', 'LotFrontage_na', 'MasVnrArea_na']].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Alley_na          0.939335\n",
       "MasVnrType_na     0.004892\n",
       "LotFrontage_na    0.184932\n",
       "MasVnrArea_na     0.004892\n",
       "dtype: float64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Note that the original variables still have missing data.\n",
    "\n",
    "train_t[['Alley_na', 'MasVnrType_na', 'LotFrontage_na', 'MasVnrArea_na']].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Indicators plus imputation\n",
    "\n",
    "We normally add missing indicators and impute the original variables with the mean or median if the variable is numerical, or with the mode if the variable is categorical. So let's do that."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Alley           object\n",
       "MasVnrType      object\n",
       "LotFrontage    float64\n",
       "MasVnrArea     float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check variable types\n",
    "\n",
    "X_train[['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea']].dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The first 2 variables are categorical, so I will impute them with the most frequent category. The last variables are numerical, so I will impute with the median."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a pipeline with the imputation strategy\n",
    "\n",
    "pipe = Pipeline([\n",
    "    ('indicators', AddMissingIndicator(\n",
    "        variables=['Alley', 'MasVnrType',\n",
    "                   'LotFrontage', 'MasVnrArea'],\n",
    "    )),\n",
    "\n",
    "    ('imputer_num', MeanMedianImputer(\n",
    "        imputation_method='median',\n",
    "        variables=['LotFrontage', 'MasVnrArea'],\n",
    "    )),\n",
    "\n",
    "    ('imputer_cat', CategoricalImputer(\n",
    "        imputation_method='frequent',\n",
    "        variables=['Alley', 'MasVnrType'],\n",
    "    )),\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('indicators',\n",
       "                 AddMissingIndicator(variables=['Alley', 'MasVnrType',\n",
       "                                                'LotFrontage', 'MasVnrArea'])),\n",
       "                ('imputer_num',\n",
       "                 MeanMedianImputer(variables=['LotFrontage', 'MasVnrArea'])),\n",
       "                ('imputer_cat',\n",
       "                 CategoricalImputer(imputation_method='frequent',\n",
       "                                    variables=['Alley', 'MasVnrType']))])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# With fit() the transformers learn the \n",
    "# required parameters.\n",
    "\n",
    "pipe.fit(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We can look into the attributes of the\n",
    "# different transformers.\n",
    "\n",
    "# Check the variables that will take indicators.\n",
    "pipe.named_steps['indicators'].variables_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'LotFrontage': 69.0, 'MasVnrArea': 0.0}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check the median values for the imputation.\n",
    "\n",
    "pipe.named_steps['imputer_num'].imputer_dict_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Alley': 'Pave', 'MasVnrType': 'None'}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check the mode values for the imputation.\n",
    "\n",
    "pipe.named_steps['imputer_cat'].imputer_dict_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now, we transform the data.\n",
    "\n",
    "train_t = pipe.transform(X_train)\n",
    "test_t = pipe.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Alley</th>\n",
       "      <th>MasVnrType</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>MasVnrArea</th>\n",
       "      <th>Alley_na</th>\n",
       "      <th>MasVnrType_na</th>\n",
       "      <th>LotFrontage_na</th>\n",
       "      <th>MasVnrArea_na</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>64</th>\n",
       "      <td>Pave</td>\n",
       "      <td>BrkFace</td>\n",
       "      <td>69.0</td>\n",
       "      <td>573.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>682</th>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>69.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>960</th>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>50.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1384</th>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>60.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1100</th>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>60.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     Alley MasVnrType  LotFrontage  MasVnrArea  Alley_na  MasVnrType_na  \\\n",
       "64    Pave    BrkFace         69.0       573.0         1              0   \n",
       "682   Pave       None         69.0         0.0         1              0   \n",
       "960   Pave       None         50.0         0.0         1              0   \n",
       "1384  Pave       None         60.0         0.0         1              0   \n",
       "1100  Pave       None         60.0         0.0         1              0   \n",
       "\n",
       "      LotFrontage_na  MasVnrArea_na  \n",
       "64                 1              0  \n",
       "682                1              0  \n",
       "960                0              0  \n",
       "1384               0              0  \n",
       "1100               0              0  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Lets' look at the transformed variables.\n",
    "\n",
    "# original variables plus indicators\n",
    "vars_ = ['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea',\n",
    "         'Alley_na', 'MasVnrType_na', 'LotFrontage_na', 'MasVnrArea_na']\n",
    "\n",
    "train_t[vars_].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Alley             0\n",
       "MasVnrType        0\n",
       "LotFrontage       0\n",
       "MasVnrArea        0\n",
       "Alley_na          0\n",
       "MasVnrType_na     0\n",
       "LotFrontage_na    0\n",
       "MasVnrArea_na     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# After the transformation, the variables do not\n",
    "# show missing data\n",
    "\n",
    "train_t[vars_].isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Automatically select the variables\n",
    "\n",
    "We have the option to add indicators to all variables in the dataset, or to all variables with missing data. AddMissingIndicator can select which variables to transform automatically.\n",
    "\n",
    "When the parameter `variables` is left to None and the parameter `missing_only` is left to True, the imputer add indicators to all variables with missing data.\n",
    "\n",
    "When the parameter `variables` is left to None and the parameter `missing_only` is switched to False, the imputer add indicators to all variables.\n",
    "\n",
    "It is good practice to use `missing_only=True` when we set `variables=None`, so that the transformer handles the imputation automatically in a meaningful way.\n",
    "\n",
    "### Automatically find variables with NA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AddMissingIndicator()"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# With missing_only=True, missing indicators will only be added\n",
    "# to those variables with missing data found during the fit method\n",
    "# in the train set\n",
    "\n",
    "\n",
    "imputer = AddMissingIndicator(\n",
    "    variables=None,\n",
    "    missing_only=True,\n",
    ")\n",
    "\n",
    "# finds variables with missing data\n",
    "imputer.fit(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The original variables argument was None\n",
    "\n",
    "imputer.variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['LotFrontage',\n",
       " 'Alley',\n",
       " 'MasVnrType',\n",
       " 'MasVnrArea',\n",
       " 'BsmtQual',\n",
       " 'BsmtCond',\n",
       " 'BsmtExposure',\n",
       " 'BsmtFinType1',\n",
       " 'BsmtFinType2',\n",
       " 'Electrical',\n",
       " 'FireplaceQu',\n",
       " 'GarageType',\n",
       " 'GarageYrBlt',\n",
       " 'GarageFinish',\n",
       " 'GarageQual',\n",
       " 'GarageCond',\n",
       " 'PoolQC',\n",
       " 'Fence',\n",
       " 'MiscFeature']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# In variables_ we find the list of variables with NA\n",
    "# in the train set\n",
    "\n",
    "imputer.variables_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "19"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(imputer.variables_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We've got 19 variables with NA in the train set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1022, 79), (1022, 98))"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# After transforming the dataset, we see more columns\n",
    "# corresponding to the missing indicators.\n",
    "\n",
    "train_t = imputer.transform(X_train)\n",
    "test_t = imputer.transform(X_test)\n",
    "\n",
    "X_train.shape, train_t.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>LotConfig</th>\n",
       "      <th>...</th>\n",
       "      <th>Electrical_na</th>\n",
       "      <th>FireplaceQu_na</th>\n",
       "      <th>GarageType_na</th>\n",
       "      <th>GarageYrBlt_na</th>\n",
       "      <th>GarageFinish_na</th>\n",
       "      <th>GarageQual_na</th>\n",
       "      <th>GarageCond_na</th>\n",
       "      <th>PoolQC_na</th>\n",
       "      <th>Fence_na</th>\n",
       "      <th>MiscFeature_na</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>64</th>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>NaN</td>\n",
       "      <td>9375</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>682</th>\n",
       "      <td>120</td>\n",
       "      <td>RL</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2887</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>HLS</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>960</th>\n",
       "      <td>20</td>\n",
       "      <td>RL</td>\n",
       "      <td>50.0</td>\n",
       "      <td>7207</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1384</th>\n",
       "      <td>50</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>9060</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1100</th>\n",
       "      <td>30</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>8400</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Bnk</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 98 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
       "64            60       RL          NaN     9375   Pave   NaN      Reg   \n",
       "682          120       RL          NaN     2887   Pave   NaN      Reg   \n",
       "960           20       RL         50.0     7207   Pave   NaN      IR1   \n",
       "1384          50       RL         60.0     9060   Pave   NaN      Reg   \n",
       "1100          30       RL         60.0     8400   Pave   NaN      Reg   \n",
       "\n",
       "     LandContour Utilities LotConfig  ... Electrical_na FireplaceQu_na  \\\n",
       "64           Lvl    AllPub    Inside  ...             0              1   \n",
       "682          HLS    AllPub    Inside  ...             0              0   \n",
       "960          Lvl    AllPub    Inside  ...             0              1   \n",
       "1384         Lvl    AllPub    Inside  ...             0              1   \n",
       "1100         Bnk    AllPub    Inside  ...             0              1   \n",
       "\n",
       "     GarageType_na GarageYrBlt_na GarageFinish_na GarageQual_na  \\\n",
       "64               0              0               0             0   \n",
       "682              0              0               0             0   \n",
       "960              1              1               1             1   \n",
       "1384             0              0               0             0   \n",
       "1100             0              0               0             0   \n",
       "\n",
       "      GarageCond_na  PoolQC_na  Fence_na  MiscFeature_na  \n",
       "64                0          1         0               1  \n",
       "682               0          1         1               1  \n",
       "960               1          1         1               1  \n",
       "1384              0          1         0               1  \n",
       "1100              0          1         1               1  \n",
       "\n",
       "[5 rows x 98 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Towards the right, we find the missing indicators.\n",
    "\n",
    "train_t.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Add indicators to all variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AddMissingIndicator(missing_only=False)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We can, in practice, set up the indicator to add\n",
    "# missing indicators to all variables\n",
    "\n",
    "imputer = AddMissingIndicator(\n",
    "    variables=None,\n",
    "    missing_only=False,\n",
    ")\n",
    "\n",
    "imputer.fit(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "79"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the attribute variables_ now shows all variables\n",
    "# in the train set.\n",
    "\n",
    "len(imputer.variables_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1022, 79), (1022, 158))"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# After transforming the dataset,\n",
    "# we obtain double the number of columns\n",
    "\n",
    "train_t = imputer.transform(X_train)\n",
    "test_t = imputer.transform(X_test)\n",
    "\n",
    "X_train.shape, train_t.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Automatic imputation\n",
    "\n",
    "We can automatically impute missing data in numerical and categorical variables, letting the imputers  find out which variables to impute.\n",
    "\n",
    "We need to set the parameter variables to None in all imputers. None is the default value, so we can simply omit the parameter when initialising the transformers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a pipeline with the imputation strategy\n",
    "\n",
    "pipe = Pipeline([\n",
    "    \n",
    "    # add indicators to variables with NA\n",
    "    ('indicators', AddMissingIndicator(\n",
    "        missing_only=True,\n",
    "    )),\n",
    "\n",
    "    # impute all numerical variables with the median\n",
    "    ('imputer_num', MeanMedianImputer(\n",
    "        imputation_method='median',\n",
    "    )),\n",
    "\n",
    "    # impute all categorical variables with the mode\n",
    "    ('imputer_cat', CategoricalImputer(\n",
    "        imputation_method='frequent',\n",
    "    )),\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('indicators', AddMissingIndicator()),\n",
       "                ('imputer_num', MeanMedianImputer()),\n",
       "                ('imputer_cat',\n",
       "                 CategoricalImputer(imputation_method='frequent'))])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# With fit() the transformers learn the \n",
    "# required parameters.\n",
    "\n",
    "pipe.fit(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['LotFrontage',\n",
       " 'Alley',\n",
       " 'MasVnrType',\n",
       " 'MasVnrArea',\n",
       " 'BsmtQual',\n",
       " 'BsmtCond',\n",
       " 'BsmtExposure',\n",
       " 'BsmtFinType1',\n",
       " 'BsmtFinType2',\n",
       " 'Electrical',\n",
       " 'FireplaceQu',\n",
       " 'GarageType',\n",
       " 'GarageYrBlt',\n",
       " 'GarageFinish',\n",
       " 'GarageQual',\n",
       " 'GarageCond',\n",
       " 'PoolQC',\n",
       " 'Fence',\n",
       " 'MiscFeature']"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We can look into the attributes of the\n",
    "# different transformers.\n",
    "\n",
    "# Check the variables that will take indicators.\n",
    "pipe.named_steps['indicators'].variables_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'MSSubClass': 50.0,\n",
       " 'LotFrontage': 69.0,\n",
       " 'LotArea': 9536.0,\n",
       " 'OverallQual': 6.0,\n",
       " 'OverallCond': 5.0,\n",
       " 'YearBuilt': 1972.0,\n",
       " 'YearRemodAdd': 1993.0,\n",
       " 'MasVnrArea': 0.0,\n",
       " 'BsmtFinSF1': 386.0,\n",
       " 'BsmtFinSF2': 0.0,\n",
       " 'BsmtUnfSF': 486.5,\n",
       " 'TotalBsmtSF': 992.0,\n",
       " '1stFlrSF': 1095.0,\n",
       " '2ndFlrSF': 0.0,\n",
       " 'LowQualFinSF': 0.0,\n",
       " 'GrLivArea': 1479.0,\n",
       " 'BsmtFullBath': 0.0,\n",
       " 'BsmtHalfBath': 0.0,\n",
       " 'FullBath': 2.0,\n",
       " 'HalfBath': 0.0,\n",
       " 'BedroomAbvGr': 3.0,\n",
       " 'KitchenAbvGr': 1.0,\n",
       " 'TotRmsAbvGrd': 6.0,\n",
       " 'Fireplaces': 1.0,\n",
       " 'GarageYrBlt': 1979.0,\n",
       " 'GarageCars': 2.0,\n",
       " 'GarageArea': 477.0,\n",
       " 'WoodDeckSF': 0.0,\n",
       " 'OpenPorchSF': 25.0,\n",
       " 'EnclosedPorch': 0.0,\n",
       " '3SsnPorch': 0.0,\n",
       " 'ScreenPorch': 0.0,\n",
       " 'PoolArea': 0.0,\n",
       " 'MiscVal': 0.0,\n",
       " 'MoSold': 6.0,\n",
       " 'YrSold': 2008.0,\n",
       " 'LotFrontage_na': 0.0,\n",
       " 'Alley_na': 1.0,\n",
       " 'MasVnrType_na': 0.0,\n",
       " 'MasVnrArea_na': 0.0,\n",
       " 'BsmtQual_na': 0.0,\n",
       " 'BsmtCond_na': 0.0,\n",
       " 'BsmtExposure_na': 0.0,\n",
       " 'BsmtFinType1_na': 0.0,\n",
       " 'BsmtFinType2_na': 0.0,\n",
       " 'Electrical_na': 0.0,\n",
       " 'FireplaceQu_na': 0.0,\n",
       " 'GarageType_na': 0.0,\n",
       " 'GarageYrBlt_na': 0.0,\n",
       " 'GarageFinish_na': 0.0,\n",
       " 'GarageQual_na': 0.0,\n",
       " 'GarageCond_na': 0.0,\n",
       " 'PoolQC_na': 1.0,\n",
       " 'Fence_na': 1.0,\n",
       " 'MiscFeature_na': 1.0}"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check the median values for the imputation.\n",
    "\n",
    "pipe.named_steps['imputer_num'].imputer_dict_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'MSZoning': 'RL',\n",
       " 'Street': 'Pave',\n",
       " 'Alley': 'Pave',\n",
       " 'LotShape': 'Reg',\n",
       " 'LandContour': 'Lvl',\n",
       " 'Utilities': 'AllPub',\n",
       " 'LotConfig': 'Inside',\n",
       " 'LandSlope': 'Gtl',\n",
       " 'Neighborhood': 'NAmes',\n",
       " 'Condition1': 'Norm',\n",
       " 'Condition2': 'Norm',\n",
       " 'BldgType': '1Fam',\n",
       " 'HouseStyle': '1Story',\n",
       " 'RoofStyle': 'Gable',\n",
       " 'RoofMatl': 'CompShg',\n",
       " 'Exterior1st': 'VinylSd',\n",
       " 'Exterior2nd': 'VinylSd',\n",
       " 'MasVnrType': 'None',\n",
       " 'ExterQual': 'TA',\n",
       " 'ExterCond': 'TA',\n",
       " 'Foundation': 'PConc',\n",
       " 'BsmtQual': 'TA',\n",
       " 'BsmtCond': 'TA',\n",
       " 'BsmtExposure': 'No',\n",
       " 'BsmtFinType1': 'Unf',\n",
       " 'BsmtFinType2': 'Unf',\n",
       " 'Heating': 'GasA',\n",
       " 'HeatingQC': 'Ex',\n",
       " 'CentralAir': 'Y',\n",
       " 'Electrical': 'SBrkr',\n",
       " 'KitchenQual': 'TA',\n",
       " 'Functional': 'Typ',\n",
       " 'FireplaceQu': 'Gd',\n",
       " 'GarageType': 'Attchd',\n",
       " 'GarageFinish': 'Unf',\n",
       " 'GarageQual': 'TA',\n",
       " 'GarageCond': 'TA',\n",
       " 'PavedDrive': 'Y',\n",
       " 'PoolQC': 'Gd',\n",
       " 'Fence': 'MnPrv',\n",
       " 'MiscFeature': 'Shed',\n",
       " 'SaleType': 'WD',\n",
       " 'SaleCondition': 'Normal'}"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check the mode values for the imputation.\n",
    "\n",
    "pipe.named_steps['imputer_cat'].imputer_dict_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now, we transform the data.\n",
    "\n",
    "train_t = pipe.transform(X_train)\n",
    "test_t = pipe.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MSSubClass        0\n",
       "MSZoning          0\n",
       "LotFrontage       0\n",
       "LotArea           0\n",
       "Street            0\n",
       "                 ..\n",
       "GarageQual_na     0\n",
       "GarageCond_na     0\n",
       "PoolQC_na         0\n",
       "Fence_na          0\n",
       "MiscFeature_na    0\n",
       "Length: 98, dtype: int64"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We should see a complete case dataset\n",
    "\n",
    "train_t.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Sanity check\n",
    "\n",
    "[v for v in train_t.columns if train_t[v].isnull().sum() > 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "fenotebook",
   "language": "python",
   "name": "fenotebook"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}