{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Missing value imputation: RandomSampleImputer\n", "\n", "\n", "The RandomSampleImputer extracts a random sample of observations where data is available, and uses it to replace the NA. It is suitable for numerical and categorical variables.\n", "\n", "To control the random sample extraction, there are various ways to set a seed and ensure or maximize reproducibility.\n", "\n", "\n", "**For this demonstration, we use the Ames House Prices dataset produced by Professor Dean De Cock:**\n", "\n", "[Dean De Cock (2011) Ames, Iowa: Alternative to the Boston Housing\n", "Data as an End of Semester Regression Project, Journal of Statistics Education, Vol.19, No. 3](http://jse.amstat.org/v19n3/decock.pdf)\n", "\n", "The version of the dataset used in this notebook can be obtained from [Kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Version" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'1.2.0'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make sure you are using this \n", "# Feature-engine version.\n", "\n", "import feature_engine\n", "\n", "feature_engine.__version__" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "from feature_engine.imputation import RandomSampleImputer" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Id | \n", "MSSubClass | \n", "MSZoning | \n", "LotFrontage | \n", "LotArea | \n", "Street | \n", "Alley | \n", "LotShape | \n", "LandContour | \n", "Utilities | \n", "... | \n", "PoolArea | \n", "PoolQC | \n", "Fence | \n", "MiscFeature | \n", "MiscVal | \n", "MoSold | \n", "YrSold | \n", "SaleType | \n", "SaleCondition | \n", "SalePrice | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "60 | \n", "RL | \n", "65.0 | \n", "8450 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "2 | \n", "2008 | \n", "WD | \n", "Normal | \n", "208500 | \n", "
1 | \n", "2 | \n", "20 | \n", "RL | \n", "80.0 | \n", "9600 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "5 | \n", "2007 | \n", "WD | \n", "Normal | \n", "181500 | \n", "
2 | \n", "3 | \n", "60 | \n", "RL | \n", "68.0 | \n", "11250 | \n", "Pave | \n", "NaN | \n", "IR1 | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "9 | \n", "2008 | \n", "WD | \n", "Normal | \n", "223500 | \n", "
3 | \n", "4 | \n", "70 | \n", "RL | \n", "60.0 | \n", "9550 | \n", "Pave | \n", "NaN | \n", "IR1 | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "2 | \n", "2006 | \n", "WD | \n", "Abnorml | \n", "140000 | \n", "
4 | \n", "5 | \n", "60 | \n", "RL | \n", "84.0 | \n", "14260 | \n", "Pave | \n", "NaN | \n", "IR1 | \n", "Lvl | \n", "AllPub | \n", "... | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "12 | \n", "2008 | \n", "WD | \n", "Normal | \n", "250000 | \n", "
5 rows × 81 columns
\n", "\n", " | Alley | \n", "MasVnrType | \n", "LotFrontage | \n", "MasVnrArea | \n", "
---|---|---|---|---|
64 | \n", "NaN | \n", "BrkFace | \n", "NaN | \n", "573.0 | \n", "
682 | \n", "NaN | \n", "None | \n", "NaN | \n", "0.0 | \n", "
960 | \n", "NaN | \n", "None | \n", "50.0 | \n", "0.0 | \n", "
1384 | \n", "NaN | \n", "None | \n", "60.0 | \n", "0.0 | \n", "
1100 | \n", "NaN | \n", "None | \n", "60.0 | \n", "0.0 | \n", "
\n", " | MSSubClass | \n", "MSZoning | \n", "LotFrontage | \n", "LotArea | \n", "Street | \n", "Alley | \n", "LotShape | \n", "LandContour | \n", "Utilities | \n", "LotConfig | \n", "... | \n", "ScreenPorch | \n", "PoolArea | \n", "PoolQC | \n", "Fence | \n", "MiscFeature | \n", "MiscVal | \n", "MoSold | \n", "YrSold | \n", "SaleType | \n", "SaleCondition | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
64 | \n", "60 | \n", "RL | \n", "NaN | \n", "9375 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "Inside | \n", "... | \n", "0 | \n", "0 | \n", "NaN | \n", "GdPrv | \n", "NaN | \n", "0 | \n", "2 | \n", "2009 | \n", "WD | \n", "Normal | \n", "
682 | \n", "120 | \n", "RL | \n", "NaN | \n", "2887 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "HLS | \n", "AllPub | \n", "Inside | \n", "... | \n", "0 | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "11 | \n", "2008 | \n", "WD | \n", "Normal | \n", "
960 | \n", "20 | \n", "RL | \n", "50.0 | \n", "7207 | \n", "Pave | \n", "NaN | \n", "IR1 | \n", "Lvl | \n", "AllPub | \n", "Inside | \n", "... | \n", "0 | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "2 | \n", "2010 | \n", "WD | \n", "Normal | \n", "
1384 | \n", "50 | \n", "RL | \n", "60.0 | \n", "9060 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "Inside | \n", "... | \n", "0 | \n", "0 | \n", "NaN | \n", "MnPrv | \n", "NaN | \n", "0 | \n", "10 | \n", "2009 | \n", "WD | \n", "Normal | \n", "
1100 | \n", "30 | \n", "RL | \n", "60.0 | \n", "8400 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Bnk | \n", "AllPub | \n", "Inside | \n", "... | \n", "0 | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "1 | \n", "2009 | \n", "WD | \n", "Normal | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
763 | \n", "60 | \n", "RL | \n", "82.0 | \n", "9430 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "Inside | \n", "... | \n", "180 | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "7 | \n", "2009 | \n", "WD | \n", "Normal | \n", "
835 | \n", "20 | \n", "RL | \n", "60.0 | \n", "9600 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "Inside | \n", "... | \n", "0 | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "2 | \n", "2010 | \n", "WD | \n", "Normal | \n", "
1216 | \n", "90 | \n", "RM | \n", "68.0 | \n", "8930 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "Inside | \n", "... | \n", "0 | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "4 | \n", "2010 | \n", "WD | \n", "Normal | \n", "
559 | \n", "120 | \n", "RL | \n", "NaN | \n", "3196 | \n", "Pave | \n", "NaN | \n", "Reg | \n", "Lvl | \n", "AllPub | \n", "Inside | \n", "... | \n", "0 | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "10 | \n", "2006 | \n", "WD | \n", "Normal | \n", "
684 | \n", "60 | \n", "RL | \n", "58.0 | \n", "16770 | \n", "Pave | \n", "NaN | \n", "IR2 | \n", "Lvl | \n", "AllPub | \n", "CulDSac | \n", "... | \n", "0 | \n", "0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "6 | \n", "2010 | \n", "WD | \n", "Normal | \n", "
1022 rows × 79 columns
\n", "