{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.impute import SimpleImputer\n", "\n", "from feature_engine.wrappers import SklearnTransformerWrapper" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPub...0NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPub...0NaNNaNNaN0122008WDNormal250000
\n", "

5 rows × 81 columns

\n", "
" ], "text/plain": [ " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", "0 1 60 RL 65.0 8450 Pave NaN Reg \n", "1 2 20 RL 80.0 9600 Pave NaN Reg \n", "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", "\n", " LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold \\\n", "0 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n", "1 Lvl AllPub ... 0 NaN NaN NaN 0 5 \n", "2 Lvl AllPub ... 0 NaN NaN NaN 0 9 \n", "3 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n", "4 Lvl AllPub ... 0 NaN NaN NaN 0 12 \n", "\n", " YrSold SaleType SaleCondition SalePrice \n", "0 2008 WD Normal 208500 \n", "1 2007 WD Normal 181500 \n", "2 2008 WD Normal 223500 \n", "3 2006 WD Abnorml 140000 \n", "4 2008 WD Normal 250000 \n", "\n", "[5 rows x 81 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load house prices data set from Kaggle\n", "\n", "data = pd.read_csv('houseprice.csv')\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((1022, 79), (438, 79))" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# let's separate into training and testing set\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " data.drop(['Id', 'SalePrice'], axis=1),\n", " data['SalePrice'],\n", " test_size=0.3,\n", " random_state=0)\n", "\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LotFrontage 0.184932\n", "MasVnrArea 0.004892\n", "dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train[['LotFrontage', 'MasVnrArea']].isnull().mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## SimpleImputer\n", "\n", "### Mean imputation" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SklearnTransformerWrapper(transformer=SimpleImputer(),\n", " variables=['LotFrontage', 'MasVnrArea'])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "imputer = SklearnTransformerWrapper(\n", " transformer = SimpleImputer(strategy='mean'),\n", " variables = ['LotFrontage', 'MasVnrArea'],\n", ")\n", "\n", "imputer.fit(X_train)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 69.66866747, 103.55358899])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we can find the mean values within the parameters of the\n", "# simple imputer\n", "\n", "imputer.transformer_.statistics_" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LotFrontage 0.0\n", "MasVnrArea 0.0\n", "dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# remove NA\n", "\n", "X_train = imputer.transform(X_train)\n", "X_test = imputer.transform(X_test)\n", "\n", "X_train[['LotFrontage', 'MasVnrArea']].isnull().mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Frequent category imputation" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AlleyMasVnrTypeBsmtQualBsmtCondBsmtExposureBsmtFinType1BsmtFinType2ElectricalFireplaceQuGarageTypeGarageFinishGarageQualGarageCondPoolQCFenceMiscFeature
0NaNBrkFaceGdTANoGLQUnfSBrkrNaNAttchdRFnTATANaNNaNNaN
1NaNNoneGdTAGdALQUnfSBrkrTAAttchdRFnTATANaNNaNNaN
2NaNBrkFaceGdTAMnGLQUnfSBrkrTAAttchdRFnTATANaNNaNNaN
3NaNNoneTAGdNoALQUnfSBrkrGdDetchdUnfTATANaNNaNNaN
4NaNBrkFaceGdTAAvGLQUnfSBrkrTAAttchdRFnTATANaNNaNNaN
\n", "
" ], "text/plain": [ " Alley MasVnrType BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 \\\n", "0 NaN BrkFace Gd TA No GLQ Unf \n", "1 NaN None Gd TA Gd ALQ Unf \n", "2 NaN BrkFace Gd TA Mn GLQ Unf \n", "3 NaN None TA Gd No ALQ Unf \n", "4 NaN BrkFace Gd TA Av GLQ Unf \n", "\n", " Electrical FireplaceQu GarageType GarageFinish GarageQual GarageCond PoolQC \\\n", "0 SBrkr NaN Attchd RFn TA TA NaN \n", "1 SBrkr TA Attchd RFn TA TA NaN \n", "2 SBrkr TA Attchd RFn TA TA NaN \n", "3 SBrkr Gd Detchd Unf TA TA NaN \n", "4 SBrkr TA Attchd RFn TA TA NaN \n", "\n", " Fence MiscFeature \n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cols = [c for c in data.columns if data[c].dtypes=='O' and data[c].isnull().sum()>0]\n", "data[cols].head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SklearnTransformerWrapper(transformer=SimpleImputer(strategy='most_frequent'),\n", " variables=['Alley', 'MasVnrType', 'BsmtQual',\n", " 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',\n", " 'BsmtFinType2', 'Electrical',\n", " 'FireplaceQu', 'GarageType',\n", " 'GarageFinish', 'GarageQual', 'GarageCond',\n", " 'PoolQC', 'Fence', 'MiscFeature'])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "imputer = SklearnTransformerWrapper(\n", " transformer=SimpleImputer(strategy='most_frequent'),\n", " variables=cols,\n", ")\n", "\n", "# find the most frequent category\n", "imputer.fit(X_train)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Pave', 'None', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Gd',\n", " 'Attchd', 'Unf', 'TA', 'TA', 'Gd', 'MnPrv', 'Shed'], dtype=object)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we can find the most frequent values within the parameters of the\n", "# simple imputer\n", "\n", "imputer.transformer_.statistics_" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Alley 0.0\n", "MasVnrType 0.0\n", "BsmtQual 0.0\n", "BsmtCond 0.0\n", "BsmtExposure 0.0\n", "BsmtFinType1 0.0\n", "BsmtFinType2 0.0\n", "Electrical 0.0\n", "FireplaceQu 0.0\n", "GarageType 0.0\n", "GarageFinish 0.0\n", "GarageQual 0.0\n", "GarageCond 0.0\n", "PoolQC 0.0\n", "Fence 0.0\n", "MiscFeature 0.0\n", "dtype: float64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# remove NA\n", "\n", "X_train = imputer.transform(X_train)\n", "X_test = imputer.transform(X_test)\n", "\n", "X_train[cols].isnull().mean()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AlleyMasVnrTypeBsmtQualBsmtCondBsmtExposureBsmtFinType1BsmtFinType2ElectricalFireplaceQuGarageTypeGarageFinishGarageQualGarageCondPoolQCFenceMiscFeature
529PaveNoneTATANoRecUnfSBrkrTAAttchdRFnTATAGdMnPrvShed
491PaveNoneTATANoBLQRecFuseATAAttchdUnfTATAGdMnPrvShed
459PaveBrkCmnTATANoLwQUnfSBrkrTADetchdUnfTATAGdMnPrvShed
279PaveBrkFaceGdTANoBLQUnfSBrkrTAAttchdFinTATAGdMnPrvShed
655PaveBrkFaceTATANoUnfUnfSBrkrGdDetchdUnfTATAGdMnPrvShed
\n", "
" ], "text/plain": [ " Alley MasVnrType BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 \\\n", "529 Pave None TA TA No Rec Unf \n", "491 Pave None TA TA No BLQ Rec \n", "459 Pave BrkCmn TA TA No LwQ Unf \n", "279 Pave BrkFace Gd TA No BLQ Unf \n", "655 Pave BrkFace TA TA No Unf Unf \n", "\n", " Electrical FireplaceQu GarageType GarageFinish GarageQual GarageCond \\\n", "529 SBrkr TA Attchd RFn TA TA \n", "491 FuseA TA Attchd Unf TA TA \n", "459 SBrkr TA Detchd Unf TA TA \n", "279 SBrkr TA Attchd Fin TA TA \n", "655 SBrkr Gd Detchd Unf TA TA \n", "\n", " PoolQC Fence MiscFeature \n", "529 Gd MnPrv Shed \n", "491 Gd MnPrv Shed \n", "459 Gd MnPrv Shed \n", "279 Gd MnPrv Shed \n", "655 Gd MnPrv Shed " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test[cols].head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "fenotebook", "language": "python", "name": "fenotebook" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }