{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import OrdinalEncoder\n", "\n", "from feature_engine.wrappers import SklearnTransformerWrapper\n", "from feature_engine.encoding import RareLabelEncoder" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPub...0NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPub...0NaNNaNNaN0122008WDNormal250000
\n", "

5 rows × 81 columns

\n", "
" ], "text/plain": [ " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", "0 1 60 RL 65.0 8450 Pave NaN Reg \n", "1 2 20 RL 80.0 9600 Pave NaN Reg \n", "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", "\n", " LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold \\\n", "0 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n", "1 Lvl AllPub ... 0 NaN NaN NaN 0 5 \n", "2 Lvl AllPub ... 0 NaN NaN NaN 0 9 \n", "3 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n", "4 Lvl AllPub ... 0 NaN NaN NaN 0 12 \n", "\n", " YrSold SaleType SaleCondition SalePrice \n", "0 2008 WD Normal 208500 \n", "1 2007 WD Normal 181500 \n", "2 2008 WD Normal 223500 \n", "3 2006 WD Abnorml 140000 \n", "4 2008 WD Normal 250000 \n", "\n", "[5 rows x 81 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load the dataset from Kaggle\n", "\n", "data = pd.read_csv('houseprice.csv')\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((1022, 79), (438, 79))" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# let's separate into training and testing set\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " data.drop(['Id', 'SalePrice'], axis=1),\n", " data['SalePrice'],\n", " test_size=0.3,\n", " random_state=0,\n", ")\n", "\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## OrdinalEncoder" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "cols = ['Alley',\n", " 'MasVnrType',\n", " 'BsmtQual',\n", " 'BsmtCond',\n", " 'BsmtExposure',\n", " 'BsmtFinType1',\n", " 'BsmtFinType2',\n", " 'Electrical',\n", " 'FireplaceQu',\n", " 'GarageType',\n", " 'GarageFinish',\n", " 'GarageQual']" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# let's remove rare labels to avoid errors when encoding\n", "\n", "rare_label_enc = RareLabelEncoder(n_categories=2, variables=cols)\n", "\n", "X_train = rare_label_enc.fit_transform(X_train.fillna('Missing'))\n", "X_test = rare_label_enc.transform(X_test.fillna('Missing'))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SklearnTransformerWrapper(transformer=OrdinalEncoder(),\n", " variables=['Alley', 'MasVnrType', 'BsmtQual',\n", " 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',\n", " 'BsmtFinType2', 'Electrical',\n", " 'FireplaceQu', 'GarageType',\n", " 'GarageFinish', 'GarageQual'])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# now let's replace categories by integers\n", "\n", "encoder = SklearnTransformerWrapper(\n", " transformer = OrdinalEncoder(),\n", " variables = cols,\n", ")\n", "\n", "encoder.fit(X_train)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[array(['Missing', 'Rare'], dtype=object),\n", " array(['BrkFace', 'None', 'Rare', 'Stone'], dtype=object),\n", " array(['Ex', 'Gd', 'Rare', 'TA'], dtype=object),\n", " array(['Rare', 'TA'], dtype=object),\n", " array(['Av', 'Gd', 'Mn', 'No', 'Rare'], dtype=object),\n", " array(['ALQ', 'BLQ', 'GLQ', 'Rare', 'Rec', 'Unf'], dtype=object),\n", " array(['Rare', 'Unf'], dtype=object),\n", " array(['FuseA', 'Rare', 'SBrkr'], dtype=object),\n", " array(['Gd', 'Missing', 'Rare', 'TA'], dtype=object),\n", " array(['Attchd', 'BuiltIn', 'Detchd', 'Missing', 'Rare'], dtype=object),\n", " array(['Fin', 'Missing', 'RFn', 'Unf'], dtype=object),\n", " array(['Missing', 'Rare', 'TA'], dtype=object)]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we can navigate to the parameters of the sklearn transformer\n", "# like this:\n", "\n", "encoder.transformer_.categories_" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Alley 0.0\n", "MasVnrType 0.0\n", "BsmtQual 0.0\n", "BsmtCond 0.0\n", "BsmtExposure 0.0\n", "BsmtFinType1 0.0\n", "BsmtFinType2 0.0\n", "Electrical 0.0\n", "FireplaceQu 0.0\n", "GarageType 0.0\n", "GarageFinish 0.0\n", "GarageQual 0.0\n", "dtype: float64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# encode categories\n", "\n", "X_train = encoder.transform(X_train)\n", "X_test = encoder.transform(X_test)\n", "\n", "X_train[cols].isnull().mean()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AlleyMasVnrTypeBsmtQualBsmtCondBsmtExposureBsmtFinType1BsmtFinType2ElectricalFireplaceQuGarageTypeGarageFinishGarageQual
5290.02.03.01.03.04.01.02.03.00.02.02.0
4910.01.03.01.03.01.00.00.03.00.03.02.0
4590.02.03.01.03.03.01.02.03.02.03.02.0
2790.00.01.01.03.01.01.02.03.00.00.02.0
6550.00.03.01.03.05.01.02.01.02.03.02.0
\n", "
" ], "text/plain": [ " Alley MasVnrType BsmtQual BsmtCond BsmtExposure BsmtFinType1 \\\n", "529 0.0 2.0 3.0 1.0 3.0 4.0 \n", "491 0.0 1.0 3.0 1.0 3.0 1.0 \n", "459 0.0 2.0 3.0 1.0 3.0 3.0 \n", "279 0.0 0.0 1.0 1.0 3.0 1.0 \n", "655 0.0 0.0 3.0 1.0 3.0 5.0 \n", "\n", " BsmtFinType2 Electrical FireplaceQu GarageType GarageFinish \\\n", "529 1.0 2.0 3.0 0.0 2.0 \n", "491 0.0 0.0 3.0 0.0 3.0 \n", "459 1.0 2.0 3.0 2.0 3.0 \n", "279 1.0 2.0 3.0 0.0 0.0 \n", "655 1.0 2.0 1.0 2.0 3.0 \n", "\n", " GarageQual \n", "529 2.0 \n", "491 2.0 \n", "459 2.0 \n", "279 2.0 \n", "655 2.0 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test[cols].head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "fenotebook", "language": "python", "name": "fenotebook" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }