{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_selection import (\n", " f_regression,\n", " SelectKBest,\n", " SelectFromModel,\n", ")\n", "\n", "from sklearn.linear_model import Lasso\n", "\n", "from feature_engine.wrappers import SklearnTransformerWrapper" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPub...0NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPub...0NaNNaNNaN0122008WDNormal250000
\n", "

5 rows × 81 columns

\n", "
" ], "text/plain": [ " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", "0 1 60 RL 65.0 8450 Pave NaN Reg \n", "1 2 20 RL 80.0 9600 Pave NaN Reg \n", "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", "\n", " LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold \\\n", "0 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n", "1 Lvl AllPub ... 0 NaN NaN NaN 0 5 \n", "2 Lvl AllPub ... 0 NaN NaN NaN 0 9 \n", "3 Lvl AllPub ... 0 NaN NaN NaN 0 2 \n", "4 Lvl AllPub ... 0 NaN NaN NaN 0 12 \n", "\n", " YrSold SaleType SaleCondition SalePrice \n", "0 2008 WD Normal 208500 \n", "1 2007 WD Normal 181500 \n", "2 2008 WD Normal 223500 \n", "3 2006 WD Abnorml 140000 \n", "4 2008 WD Normal 250000 \n", "\n", "[5 rows x 81 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load dataset\n", "\n", "data = pd.read_csv('houseprice.csv')\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((1022, 79), (438, 79))" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# let's separate into training and testing set\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " data.drop(['Id', 'SalePrice'], axis=1),\n", " data['SalePrice'],\n", " test_size=0.3,\n", " random_state=0,\n", ")\n", "\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Select K Best" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['MSSubClass',\n", " 'LotFrontage',\n", " 'LotArea',\n", " 'OverallQual',\n", " 'OverallCond',\n", " 'YearBuilt',\n", " 'YearRemodAdd',\n", " 'MasVnrArea',\n", " 'BsmtFinSF1',\n", " 'BsmtFinSF2',\n", " 'BsmtUnfSF',\n", " 'TotalBsmtSF',\n", " '1stFlrSF',\n", " '2ndFlrSF',\n", " 'LowQualFinSF',\n", " 'GrLivArea',\n", " 'BsmtFullBath',\n", " 'BsmtHalfBath',\n", " 'FullBath',\n", " 'HalfBath',\n", " 'BedroomAbvGr',\n", " 'KitchenAbvGr',\n", " 'TotRmsAbvGrd',\n", " 'Fireplaces',\n", " 'GarageYrBlt',\n", " 'GarageCars',\n", " 'GarageArea',\n", " 'WoodDeckSF',\n", " 'OpenPorchSF',\n", " 'EnclosedPorch',\n", " '3SsnPorch',\n", " 'ScreenPorch',\n", " 'PoolArea',\n", " 'MiscVal',\n", " 'MoSold',\n", " 'YrSold']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# variables to evaluate:\n", "\n", "cols = [var for var in X_train.columns if X_train[var].dtypes !='O']\n", "\n", "cols" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SklearnTransformerWrapper(transformer=SelectKBest(k=5,\n", " score_func=),\n", " variables=['MSSubClass', 'LotFrontage', 'LotArea',\n", " 'OverallQual', 'OverallCond', 'YearBuilt',\n", " 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',\n", " 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',\n", " '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',\n", " 'GrLivArea', 'BsmtFullBath',\n", " 'BsmtHalfBath', 'FullBath', 'HalfBath',\n", " 'BedroomAbvGr', 'KitchenAbvGr',\n", " 'TotRmsAbvGrd', 'Fireplaces',\n", " 'GarageYrBlt', 'GarageCars', 'GarageArea',\n", " 'WoodDeckSF', 'OpenPorchSF',\n", " 'EnclosedPorch', ...])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# let's use select K best to select the best k variables\n", "\n", "selector = SklearnTransformerWrapper(\n", " transformer = SelectKBest(f_regression, k=5),\n", " variables = cols)\n", "\n", "selector.fit(X_train.fillna(0), y_train)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 3, 11, 15, 25, 26], dtype=int64)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "selector.transformer_.get_support(indices=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['LotArea', 'Neighborhood', 'HouseStyle', 'MasVnrArea', 'ExterQual'], dtype='object')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# selecteed features\n", "\n", "X_train.columns[selector.transformer_.get_support(indices=True)]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# the transformer returns the selected variables from the list\n", "# we passed to the transformer PLUS the remaining variables \n", "# in the dataframe that were not examined\n", "\n", "X_train_t = selector.transform(X_train.fillna(0))\n", "X_test_t = selector.transform(X_test.fillna(0))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LotAreaNeighborhoodHouseStyleMasVnrAreaExterQualMSZoningStreetAlleyLotShapeLandContour...GarageTypeGarageFinishGarageQualGarageCondPavedDrivePoolQCFenceMiscFeatureSaleTypeSaleCondition
52932668Crawfor1Story0.0GdRLPave0IR1Lvl...AttchdRFnTATAY000WDAlloca
4919490NAmes1.5Fin0.0TARLPave0RegLvl...AttchdUnfTATAY0MnPrv0WDNormal
4597015BrkSide1.5Fin161.0TARLPave0IR1Bnk...DetchdUnfTATAY000WDNormal
27910005ClearCr2Story299.0TARLPave0RegLvl...AttchdFinTATAY000WDNormal
6551680BrDale2Story381.0TARMPave0RegLvl...DetchdUnfTATAY000WDFamily
\n", "

5 rows × 48 columns

\n", "
" ], "text/plain": [ " LotArea Neighborhood HouseStyle MasVnrArea ExterQual MSZoning Street \\\n", "529 32668 Crawfor 1Story 0.0 Gd RL Pave \n", "491 9490 NAmes 1.5Fin 0.0 TA RL Pave \n", "459 7015 BrkSide 1.5Fin 161.0 TA RL Pave \n", "279 10005 ClearCr 2Story 299.0 TA RL Pave \n", "655 1680 BrDale 2Story 381.0 TA RM Pave \n", "\n", " Alley LotShape LandContour ... GarageType GarageFinish GarageQual \\\n", "529 0 IR1 Lvl ... Attchd RFn TA \n", "491 0 Reg Lvl ... Attchd Unf TA \n", "459 0 IR1 Bnk ... Detchd Unf TA \n", "279 0 Reg Lvl ... Attchd Fin TA \n", "655 0 Reg Lvl ... Detchd Unf TA \n", "\n", " GarageCond PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition \n", "529 TA Y 0 0 0 WD Alloca \n", "491 TA Y 0 MnPrv 0 WD Normal \n", "459 TA Y 0 0 0 WD Normal \n", "279 TA Y 0 0 0 WD Normal \n", "655 TA Y 0 0 0 WD Family \n", "\n", "[5 rows x 48 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_t.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## SelectFromModel" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SklearnTransformerWrapper(transformer=SelectFromModel(estimator=Lasso(alpha=10000,\n", " random_state=0)),\n", " variables=['MSSubClass', 'LotFrontage', 'LotArea',\n", " 'OverallQual', 'OverallCond', 'YearBuilt',\n", " 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',\n", " 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',\n", " '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',\n", " 'GrLivArea', 'BsmtFullBath',\n", " 'BsmtHalfBath', 'FullBath', 'HalfBath',\n", " 'BedroomAbvGr', 'KitchenAbvGr',\n", " 'TotRmsAbvGrd', 'Fireplaces',\n", " 'GarageYrBlt', 'GarageCars', 'GarageArea',\n", " 'WoodDeckSF', 'OpenPorchSF',\n", " 'EnclosedPorch', ...])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# let's select the best variables according to Lasso\n", "\n", "lasso = Lasso(alpha=10000, random_state=0)\n", "\n", "sfm = SelectFromModel(lasso, prefit=False)\n", "\n", "selector = SklearnTransformerWrapper(\n", " transformer = sfm,\n", " variables = cols)\n", "\n", "selector.fit(X_train.fillna(0), y_train)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 26,\n", " 27, 28, 29, 30, 31, 32, 33], dtype=int64)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "selector.transformer_.get_support(indices=True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "24" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(selector.transformer_.get_support(indices=True))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "36" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(cols)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# the transformer returns the selected variables from the list\n", "# we passed to the transformer PLUS the remaining variables \n", "# in the dataframe that were not examined\n", "\n", "X_train_t = selector.transform(X_train.fillna(0))\n", "X_test_t = selector.transform(X_test.fillna(0))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MSSubClassMSZoningLotFrontageLotAreaAlleyLotShapeLandContourUtilitiesLotConfigLandSlope...GarageTypeGarageFinishGarageQualGarageCondPavedDrivePoolQCFenceMiscFeatureSaleTypeSaleCondition
52920RL0.0326680IR1LvlAllPubCulDSacGtl...AttchdRFnTATAY000WDAlloca
49150RL79.094900RegLvlAllPubInsideGtl...AttchdUnfTATAY0MnPrv0WDNormal
45950RL0.070150IR1BnkAllPubCornerGtl...DetchdUnfTATAY000WDNormal
27960RL83.0100050RegLvlAllPubInsideGtl...AttchdFinTATAY000WDNormal
655160RM21.016800RegLvlAllPubInsideGtl...DetchdUnfTATAY000WDFamily
\n", "

5 rows × 67 columns

\n", "
" ], "text/plain": [ " MSSubClass MSZoning LotFrontage LotArea Alley LotShape LandContour \\\n", "529 20 RL 0.0 32668 0 IR1 Lvl \n", "491 50 RL 79.0 9490 0 Reg Lvl \n", "459 50 RL 0.0 7015 0 IR1 Bnk \n", "279 60 RL 83.0 10005 0 Reg Lvl \n", "655 160 RM 21.0 1680 0 Reg Lvl \n", "\n", " Utilities LotConfig LandSlope ... GarageType GarageFinish GarageQual \\\n", "529 AllPub CulDSac Gtl ... Attchd RFn TA \n", "491 AllPub Inside Gtl ... Attchd Unf TA \n", "459 AllPub Corner Gtl ... Detchd Unf TA \n", "279 AllPub Inside Gtl ... Attchd Fin TA \n", "655 AllPub Inside Gtl ... Detchd Unf TA \n", "\n", " GarageCond PavedDrive PoolQC Fence MiscFeature SaleType SaleCondition \n", "529 TA Y 0 0 0 WD Alloca \n", "491 TA Y 0 MnPrv 0 WD Normal \n", "459 TA Y 0 0 0 WD Normal \n", "279 TA Y 0 0 0 WD Normal \n", "655 TA Y 0 0 0 WD Family \n", "\n", "[5 rows x 67 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_t.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "fenotebook", "language": "python", "name": "fenotebook" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }