In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import (
    f_regression,
    SelectKBest,
    SelectFromModel,
)

from sklearn.linear_model import Lasso

from feature_engine.wrappers import SklearnTransformerWrapper

In [2]:
# load dataset

data = pd.read_csv('houseprice.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Id', 'SalePrice'], axis=1),
    data['SalePrice'],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((1022, 79), (438, 79))

## Select K Best

In [4]:
# variables to evaluate:

cols = [var for var in X_train.columns if X_train[var].dtypes !='O']

cols

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [5]:
# let's use select K best to select the best k variables

selector = SklearnTransformerWrapper(
    transformer = SelectKBest(f_regression, k=5),
    variables = cols)

selector.fit(X_train.fillna(0), y_train)

SklearnTransformerWrapper(transformer=SelectKBest(k=5,
                                                  score_func=<function f_regression at 0x0000007EFF7D7F70>),
                          variables=['MSSubClass', 'LotFrontage', 'LotArea',
                                     'OverallQual', 'OverallCond', 'YearBuilt',
                                     'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
                                     'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                                     '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
                                     'GrLivArea', 'BsmtFullBath',
                                     'BsmtHalfBath', 'FullBath', 'HalfBath',
                                     'BedroomAbvGr', 'KitchenAbvGr',
                                     'TotRmsAbvGrd', 'Fireplaces',
                                     'GarageYrBlt', 'GarageCars', 'GarageArea',
                                     'WoodDeckSF', 'OpenPorchSF',
                 

In [6]:
selector.transformer_.get_support(indices=True)

array([ 3, 11, 15, 25, 26], dtype=int64)

In [7]:
# selecteed features

X_train.columns[selector.transformer_.get_support(indices=True)]

Index(['LotArea', 'Neighborhood', 'HouseStyle', 'MasVnrArea', 'ExterQual'], dtype='object')

In [8]:
# the transformer returns the selected variables from the list
# we passed to the transformer PLUS the remaining variables 
# in the dataframe that were not examined

X_train_t = selector.transform(X_train.fillna(0))
X_test_t = selector.transform(X_test.fillna(0))

In [9]:
X_test_t.head()

Unnamed: 0,LotArea,Neighborhood,HouseStyle,MasVnrArea,ExterQual,MSZoning,Street,Alley,LotShape,LandContour,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
529,32668,Crawfor,1Story,0.0,Gd,RL,Pave,0,IR1,Lvl,...,Attchd,RFn,TA,TA,Y,0,0,0,WD,Alloca
491,9490,NAmes,1.5Fin,0.0,TA,RL,Pave,0,Reg,Lvl,...,Attchd,Unf,TA,TA,Y,0,MnPrv,0,WD,Normal
459,7015,BrkSide,1.5Fin,161.0,TA,RL,Pave,0,IR1,Bnk,...,Detchd,Unf,TA,TA,Y,0,0,0,WD,Normal
279,10005,ClearCr,2Story,299.0,TA,RL,Pave,0,Reg,Lvl,...,Attchd,Fin,TA,TA,Y,0,0,0,WD,Normal
655,1680,BrDale,2Story,381.0,TA,RM,Pave,0,Reg,Lvl,...,Detchd,Unf,TA,TA,Y,0,0,0,WD,Family


## SelectFromModel

In [10]:
# let's select the best variables according to Lasso

lasso = Lasso(alpha=10000, random_state=0)

sfm = SelectFromModel(lasso, prefit=False)

selector = SklearnTransformerWrapper(
    transformer = sfm,
    variables = cols)

selector.fit(X_train.fillna(0), y_train)

SklearnTransformerWrapper(transformer=SelectFromModel(estimator=Lasso(alpha=10000,
                                                                      random_state=0)),
                          variables=['MSSubClass', 'LotFrontage', 'LotArea',
                                     'OverallQual', 'OverallCond', 'YearBuilt',
                                     'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
                                     'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                                     '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
                                     'GrLivArea', 'BsmtFullBath',
                                     'BsmtHalfBath', 'FullBath', 'HalfBath',
                                     'BedroomAbvGr', 'KitchenAbvGr',
                                     'TotRmsAbvGrd', 'Fireplaces',
                                     'GarageYrBlt', 'GarageCars', 'GarageArea',
                                     'WoodDeckSF', 'OpenPorchSF',
          

In [11]:
selector.transformer_.get_support(indices=True)

array([ 0,  1,  2,  3,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 24, 26,
       27, 28, 29, 30, 31, 32, 33], dtype=int64)

In [12]:
len(selector.transformer_.get_support(indices=True))

24

In [13]:
len(cols)

36

In [14]:
# the transformer returns the selected variables from the list
# we passed to the transformer PLUS the remaining variables 
# in the dataframe that were not examined

X_train_t = selector.transform(X_train.fillna(0))
X_test_t = selector.transform(X_test.fillna(0))

In [15]:
X_test_t.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
529,20,RL,0.0,32668,0,IR1,Lvl,AllPub,CulDSac,Gtl,...,Attchd,RFn,TA,TA,Y,0,0,0,WD,Alloca
491,50,RL,79.0,9490,0,Reg,Lvl,AllPub,Inside,Gtl,...,Attchd,Unf,TA,TA,Y,0,MnPrv,0,WD,Normal
459,50,RL,0.0,7015,0,IR1,Bnk,AllPub,Corner,Gtl,...,Detchd,Unf,TA,TA,Y,0,0,0,WD,Normal
279,60,RL,83.0,10005,0,Reg,Lvl,AllPub,Inside,Gtl,...,Attchd,Fin,TA,TA,Y,0,0,0,WD,Normal
655,160,RM,21.0,1680,0,Reg,Lvl,AllPub,Inside,Gtl,...,Detchd,Unf,TA,TA,Y,0,0,0,WD,Family
