# model based feature select

In [None]:
! pip install numpy
! pip install pandas
! pip install sklearn

In [1]:
import warnings
warnings.filterwarnings('ignore')
import hourse_price_preprocessor as hpp
import os
import numpy as np
import pandas as pd

+ Load data

In [2]:
DATA_DIR = "data/house_price/"
TEST_FILENAME = "test.csv"
TRAIN_FILENAME = "train.csv"

test_file = os.path.join(DATA_DIR, TEST_FILENAME)
train_file = os.path.join(DATA_DIR, TRAIN_FILENAME)

X_train, X_test, y_train, test_id_idx = hpp.get_train_test_split_dataset(train_file, test_file)

In [3]:
X_train.shape, y_train.shape, X_test.shape, test_id_idx.shape

((1460, 67), (1460,), (1459, 67), (1459,))

In [4]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

+ SelectFromModel with RandomFerestRegressor

In [5]:
select = SelectFromModel(estimator=RandomForestRegressor(n_estimators=200),
 threshold="median")

select.fit(X_train, y_train)

# transform training set
X_train_selected = select.transform(X_train)

In [6]:
X_train_selected.shape

(1460, 34)

+ get cross validation score
+ model based feature selection is useful to research which features are important.
 - Accordingly, Removing unimport features sometimes doesn't affect performance.

In [7]:
# CV score of selected data set
np.mean(cross_val_score(RandomForestRegressor(n_estimators=1000),
 X_train_selected,
 y_train,
 scoring="r2"))

0.8451998805867299

In [8]:
# CV score of full data set
np.mean(cross_val_score(RandomForestRegressor(n_estimators=1000),
 X_train,
 y_train,
 scoring="r2"))

0.8453123482733651

### get selected features and importance rankings

In [9]:
X_test_selected = select.transform(X_test)
lr = RandomForestRegressor(n_estimators=1000)
lr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
 max_features='auto', max_leaf_nodes=None,
 min_impurity_decrease=0.0, min_impurity_split=None,
 min_samples_leaf=1, min_samples_split=2,
 min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
 oob_score=False, random_state=None, verbose=0, warm_start=False)

In [10]:
select.get_support()

array([False, False, False, False, False, True, False, False, False,
 False, False, False, False, False, False, False, False, False,
 False, True, True, True, True, True, False, False, False,
 False, False, False, True, False, False, False, True, True,
 True, True, True, True, True, True, True, True, True,
 False, True, True, False, True, True, True, True, True,
 True, True, True, True, True, True, True, False, True,
 False, False, True, False])

In [11]:
lr.feature_importances_

array([5.86635253e-04, 6.35226018e-04, 3.13873874e-05, 1.76446103e-05,
 2.63455817e-04, 1.66475815e-03, 2.48867436e-04, 7.71798903e-05,
 2.38984037e-04, 1.69252390e-05, 3.07788155e-04, 3.05799961e-04,
 1.17085864e-04, 1.16593143e-05, 5.62596401e-05, 7.95842376e-06,
 3.22227354e-05, 1.97501373e-04, 7.64795432e-04, 3.15306021e-03,
 1.57051068e-03, 6.69520234e-03, 1.84039792e-02, 5.29243793e-03,
 1.01574917e-03, 2.09889878e-06, 8.71862017e-05, 1.00176116e-03,
 6.39519127e-04, 1.01269643e-03, 1.03724146e-03, 1.04118417e-03,
 3.44272071e-04, 4.41193308e-04, 8.44429188e-03, 1.73231906e-02,
 6.26555657e-02, 3.46818684e-02, 8.78382531e-03, 3.61008523e-02,
 1.55159707e-03, 8.04525688e-03, 6.97136401e-02, 4.21627003e-02,
 2.50655836e-02, 1.89168035e-04, 2.34987429e-01, 1.85382910e-03,
 4.74007615e-04, 7.56489778e-03, 2.63247269e-03, 4.53145595e-03,
 1.99927331e-03, 7.69932058e-03, 8.73998630e-03, 8.06918925e-03,
 3.19444427e-01, 1.93271512e-02, 5.68989321e-03, 7.71485291e-03,
 1.10370882e-03, 5.

In [12]:
np.flip(np.argsort(lr.feature_importances_), axis = 0)

array([56, 46, 42, 36, 43, 39, 37, 44, 57, 22, 35, 38, 54, 34, 55, 41, 59,
 53, 49, 21, 58, 23, 51, 19, 50, 65, 52, 47, 62, 5, 20, 40, 60, 31,
 30, 24, 29, 27, 18, 66, 28, 1, 0, 61, 63, 48, 33, 32, 10, 11, 4,
 6, 8, 17, 45, 64, 12, 26, 7, 14, 16, 2, 3, 9, 13, 15, 25],
 dtype=int64)