{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# model based feature select" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install numpy\n", "! pip install pandas\n", "! pip install sklearn" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings('ignore')\n", "import hourse_price_preprocessor as hpp\n", "import os\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "+ Load data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "DATA_DIR = \"data/house_price/\"\n", "TEST_FILENAME = \"test.csv\"\n", "TRAIN_FILENAME = \"train.csv\"\n", "\n", "test_file = os.path.join(DATA_DIR, TEST_FILENAME)\n", "train_file = os.path.join(DATA_DIR, TRAIN_FILENAME)\n", "\n", "X_train, X_test, y_train, test_id_idx = hpp.get_train_test_split_dataset(train_file, test_file)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((1460, 67), (1460,), (1459, 67), (1459,))" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.shape, y_train.shape, X_test.shape, test_id_idx.shape" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_selection import SelectFromModel\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.tree import DecisionTreeRegressor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "+ SelectFromModel with RandomFerestRegressor" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "select = SelectFromModel(estimator=RandomForestRegressor(n_estimators=200),\n", " threshold=\"median\")\n", "\n", "select.fit(X_train, y_train)\n", "\n", "# transform training set\n", "X_train_selected = select.transform(X_train)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1460, 34)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train_selected.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "+ get cross validation score\n", "+ model based feature selection is useful to research which features are important.\n", " - Accordingly, Removing unimport features sometimes doesn't affect performance." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8451998805867299" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# CV score of selected data set\n", "np.mean(cross_val_score(RandomForestRegressor(n_estimators=1000),\n", " X_train_selected,\n", " y_train,\n", " scoring=\"r2\"))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8453123482733651" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# CV score of full data set\n", "np.mean(cross_val_score(RandomForestRegressor(n_estimators=1000),\n", " X_train,\n", " y_train,\n", " scoring=\"r2\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### get selected features and importance rankings" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n", " max_features='auto', max_leaf_nodes=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,\n", " oob_score=False, random_state=None, verbose=0, warm_start=False)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_selected = select.transform(X_test)\n", "lr = RandomForestRegressor(n_estimators=1000)\n", "lr.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([False, False, False, False, False, True, False, False, False,\n", " False, False, False, False, False, False, False, False, False,\n", " False, True, True, True, True, True, False, False, False,\n", " False, False, False, True, False, False, False, True, True,\n", " True, True, True, True, True, True, True, True, True,\n", " False, True, True, False, True, True, True, True, True,\n", " True, True, True, True, True, True, True, False, True,\n", " False, False, True, False])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select.get_support()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([5.86635253e-04, 6.35226018e-04, 3.13873874e-05, 1.76446103e-05,\n", " 2.63455817e-04, 1.66475815e-03, 2.48867436e-04, 7.71798903e-05,\n", " 2.38984037e-04, 1.69252390e-05, 3.07788155e-04, 3.05799961e-04,\n", " 1.17085864e-04, 1.16593143e-05, 5.62596401e-05, 7.95842376e-06,\n", " 3.22227354e-05, 1.97501373e-04, 7.64795432e-04, 3.15306021e-03,\n", " 1.57051068e-03, 6.69520234e-03, 1.84039792e-02, 5.29243793e-03,\n", " 1.01574917e-03, 2.09889878e-06, 8.71862017e-05, 1.00176116e-03,\n", " 6.39519127e-04, 1.01269643e-03, 1.03724146e-03, 1.04118417e-03,\n", " 3.44272071e-04, 4.41193308e-04, 8.44429188e-03, 1.73231906e-02,\n", " 6.26555657e-02, 3.46818684e-02, 8.78382531e-03, 3.61008523e-02,\n", " 1.55159707e-03, 8.04525688e-03, 6.97136401e-02, 4.21627003e-02,\n", " 2.50655836e-02, 1.89168035e-04, 2.34987429e-01, 1.85382910e-03,\n", " 4.74007615e-04, 7.56489778e-03, 2.63247269e-03, 4.53145595e-03,\n", " 1.99927331e-03, 7.69932058e-03, 8.73998630e-03, 8.06918925e-03,\n", " 3.19444427e-01, 1.93271512e-02, 5.68989321e-03, 7.71485291e-03,\n", " 1.10370882e-03, 5.76839664e-04, 1.79836600e-03, 4.91730645e-04,\n", " 1.62542868e-04, 2.34836091e-03, 7.52498271e-04])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lr.feature_importances_" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([56, 46, 42, 36, 43, 39, 37, 44, 57, 22, 35, 38, 54, 34, 55, 41, 59,\n", " 53, 49, 21, 58, 23, 51, 19, 50, 65, 52, 47, 62, 5, 20, 40, 60, 31,\n", " 30, 24, 29, 27, 18, 66, 28, 1, 0, 61, 63, 48, 33, 32, 10, 11, 4,\n", " 6, 8, 17, 45, 64, 12, 26, 7, 14, 16, 2, 3, 9, 13, 15, 25],\n", " dtype=int64)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.flip(np.argsort(lr.feature_importances_), axis = 0)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }