{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# model based feature select"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install numpy\n",
    "! pip install pandas\n",
    "! pip install sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "import hourse_price_preprocessor as hpp\n",
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "+ Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_DIR = \"data/house_price/\"\n",
    "TEST_FILENAME = \"test.csv\"\n",
    "TRAIN_FILENAME = \"train.csv\"\n",
    "\n",
    "test_file = os.path.join(DATA_DIR, TEST_FILENAME)\n",
    "train_file = os.path.join(DATA_DIR, TRAIN_FILENAME)\n",
    "\n",
    "X_train, X_test, y_train, test_id_idx = hpp.get_train_test_split_dataset(train_file, test_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1460, 67), (1460,), (1459, 67), (1459,))"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape, y_train.shape, X_test.shape, test_id_idx.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_selection import SelectFromModel\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.tree import DecisionTreeRegressor"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "+ SelectFromModel with RandomFerestRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "select  = SelectFromModel(estimator=RandomForestRegressor(n_estimators=200),\n",
    "                          threshold=\"median\")\n",
    "\n",
    "select.fit(X_train, y_train)\n",
    "\n",
    "# transform training set\n",
    "X_train_selected = select.transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1460, 34)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_selected.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "+ get cross validation score\n",
    "+ model based feature selection is useful to research which features are important.\n",
    "    - Accordingly, Removing unimport features sometimes doesn't affect performance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8451998805867299"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# CV score of selected data set\n",
    "np.mean(cross_val_score(RandomForestRegressor(n_estimators=1000),\n",
    "                        X_train_selected,\n",
    "                        y_train,\n",
    "                        scoring=\"r2\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8453123482733651"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# CV score of full data set\n",
    "np.mean(cross_val_score(RandomForestRegressor(n_estimators=1000),\n",
    "                        X_train,\n",
    "                        y_train,\n",
    "                        scoring=\"r2\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### get selected features and importance rankings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
       "           max_features='auto', max_leaf_nodes=None,\n",
       "           min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "           min_samples_leaf=1, min_samples_split=2,\n",
       "           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,\n",
       "           oob_score=False, random_state=None, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test_selected = select.transform(X_test)\n",
    "lr = RandomForestRegressor(n_estimators=1000)\n",
    "lr.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([False, False, False, False, False,  True, False, False, False,\n",
       "       False, False, False, False, False, False, False, False, False,\n",
       "       False,  True,  True,  True,  True,  True, False, False, False,\n",
       "       False, False, False,  True, False, False, False,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True,  True,  True,\n",
       "       False,  True,  True, False,  True,  True,  True,  True,  True,\n",
       "        True,  True,  True,  True,  True,  True,  True, False,  True,\n",
       "       False, False,  True, False])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "select.get_support()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([5.86635253e-04, 6.35226018e-04, 3.13873874e-05, 1.76446103e-05,\n",
       "       2.63455817e-04, 1.66475815e-03, 2.48867436e-04, 7.71798903e-05,\n",
       "       2.38984037e-04, 1.69252390e-05, 3.07788155e-04, 3.05799961e-04,\n",
       "       1.17085864e-04, 1.16593143e-05, 5.62596401e-05, 7.95842376e-06,\n",
       "       3.22227354e-05, 1.97501373e-04, 7.64795432e-04, 3.15306021e-03,\n",
       "       1.57051068e-03, 6.69520234e-03, 1.84039792e-02, 5.29243793e-03,\n",
       "       1.01574917e-03, 2.09889878e-06, 8.71862017e-05, 1.00176116e-03,\n",
       "       6.39519127e-04, 1.01269643e-03, 1.03724146e-03, 1.04118417e-03,\n",
       "       3.44272071e-04, 4.41193308e-04, 8.44429188e-03, 1.73231906e-02,\n",
       "       6.26555657e-02, 3.46818684e-02, 8.78382531e-03, 3.61008523e-02,\n",
       "       1.55159707e-03, 8.04525688e-03, 6.97136401e-02, 4.21627003e-02,\n",
       "       2.50655836e-02, 1.89168035e-04, 2.34987429e-01, 1.85382910e-03,\n",
       "       4.74007615e-04, 7.56489778e-03, 2.63247269e-03, 4.53145595e-03,\n",
       "       1.99927331e-03, 7.69932058e-03, 8.73998630e-03, 8.06918925e-03,\n",
       "       3.19444427e-01, 1.93271512e-02, 5.68989321e-03, 7.71485291e-03,\n",
       "       1.10370882e-03, 5.76839664e-04, 1.79836600e-03, 4.91730645e-04,\n",
       "       1.62542868e-04, 2.34836091e-03, 7.52498271e-04])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lr.feature_importances_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([56, 46, 42, 36, 43, 39, 37, 44, 57, 22, 35, 38, 54, 34, 55, 41, 59,\n",
       "       53, 49, 21, 58, 23, 51, 19, 50, 65, 52, 47, 62,  5, 20, 40, 60, 31,\n",
       "       30, 24, 29, 27, 18, 66, 28,  1,  0, 61, 63, 48, 33, 32, 10, 11,  4,\n",
       "        6,  8, 17, 45, 64, 12, 26,  7, 14, 16,  2,  3,  9, 13, 15, 25],\n",
       "      dtype=int64)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.flip(np.argsort(lr.feature_importances_), axis = 0)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}