{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# univariate select example" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "! pip install numpy\n", "! pip install pandas\n", "! pip install sklearn" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import hourse_price_preprocessor as hpp\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_selection import SelectKBest # specify number\n", "from sklearn.feature_selection import SelectPercentile # specify remaining ratio\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.linear_model import LinearRegression" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "DATA_DIR = \"data/house_price/\"\n", "TEST_FILENAME = \"test.csv\"\n", "TRAIN_FILENAME = \"train.csv\"\n", "\n", "test_file = os.path.join(DATA_DIR, TEST_FILENAME)\n", "train_file = os.path.join(DATA_DIR, TRAIN_FILENAME)\n", "\n", "X_train, X_test, y_train, test_id_idx = hpp.get_train_test_split_dataset(train_file, test_file)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1460, 67) (1460,) (1459, 67) (1459,)\n" ] } ], "source": [ "print(X_train.shape, y_train.shape, X_test.shape, test_id_idx.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Feature selection" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1460, 37)\n" ] } ], "source": [ "select = SelectPercentile(percentile=55)\n", "select.fit(X_train, y_train)\n", "\n", "# transform training set\n", "X_train_selected = select.transform(X_train)\n", "print(X_train_selected.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get cross validation score of selected feature model and original" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "+ selected feature model" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8000781632789679" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean(cross_val_score(LinearRegression(), X_train_selected, y_train, scoring=\"r2\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "+ original model" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-2.009795549320303e+20" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean(cross_val_score(LinearRegression(), X_train, y_train, scoring=\"r2\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "+ get selected features" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([False, True, False, False, False, True, False, True, False,\n", " False, False, True, False, False, False, True, True, True,\n", " True, True, False, True, True, True, True, True, True,\n", " True, False, False, False, False, False, False, False, True,\n", " True, True, True, True, False, True, True, True, True,\n", " False, True, False, False, True, True, True, False, True,\n", " True, True, True, True, True, True, False, False, False,\n", " False, True, False, False])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "select.get_support()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }