{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Univariate Single Performance\n", "\n", "- Train a ML model per every single feature\n", "- Determine the performance of the models\n", "- Select features if model performance is above a certain threshold" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor\n", "from sklearn.metrics import roc_auc_score, mean_squared_error\n", "\n", "from feature_engine.selection import SelectBySingleFeaturePerformance" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Classification" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(50000, 109)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | var_1 | \n", "var_2 | \n", "var_3 | \n", "var_4 | \n", "var_5 | \n", "var_6 | \n", "var_7 | \n", "var_8 | \n", "var_9 | \n", "var_10 | \n", "... | \n", "var_100 | \n", "var_101 | \n", "var_102 | \n", "var_103 | \n", "var_104 | \n", "var_105 | \n", "var_106 | \n", "var_107 | \n", "var_108 | \n", "var_109 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "4.532710 | \n", "3.280834 | \n", "17.982476 | \n", "4.404259 | \n", "2.349910 | \n", "0.603264 | \n", "2.784655 | \n", "0.323146 | \n", "12.009691 | \n", "0.139346 | \n", "... | \n", "2.079066 | \n", "6.748819 | \n", "2.941445 | \n", "18.360496 | \n", "17.726613 | \n", "7.774031 | \n", "1.473441 | \n", "1.973832 | \n", "0.976806 | \n", "2.541417 | \n", "
1 | \n", "5.821374 | \n", "12.098722 | \n", "13.309151 | \n", "4.125599 | \n", "1.045386 | \n", "1.832035 | \n", "1.833494 | \n", "0.709090 | \n", "8.652883 | \n", "0.102757 | \n", "... | \n", "2.479789 | \n", "7.795290 | \n", "3.557890 | \n", "17.383378 | \n", "15.193423 | \n", "8.263673 | \n", "1.878108 | \n", "0.567939 | \n", "1.018818 | \n", "1.416433 | \n", "
2 | \n", "1.938776 | \n", "7.952752 | \n", "0.972671 | \n", "3.459267 | \n", "1.935782 | \n", "0.621463 | \n", "2.338139 | \n", "0.344948 | \n", "9.937850 | \n", "11.691283 | \n", "... | \n", "1.861487 | \n", "6.130886 | \n", "3.401064 | \n", "15.850471 | \n", "14.620599 | \n", "6.849776 | \n", "1.098210 | \n", "1.959183 | \n", "1.575493 | \n", "1.857893 | \n", "
3 | \n", "6.020690 | \n", "9.900544 | \n", "17.869637 | \n", "4.366715 | \n", "1.973693 | \n", "2.026012 | \n", "2.853025 | \n", "0.674847 | \n", "11.816859 | \n", "0.011151 | \n", "... | \n", "1.340944 | \n", "7.240058 | \n", "2.417235 | \n", "15.194609 | \n", "13.553772 | \n", "7.229971 | \n", "0.835158 | \n", "2.234482 | \n", "0.946170 | \n", "2.700606 | \n", "
4 | \n", "3.909506 | \n", "10.576516 | \n", "0.934191 | \n", "3.419572 | \n", "1.871438 | \n", "3.340811 | \n", "1.868282 | \n", "0.439865 | \n", "13.585620 | \n", "1.153366 | \n", "... | \n", "2.738095 | \n", "6.565509 | \n", "4.341414 | \n", "15.893832 | \n", "11.929787 | \n", "6.954033 | \n", "1.853364 | \n", "0.511027 | \n", "2.599562 | \n", "0.811364 | \n", "
5 rows × 109 columns
\n", "\n", " | Id | \n", "MSSubClass | \n", "LotFrontage | \n", "LotArea | \n", "OverallQual | \n", "OverallCond | \n", "YearBuilt | \n", "YearRemodAdd | \n", "MasVnrArea | \n", "BsmtFinSF1 | \n", "... | \n", "WoodDeckSF | \n", "OpenPorchSF | \n", "EnclosedPorch | \n", "3SsnPorch | \n", "ScreenPorch | \n", "PoolArea | \n", "MiscVal | \n", "MoSold | \n", "YrSold | \n", "SalePrice | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "60 | \n", "65.0 | \n", "8450 | \n", "7 | \n", "5 | \n", "2003 | \n", "2003 | \n", "196.0 | \n", "706 | \n", "... | \n", "0 | \n", "61 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "2 | \n", "2008 | \n", "208500 | \n", "
1 | \n", "2 | \n", "20 | \n", "80.0 | \n", "9600 | \n", "6 | \n", "8 | \n", "1976 | \n", "1976 | \n", "0.0 | \n", "978 | \n", "... | \n", "298 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "5 | \n", "2007 | \n", "181500 | \n", "
2 | \n", "3 | \n", "60 | \n", "68.0 | \n", "11250 | \n", "7 | \n", "5 | \n", "2001 | \n", "2002 | \n", "162.0 | \n", "486 | \n", "... | \n", "0 | \n", "42 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "9 | \n", "2008 | \n", "223500 | \n", "
3 | \n", "4 | \n", "70 | \n", "60.0 | \n", "9550 | \n", "7 | \n", "5 | \n", "1915 | \n", "1970 | \n", "0.0 | \n", "216 | \n", "... | \n", "0 | \n", "35 | \n", "272 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "2 | \n", "2006 | \n", "140000 | \n", "
4 | \n", "5 | \n", "60 | \n", "84.0 | \n", "14260 | \n", "8 | \n", "5 | \n", "2000 | \n", "2000 | \n", "350.0 | \n", "655 | \n", "... | \n", "192 | \n", "84 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "12 | \n", "2008 | \n", "250000 | \n", "
5 rows × 38 columns
\n", "