{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 07- Feature Selection\n",
    "by [Alejandro Correa Bahnsen](albahnsen.com/)\n",
    "\n",
    "version 0.2, May 2016\n",
    "\n",
    "## Part of the class [Machine Learning for Security Informatics](https://github.com/albahnsen/ML_SecurityInformatics)\n",
    "\n",
    "\n",
    "This notebook is licensed under a [Creative Commons Attribution-ShareAlike 3.0 Unported License](http://creativecommons.org/licenses/by-sa/3.0/deed.en_US). Special thanks goes to [Kevin Markham](https://github.com/justmarkham)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing & Cross-validation (review)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PassengerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Survived  Pclass  \\\n",
       "PassengerId                     \n",
       "1                   0       3   \n",
       "2                   1       1   \n",
       "3                   1       3   \n",
       "4                   1       1   \n",
       "5                   0       3   \n",
       "\n",
       "                                                          Name     Sex   Age  \\\n",
       "PassengerId                                                                    \n",
       "1                                      Braund, Mr. Owen Harris    male  22.0   \n",
       "2            Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0   \n",
       "3                                       Heikkinen, Miss. Laina  female  26.0   \n",
       "4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0   \n",
       "5                                     Allen, Mr. William Henry    male  35.0   \n",
       "\n",
       "             SibSp  Parch            Ticket     Fare Cabin Embarked  \n",
       "PassengerId                                                          \n",
       "1                1      0         A/5 21171   7.2500   NaN        S  \n",
       "2                1      0          PC 17599  71.2833   C85        C  \n",
       "3                0      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "4                1      0            113803  53.1000  C123        S  \n",
       "5                0      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import zipfile\n",
    "with zipfile.ZipFile('../datasets/titanic.csv.zip', 'r') as z:\n",
    "    f = z.open('titanic.csv')\n",
    "    titanic = pd.read_csv(f, sep=',', index_col=0)\n",
    "titanic.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "titanic.Age.fillna(titanic.Age.median(), inplace=True)\n",
    "titanic.loc[titanic.Embarked.isnull(), 'Embarked'] = titanic.Embarked.mode().values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "titanic['Sex_Female'] = titanic.Sex.map({'male':0, 'female':1})\n",
    "embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked')\n",
    "titanic = pd.concat([titanic, embarked_dummies], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "titanic['Age2'] = titanic['Age'] ** 2\n",
    "titanic['Age3'] = titanic['Age'] ** 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.cross_validation import cross_val_score\n",
    "logreg = LogisticRegression(C=1e9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    10.000000\n",
       "mean      0.716350\n",
       "std       0.082241\n",
       "min       0.611111\n",
       "25%       0.630150\n",
       "50%       0.735955\n",
       "75%       0.772472\n",
       "max       0.829545\n",
       "dtype: float64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features = ['Pclass', 'Age', 'Age2', 'Age3', 'Parch', 'SibSp', 'Sex_Female', 'Embarked_C', 'Embarked_Q', 'Embarked_S'] \n",
    "X = titanic[list(features)]\n",
    "y = titanic['Survived']\n",
    "pd.Series(cross_val_score(logreg, X, y, cv=10, scoring='accuracy')).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Removing features with low variance\n",
    "\n",
    "VarianceThreshold is a simple baseline approach to feature selection. It removes all features whose variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e. features that have the same value in all samples.\n",
    "\n",
    "As an example, suppose that we have a dataset with boolean features, and we want to remove all features that are either one or zero (on or off) in more than 80% of the samples. Boolean features are Bernoulli random variables, and the variance of such variables is given by\n",
    "$$\\mathrm{Var}[X] = p(1 - p)$$\n",
    "so we can select using the threshold .8 * (1 - .8):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([  6.98230591e-01,   1.69322249e+02,   8.01186989e+05,\n",
       "          3.31010522e+09,   6.48999031e-01,   1.21467827e+00,\n",
       "          2.28218083e-01,   1.53000261e-01,   7.89513794e-02,\n",
       "          1.99362373e-01]),\n",
       " array([ True,  True,  True,  True,  True,  True,  True, False, False,  True], dtype=bool))"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_selection import VarianceThreshold\n",
    "sel = VarianceThreshold(threshold=(.8 * (1 - .8)))\n",
    "sel.fit(X)\n",
    "sel.variances_, sel.get_support()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Embarked_C' 'Embarked_Q']\n"
     ]
    }
   ],
   "source": [
    "X_sel = sel.transform(X)\n",
    "features_sel = np.array(features)[sel.get_support()]\n",
    "print(np.array(features)[~sel.get_support()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    10.000000\n",
       "mean      0.701693\n",
       "std       0.082188\n",
       "min       0.611111\n",
       "25%       0.617978\n",
       "50%       0.709613\n",
       "75%       0.758427\n",
       "max       0.820225\n",
       "dtype: float64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(cross_val_score(logreg, X_sel, y, cv=10, scoring='accuracy')).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Univariate feature selection\n",
    "\n",
    "Univariate feature selection works by selecting the best features based on univariate statistical tests. It can be seen as a preprocessing step to an estimator. Scikit-learn exposes feature selection routines as objects that implement the transform method:\n",
    "\n",
    "* SelectKBest removes all but the k highest scoring features\n",
    "* SelectPercentile removes all but a user-specified highest scoring percentage of features\n",
    "using common univariate statistical tests for each feature: false positive rate SelectFpr, false discovery rate SelectFdr, or family wise error SelectFwe.\n",
    "* GenericUnivariateSelect allows to perform univariate feature\n",
    "selection with a configurable strategy. This allows to select the best univariate selection strategy with hyper-parameter search estimator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ True,  True,  True, False,  True,  True,  True,  True, False,  True], dtype=bool)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_selection import SelectKBest\n",
    "\n",
    "sel = SelectKBest(k=8)\n",
    "sel.fit(X, y)\n",
    "sel.get_support()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Age3' 'Embarked_Q']\n"
     ]
    }
   ],
   "source": [
    "print(np.array(features)[~sel.get_support()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Pclass' 'Age' 'Age2' 'Parch' 'SibSp' 'Sex_Female' 'Embarked_C'\n",
      " 'Embarked_S']\n"
     ]
    }
   ],
   "source": [
    "print(np.array(features)[sel.get_support()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    10.000000\n",
       "mean      0.804803\n",
       "std       0.026880\n",
       "min       0.766667\n",
       "25%       0.786517\n",
       "50%       0.793258\n",
       "75%       0.828652\n",
       "max       0.842697\n",
       "dtype: float64"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_sel = sel.transform(X)\n",
    "pd.Series(cross_val_score(logreg, X_sel, y, cv=10, scoring='accuracy')).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There is still the question of how to select the parameter k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ True, False, False, False,  True, False,  True,  True, False,  True], dtype=bool)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_selection import SelectPercentile, f_classif\n",
    "\n",
    "sel = SelectPercentile(f_classif, percentile=50)\n",
    "sel.fit(X, y)\n",
    "sel.get_support()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Age' 'Age2' 'Age3' 'SibSp' 'Embarked_Q']\n"
     ]
    }
   ],
   "source": [
    "print(np.array(features)[~sel.get_support()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Pclass' 'Parch' 'Sex_Female' 'Embarked_C' 'Embarked_S']\n"
     ]
    }
   ],
   "source": [
    "print(np.array(features)[sel.get_support()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    10.000000\n",
       "mean      0.777797\n",
       "std       0.021426\n",
       "min       0.741573\n",
       "25%       0.766667\n",
       "50%       0.774004\n",
       "75%       0.786517\n",
       "max       0.820225\n",
       "dtype: float64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_sel = sel.transform(X)\n",
    "pd.Series(cross_val_score(logreg, X_sel, y, cv=10, scoring='accuracy')).describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Recursive feature elimination\n",
    "\n",
    "Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and weights are assigned to each one of them. Then, features whose absolute weights are the smallest are pruned from the current set features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.\n",
    "\n",
    "RFECV performs RFE in a cross-validation loop to find the optimal number of features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_selection import RFE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RFE(estimator=LogisticRegression(C=1000000000.0, class_weight=None, dual=False,\n",
       "          fit_intercept=True, intercept_scaling=1, max_iter=100,\n",
       "          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n",
       "          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),\n",
       "  estimator_params=None, n_features_to_select=6, step=1, verbose=0)"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sel = RFE(estimator=logreg, n_features_to_select=6)\n",
    "sel.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ True, False, False, False, False,  True,  True,  True,  True,  True], dtype=bool)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sel.get_support()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Age' 'Age2' 'Age3' 'Parch']\n"
     ]
    }
   ],
   "source": [
    "print(np.array(features)[~sel.get_support()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Pclass' 'SibSp' 'Sex_Female' 'Embarked_C' 'Embarked_Q' 'Embarked_S']\n"
     ]
    }
   ],
   "source": [
    "print(np.array(features)[sel.get_support()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    10.000000\n",
       "mean      0.784526\n",
       "std       0.017122\n",
       "min       0.764045\n",
       "25%       0.771023\n",
       "50%       0.786517\n",
       "75%       0.788296\n",
       "max       0.820225\n",
       "dtype: float64"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_sel = sel.transform(X)\n",
    "pd.Series(cross_val_score(logreg, X_sel, y, cv=10, scoring='accuracy')).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  },
  "name": "_merged"
 },
 "nbformat": 4,
 "nbformat_minor": 0
}