{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import necessary dependencies and settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "np.set_printoptions(suppress=True)\n",
    "pt = np.get_printoptions()['threshold']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Threshold based methods"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Limiting features in bag of word based models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=0.85, max_features=2000, min_df=0.1,\n",
       "        ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
       "        strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
       "        tokenizer=None, vocabulary=None)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "cv = CountVectorizer(min_df=0.1, max_df=0.85, max_features=2000)\n",
    "cv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Variance based thresholding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Gen 1</th>\n",
       "      <th>Gen 2</th>\n",
       "      <th>Gen 3</th>\n",
       "      <th>Gen 4</th>\n",
       "      <th>Gen 5</th>\n",
       "      <th>Gen 6</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Gen 1  Gen 2  Gen 3  Gen 4  Gen 5  Gen 6\n",
       "0      1      0      0      0      0      0\n",
       "1      1      0      0      0      0      0\n",
       "2      1      0      0      0      0      0\n",
       "3      1      0      0      0      0      0\n",
       "4      1      0      0      0      0      0"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('datasets/Pokemon.csv')\n",
    "poke_gen = pd.get_dummies(df['Generation'])\n",
    "poke_gen.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "VarianceThreshold(threshold=0.15)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_selection import VarianceThreshold\n",
    "\n",
    "vt = VarianceThreshold(threshold=.15)\n",
    "vt.fit(poke_gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Gen 1</th>\n",
       "      <th>Gen 2</th>\n",
       "      <th>Gen 3</th>\n",
       "      <th>Gen 4</th>\n",
       "      <th>Gen 5</th>\n",
       "      <th>Gen 6</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>select_feature</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>variance</th>\n",
       "      <td>0.164444</td>\n",
       "      <td>0.114944</td>\n",
       "      <td>0.16</td>\n",
       "      <td>0.128373</td>\n",
       "      <td>0.163711</td>\n",
       "      <td>0.0919937</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   Gen 1     Gen 2 Gen 3     Gen 4     Gen 5      Gen 6\n",
       "select_feature      True     False  True     False      True      False\n",
       "variance        0.164444  0.114944  0.16  0.128373  0.163711  0.0919937"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame({'variance': vt.variances_,\n",
    "              'select_feature': vt.get_support()},\n",
    "            index=poke_gen.columns).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Gen 1</th>\n",
       "      <th>Gen 3</th>\n",
       "      <th>Gen 5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Gen 1  Gen 3  Gen 5\n",
       "0      1      0      0\n",
       "1      1      0      0\n",
       "2      1      0      0\n",
       "3      1      0      0\n",
       "4      1      0      0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "poke_gen_subset = poke_gen.iloc[:,vt.get_support()].head()\n",
    "poke_gen_subset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Statistical Methods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Feature set shape: (569, 30)\n",
      "Response class shape: (569,)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.datasets import load_breast_cancer\n",
    "\n",
    "bc_data = load_breast_cancer()\n",
    "bc_features = pd.DataFrame(bc_data.data, columns=bc_data.feature_names)\n",
    "bc_classes = pd.DataFrame(bc_data.target, columns=['IsMalignant'])\n",
    "\n",
    "# build featureset and response class labels \n",
    "bc_X = np.array(bc_features)\n",
    "bc_y = np.array(bc_classes).T[0]\n",
    "print('Feature set shape:', bc_X.shape)\n",
    "print('Response class shape:', bc_y.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Feature set data [shape: (569, 30)]\n",
      "[[  17.99   10.38  122.8  ...,    0.27    0.46    0.12]\n",
      " [  20.57   17.77  132.9  ...,    0.19    0.28    0.09]\n",
      " [  19.69   21.25  130.   ...,    0.24    0.36    0.09]\n",
      " ..., \n",
      " [  16.6    28.08  108.3  ...,    0.14    0.22    0.08]\n",
      " [  20.6    29.33  140.1  ...,    0.26    0.41    0.12]\n",
      " [   7.76   24.54   47.92 ...,    0.      0.29    0.07]] \n",
      "\n",
      "Feature names:\n",
      "['mean radius' 'mean texture' 'mean perimeter' 'mean area'\n",
      " 'mean smoothness' 'mean compactness' 'mean concavity'\n",
      " 'mean concave points' 'mean symmetry' 'mean fractal dimension'\n",
      " 'radius error' 'texture error' 'perimeter error' 'area error'\n",
      " 'smoothness error' 'compactness error' 'concavity error'\n",
      " 'concave points error' 'symmetry error' 'fractal dimension error'\n",
      " 'worst radius' 'worst texture' 'worst perimeter' 'worst area'\n",
      " 'worst smoothness' 'worst compactness' 'worst concavity'\n",
      " 'worst concave points' 'worst symmetry' 'worst fractal dimension'] \n",
      "\n",
      "Predictor Class label data [shape: (569,)]\n",
      "[0 0 0 ..., 0 0 1] \n",
      "\n",
      "Predictor name: ['IsMalignant']\n"
     ]
    }
   ],
   "source": [
    "np.set_printoptions(threshold=30)\n",
    "print('Feature set data [shape: '+str(bc_X.shape)+']')\n",
    "print(np.round(bc_X, 2), '\\n')\n",
    "print('Feature names:')\n",
    "print(np.array(bc_features.columns), '\\n')\n",
    "print('Predictor Class label data [shape: '+str(bc_y.shape)+']')\n",
    "print(bc_y, '\\n')\n",
    "print('Predictor name:', np.array(bc_classes.columns))\n",
    "np.set_printoptions(threshold=pt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SelectKBest(k=15, score_func=<function chi2 at 0x00000166BF43A7B8>)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_selection import chi2, SelectKBest\n",
    "\n",
    "skb = SelectKBest(score_func=chi2, k=15)\n",
    "skb.fit(bc_X, bc_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('worst area', 112598.43156405364),\n",
       " ('mean area', 53991.655923750892),\n",
       " ('area error', 8758.5047053344697),\n",
       " ('worst perimeter', 3665.0354163405909),\n",
       " ('mean perimeter', 2011.1028637679051),\n",
       " ('worst radius', 491.68915743332195),\n",
       " ('mean radius', 266.10491719517802),\n",
       " ('perimeter error', 250.57189635982184),\n",
       " ('worst texture', 174.44939960571074),\n",
       " ('mean texture', 93.897508098633352)]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_scores = [(item, score) for item, score in zip(bc_data.feature_names, skb.scores_)]\n",
    "sorted(feature_scores, key=lambda x: -x[1])[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(569, 15)\n",
      "['mean radius' 'mean texture' 'mean perimeter' 'mean area' 'mean concavity'\n",
      " 'radius error' 'perimeter error' 'area error' 'worst radius'\n",
      " 'worst texture' 'worst perimeter' 'worst area' 'worst compactness'\n",
      " 'worst concavity' 'worst concave points']\n"
     ]
    }
   ],
   "source": [
    "select_features_kbest = skb.get_support()\n",
    "feature_names_kbest = bc_data.feature_names[select_features_kbest]\n",
    "feature_subset_df = bc_features[feature_names_kbest]\n",
    "bc_SX = np.array(feature_subset_df)\n",
    "print(bc_SX.shape)\n",
    "print(feature_names_kbest)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean radius</th>\n",
       "      <th>mean texture</th>\n",
       "      <th>mean perimeter</th>\n",
       "      <th>mean area</th>\n",
       "      <th>mean concavity</th>\n",
       "      <th>radius error</th>\n",
       "      <th>perimeter error</th>\n",
       "      <th>area error</th>\n",
       "      <th>worst radius</th>\n",
       "      <th>worst texture</th>\n",
       "      <th>worst perimeter</th>\n",
       "      <th>worst area</th>\n",
       "      <th>worst compactness</th>\n",
       "      <th>worst concavity</th>\n",
       "      <th>worst concave points</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>13.08</td>\n",
       "      <td>15.71</td>\n",
       "      <td>85.63</td>\n",
       "      <td>520.0</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.19</td>\n",
       "      <td>1.38</td>\n",
       "      <td>14.67</td>\n",
       "      <td>14.50</td>\n",
       "      <td>20.49</td>\n",
       "      <td>96.09</td>\n",
       "      <td>630.5</td>\n",
       "      <td>0.28</td>\n",
       "      <td>0.19</td>\n",
       "      <td>0.07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>9.50</td>\n",
       "      <td>12.44</td>\n",
       "      <td>60.34</td>\n",
       "      <td>273.9</td>\n",
       "      <td>0.03</td>\n",
       "      <td>0.28</td>\n",
       "      <td>1.91</td>\n",
       "      <td>15.70</td>\n",
       "      <td>10.23</td>\n",
       "      <td>15.66</td>\n",
       "      <td>65.13</td>\n",
       "      <td>314.9</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.09</td>\n",
       "      <td>0.06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>15.34</td>\n",
       "      <td>14.26</td>\n",
       "      <td>102.50</td>\n",
       "      <td>704.4</td>\n",
       "      <td>0.21</td>\n",
       "      <td>0.44</td>\n",
       "      <td>3.38</td>\n",
       "      <td>44.91</td>\n",
       "      <td>18.07</td>\n",
       "      <td>19.08</td>\n",
       "      <td>125.10</td>\n",
       "      <td>980.9</td>\n",
       "      <td>0.60</td>\n",
       "      <td>0.63</td>\n",
       "      <td>0.24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>21.16</td>\n",
       "      <td>23.04</td>\n",
       "      <td>137.20</td>\n",
       "      <td>1404.0</td>\n",
       "      <td>0.11</td>\n",
       "      <td>0.69</td>\n",
       "      <td>4.30</td>\n",
       "      <td>93.99</td>\n",
       "      <td>29.17</td>\n",
       "      <td>35.59</td>\n",
       "      <td>188.00</td>\n",
       "      <td>2615.0</td>\n",
       "      <td>0.26</td>\n",
       "      <td>0.32</td>\n",
       "      <td>0.20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>16.65</td>\n",
       "      <td>21.38</td>\n",
       "      <td>110.00</td>\n",
       "      <td>904.6</td>\n",
       "      <td>0.15</td>\n",
       "      <td>0.81</td>\n",
       "      <td>5.46</td>\n",
       "      <td>102.60</td>\n",
       "      <td>26.46</td>\n",
       "      <td>31.56</td>\n",
       "      <td>177.00</td>\n",
       "      <td>2215.0</td>\n",
       "      <td>0.36</td>\n",
       "      <td>0.47</td>\n",
       "      <td>0.21</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    mean radius  mean texture  mean perimeter  mean area  mean concavity  \\\n",
       "20        13.08         15.71           85.63      520.0            0.05   \n",
       "21         9.50         12.44           60.34      273.9            0.03   \n",
       "22        15.34         14.26          102.50      704.4            0.21   \n",
       "23        21.16         23.04          137.20     1404.0            0.11   \n",
       "24        16.65         21.38          110.00      904.6            0.15   \n",
       "\n",
       "    radius error  perimeter error  area error  worst radius  worst texture  \\\n",
       "20          0.19             1.38       14.67         14.50          20.49   \n",
       "21          0.28             1.91       15.70         10.23          15.66   \n",
       "22          0.44             3.38       44.91         18.07          19.08   \n",
       "23          0.69             4.30       93.99         29.17          35.59   \n",
       "24          0.81             5.46      102.60         26.46          31.56   \n",
       "\n",
       "    worst perimeter  worst area  worst compactness  worst concavity  \\\n",
       "20            96.09       630.5               0.28             0.19   \n",
       "21            65.13       314.9               0.11             0.09   \n",
       "22           125.10       980.9               0.60             0.63   \n",
       "23           188.00      2615.0               0.26             0.32   \n",
       "24           177.00      2215.0               0.36             0.47   \n",
       "\n",
       "    worst concave points  \n",
       "20                  0.07  \n",
       "21                  0.06  \n",
       "22                  0.24  \n",
       "23                  0.20  \n",
       "24                  0.21  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.round(feature_subset_df.iloc[20:25], 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model accuracy statistics with 5-fold cross validation\n",
      "Model accuracy with complete feature set (569, 30) : 0.950904193921\n",
      "Model accuracy with selected feature set (569, 15) : 0.952643324356\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "# build logistic regression model\n",
    "lr = LogisticRegression()\n",
    "\n",
    "# evaluating accuracy for model built on full featureset\n",
    "full_feat_acc = np.average(cross_val_score(lr, bc_X, bc_y, scoring='accuracy', cv=5))\n",
    "# evaluating accuracy for model built on selected featureset\n",
    "sel_feat_acc = np.average(cross_val_score(lr, bc_SX, bc_y, scoring='accuracy', cv=5))\n",
    "\n",
    "print('Model accuracy statistics with 5-fold cross validation')\n",
    "print('Model accuracy with complete feature set', bc_X.shape, ':', full_feat_acc)\n",
    "print('Model accuracy with selected feature set', bc_SX.shape, ':', sel_feat_acc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Recursive Feature Elimination"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
       "          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
       "          verbose=0, warm_start=False),\n",
       "  n_features_to_select=15, step=1, verbose=0)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_selection import RFE\n",
    "\n",
    "lr = LogisticRegression()\n",
    "rfe = RFE(estimator=lr, n_features_to_select=15, step=1)\n",
    "rfe.fit(bc_X, bc_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['mean radius' 'mean texture' 'mean perimeter' 'mean smoothness'\n",
      " 'mean concavity' 'mean concave points' 'mean symmetry' 'texture error'\n",
      " 'worst radius' 'worst texture' 'worst smoothness' 'worst concavity'\n",
      " 'worst concave points' 'worst symmetry' 'worst fractal dimension']\n"
     ]
    }
   ],
   "source": [
    "select_features_rfe = rfe.get_support()\n",
    "feature_names_rfe = bc_data.feature_names[select_features_rfe]\n",
    "print(feature_names_rfe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'mean concavity',\n",
       " 'mean perimeter',\n",
       " 'mean radius',\n",
       " 'mean texture',\n",
       " 'worst concave points',\n",
       " 'worst concavity',\n",
       " 'worst radius',\n",
       " 'worst texture'}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(feature_names_kbest) & set(feature_names_rfe)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Model based selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_impurity_split=1e-07, min_samples_leaf=1,\n",
       "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,\n",
       "            verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "rfc = RandomForestClassifier()\n",
    "rfc.fit(bc_X, bc_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('worst concave points', 0.22465186401289805),\n",
       " ('worst area', 0.22183657032316897),\n",
       " ('mean concave points', 0.18192574025833769),\n",
       " ('worst perimeter', 0.099521838900566054),\n",
       " ('worst radius', 0.084068507192381611),\n",
       " ('worst texture', 0.02243708745933972),\n",
       " ('mean perimeter', 0.020073882937172081),\n",
       " ('worst smoothness', 0.014608966775322443),\n",
       " ('mean radius', 0.01374196961657885),\n",
       " ('worst concavity', 0.011340255118074721)]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "importance_scores = rfc.feature_importances_\n",
    "feature_importances = [(feature, score) for feature, score in zip(bc_data.feature_names, importance_scores)]\n",
    "sorted(feature_importances, key=lambda x: -x[1])[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature extraction using dimensionality reduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(30, 3)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# center the feature set\n",
    "bc_XC = bc_X - bc_X.mean(axis=0)\n",
    "\n",
    "# decompose using SVD\n",
    "U, S, VT = np.linalg.svd(bc_XC)\n",
    "\n",
    "# get principal components\n",
    "PC = VT.T\n",
    "\n",
    "# get first 3 principal components\n",
    "PC3 = PC[:, 0:3]\n",
    "PC3.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1160.14,  -293.92,   -48.58],\n",
       "       [-1269.12,    15.63,    35.39],\n",
       "       [ -995.79,    39.16,     1.71],\n",
       "       ..., \n",
       "       [ -314.5 ,    47.55,    10.44],\n",
       "       [-1124.86,    34.13,    19.74],\n",
       "       [  771.53,   -88.64,   -23.89]])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# reduce feature set dimensionality \n",
    "np.round(bc_XC.dot(PC3), 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,\n",
       "  svd_solver='auto', tol=0.0, whiten=False)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "pca = PCA(n_components=3)\n",
    "pca.fit(bc_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.98204467,  0.01617649,  0.00155751])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pca.explained_variance_ratio_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 1160.14,  -293.92,    48.58],\n",
       "       [ 1269.12,    15.63,   -35.39],\n",
       "       [  995.79,    39.16,    -1.71],\n",
       "       ..., \n",
       "       [  314.5 ,    47.55,   -10.44],\n",
       "       [ 1124.86,    34.13,   -19.74],\n",
       "       [ -771.53,   -88.64,    23.89]])"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bc_pca = pca.transform(bc_X)\n",
    "np.round(bc_pca, 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.92808003078106949"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.average(cross_val_score(lr, bc_pca, bc_y, scoring='accuracy', cv=5))"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}