{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## Why do Feature Selection ??\n",
    "\n",
    "# Reduces Overfitting: Less redundant data means less opportunity to make decisions based on noise.\n",
    "# Improves Accuracy: Less misleading data means modeling accuracy improves.\n",
    "# Reduces Training Time: Less data means that algorithms train faster."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.19.1\n",
      "0.20.3\n",
      "1.14.2\n"
     ]
    }
   ],
   "source": [
    "# Imports\n",
    "from sklearn.utils import shuffle\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "np.random.seed(10)\n",
    "import sklearn\n",
    "from numpy import set_printoptions\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import chi2\n",
    "from sklearn.feature_selection import RFE\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn import preprocessing\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "print sklearn.__version__\n",
    "print pd.__version__\n",
    "print np.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# this class takes care for scaling the features to the scale of 0-1\n",
    "# we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which \n",
    "# also has the range from 0-1\n",
    "class Normalizer:\n",
    "\n",
    "    def __init__(self):\n",
    "        self.sc = MinMaxScaler()\n",
    "    \n",
    "    def scale(self, X, dtype):\n",
    "        if dtype=='train':\n",
    "            XX = self.sc.fit_transform(X)\n",
    "        elif dtype=='test':\n",
    "            XX = self.sc.transform(X)\n",
    "        else:\n",
    "            return None\n",
    "        return XX"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     feat1  feat2  feat3  feat4            class\n",
      "87     6.3    2.3    4.4    1.3  Iris-versicolor\n",
      "111    6.4    2.7    5.3    1.9   Iris-virginica\n",
      "********************\n",
      "[[0.55555556 0.125      0.57627119 0.5       ]\n",
      " [0.58333333 0.29166667 0.72881356 0.75      ]] ['Iris-versicolor' 'Iris-virginica']\n",
      "********************\n",
      "[11.377  4.34  26.827 30.569]\n",
      "********************\n",
      "[[0.576 0.5  ]\n",
      " [0.729 0.75 ]]\n"
     ]
    }
   ],
   "source": [
    "#### METHOD-1\n",
    "# Univariate Feature Selection\n",
    "###\n",
    "\n",
    "# Select those features that have strong relationship with the output variable\n",
    "df = pd.read_csv('../data/day1/iris.csv')\n",
    "df = shuffle(df)\n",
    "print df.head(2)\n",
    "print '*'*20\n",
    "norm = Normalizer()\n",
    "X = norm.scale(df.iloc[:,:-1].values, 'train')\n",
    "Y = df.iloc[:,-1].values\n",
    "print X[:2], Y[:2]\n",
    "print '*'*20\n",
    "# feature extraction\n",
    "test = SelectKBest(score_func=chi2, k=2)\n",
    "fit = test.fit(X, Y)\n",
    "# summarize scores\n",
    "np.set_printoptions(precision=3)\n",
    "print(fit.scores_)\n",
    "print '*'*20\n",
    "features = fit.transform(X)\n",
    "# summarize selected features\n",
    "print(features[0:2,:])\n",
    "\n",
    "# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Suggestions before you see the next cell\n",
    "\n",
    "* Read about chi-square test and get the intution."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     feat1  feat2  feat3  feat4            class\n",
      "87     6.3    2.3    4.4    1.3  Iris-versicolor\n",
      "100    6.3    3.3    6.0    2.5   Iris-virginica\n",
      "********************\n",
      "[[0.556 0.125 0.576 0.5  ]\n",
      " [0.556 0.542 0.847 1.   ]] ['Iris-versicolor' 'Iris-virginica']\n",
      "********************\n",
      "Selected Features: [False False  True  True]\n"
     ]
    }
   ],
   "source": [
    "#### METHOD-2\n",
    "# Recursive Feature Elimination (Backwards)\n",
    "###\n",
    "# The Recursive Feature Elimination (or RFE) works by recursively removing attributes and\n",
    "# building a model on those attributes that remain. It uses the model accuracy to identify which attributes \n",
    "# (and combination of attributes) contribute the most to predicting the target attribute\n",
    "\n",
    "df = pd.read_csv('../data/day1/iris.csv')\n",
    "df = shuffle(df)\n",
    "print df.head(2)\n",
    "print '*'*20\n",
    "norm = Normalizer()\n",
    "X = norm.scale(df.iloc[:,:-1].values, 'train')\n",
    "Y = df.iloc[:,-1].values\n",
    "print X[:2], Y[:2]\n",
    "print '*'*20\n",
    "model = LogisticRegression()\n",
    "rfe = RFE(model, 2)\n",
    "fit = rfe.fit(X, Y)\n",
    "print(\"Selected Features: %s\") % fit.support_\n",
    "\n",
    "# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Suggestions before you see the next cell\n",
    "\n",
    "* http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    feat1  feat2  feat3  feat4        class\n",
      "36    5.5    3.5    1.3    0.2  Iris-setosa\n",
      "43    5.0    3.5    1.6    0.6  Iris-setosa\n",
      "********************\n",
      "[[0.333 0.625 0.051 0.042]\n",
      " [0.194 0.625 0.102 0.208]] ['Iris-setosa' 'Iris-setosa']\n",
      "********************\n",
      "[0.083 0.086 0.447 0.384]\n"
     ]
    }
   ],
   "source": [
    "#### METHOD-3\n",
    "# Using Feature importance from any bagging algorithm\n",
    "###\n",
    "df = pd.read_csv('../data/day1/iris.csv')\n",
    "df = shuffle(df)\n",
    "print df.head(2)\n",
    "print '*'*20\n",
    "norm = Normalizer()\n",
    "X = norm.scale(df.iloc[:,:-1].values, 'train')\n",
    "Y = df.iloc[:,-1].values\n",
    "print X[:2], Y[:2]\n",
    "print '*'*20\n",
    "# feature extraction\n",
    "model = ExtraTreesClassifier()\n",
    "model.fit(X, Y)\n",
    "print model.feature_importances_\n",
    "\n",
    "# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Feature Selection and Elimination are mutually exclusive techniques for getting correct set of features for\n",
    "# your ML model\n",
    "\n",
    "# Few techniques for elimination are:\n",
    "## Remove Zero/Less Variance columns\n",
    "## Remove Columns with many missing values\n",
    "## Remove Highly +ve/-ve co-related features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Other techniques for feature selection include: \n",
    "## PCA Decomposition\n",
    "## Auto Encoders\n",
    "\n",
    "### we will see them in few upcoming days!!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}