{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "## Why do Feature Selection ??\n", "\n", "# Reduces Overfitting: Less redundant data means less opportunity to make decisions based on noise.\n", "# Improves Accuracy: Less misleading data means modeling accuracy improves.\n", "# Reduces Training Time: Less data means that algorithms train faster." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.19.1\n", "0.20.3\n", "1.14.2\n" ] } ], "source": [ "# Imports\n", "from sklearn.utils import shuffle\n", "import pandas as pd\n", "import numpy as np\n", "np.random.seed(10)\n", "import sklearn\n", "from numpy import set_printoptions\n", "from sklearn.feature_selection import SelectKBest\n", "from sklearn.feature_selection import chi2\n", "from sklearn.feature_selection import RFE\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import ExtraTreesClassifier\n", "from sklearn import preprocessing\n", "from sklearn.preprocessing import MinMaxScaler\n", "\n", "print sklearn.__version__\n", "print pd.__version__\n", "print np.__version__" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# this class takes care for scaling the features to the scale of 0-1\n", "# we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which \n", "# also has the range from 0-1\n", "class Normalizer:\n", "\n", " def __init__(self):\n", " self.sc = MinMaxScaler()\n", " \n", " def scale(self, X, dtype):\n", " if dtype=='train':\n", " XX = self.sc.fit_transform(X)\n", " elif dtype=='test':\n", " XX = self.sc.transform(X)\n", " else:\n", " return None\n", " return XX" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " feat1 feat2 feat3 feat4 class\n", "87 6.3 2.3 4.4 1.3 Iris-versicolor\n", "111 6.4 2.7 5.3 1.9 Iris-virginica\n", "********************\n", "[[0.55555556 0.125 0.57627119 0.5 ]\n", " [0.58333333 0.29166667 0.72881356 0.75 ]] ['Iris-versicolor' 'Iris-virginica']\n", "********************\n", "[11.377 4.34 26.827 30.569]\n", "********************\n", "[[0.576 0.5 ]\n", " [0.729 0.75 ]]\n" ] } ], "source": [ "#### METHOD-1\n", "# Univariate Feature Selection\n", "###\n", "\n", "# Select those features that have strong relationship with the output variable\n", "df = pd.read_csv('../data/day1/iris.csv')\n", "df = shuffle(df)\n", "print df.head(2)\n", "print '*'*20\n", "norm = Normalizer()\n", "X = norm.scale(df.iloc[:,:-1].values, 'train')\n", "Y = df.iloc[:,-1].values\n", "print X[:2], Y[:2]\n", "print '*'*20\n", "# feature extraction\n", "test = SelectKBest(score_func=chi2, k=2)\n", "fit = test.fit(X, Y)\n", "# summarize scores\n", "np.set_printoptions(precision=3)\n", "print(fit.scores_)\n", "print '*'*20\n", "features = fit.transform(X)\n", "# summarize selected features\n", "print(features[0:2,:])\n", "\n", "# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Suggestions before you see the next cell\n", "\n", "* Read about chi-square test and get the intution." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " feat1 feat2 feat3 feat4 class\n", "87 6.3 2.3 4.4 1.3 Iris-versicolor\n", "100 6.3 3.3 6.0 2.5 Iris-virginica\n", "********************\n", "[[0.556 0.125 0.576 0.5 ]\n", " [0.556 0.542 0.847 1. ]] ['Iris-versicolor' 'Iris-virginica']\n", "********************\n", "Selected Features: [False False True True]\n" ] } ], "source": [ "#### METHOD-2\n", "# Recursive Feature Elimination (Backwards)\n", "###\n", "# The Recursive Feature Elimination (or RFE) works by recursively removing attributes and\n", "# building a model on those attributes that remain. It uses the model accuracy to identify which attributes \n", "# (and combination of attributes) contribute the most to predicting the target attribute\n", "\n", "df = pd.read_csv('../data/day1/iris.csv')\n", "df = shuffle(df)\n", "print df.head(2)\n", "print '*'*20\n", "norm = Normalizer()\n", "X = norm.scale(df.iloc[:,:-1].values, 'train')\n", "Y = df.iloc[:,-1].values\n", "print X[:2], Y[:2]\n", "print '*'*20\n", "model = LogisticRegression()\n", "rfe = RFE(model, 2)\n", "fit = rfe.fit(X, Y)\n", "print(\"Selected Features: %s\") % fit.support_\n", "\n", "# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Suggestions before you see the next cell\n", "\n", "* http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " feat1 feat2 feat3 feat4 class\n", "36 5.5 3.5 1.3 0.2 Iris-setosa\n", "43 5.0 3.5 1.6 0.6 Iris-setosa\n", "********************\n", "[[0.333 0.625 0.051 0.042]\n", " [0.194 0.625 0.102 0.208]] ['Iris-setosa' 'Iris-setosa']\n", "********************\n", "[0.083 0.086 0.447 0.384]\n" ] } ], "source": [ "#### METHOD-3\n", "# Using Feature importance from any bagging algorithm\n", "###\n", "df = pd.read_csv('../data/day1/iris.csv')\n", "df = shuffle(df)\n", "print df.head(2)\n", "print '*'*20\n", "norm = Normalizer()\n", "X = norm.scale(df.iloc[:,:-1].values, 'train')\n", "Y = df.iloc[:,-1].values\n", "print X[:2], Y[:2]\n", "print '*'*20\n", "# feature extraction\n", "model = ExtraTreesClassifier()\n", "model.fit(X, Y)\n", "print model.feature_importances_\n", "\n", "# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Feature Selection and Elimination are mutually exclusive techniques for getting correct set of features for\n", "# your ML model\n", "\n", "# Few techniques for elimination are:\n", "## Remove Zero/Less Variance columns\n", "## Remove Columns with many missing values\n", "## Remove Highly +ve/-ve co-related features" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Other techniques for feature selection include: \n", "## PCA Decomposition\n", "## Auto Encoders\n", "\n", "### we will see them in few upcoming days!!" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }