{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Import necessary dependencies and settings" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "np.set_printoptions(suppress=True)\n", "pt = np.get_printoptions()['threshold']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Threshold based methods" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Limiting features in bag of word based models" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", " dtype=, encoding='utf-8', input='content',\n", " lowercase=True, max_df=0.85, max_features=2000, min_df=0.1,\n", " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", " strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n", " tokenizer=None, vocabulary=None)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "cv = CountVectorizer(min_df=0.1, max_df=0.85, max_features=2000)\n", "cv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Variance based thresholding" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Gen 1Gen 2Gen 3Gen 4Gen 5Gen 6
0100000
1100000
2100000
3100000
4100000
\n", "
" ], "text/plain": [ " Gen 1 Gen 2 Gen 3 Gen 4 Gen 5 Gen 6\n", "0 1 0 0 0 0 0\n", "1 1 0 0 0 0 0\n", "2 1 0 0 0 0 0\n", "3 1 0 0 0 0 0\n", "4 1 0 0 0 0 0" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('datasets/Pokemon.csv')\n", "poke_gen = pd.get_dummies(df['Generation'])\n", "poke_gen.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "VarianceThreshold(threshold=0.15)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_selection import VarianceThreshold\n", "\n", "vt = VarianceThreshold(threshold=.15)\n", "vt.fit(poke_gen)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Gen 1Gen 2Gen 3Gen 4Gen 5Gen 6
select_featureTrueFalseTrueFalseTrueFalse
variance0.1644440.1149440.160.1283730.1637110.0919937
\n", "
" ], "text/plain": [ " Gen 1 Gen 2 Gen 3 Gen 4 Gen 5 Gen 6\n", "select_feature True False True False True False\n", "variance 0.164444 0.114944 0.16 0.128373 0.163711 0.0919937" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame({'variance': vt.variances_,\n", " 'select_feature': vt.get_support()},\n", " index=poke_gen.columns).T" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Gen 1Gen 3Gen 5
0100
1100
2100
3100
4100
\n", "
" ], "text/plain": [ " Gen 1 Gen 3 Gen 5\n", "0 1 0 0\n", "1 1 0 0\n", "2 1 0 0\n", "3 1 0 0\n", "4 1 0 0" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "poke_gen_subset = poke_gen.iloc[:,vt.get_support()].head()\n", "poke_gen_subset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Statistical Methods" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Feature set shape: (569, 30)\n", "Response class shape: (569,)\n" ] } ], "source": [ "from sklearn.datasets import load_breast_cancer\n", "\n", "bc_data = load_breast_cancer()\n", "bc_features = pd.DataFrame(bc_data.data, columns=bc_data.feature_names)\n", "bc_classes = pd.DataFrame(bc_data.target, columns=['IsMalignant'])\n", "\n", "# build featureset and response class labels \n", "bc_X = np.array(bc_features)\n", "bc_y = np.array(bc_classes).T[0]\n", "print('Feature set shape:', bc_X.shape)\n", "print('Response class shape:', bc_y.shape)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Feature set data [shape: (569, 30)]\n", "[[ 17.99 10.38 122.8 ..., 0.27 0.46 0.12]\n", " [ 20.57 17.77 132.9 ..., 0.19 0.28 0.09]\n", " [ 19.69 21.25 130. ..., 0.24 0.36 0.09]\n", " ..., \n", " [ 16.6 28.08 108.3 ..., 0.14 0.22 0.08]\n", " [ 20.6 29.33 140.1 ..., 0.26 0.41 0.12]\n", " [ 7.76 24.54 47.92 ..., 0. 0.29 0.07]] \n", "\n", "Feature names:\n", "['mean radius' 'mean texture' 'mean perimeter' 'mean area'\n", " 'mean smoothness' 'mean compactness' 'mean concavity'\n", " 'mean concave points' 'mean symmetry' 'mean fractal dimension'\n", " 'radius error' 'texture error' 'perimeter error' 'area error'\n", " 'smoothness error' 'compactness error' 'concavity error'\n", " 'concave points error' 'symmetry error' 'fractal dimension error'\n", " 'worst radius' 'worst texture' 'worst perimeter' 'worst area'\n", " 'worst smoothness' 'worst compactness' 'worst concavity'\n", " 'worst concave points' 'worst symmetry' 'worst fractal dimension'] \n", "\n", "Predictor Class label data [shape: (569,)]\n", "[0 0 0 ..., 0 0 1] \n", "\n", "Predictor name: ['IsMalignant']\n" ] } ], "source": [ "np.set_printoptions(threshold=30)\n", "print('Feature set data [shape: '+str(bc_X.shape)+']')\n", "print(np.round(bc_X, 2), '\\n')\n", "print('Feature names:')\n", "print(np.array(bc_features.columns), '\\n')\n", "print('Predictor Class label data [shape: '+str(bc_y.shape)+']')\n", "print(bc_y, '\\n')\n", "print('Predictor name:', np.array(bc_classes.columns))\n", "np.set_printoptions(threshold=pt)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "SelectKBest(k=15, score_func=)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_selection import chi2, SelectKBest\n", "\n", "skb = SelectKBest(score_func=chi2, k=15)\n", "skb.fit(bc_X, bc_y)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('worst area', 112598.43156405364),\n", " ('mean area', 53991.655923750892),\n", " ('area error', 8758.5047053344697),\n", " ('worst perimeter', 3665.0354163405909),\n", " ('mean perimeter', 2011.1028637679051),\n", " ('worst radius', 491.68915743332195),\n", " ('mean radius', 266.10491719517802),\n", " ('perimeter error', 250.57189635982184),\n", " ('worst texture', 174.44939960571074),\n", " ('mean texture', 93.897508098633352)]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_scores = [(item, score) for item, score in zip(bc_data.feature_names, skb.scores_)]\n", "sorted(feature_scores, key=lambda x: -x[1])[:10]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(569, 15)\n", "['mean radius' 'mean texture' 'mean perimeter' 'mean area' 'mean concavity'\n", " 'radius error' 'perimeter error' 'area error' 'worst radius'\n", " 'worst texture' 'worst perimeter' 'worst area' 'worst compactness'\n", " 'worst concavity' 'worst concave points']\n" ] } ], "source": [ "select_features_kbest = skb.get_support()\n", "feature_names_kbest = bc_data.feature_names[select_features_kbest]\n", "feature_subset_df = bc_features[feature_names_kbest]\n", "bc_SX = np.array(feature_subset_df)\n", "print(bc_SX.shape)\n", "print(feature_names_kbest)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mean radiusmean texturemean perimetermean areamean concavityradius errorperimeter errorarea errorworst radiusworst textureworst perimeterworst areaworst compactnessworst concavityworst concave points
2013.0815.7185.63520.00.050.191.3814.6714.5020.4996.09630.50.280.190.07
219.5012.4460.34273.90.030.281.9115.7010.2315.6665.13314.90.110.090.06
2215.3414.26102.50704.40.210.443.3844.9118.0719.08125.10980.90.600.630.24
2321.1623.04137.201404.00.110.694.3093.9929.1735.59188.002615.00.260.320.20
2416.6521.38110.00904.60.150.815.46102.6026.4631.56177.002215.00.360.470.21
\n", "
" ], "text/plain": [ " mean radius mean texture mean perimeter mean area mean concavity \\\n", "20 13.08 15.71 85.63 520.0 0.05 \n", "21 9.50 12.44 60.34 273.9 0.03 \n", "22 15.34 14.26 102.50 704.4 0.21 \n", "23 21.16 23.04 137.20 1404.0 0.11 \n", "24 16.65 21.38 110.00 904.6 0.15 \n", "\n", " radius error perimeter error area error worst radius worst texture \\\n", "20 0.19 1.38 14.67 14.50 20.49 \n", "21 0.28 1.91 15.70 10.23 15.66 \n", "22 0.44 3.38 44.91 18.07 19.08 \n", "23 0.69 4.30 93.99 29.17 35.59 \n", "24 0.81 5.46 102.60 26.46 31.56 \n", "\n", " worst perimeter worst area worst compactness worst concavity \\\n", "20 96.09 630.5 0.28 0.19 \n", "21 65.13 314.9 0.11 0.09 \n", "22 125.10 980.9 0.60 0.63 \n", "23 188.00 2615.0 0.26 0.32 \n", "24 177.00 2215.0 0.36 0.47 \n", "\n", " worst concave points \n", "20 0.07 \n", "21 0.06 \n", "22 0.24 \n", "23 0.20 \n", "24 0.21 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.round(feature_subset_df.iloc[20:25], 2)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model accuracy statistics with 5-fold cross validation\n", "Model accuracy with complete feature set (569, 30) : 0.950904193921\n", "Model accuracy with selected feature set (569, 15) : 0.952643324356\n" ] } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import cross_val_score\n", "\n", "# build logistic regression model\n", "lr = LogisticRegression()\n", "\n", "# evaluating accuracy for model built on full featureset\n", "full_feat_acc = np.average(cross_val_score(lr, bc_X, bc_y, scoring='accuracy', cv=5))\n", "# evaluating accuracy for model built on selected featureset\n", "sel_feat_acc = np.average(cross_val_score(lr, bc_SX, bc_y, scoring='accuracy', cv=5))\n", "\n", "print('Model accuracy statistics with 5-fold cross validation')\n", "print('Model accuracy with complete feature set', bc_X.shape, ':', full_feat_acc)\n", "print('Model accuracy with selected feature set', bc_SX.shape, ':', sel_feat_acc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Recursive Feature Elimination" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", " verbose=0, warm_start=False),\n", " n_features_to_select=15, step=1, verbose=0)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_selection import RFE\n", "\n", "lr = LogisticRegression()\n", "rfe = RFE(estimator=lr, n_features_to_select=15, step=1)\n", "rfe.fit(bc_X, bc_y)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['mean radius' 'mean texture' 'mean perimeter' 'mean smoothness'\n", " 'mean concavity' 'mean concave points' 'mean symmetry' 'texture error'\n", " 'worst radius' 'worst texture' 'worst smoothness' 'worst concavity'\n", " 'worst concave points' 'worst symmetry' 'worst fractal dimension']\n" ] } ], "source": [ "select_features_rfe = rfe.get_support()\n", "feature_names_rfe = bc_data.feature_names[select_features_rfe]\n", "print(feature_names_rfe)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'mean concavity',\n", " 'mean perimeter',\n", " 'mean radius',\n", " 'mean texture',\n", " 'worst concave points',\n", " 'worst concavity',\n", " 'worst radius',\n", " 'worst texture'}" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set(feature_names_kbest) & set(feature_names_rfe)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Model based selection" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", " min_impurity_split=1e-07, min_samples_leaf=1,\n", " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", " n_estimators=10, n_jobs=1, oob_score=False, random_state=None,\n", " verbose=0, warm_start=False)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "rfc = RandomForestClassifier()\n", "rfc.fit(bc_X, bc_y)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('worst concave points', 0.22465186401289805),\n", " ('worst area', 0.22183657032316897),\n", " ('mean concave points', 0.18192574025833769),\n", " ('worst perimeter', 0.099521838900566054),\n", " ('worst radius', 0.084068507192381611),\n", " ('worst texture', 0.02243708745933972),\n", " ('mean perimeter', 0.020073882937172081),\n", " ('worst smoothness', 0.014608966775322443),\n", " ('mean radius', 0.01374196961657885),\n", " ('worst concavity', 0.011340255118074721)]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "importance_scores = rfc.feature_importances_\n", "feature_importances = [(feature, score) for feature, score in zip(bc_data.feature_names, importance_scores)]\n", "sorted(feature_importances, key=lambda x: -x[1])[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Feature extraction using dimensionality reduction" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(30, 3)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# center the feature set\n", "bc_XC = bc_X - bc_X.mean(axis=0)\n", "\n", "# decompose using SVD\n", "U, S, VT = np.linalg.svd(bc_XC)\n", "\n", "# get principal components\n", "PC = VT.T\n", "\n", "# get first 3 principal components\n", "PC3 = PC[:, 0:3]\n", "PC3.shape" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[-1160.14, -293.92, -48.58],\n", " [-1269.12, 15.63, 35.39],\n", " [ -995.79, 39.16, 1.71],\n", " ..., \n", " [ -314.5 , 47.55, 10.44],\n", " [-1124.86, 34.13, 19.74],\n", " [ 771.53, -88.64, -23.89]])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# reduce feature set dimensionality \n", "np.round(bc_XC.dot(PC3), 2)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,\n", " svd_solver='auto', tol=0.0, whiten=False)" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.decomposition import PCA\n", "pca = PCA(n_components=3)\n", "pca.fit(bc_X)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ 0.98204467, 0.01617649, 0.00155751])" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pca.explained_variance_ratio_" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 1160.14, -293.92, 48.58],\n", " [ 1269.12, 15.63, -35.39],\n", " [ 995.79, 39.16, -1.71],\n", " ..., \n", " [ 314.5 , 47.55, -10.44],\n", " [ 1124.86, 34.13, -19.74],\n", " [ -771.53, -88.64, 23.89]])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bc_pca = pca.transform(bc_X)\n", "np.round(bc_pca, 2)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.92808003078106949" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.average(cross_val_score(lr, bc_pca, bc_y, scoring='accuracy', cv=5))" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }