{ "cells": [ { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.svm import SVC\n", "from sklearn.metrics import confusion_matrix\n", "import matplotlib.pyplot as plt\n", "from sklearn.metrics import roc_auc_score\n", "from sklearn.metrics import classification_report\n", "%matplotlib inline\n" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5822, 86)" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('../data/classification/Caravan.csv', ';')\n", "df = df.sample(frac = 1)\n", "df.shape\n" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "X = df[ df.columns[:-1] ]\n", "le = LabelEncoder()\n", "y = le.fit_transform(df['Purchase'])" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y,\n", " test_size=0.3,\n", " shuffle = True,\n", " random_state=2)\n" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,\n", " decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',\n", " max_iter=-1, probability=True, random_state=None, shrinking=True,\n", " tol=0.001, verbose=False)" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "C = 1\n", "clf = SVC(kernel='sigmoid', C = C, probability = True)\n", "clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.93589009730967376" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.score(X_test, y_test)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1635, 101],\n", " [ 11, 0]])" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "yhat = clf.predict(X_test)\n", "confusion_matrix(yhat, y_test)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-- ROC AUC Score test 0.4099\n", " precision recall f1-score support\n", "\n", " 0 0.99 0.94 0.97 1736\n", " 1 0.00 0.00 0.00 11\n", "\n", "avg / total 0.99 0.94 0.96 1747\n", "\n" ] } ], "source": [ "# ROC AUC\n", "\n", "yhat_proba = clf.predict_proba(X_test)[:,-1]\n", "print(\"-- ROC AUC Score test {:.4f}\".format(roc_auc_score(y_test, yhat_proba) ))\n", "\n", "print(classification_report(yhat, y_test))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Oversampling" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "No 5474\n", "Yes 1392\n", "Name: Purchase, dtype: int64" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_maj = df[df.Purchase == 'No']\n", "df_min = df[df.Purchase == 'Yes'].sample(frac = 4, replace = True )\n", "\n", "odf = pd.concat([df_maj, df_min])\n", "\n", "\n", "odf = odf.sample(frac = 1)\n", "odf.Purchase.value_counts()" ] }, { "cell_type": "code", "execution_count": 75, "metadata": { "collapsed": true }, "outputs": [], "source": [ "\n", "X = odf[ odf.columns[:-1] ]\n", "le = LabelEncoder()\n", "y = le.fit_transform(odf['Purchase'])\n", "X_train, X_test, y_train, y_test = train_test_split(X, y,\n", " test_size=0.3,\n", " shuffle = True,\n", " random_state=2)\n" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,\n", " decision_function_shape='ovr', degree=6, gamma='auto', kernel='poly',\n", " max_iter=-1, probability=True, random_state=None, shrinking=True,\n", " tol=0.001, verbose=False)" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "C = 1\n", "clf = SVC(kernel='poly', C = C, probability = True, degree = 6)\n", "clf.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[1465 35]\n", " [ 184 376]]\n", "-- ROC AUC Score test 0.9127\n" ] } ], "source": [ "yhat = clf.predict(X_test)\n", "print(confusion_matrix(yhat, y_test))\n", "\n", "yhat_proba = clf.predict_proba(X_test)[:,-1]\n", "print(\"-- ROC AUC Score test {:.4f}\".format(roc_auc_score(y_test, yhat_proba) ))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Undersample" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "No 1095\n", "Yes 348\n", "Name: Purchase, dtype: int64" ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_maj = df[df.Purchase == 'No'].sample(frac = 0.2)\n", "df_min = df[df.Purchase == 'Yes']\n", "\n", "udf = pd.concat([df_maj, df_min])\n", "\n", "\n", "udf = udf.sample(frac = 1)\n", "udf.Purchase.value_counts()" ] }, { "cell_type": "code", "execution_count": 116, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X = udf[ udf.columns[:-1] ]\n", "le = LabelEncoder()\n", "y = le.fit_transform(udf['Purchase'])\n", "X_train, X_test, y_train, y_test = train_test_split(X, y,\n", " test_size=0.3,\n", " shuffle = True,\n", " random_state=2)\n" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,\n", " decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\n", " max_iter=-1, probability=True, random_state=None, shrinking=True,\n", " tol=0.001, verbose=False)" ] }, "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "C = 1\n", "clf = SVC(kernel='rbf', C = C, probability = True)\n", "clf.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[318 85]\n", " [ 14 16]]\n", "-- ROC AUC Score test 0.6596\n" ] } ], "source": [ "yhat = clf.predict(X_test)\n", "print(confusion_matrix(yhat, y_test))\n", "\n", "yhat_proba = clf.predict_proba(X_test)[:,-1]\n", "print(\"-- ROC AUC Score test {:.4f}\".format(roc_auc_score(y_test, yhat_proba) ))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }