{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.metrics import confusion_matrix\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from sklearn.metrics import classification_report\n",
    "%matplotlib inline\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5822, 86)"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('../data/classification/Caravan.csv', ';')\n",
    "df = df.sample(frac = 1)\n",
    "df.shape\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df[ df.columns[:-1]  ]\n",
    "le = LabelEncoder()\n",
    "y = le.fit_transform(df['Purchase'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
    "    test_size=0.3,\n",
    "    shuffle = True,\n",
    "    random_state=2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,\n",
       "  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',\n",
       "  max_iter=-1, probability=True, random_state=None, shrinking=True,\n",
       "  tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "C = 1\n",
    "clf = SVC(kernel='sigmoid', C = C, probability = True)\n",
    "clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.93589009730967376"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1635,  101],\n",
       "       [  11,    0]])"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "yhat = clf.predict(X_test)\n",
    "confusion_matrix(yhat, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-- ROC AUC Score test 0.4099\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.99      0.94      0.97      1736\n",
      "          1       0.00      0.00      0.00        11\n",
      "\n",
      "avg / total       0.99      0.94      0.96      1747\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# ROC AUC\n",
    "\n",
    "yhat_proba = clf.predict_proba(X_test)[:,-1]\n",
    "print(\"-- ROC AUC Score test {:.4f}\".format(roc_auc_score(y_test, yhat_proba)  ))\n",
    "\n",
    "print(classification_report(yhat, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Oversampling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "No     5474\n",
       "Yes    1392\n",
       "Name: Purchase, dtype: int64"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_maj = df[df.Purchase == 'No']\n",
    "df_min = df[df.Purchase == 'Yes'].sample(frac = 4, replace = True )\n",
    "\n",
    "odf = pd.concat([df_maj, df_min])\n",
    "\n",
    "\n",
    "odf = odf.sample(frac = 1)\n",
    "odf.Purchase.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "X = odf[ odf.columns[:-1]  ]\n",
    "le = LabelEncoder()\n",
    "y = le.fit_transform(odf['Purchase'])\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
    "    test_size=0.3,\n",
    "    shuffle = True,\n",
    "    random_state=2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,\n",
       "  decision_function_shape='ovr', degree=6, gamma='auto', kernel='poly',\n",
       "  max_iter=-1, probability=True, random_state=None, shrinking=True,\n",
       "  tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "C = 1\n",
    "clf = SVC(kernel='poly', C = C, probability = True, degree = 6)\n",
    "clf.fit(X_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1465   35]\n",
      " [ 184  376]]\n",
      "-- ROC AUC Score test 0.9127\n"
     ]
    }
   ],
   "source": [
    "yhat = clf.predict(X_test)\n",
    "print(confusion_matrix(yhat, y_test))\n",
    "\n",
    "yhat_proba = clf.predict_proba(X_test)[:,-1]\n",
    "print(\"-- ROC AUC Score test {:.4f}\".format(roc_auc_score(y_test, yhat_proba)  ))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Undersample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "No     1095\n",
       "Yes     348\n",
       "Name: Purchase, dtype: int64"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_maj = df[df.Purchase == 'No'].sample(frac = 0.2)\n",
    "df_min = df[df.Purchase == 'Yes']\n",
    "\n",
    "udf = pd.concat([df_maj, df_min])\n",
    "\n",
    "\n",
    "udf = udf.sample(frac = 1)\n",
    "udf.Purchase.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = udf[ udf.columns[:-1]  ]\n",
    "le = LabelEncoder()\n",
    "y = le.fit_transform(udf['Purchase'])\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y,\n",
    "    test_size=0.3,\n",
    "    shuffle = True,\n",
    "    random_state=2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,\n",
       "  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\n",
       "  max_iter=-1, probability=True, random_state=None, shrinking=True,\n",
       "  tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "C = 1\n",
    "clf = SVC(kernel='rbf', C = C, probability = True)\n",
    "clf.fit(X_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[318  85]\n",
      " [ 14  16]]\n",
      "-- ROC AUC Score test 0.6596\n"
     ]
    }
   ],
   "source": [
    "yhat = clf.predict(X_test)\n",
    "print(confusion_matrix(yhat, y_test))\n",
    "\n",
    "yhat_proba = clf.predict_proba(X_test)[:,-1]\n",
    "print(\"-- ROC AUC Score test {:.4f}\".format(roc_auc_score(y_test, yhat_proba)  ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}