{
 "metadata": {
  "name": "",
  "signature": "sha256:90bebe71bb888c187d3c839b2bdc2395e00f81bf0cf2c8164b225fb4856d0acc"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "More on classification: SVM and Boosting"
     ]
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import time\n",
      "\n",
      "import numpy as np\n",
      "import matplotlib.pyplot as plt\n",
      "import pandas as pd\n",
      "\n",
      "import scipy as sp\n",
      "import scipy.sparse.linalg as linalg\n",
      "import scipy.cluster.hierarchy as hr\n",
      "from scipy.spatial.distance import pdist, squareform\n",
      "\n",
      "import sklearn.datasets as datasets\n",
      "import sklearn.metrics as metrics\n",
      "import sklearn.utils as utils\n",
      "import sklearn.linear_model as linear_model\n",
      "import sklearn.svm as svm\n",
      "import sklearn.cross_validation as cross_validation\n",
      "import sklearn.cluster as cluster\n",
      "from sklearn.ensemble import AdaBoostClassifier\n",
      "from sklearn.neighbors import KNeighborsClassifier\n",
      "from sklearn.decomposition import TruncatedSVD\n",
      "from sklearn.preprocessing import StandardScaler\n",
      "\n",
      "import statsmodels.api as sm\n",
      "\n",
      "from patsy import dmatrices\n",
      "\n",
      "import seaborn as sns\n",
      "%matplotlib inline"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Support Vector Machines (SVM)"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Working with the wine dataset available here:\n",
      "\n",
      "https://archive.ics.uci.edu/ml/datasets/Wine"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "wine = pd.read_table(\"datasets/wine.data\", sep=',')\n",
      "\n",
      "attributes = ['region',\n",
      "'Alcohol',\n",
      "            'Malic acid',\n",
      "            'Ash',\n",
      "            'Alcalinity of ash',\n",
      "            'Magnesium',\n",
      "            'Total phenols',\n",
      "            'Flavanoids',\n",
      "            'Nonflavanoid phenols',\n",
      "            'Proanthocyanins',\n",
      "            'Color intensity',\n",
      "            'Hue',\n",
      "            'OD280/OD315 of diluted wines',\n",
      "            'Proline']\n",
      "\n",
      "\n",
      "\n",
      "wine.columns = attributes\n",
      "X = wine[['Alcohol', 'Proline']].values\n",
      "grape = wine.pop('region')\n",
      "y = grape.values\n",
      "\n",
      "print wine.info()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "wine.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 5,
     "metadata": {},
     "source": [
      "Using the SVM library of scikit-learn:\n",
      "\n",
      "http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "svc = svm.SVC(kernel='linear')\n",
      "svc.fit(X, y)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Evaluating the fit of the classifier graphically"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from matplotlib.colors import ListedColormap\n",
      "# Create color maps for 3-class classification problem, as with iris\n",
      "cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])\n",
      "cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])\n",
      "\n",
      "def plot_estimator(estimator, X, y):\n",
      "    \n",
      "    try:\n",
      "        X, y = X.values, y.values\n",
      "    except AttributeError:\n",
      "        pass\n",
      "    \n",
      "    estimator.fit(X, y)\n",
      "    x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1\n",
      "    y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1\n",
      "    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),\n",
      "                         np.linspace(y_min, y_max, 100))\n",
      "    Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()])\n",
      "\n",
      "    # Put the result into a color plot\n",
      "    Z = Z.reshape(xx.shape)\n",
      "    plt.figure()\n",
      "    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)\n",
      "\n",
      "    # Plot also the training points\n",
      "    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)\n",
      "    plt.axis('tight')\n",
      "    plt.axis('off')\n",
      "    plt.tight_layout()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "plot_estimator(svc, X, y)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "\n",
      "\n",
      "The SVM gets its name from the samples in the dataset from each class that lie closest to the other class. These training samples are called \"support vectors\" because changing their position in p-dimensional space would change the location of the decision boundary.\n",
      "\n",
      "In scikits-learn, the indices of the support vectors for each class can be found in the support_vectors_ attribute of the SVC object. Here is a 2 class problem using only classes 1 and 2 in the wine dataset.\n",
      "\n",
      "The support vectors are circled.\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Extract classes 1 and 2\n",
      "X, y = X[np.in1d(y, [1, 2])], y[np.in1d(y, [1, 2])]\n",
      "\n",
      "plot_estimator(svc, X, y)\n",
      "plt.scatter(svc.support_vectors_[:, 0], \n",
      "           svc.support_vectors_[:, 1], \n",
      "           s=120, \n",
      "           facecolors='none', \n",
      "           linewidths=2,\n",
      "           zorder=10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 5,
     "metadata": {},
     "source": [
      "Regularization"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "\n",
      "These two classes do not appear to be lineary separable.\n",
      "\n",
      "\n",
      "For non-linearly separable classes we turn into regularization; regularization is tuned via the C parameter. In practice, a large C value means that the number of support vectors is small (less regularization), while a small C implies many support vectors (more regularization). scikit-learn sets a default value of C=1.\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "svc = svm.SVC(kernel='linear', C=1e6)\n",
      "plot_estimator(svc, X, y)\n",
      "plt.scatter(svc.support_vectors_[:, 0], svc.support_vectors_[:, 1], s=80, \n",
      "            facecolors='none', linewidths=2, zorder=10)\n",
      "plt.title('High C values: small number of support vectors')\n",
      "\n",
      "svc = svm.SVC(kernel='linear', C=1e-2)\n",
      "plot_estimator(svc, X, y)\n",
      "plt.scatter(svc.support_vectors_[:, 0], svc.support_vectors_[:, 1], s=80, \n",
      "            facecolors='none', linewidths=2, zorder=10)\n",
      "plt.title('Low C values: high number of support vectors')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Kernels"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "We can also choose from a suite of available kernels (linear, poly, rbf, sigmoid, precomputed) or a custom kernel can be passed as a function. Note that the radial basis function (rbf) kernel is just a Gaussian kernel, but with parameter $\\gamma = \\frac{1}{\\sigma^2}$."
     ]
    },
    {
     "cell_type": "heading",
     "level": 5,
     "metadata": {},
     "source": [
      "Linear Kernel"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "svc_lin = svm.SVC(kernel='linear')\n",
      "plot_estimator(svc_lin, X, y)\n",
      "plt.scatter(svc_lin.support_vectors_[:, 0], svc_lin.support_vectors_[:, 1], \n",
      "            s=80, facecolors='none', linewidths=2, zorder=10)\n",
      "plt.title('Linear kernel')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 5,
     "metadata": {},
     "source": [
      "Polynomial Kernel"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#svc_poly = svm.SVC(kernel='poly', degree=3)\n",
      "#plot_estimator(svc_poly, X, y)\n",
      "#plt.scatter(svc_poly.support_vectors_[:, 0], svc_poly.support_vectors_[:, 1], \n",
      "#           s=80, facecolors='none', linewidths=2, zorder=10)\n",
      "#plt.title('Polynomial kernel')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 5,
     "metadata": {},
     "source": [
      "RBF kernel"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#svc_rbf = svm.SVC(kernel='rbf', gamma=1e-2)\n",
      "#plot_estimator(svc_rbf, X, y)\n",
      "#plt.scatter(svc_rbf.support_vectors_[:, 0], svc_rbf.support_vectors_[:, 1], \n",
      "#           s=80, facecolors='none', linewidths=2, zorder=10)\n",
      "#plt.title('RBF kernel')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "X_train, X_test, y_train, y_test = cross_validation.train_test_split(\n",
      "        wine.values, grape.values, test_size=0.4, random_state=0)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "f = svm.SVC(kernel='linear', C=1)\n",
      "f.fit(X_train, y_train)\n",
      "f.score(X_test, y_test)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#mean accuracy\n",
      "scores = cross_validation.cross_val_score(f, wine.values, grape.values, cv=5)\n",
      "print scores"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#precision\n",
      "cross_validation.cross_val_score(f, wine.values, grape.values, cv=5, scoring='precision')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "Analyzing the iris dataset"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "iris = datasets.load_iris()\n",
      "X = iris.data[:, :2]  # we only take the first two features. We could\n",
      "                      # avoid this ugly slicing by using a two-dim dataset\n",
      "y = iris.target\n",
      "\n",
      "h = .02  # step size in the mesh\n",
      "\n",
      "# we create an instance of SVM and fit out data. We do not scale our\n",
      "# data since we want to plot the support vectors\n",
      "C = 1.0  # SVM regularization parameter\n",
      "svc = svm.SVC(kernel='linear', C=C).fit(X, y)\n",
      "rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, y)\n",
      "poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, y)\n",
      "lin_svc = svm.LinearSVC(C=C).fit(X, y)\n",
      "\n",
      "# create a mesh to plot in\n",
      "x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n",
      "y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n",
      "xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n",
      "                     np.arange(y_min, y_max, h))\n",
      "\n",
      "# title for the plots\n",
      "titles = ['SVC with linear kernel',\n",
      "          'LinearSVC (linear kernel)',\n",
      "          'SVC with RBF kernel', 'SVC with poly kernel']\n",
      "\n",
      "fig = plt.figure(figsize=(10,10))\n",
      "\n",
      "for i, clf in enumerate((svc, lin_svc, rbf_svc, poly_svc)):\n",
      "    # Plot the decision boundary. For that, we will assign a color to each\n",
      "    # point in the mesh [x_min, m_max]x[y_min, y_max].\n",
      "    plt.subplot(2, 2, i + 1)\n",
      "    plt.subplots_adjust(wspace=0.4, hspace=0.4)\n",
      "\n",
      "    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
      "\n",
      "    # Put the result into a color plot\n",
      "    Z = Z.reshape(xx.shape)\n",
      "    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)\n",
      "\n",
      "    # Plot also the training points\n",
      "    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)\n",
      "    plt.xlabel('Sepal length')\n",
      "    plt.ylabel('Sepal width')\n",
      "    plt.xlim(xx.min(), xx.max())\n",
      "    plt.ylim(yy.min(), yy.max())\n",
      "    plt.xticks(())\n",
      "    plt.yticks(())\n",
      "    plt.title(titles[i])\n",
      "\n",
      "plt.show()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Boosting"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "adaboost = AdaBoostClassifier(svm.SVC(kernel='linear'),\n",
      "                         algorithm=\"SAMME\",\n",
      "                         n_estimators=200)\n",
      "#adaboost.fit(wine.values,grape.values)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "scores = cross_validation.cross_val_score(adaboost, wine.values, grape.values, cv=5)\n",
      "print scores\n",
      "print scores.mean()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Code for setting the style of the notebook\n",
      "from IPython.core.display import HTML\n",
      "def css_styling():\n",
      "    styles = open(\"custom.css\", \"r\").read()\n",
      "    return HTML(styles)\n",
      "css_styling()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<link href='http://fonts.googleapis.com/css?family=EB+Garamond' rel='stylesheet' type='text/css'>\r\n",
        "<link href='http://fonts.googleapis.com/css?family=Alegreya+Sans:100,300,400,500,700,800,900,100italic,300italic,400italic,500italic,700italic,800italic,900italic' rel='stylesheet' type='text/css'>\r\n",
        "<link href='http://fonts.googleapis.com/css?family=Source+Code+Pro:300,400' rel='stylesheet' type='text/css'>\r\n",
        "<style>\r\n",
        "    @font-face {\r\n",
        "        font-family: \"Computer Modern\";\r\n",
        "        src: url('http://mirrors.ctan.org/fonts/cm-unicode/fonts/otf/cmunss.otf');\r\n",
        "    }\r\n",
        "    .code_cell {\r\n",
        "        width: 105ex !important ;\r\n",
        "        margin-bottom: 15px !important;\r\n",
        "    }\r\n",
        "    div.cell {\r\n",
        "        margin-left: auto;\r\n",
        "        margin-right: auto;\r\n",
        "        width: 70%;\r\n",
        "    }    \r\n",
        "    div.cell.selected {\r\n",
        "        border: thin rgba(171, 171, 171, 0.5) dashed;\r\n",
        "    }\r\n",
        "    h1 {\r\n",
        "        font-family: 'Alegreya Sans', sans-serif;\r\n",
        "    }\r\n",
        "    h2 {\r\n",
        "        font-family: 'EB Garamond', serif;\r\n",
        "    }\r\n",
        "    h3 {\r\n",
        "        font-family: 'EB Garamond', serif;\r\n",
        "        margin-top:12px;\r\n",
        "        margin-bottom: 3px;\r\n",
        "    }\r\n",
        "    h4 {\r\n",
        "        font-family: 'EB Garamond', serif;\r\n",
        "    }\r\n",
        "    h5 {\r\n",
        "        font-family: 'Alegreya Sans', sans-serif;\r\n",
        "    }\r\n",
        "    div.text_cell_render {\r\n",
        "        font-family: 'EB Garamond',Computer Modern, \"Helvetica Neue\", Arial, Helvetica, Geneva, sans-serif;\r\n",
        "        line-height: 145%;\r\n",
        "        font-size: 140%;\r\n",
        "    }\r\n",
        "    div.input_area {\r\n",
        "        border-color: rgba(0,0,0,0.10) !important;\r\n",
        "        background: #fafafa;\r\n",
        "    }\r\n",
        "    .CodeMirror {\r\n",
        "            font-family: \"Source Code Pro\";\r\n",
        "            font-size: 90%;\r\n",
        "    }\r\n",
        "    .prompt {\r\n",
        "        display: None;\r\n",
        "    }\r\n",
        "    .output {\r\n",
        "        padding-left: 50px;\r\n",
        "        padding-top: 5px;\r\n",
        "    }\r\n",
        "    .output_wrapper {\r\n",
        "        padding-left: 5px;\r\n",
        "        padding-top: inherit;\r\n",
        "    }\r\n",
        "    div.output_scroll {\r\n",
        "        width: inherit;\r\n",
        "    }\r\n",
        "    .inner_cell {\r\n",
        "        padding-left: 5px;\r\n",
        "    }\r\n",
        "    .text_cell_render h1 {\r\n",
        "        font-weight: 200;\r\n",
        "        font-size: 50pt;\r\n",
        "        line-height: 100%;\r\n",
        "        color:#CD2305;\r\n",
        "        margin-bottom: 0.5em;\r\n",
        "        margin-top: 0.5em;\r\n",
        "        display: block;\r\n",
        "    }\r\n",
        "    .text_cell_render h5 {\r\n",
        "        font-weight: 300;\r\n",
        "        font-size: 16pt;\r\n",
        "        color: #CD2305;\r\n",
        "        font-style: italic;\r\n",
        "        margin-bottom: .5em;\r\n",
        "        margin-top: 0.5em;\r\n",
        "        display: block;\r\n",
        "    }\r\n",
        "    .warning {\r\n",
        "        color: rgb( 240, 20, 20 )\r\n",
        "        }  \r\n",
        "</style>\r\n",
        "<script>\r\n",
        "    MathJax.Hub.Config({\r\n",
        "                        TeX: {\r\n",
        "                           extensions: [\"AMSmath.js\"]\r\n",
        "                           },\r\n",
        "                tex2jax: {\r\n",
        "                    inlineMath: [ ['$','$'], [\"\\\\(\",\"\\\\)\"] ],\r\n",
        "                    displayMath: [ ['$$','$$'], [\"\\\\[\",\"\\\\]\"] ]\r\n",
        "                },\r\n",
        "                displayAlign: 'center', // Change this to 'center' to center equations.\r\n",
        "                \"HTML-CSS\": {\r\n",
        "                    styles: {'.MathJax_Display': {\"margin\": 4}}\r\n",
        "                }\r\n",
        "        });\r\n",
        "</script>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 27,
       "text": [
        "<IPython.core.display.HTML at 0x10b348510>"
       ]
      }
     ],
     "prompt_number": 27
    }
   ],
   "metadata": {}
  }
 ]
}