{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Choosing the right complexity for a model\n", "===============================================" ] }, { "cell_type": "code", "collapsed": false, "input": [ "\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from sklearn.datasets import load_iris\n", "from sklearn.cross_validation import train_test_split\n", "\n", "\n", "iris = load_iris()\n", "X = iris.data\n", "y = iris.target\n", "\n", "\n", "# dataset for decision function visualization\n", "X_2d = X[:, :2]\n", "X_2d = X_2d[y > 0]\n", "y_2d = y[y > 0]\n", "y_2d -= 1\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X_2d, y_2d)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "%matplotlib inline\n", "plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "def show_decision_function(clf, ax):\n", " xx, yy = np.meshgrid(np.linspace(4.5, 8, 200), np.linspace(1.5, 4.0, 200))\n", " try:\n", " Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n", " except AttributeError:\n", " Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]\n", "\n", " Z = Z.reshape(xx.shape)\n", " ax.pcolormesh(xx, yy, Z, cmap=plt.cm.jet)\n", " ax.set_xlim(4.5, 8)\n", " ax.set_ylim(1.5, 4.0)\n", " ax.set_xticks(())\n", " ax.set_yticks(())\n", " ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.svm import SVC\n", "\n", "training_scores = []\n", "test_scores = []\n", "fig, axes = plt.subplots(2, 3, figsize=(20, 10))\n", "Cs = [0.01, 0.1, 1, 10, 100, 1000]\n", "\n", "for C, ax in zip(Cs, axes.ravel()):\n", " clf = SVC(gamma=10, C=C)\n", " clf.fit(X_train, y_train)\n", " training_scores.append(clf.score(X_train, y_train))\n", " test_scores.append(clf.score(X_test, y_test))\n", " show_decision_function(clf, ax)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "plt.figure(figsize=(20, 10))\n", "plt.plot(training_scores, label=\"training scores\")\n", "plt.plot(test_scores, label=\"test scores\")\n", "plt.legend(loc=\"best\")\n", "plt.xticks(range(6), Cs)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Tasks\n", "======\n", "1. Play with the ``n_neighbors`` parameter of ``KNeighborsClassifier`` on the digits dataset. Compare training set and test set performance to see how it is related to complexity." ] } ], "metadata": {} } ] }