{ "metadata": { "name": "01B_sklearn_overview" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "An Overview of Scikit-learn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "*Adapted from* [*http://scikit-learn.org/stable/tutorial/basic/tutorial.html*](http://scikit-learn.org/stable/tutorial/basic/tutorial.html)" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%pylab inline" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline].\n", "For more information, type 'help(pylab)'.\n" ] } ], "prompt_number": 0 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Loading an Example Dataset" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn import datasets\n", "digits = datasets.load_digits()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "digits.data" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 4, "text": [ "array([[ 0., 0., 5., ..., 0., 0., 0.],\n", " [ 0., 0., 0., ..., 10., 0., 0.],\n", " [ 0., 0., 0., ..., 16., 9., 0.],\n", " ..., \n", " [ 0., 0., 1., ..., 6., 0., 0.],\n", " [ 0., 0., 2., ..., 12., 0., 0.],\n", " [ 0., 0., 10., ..., 12., 1., 0.]])" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "digits.target" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 5, "text": [ "array([0, 1, 2, ..., 8, 9, 8])" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "digits.images[0]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 6, "text": [ "array([[ 0., 0., 5., 13., 9., 1., 0., 0.],\n", " [ 0., 0., 13., 15., 10., 15., 5., 0.],\n", " [ 0., 3., 15., 2., 0., 11., 8., 0.],\n", " [ 0., 4., 12., 0., 0., 8., 8., 0.],\n", " [ 0., 5., 8., 0., 0., 9., 8., 0.],\n", " [ 0., 4., 11., 0., 1., 12., 7., 0.],\n", " [ 0., 2., 14., 5., 10., 12., 0., 0.],\n", " [ 0., 0., 6., 13., 10., 0., 0., 0.]])" ] } ], "prompt_number": 4 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Learning and Predicting" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn import svm\n", "clf = svm.SVC(gamma=0.001, C=100.)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stderr", "text": [ "/usr/local/lib/python2.7/site-packages/scikits/__init__.py:1: UserWarning: Module argparse was already imported from /usr/local/Cellar/python/2.7.5/Frameworks/Python.framework/Versions/2.7/lib/python2.7/argparse.pyc, but /usr/local/lib/python2.7/site-packages is being added to sys.path\n", " __import__('pkg_resources').declare_namespace(__name__)\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "clf.fit(digits.data[:-1], digits.target[:-1])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 8, "text": [ "SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,\n", " gamma=0.001, kernel='rbf', max_iter=-1, probability=False,\n", " shrinking=True, tol=0.001, verbose=False)" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "clf.predict(digits.data[-1])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 9, "text": [ "array([8])" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "plt.figure(figsize=(2, 2))\n", "plt.imshow(digits.images[-1], interpolation='nearest', cmap=plt.cm.binary)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 10, "text": [ "" ] }, { "output_type": "display_data", "png": "iVBORw0KGgoAAAANSUhEUgAAAIcAAACKCAYAAACAeVnUAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAC55JREFUeJzt3V9sU+UbB/Bv52bImA40c13WxeL4s3UbbWXYhKyBqkgm\nYBzMuIH8G3pjTJw3RrwRvSBDY3SIV0YFNRkmXogSNnSByhgxk7SNFxgwsYtlEAPGAe2cXdvHC7P+\n4Le9Pee87Tk9Zc8n4WL0POc8dA/nz/uc9xwLEREYm0VRvhNg5sXFwYS4OJgQFwcT4uJgQlwcTKg4\n2xVYLJZc5MHySDSakXVxZFr53r17sXfvXs3ryxS3c+dOYVwoFILL5Zr1M7/fL4wbHx/HggULNG/P\n7/djzZo1ws+7u7tn/fuenh689tprwjhRLnp8n5n+c/NhhQkpFsfAwADq6uqwZMkS7N+/34icmElk\nLI5kMomXXnoJAwMDOH/+PPr6+vDLL7+oXnmmXa4ecVarVSpu3rx5UnF2u10qrqWlRSrO6O8zY3GM\njIxg8eLFsNvtKCkpQUdHB44ePap7UlwcszNVcYyNjaGmpib9s81mw9jYmNSGWOHJeLWi9jL11jPh\nNWvWSFcq05/f78945XarjMVRXV2NSCSS/jkSicBms81YTubyiuXH///nffPNN4XLZjysNDc349df\nf8Xo6Cji8Ti+/PJLPPXUUzlLlJlbxj1HcXExDh48iHXr1iGZTGL37t2or683KjeWZ4ojpK2trWht\nbTUiF2YyPELKhHLSW5ExOjoqFXf48GGpuAcffFAqTnYs407Aew4mxMXBhLg4mJBicXR1daGyshJN\nTU1G5MNMRLE4du3ahYGBASNyYSajWBxerxcLFy40IhdmMnzOwYRyMs7BXdnCkbOurFrclS0cOevK\nsrlNsTg6OzuxatUqXLx4ETU1Nfj000+NyIuZgOJhpa+vz4g8mAnxYYUJ5a0rK9vtLC8vl4obHx+X\nipPtHgPy/0bZXHON9xxMiIuDCXFxMCHF4ohEIvD5fGhoaEBjYyMOHDhgRF7MBBRPSEtKSvDee+/B\n5XIhGo1ixYoVWLt2Ld+FPgco7jmsVmv6mRdlZWWor6/H5cuXdU+M5Z+mc47R0VEEg0F4PB698mEm\nonqcIxqNor29Hb29vSgrK7vtM+7KFg4tXVmLmsdbT01NYcOGDWhtbZ3xKCOLxSJ87JMeRI9E0ovo\n0U1qvP/++1JxRg6CZfr9KR5WiAi7d++Gw+HI6otihUexOIaHh/HFF1/g1KlTcLvdcLvdfE/pHKF4\nztHS0oJUKmVELsxkeISUCak6Ic24AoNPSL/++mupuLa2thxnomzHjh1ScYcOHcptIhlkdULK5i4u\nDibExcGEFItjcnISHo8HLpcLDocDe/bsMSIvZgKKl7Lz5s3DqVOnUFpaikQigZaWFpw5c0b6Qaus\ncKg6rJSWlgIA4vE4kskk7rvvPl2TYuagqjhSqRRcLhcqKyvh8/ngcDj0zouZgKqubFFREUKhEK5f\nv45169bNeM8Id2ULh25zZcvLy7F+/XqcO3dOWBzM3HI6V/batWvpFvLff/+N77//Hm63O/ssmekp\n7jmuXLmCHTt2IJVKIZVKYdu2bXjssceMyI3lmWJxNDU1IRAIGJELMxkeIWVCXBxMKG8TqWXJ3pcp\nOwE7G9lMwjYD3nMwIS4OJqSqOJLJJNxuNzZu3Kh3PsxEVBVHb28vHA4Hv7d+jlEsjkuXLuH48eN4\n/vnnDb1XlOWfYnG88soreOedd1BUxKcnc03GS9ljx47hgQcegNvtztjJ465s4cjZXNnXX38dn3/+\nOYqLizE5OYkbN25g8+bN+Oyzz/63AoOnJsgWXigUym0iKkw/ukIrtb+8XJCemrBv3z5EIhGEw2Ec\nOXIEjz766G2Fwe5smk4k+GplblE9fL569WqsXr1az1yYyfAlCBPi4mBCeevKyp6R//DDD1Jxsm97\nyOalwz6fTypOdiL1zp07peJEeM/BhLg4mJCqw4rdbse9996Lu+66CyUlJRgZGdE7L2YCqorDYrHA\n7/fzNMg5RvVhhTuyc4+q4rBYLHj88cfR3NyMjz76SO+cmEmoOqwMDw+jqqoKV69exdq1a1FXVwev\n15v+nLuyhSPnc2WrqqoAABUVFWhra8PIyIiwOJi55XSu7MTEBG7evAkAiMVi+O6779DU1JR9lsz0\nFPccf/zxR/oxjYlEAlu3bsUTTzyhe2Is/xSLY9GiRXm5UYblH4+QMiEuDiZUcF1Zo7eXTVdWllnm\n2PKegwlxcTAhxeIYHx9He3s76uvr4XA48OOPPxqRFzMBxXOOl19+GU8++SS++uorJBIJxGIxI/Ji\nJpCxOK5fv46hoSEcPnz4v4WLi/PyEBSWHxkPK+FwGBUVFdi1axcefvhhvPDCC5iYmDAqN5ZnGfcc\niUQCgUAABw8exMqVK9Hd3Y2enh689dZbty3HXdnCkbOurM1mg81mw8qVKwEA7e3t6OnpmbEcd2UL\nR866slarFTU1Nbh48SIAYHBwEA0NDbnJkpme4tXKBx98gK1btyIej6O2tlZ6/gcrPIrF4XQ68dNP\nPxmRCzMZHiFlQlwcTChvLx2efk2HVrJPMJbtymbTIZXt6Mq+WHnBggWaY/ilw0wKFwcTUiyOCxcu\nwO12p/+Ul5fjwIEDRuTG8kzxUnbZsmUIBoMA/ntLZHV1dfpudHZn03RYGRwcRG1tLWpqavTKh5mI\npuI4cuQItmzZolcuzGRU32Acj8fx7bffYv/+/TM+465s4dDlvbL9/f1YsWIFKioqZnzGXdnCkdO5\nstP6+vrQ2dmZVWKssKgqjlgshsHBQWzatEnTymVHJc+cOSMVJzuaKTtaOzk5aej2ZL8X2d+DquKY\nP38+rl27hnvuuUfTyrk4crs9UxYHm5u4OJhQTrqyrLCJSiDridT8lME7Fx9WmBAXBxPSrTgGBgZQ\nV1eHJUuWzDrkPpuuri5UVlZqfiBdJBKBz+dDQ0MDGhsbVd9SMDk5CY/HA5fLBYfDgT179mjarszL\nmO12O5YvXw63241HHnlEdZzMhPasb7cgHSQSCaqtraVwOEzxeJycTiedP39eMe706dMUCASosbFR\n0/auXLlCwWCQiIhu3rxJS5cuVbU9IqJYLEZERFNTU+TxeGhoaEj1dt99913asmULbdy4UXWM3W6n\nP//8U/Xy07Zv304ff/xxOtfx8XFN8clkkqxWK/3++++qY3TZc4yMjGDx4sWw2+0oKSlBR0cHjh49\nqhjn9XqxcOFCzduzWq3pNzGWlZWhvr4ely9fVhVbWloK4L/GYjKZVP1892xexqx1+ekJ7V1dXQDk\nJrTL3G6hS3GMjY3dloTNZsPY2Jgem5phdHQUwWAQHo9H1fKpVAoulwuVlZXw+XxwOByq4mRfxizz\nqPBcTGiXud1Cl+LI19hHNBpFe3s7ent7UVZWpiqmqKgIoVAIly5dwunTp1UNNd/6Mmate4Hh4WEE\ng0H09/fjww8/xNDQkGLM9IT2F198EYFAAPPnz591zrLI9O0WzzzzjKZcdSmO6upqRCKR9M+RSAQ2\nm02PTaVNTU1h8+bNeO655/D0009rji8vL8f69etx7tw5xWXPnj2Lb775BosWLUJnZydOnjyJ7du3\nq9rObI8KVzLbhPZAIKBqe0Dm2y0y0nRWo9LU1BQ99NBDFA6H6Z9//lF9QkpEFA6HNZ+QplIp2rZt\nG3V3d2uKu3r1Kv31119ERDQxMUFer5cGBwc1rcPv99OGDRtULRuLxejGjRtERBSNRmnVqlV04sQJ\nVbFer5cuXLhARERvvPEGvfrqq6pzfPbZZ+nQoUOql5+mS3EQER0/fpyWLl1KtbW1tG/fPlUxHR0d\nVFVVRXfffTfZbDb65JNPVMUNDQ2RxWIhp9NJLpeLXC4X9ff3K8b9/PPP5Ha7yel0UlNTE7399tuq\ntncrv9+v+mrlt99+I6fTSU6nkxoaGlR/L0REoVCImpubafny5dTW1qb6aiUajdL999+fLkotsu6t\nsDsXj5AyIS4OJsTFwYS4OJgQFwcT4uJgQv8CdYFJ8R3MeG0AAAAASUVORK5CYII=\n" } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "print digits.target[-1]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "8\n" ] } ], "prompt_number": 9 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Model Persistence" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn import svm\n", "from sklearn import datasets\n", "clf = svm.SVC()\n", "iris = datasets.load_iris()\n", "X, y = iris.data, iris.target\n", "clf.fit(X, y)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 12, "text": [ "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,\n", " kernel='rbf', max_iter=-1, probability=False, shrinking=True, tol=0.001,\n", " verbose=False)" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "import pickle\n", "s = pickle.dumps(clf)\n", "clf2 = pickle.loads(s)\n", "clf2.predict(X[0])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 13, "text": [ "array([0])" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "y[0]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 14, "text": [ "0" ] } ], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.externals import joblib\n", "joblib.dump(clf, 'filename.pkl') " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "pyout", "prompt_number": 15, "text": [ "['filename.pkl',\n", " 'filename.pkl_01.npy',\n", " 'filename.pkl_02.npy',\n", " 'filename.pkl_03.npy',\n", " 'filename.pkl_04.npy',\n", " 'filename.pkl_05.npy',\n", " 'filename.pkl_06.npy',\n", " 'filename.pkl_07.npy',\n", " 'filename.pkl_08.npy',\n", " 'filename.pkl_09.npy',\n", " 'filename.pkl_10.npy',\n", " 'filename.pkl_11.npy']" ] } ], "prompt_number": 13 } ], "metadata": {} } ] }