{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Advanced scikit-learn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Agenda\n", "\n", "- StandardScaler\n", "- Pipeline (bonus content)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## StandardScaler\n", "\n", "### What is the problem we're trying to solve?" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# fake data\n", "import pandas as pd\n", "train = pd.DataFrame({'id':[0,1,2], 'length':[0.9,0.3,0.6], 'mass':[0.1,0.2,0.8], 'rings':[40,50,60]})\n", "test = pd.DataFrame({'length':[0.59], 'mass':[0.79], 'rings':[54]})" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlengthmassrings
000.90.140
110.30.250
220.60.860
\n", "
" ], "text/plain": [ " id length mass rings\n", "0 0 0.9 0.1 40\n", "1 1 0.3 0.2 50\n", "2 2 0.6 0.8 60" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# training data\n", "train" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lengthmassrings
00.590.7954
\n", "
" ], "text/plain": [ " length mass rings\n", "0 0.59 0.79 54" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# testing data\n", "test" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# define X and y\n", "feature_cols = ['length', 'mass', 'rings']\n", "X = train[feature_cols]\n", "y = train.id" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", " metric_params=None, n_neighbors=1, p=2, weights='uniform')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# KNN with K=1\n", "from sklearn.neighbors import KNeighborsClassifier\n", "knn = KNeighborsClassifier(n_neighbors=1)\n", "knn.fit(X, y)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([1], dtype=int64)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# what \"should\" it predict?\n", "knn.predict(test)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# allow plots to appear in the notebook\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "plt.rcParams['font.size'] = 14\n", "plt.rcParams['figure.figsize'] = (5, 5)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# create a \"colors\" array for plotting\n", "import numpy as np\n", "colors = np.array(['red', 'green', 'blue'])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVYAAAFdCAYAAABYaEVAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XmcXFWd/vHPQyALKEEWWRRJAgRENKCCUQO0OiwSGQUR\nFAThhwgM4ALuEhMZEBhUXEAcRCCMyi6IiJAMSbMEZYsgA6JIAIGsRgyQhDSkv78/zu1wU1QnvZy6\n1d153q9Xvbrr3OWcW1391LnnLqWIwMzM8lmr2Q0wMxtoHKxmZpk5WM3MMnOwmpll5mA1M8vMwWpm\nlpmD1bKT1CKpXdLuzW7LQCFpRPGafqWJbWiXNLFZ9fcnDtYeknRE8UbbtZPpN0h6vOp29SE9OkFa\n0jBJkyTtkbtBzSDpEEmfyzh/s08873b9krYo/qZjGtGgvsjB2ljN/idolluBYcDtPVh2PeCbwIAI\nVuAQ4PMNnL8/2IL0N3WwmvVUJG3Ru8v6lK1BgKTBkgZlWM96PVisu6/DQP1Azvo37cscrBWSNEjS\nNyT9TdKLkp6UdJakoaV5vivp2ZrlziiGHb5eKhsmadmqxrwknShpuaSNSmXHFOu6oGbev0u6pKbs\nEEn3SFoi6Z+SrpQ0ogvb+aoxVkmXSFpa7BZeJ+l5SfMlnS1prWKeEcD8YpGJxTraJV1cWs/mki6U\nNLd4DR+WdGwn9R9S7IL+HVgCvKFm2rckzZa0WNJNkrapWU9Hm7eSdL2kRcANXX19JLUC+wId46Pt\nktpX8bp1ZX5JOlrSY8X23y3pnXXWNbpozz+KbZgp6aOd1V2z7BBJ50haIOk5Sb+W9MY6820l6TxJ\nfy5ew2cl/UbSjqV5WoC7i6cXl7brm8X0t0m6uNiepUWdl0nasitt7avWbnYDBoANJG1cp3ydOmX/\nDfw/4BrgO8AuwJeAHYHxxTy3AV+QtFNE3F+U7Q60Fz+/XZS9q6jjtlW07TZSL2E34Lqade3WMVMR\nBm8k7cJ3lH0VOB24CvgZsCFwAjBD0piI+Mcq6u3MWsBNwF3AycCexc/HgJ+QQvU44HzgV8WDYjqS\nXg/8odimc4v5/w34saSNIuL0mvq+DiwHzimWWVya9pWiPf9VbNvngOmS3hYR5Q+2tYApRZu/CLzc\njdfntGL9b6Rru/ddmf9g4DWk1wjgy8CvJI2KiI62vRm4E5gNnAW8ABwAXCXpsIj4xWracSFwKPCL\nYj3vA35bZ753kt5HVwJ/B94AHAPcKuktETEXeJg0DHAq6f3fMTz0p+LnvwGjgUuK9m4DHAvsKmnH\niFi6mrb2TRHhRw8ewBGkgFrVY1Zp/rcVZT+rWc/Eonx88Xzj4vlni+dDgReBy4HnABXlE4BlwNBV\ntHEt4Fnge6WyJ4t1tQObFGWHFc+3Lp6/CXgJOKVmfaOApcDpq3ltWor17V4qu6Qoq13nfcA9pecd\n2//NOuu9gPTPt1Gd8sXA8Jr6nwCGddK2OcD6pfL3FeX/WafN36lZR5dfH1IPd1bttqzitas7PzCi\naMv8ju0syvcrv3+KsinAg8CQmnXcDDy1mvrHFOs7t6Z8cu3fpd57DxhZvAbfKJW9s1j28DrzD6tT\n9u5i/kMb+T/cyIeHAnrvRNKnbvmxJ6mHU9bRI/1eTfk5pF7VeIBIPZ0/k3qWAGNJPdNvk3oqby/K\ndwPui4gXO2tYRLSTehy7w4qe6ZZFG5aW6tgNmBMRjxXPDwAGAVdK2rjjQQr2/yOFUE/9tOb5HaRA\nWiVJAg4k9ZxU066ppINl76pZ7NLovMdzaUQ81/EkIqYDDwEfqjPvj2ueN/L1WZ1rImJR6fkdxc+R\nAJI2BD5A6km/tqZ9N5OGQ7Zdxfr3LX6eW1P+w9oZy+89SesqDTk9D/wVeEdXNqb895H0mmIdjwL/\n4pX3er/joYDeuyci7q4tlPQFYNNS0VakgxJ/Lc8XEc9JmlNM73A7sH/x+27AQxHxJ6XTt3aXdD/p\nU/28LrTvDuBUSa8t1rUIuIe0S70HaVhiN1Y+gj+6+PlIJ+t8rJPy1WmLiHk1Zc8Cr+vCspsAGwBH\nFY9aUcxTtqp2PtpJWW0odvR8yxr1+nTF38tPIuLZ9Jmz4jXchjTsMal41Arg9dTffnjlffq3mvJX\nza90bOBU4JPAZjWTF3Sy/tp1vA44k/ShWfs+GN6VdfRFDta+ofZo6e3AZyRtT+pV3lYq36P4uR5d\nO53pNlLv6r3FumZEREi6A/hw0ZPZjpV7KB17MvtQjCnW6Om4V2+Odne06ZfARZ3M83DN8560s7aN\nbUXPv15bcr8+XbG8k/KO91BH274H3NjJvA9lasuPgCNJvdk7Sb3MAL5P1w+MXwm8h3TM4Y+kHi+k\n4ap+u0ftYK3Ok6Q3/3ak3UUAJK0PbA5cX5q3IzDfTxoK6Nh9vo10cKPjANQdrN49pDHa3YFxQMcR\n9tuAbwD/XnreoaO38lRE/LkLdeTUWfguIP3TrRMR0zLUM7qTsidqyuqdItSd16fqU61mFT+X9/B1\n6nifbksakupQ7/X6GDA5Ik4qFxbDEeUea91tKnqrHwAmRsR/lsqHkg4G9lv99hOhH+o4Taf2aO/n\nSH+HFafxRMTfSbt8JwLrsnKPdUPgaODBmrG2uiKijXS6ywGkUO9Y1x9IvZ8vAc9GxIOlxa4ppn2z\n3jpVOn2rm7oSGkuKnyv9Y0XEcuBq4COS3lanTbXDAKtzuKQVu5qS3g/swKuPftdrc3den8V0baij\np/OvJCLmA9OBoyVtUadtq3udOnq5J9SUn1hn3pepyRBJnyB1FMo6zsaoDcuO3ndtDn2Bfn7Oq3us\njbXizRERD0r6GXBU8Q89nTQ4fyTwu4j4Xc2yt5NOeflbpNNWiIhHJc3n1bvuq3M7qXe6hNSDJSIW\nS5pJOuBzQ3nmiHi8OJ3obElbAb8m7eaNJPVwLyeNrXXXav9ZImKppIeAj0v6K/BP0lHyu4Gvko7q\n/17ST0m7/q8DdgI+QjqA1VVzSadG/axYx+dJZxzUHlx8VZu78PpcAXyrmP0e4CBJ3yd9wLVHxOWr\naFd356/nOGAG8KfidZpFGld9F/BmUm+0roh4QNJlwHHF+7TjdKt6y1xP+oB6jjS8sBNwUFFf+XV7\njDSWfpykxaQ9jwcj4qHi3N0vSxpM6kyMI+1dLaQ/h2vVpyGQPs0mk04bWUr6g9Q7Laf8uLPZp0/U\n2Y4jSJ+4u3Yy/TfUnDZD+mT+OmlXchlpt+tMak6LKeb9TLHtF9aUX1nUe2A32rpXsa7/rSn/r2Jd\nX+xkuX8HWklHu18g7RqeB7x5NfW1FOst/10vBpbUmXciabe1XLYrqUe9tGj3RaVpGwM/IO2yLyOd\nNnULcFyd+g/qpG3twCdI4Teb9IFzE7Btzbx129yd14cU9pcA/yjatLyz9a1qfl453erLdZZ51elp\npINQFwHPFK/TU6QP0IO78H4ZQhon7Rh+uY50jmrt6VavJZ2bOrfY/umkswGmA9Nq1rkv8EDRluUd\n6yEd9Lqi2N5FpLDeGni8/Hfvb4+OcyIrIWkDYCZpd/Rc0h9uFOlUn0eKeS4mXVt8WGnRtoj4V2UN\ntQGruBJoGvDxiLiyyc2xAarqoYAvA89ExBGlsidr5hEpSOdjZtYPVX3w6iPA3ZKukDRP0h8lHV8z\nTwDjiul/kXRBDw5MmJk1TdXBOgr4D9IY416ksbIza8L1JtIwwPtJ15HvCkwrBrfNchiod4+yPqLq\nMdY24O6IGFcqOx3YPyJ26GSZzUnDBQdHxLXVtNTMrOeqHmOdzauvjnmEdFOLuiJijqSnSZfqrSDJ\nvQ4za4iI6NWpXlUPBcwAtq8pq3e1ywrF+OobSKfVrKRZp1JMnDjRdbtu1z1A686h6mA9Bxgr6euS\ntpH0MdIVHedBuju7pO9IGqv05WktpPPa5gEeBjCzfqHSoYCIuFfSR0i3wJtAGjs9JSI6btq7nHTT\n58NIdzKaQzrn8MCIWFxnlWZmfU7ll7RGxI10ctedSPd33KfaFnVfS0uL63bdrnuA1p1DpWcF5CQp\n+mvbzazvkkT0s4NXZmYDnoPVzCwzB6uZWWYOVjOzzBysZmaZOVjNzDJzsJqZZeZgNTPLzMFqZpaZ\ng9XMLDMHq5lZZg5WM7PMHKxmZpk5WM3MMnOwmpll5mA1M8vMwWpmlpmD1cwsMwermVlmDlYzs8wc\nrGZmmTlYzcwyc7CamWXmYDUzy8zBamaWmYPVzCwzB6uZWWYOVjOzzBysZmaZOVjNzDJzsJqZZeZg\nNTPLzMFqZpaZg9XMLDMHq5lZZpUHq6TNJU2WNF/SUkkPSdq9Zp5Jkp6RtETSdEk7VN1OM7OeqjRY\nJW0AzAAC2BfYHjgBmF+a5yvASUX5LsW0qZJeU2VbzaxaEcEtt9zCAQccxtixe3PyyV/liSeeaHaz\nekQRUV1l0reB3SJit06mC5gN/DAizijKhpLC9YsRcUFp3qiy7WbWOBHBCSeczOTJv2Hx4s8DIxk8\neBrrrDOZG264ipaWlsraIomIUK/WUXGwPgz8Dngj0EIK0Qsj4rxi+ijgb8AuEXFfabkbgH9ExBGl\nMger2QDR2trKhz70aRYvvhfYoDRlChtt9Gnmzp3F2muvXUlbcgRr1WOso4D/IIXnXsAPgDMlHV9M\n36z4Oa9mufmlaWY2wJx//mSWLDmRlUMVYC/a2jZj+vTpzWhWj1XzEfCKtYC7I+IbxfMHJG0LHA+c\nt5plX9U9nTRp0orfW1paKt1dMLN85s5dSMRWnUwdwcKFCxtWd2trK62trVnXWfVQwBPAlIj4TKns\nMOD8iHjNKoYCfgvMj4gjS2UeCjAbICZM+BZnnz2fZctq+1fLGDbsTdx//+2MHj26krb0x6GAGaQz\nAcpGA08Uvz8OzCUNEwArDl6NA+6soH1m1gTHHXc0gwdfDVzHKzunLzJkyPGMG/eeykI1l6qD9Rxg\nrKSvS9pG0seAEymGAYou6PeBr0jaX9KOwCXA88AvK26rmVVkiy22YOrU69l88y/x2tfuxPrrf4Sh\nQ99ES8uzXH315GY3r9sqHQoAkLQv8G1gO+BJ4NyIOLdmnonAMcDrgD8Ax0fEwzXzeCjAbIBpb29n\nxowZLFiwgDFjxrD11ltX3oZ+d7pVTg5WM2uE/jjGamY24DlYzcwyc7CamWXmYDUzy8zBamaWmYPV\nzCwzB6uZWWYOVjOzzBysZmaZOVjNzDJzsJqZZeZgNTPLzMFqZpaZg9XMLDMHq5lZZg5WM7PMHKxm\nZpk5WM3MMnOwmpll5mA1M8vMwWpmlpmD1cwsMwermVlmDlYzs8wcrGZmmTlYzcwyc7CamWXmYDUz\ny8zBamaWmYPVzCwzB6uZWWYOVjOzzBysZmaZOVjNzDJzsJqZZVZpsEqaJKm95jG7NP2SOtPvrLKN\nZma9tXYT6nwEaCk9X176PYCpwGGlsrYK2mRmlk0zgnV5RMzvZJqAtlVMNzPr85oxxjpK0jOSZkm6\nTNLI0rQAxkmaJ+kvki6QtEkT2mhm1mOKiOoqk/YBXkMaDtgUOAXYHnhLRPxT0sHAYuBxYCRwGjAI\neEdEtNWsK6psu5mtGSQREerVOpoZTpLWJYXomRFxTp3pmwNPAgdHxLU10xysZpZdjmBtxhjrChGx\nRNJDwDadTJ8j6enOpk+aNGnF7y0tLbS0tDSglWY2kLW2ttLa2pp1nc3usQ4l9VjPi4jT6kzfBHga\nOCoifl4zzT1WM8suR4+16vNYvyNpd0kjJb0LuBoYBkyWtF4xfaykEZJagOuBecC1q1itmVmfUvVQ\nwBuAy4CNgQXA74GxEfFU0XvdkXQO6wbAHGAacGBELK64nWZmPdbUoYDe8FCA2cCxfHm6TmjQoEFN\nbkk/HAowMyubOXMm++23H0OHDmXo0KHst99+/PGPf2x2s3rNwWpmTTFz5kz23ntvxo8fz6JFi1i0\naBHjx49nr732YubMmc1uXq94KMDMmmK//fZj/PjxHHvssSuV/+QnP+HGG2/k+uuvb0q7+v0FAr3h\nYDXrv5YvX87QoUNZtGgR66677krTlixZwvDhw3nxxRebMubqMVYzsz7IwWpmlRs0aBD77LMPl156\n6aumXXrppXzwgx/sE2cI9FRTL2k1szXXqaeeyl577QXA4YcfDqRQnTBhAlOmTGlm03rNPVYza4qd\nd96Zm2++mRtvvJHhw4czfPhwbrzxRqZMmcLOO+/c7Ob1ig9emVnTDbQLBDwUYGZN1xcCNScPBZiZ\nZeZgNTPLzMFqZpaZg9XMLDMHq5lZZg5WM7PMHKxmZpk5WM3MMnOwmpll5mA1M8vMwWpmlpmD1cws\nMwermVlmDlYzs8wcrGZmmTlYzcwy61GwKtlG0tDcDTIz6++6FKySzpD0qeJ3AVOBvwJzJI1tYPvM\nzPqdrvZYDyUFKcAHgTHAWOBS4IwGtMvMrN/q6ndevR54qvh9X+CqiLhb0j+B+xrSMjOzfqqrPdaF\nwIji972AW4rf1wF69W2GZmYDTVd7rNcAv5T0V2BD4OaifAzwaCMaZmbWX3U1WE8GngTeBHwpIl4o\nyrcAzm9Ew8zM+itFRLPb0COSor+23cz6LklERK+GOLvUY5W0eyeTAngReCwi/tmbhpiZDRRd6rFK\naieFaGcpHsD1wCcjYnG+5q2yTe6xmll2OXqsXT0rYF/gz6TzWbctHocCDwEHAgcAOwFnrWolkiZJ\naq95zK4zzzOSlkiaLmmHbm6TmVlTdfXg1WnA5yPif0tlj0laAJwVEe+QtBw4FzhhNet6BGgpPV/e\n8YukrwAnAZ8iXZDwTWCqpO1KB8zWOBHBtGnTOO+C85g9dzbv3fW9nHj8iYwYMaLZTTOzOro6FLAU\neHtE/LmmfAdgZkQMlTQCeCQiOr1/gKRJwEcj4q11pgmYDfwwIs4oyoYC84EvRsQFNfOvEUMBEcEJ\nnzuByVdNZvHbF8MGMPipwazz4DrccO0NtLS0NLuJZgNKlUMBfwa+IWlIqfKhwNeAh4uiLYE5XVjX\nqGJXf5akyySNLMpHApsCUzpmjIgXgduA93SxnQPOrbfeyuQrJ7P4U4thV2A0tH2gjcUfWsyBnziQ\nl19+udlNNLMaXQ3W44C9gdmSWiXdCjxTlB1XzDMK+PFq1vMH0m7+3sDRwGbAnZI2LH4HmFezzPzS\ntDXO+Reez5Kdl8CwmgnbQNuwNqZPn96UdplZ57o0xhoRdxU9y0OB7YviXwC/7Bj7jIjJXVjPTaWn\n/yfp98DjpLC9a1WL1iucNGnSit9bWloG5G7x3PlziQ07GfLYABYuXFhtg8wGmNbWVlpbW7Ous+kX\nCEiaRhpq+A7wGLBLRNxXmv5bYH5EHFmz3Boxxjph4gTOnno2y/ZetvKEl2HYj4Zx/933M3r06OY0\nzmwAyjHG2uVglbQlsBvpTlcrDSFExPd6VHkap30cOC8iTitOvfpRzcGreaSDVz+tWXaNCNbZs2ez\n/Vu35/k9n0/7CgJegiFThrD7Jrsz5bdTVrcKM+uGyoJV0qHARcDLwAJqds0jYmS95eqs5zukCwme\nIgX0BGAc8NaIeErSl4GvA0eSbu5ySjF9u9oLD9aUYAW466672P/g/XkhXkCvE21PtLHHbntw5S+u\nZP31129288wGlCqD9THgCmBCRCxf3fyrWM9lwO7AxqSA/n2xzkdK80wEjgFeRzrYdXxEPFxnXWtM\nsAK0t7czY8YMFixYwJgxY9h6662b3SSzAanKYH0BeFtEzOpNZTmtacFqZtWo8jzW35G+isXMzFaj\nq5e0TgHOkvQW4E/AS+WJEfGr3A0zM+uvunN3q05FRI++Rrs3PBRgZo1Q2f1YmxGcZmb9lQPTzCyz\nTnuskk4Czo+IpZJOppPLSqHnFwiYmQ1EnY6xSnoceGdELCx+71RXLxDIyWOsZtYIDR1j7QhLSYNJ\nJ/MfXj6R38zM6lvtGGtEtAEjgFWeGWBmZklXD15dSrp/qpmZrUZXLxBYF/ikpD2B+4COG6IIiIj4\nbCMaZ2bWH3U1WHcAZha/jyqVi1WcLWBmtiZq+o2ue8pnBZhZI1R5ExYzM+siB6uZWWYOVjOzzBys\nZmaZOVjNzDJzsJqZZeZgNTPLzMFqZpaZg9XMLDMHq5lZZg5WM7PMHKxmZpk5WM3MMnOwmpll5mA1\nM8vMwWpmlpmD1cwsMwermVlmDlYzs8wcrGZmmTlYzcwyc7CamWXWtGCV9DVJ7ZJ+VCq7pCgrP+5s\nVhvNzHpi7WZUKmkscDTwJyBKkwKYChxWKmursGlmZr1WeY9V0nDg58CRwLO1k4G2iJhfevyr6jaa\nmfVGM4YCLgCuiohbSUFaFsA4SfMk/UXSBZI2qb6JZmY9V+lQgKSjgVHAIUVR1MxyE3AN8DgwEjgN\nmCbpHRHhIQEz6xcqC1ZJ2wGnA+MiYnlHMaVea0RcUVrkIUn3AU8C44Frq2qrmVlvVNljfTewMSkw\nO8oGAbtJOgZYLyJeKi8QEXMkPQ1sU2+FkyZNWvF7S0sLLS0t+VttZgNaa2srra2tWdepiNq98cYo\nDlq9oVwEXAz8Ffh2RDxcZ5lNgKeBoyLi5zXToqq2m9maQxIRUXv8p1sq67FGxCJgUblM0hLg2Yh4\nWNJrgEnA1cBcYARwBjAPDwOYWT/SlPNYS4JXDmC9DOxIOod1A2AOMA04MCIWN6d5ZmbdV9lQQG4e\nCjCzRsgxFOB7BZiZZeZgNTPLzMFqZpaZg9XMLDMHq5lZZg5WM7PMHKxmZpk5WM3MMnOwmpll5mA1\nM8vMwWpmlpmD1cwsMwermVlmDlYzs8wcrGZmmTlYzcwyc7CamWXmYDUzy8zBamaWmYPVzCwzB6uZ\nWWYOVjOzzBysZmaZOVjNzDJzsJqZZeZgNTPLzMFqZpaZg9XMLDMHq5lZZg5WM7PMHKxmZpk5WM3M\nMnOwmpll5mA1M8vMwWpmlpmD1cwss6YFq6SvSWqX9KOa8kmSnpG0RNJ0STs0q41mZj3RlGCVNBY4\nGvgTEKXyrwAnAScAuwDzgamSXtOMdpZFBLfccguHHXAAe48dy1dPPpknnnii2c0ysz5IEbH6uXJW\nKA0H7gOOAiYBD0bEZyUJmA38MCLOKOYdSgrXL0bEBTXriaraHhGcfMIJ/GbyZD6/eDEjgWmDBzN5\nnXW46oYbaGlpqaQdZtZ4kogI9WYda+dqTDdcAFwVEbcWYdphJLApMKWjICJelHQb8J5iuaa49dZb\nuX7yZO5dvJgNirJ929rYq62Nww88kFlz57L22s14Kc2sL6p0KEDS0cAo4JSiqNzl3Kz4Oa9msfml\naU0x+fzzOXHJkhWh2mEvYLO2NqZPn96MZplZH1VZN0vSdsDpwLiIWN5RXDxWp+4+/6RJk1b83tLS\n0rBd8oVz57JVJ8MOI4CFCxc2pF4za7zW1lZaW1uzrrOyMVZJRwAXActLxYNIobkc2BF4BNglIu4r\nLfdbYH5EHFmzvsrGWL81YQLzzz6b85YtW6l8GfCmYcO4/f77GT16dCVtMbPGyjHGWuVQwLWk8BxT\nPHYC7gUuK35/FJhL2sMGVhy8GgfcWWE7X+Xo447j6sGDuY5Xus4vAscPGcJ7xo1zqJrZSiobCoiI\nRcCicpmkJcCzEfFw8fz7wNclPUIK2lOA54FfVtXOerbYYguunzqVT+6/PxNfeIGREne2tbHbHnsw\n+corm9k0M+uDKj/daqXKpekUp1uVyiYCxwCvA/4AHN8RvDXLVjYU0KG9vZ0ZM2awYMECxowZw9Zb\nb11p/WbWeDmGApoarL3RjGA1s4Gvv42xmpmtERysZmaZOVjNzDJzsJqZZeZgNTPLzMFqZpaZg9XM\nLDMHq5lZZg5WM7PMHKxmZpk5WM3MMnOwmpll5mA1M8vMwWpmlpmD1cwsMwermVlmDlYzs8wcrGZm\nmTlYzcwyc7CamWXmYDUzy8zBamaWmYPVzCwzB6uZWWYOVjOzzBysZmaZOVjNzDJzsJqZZeZgNTPL\nzMFqZpaZg9XMLDMHq5lZZg5WM7PMHKxmZpk5WM3MMqs0WCUdL+kBSYuKx52S9i1Nv0RSe83jzirb\naGbWW2tXXN9TwJeBR0mhfgRwnaRdIuIBIICpwGGlZdoqbqOZWa9U2mONiOsj4uaImBURf4uIU4Dn\ngV2LWQS0RcT80uNfVbaxK1pbW12363bdA7TuHJo2xippkKSPA0OB24riAMZJmifpL5IukLRJs9rY\nmTX1Dee6XfeaUHcOVQ8FIOmtwO+BIcBS4KCI+Esx+SbgGuBxYCRwGjBN0jsiwkMCZtYvVB6swCPA\n24DhwMeAyyW9LyLujYgrSvM9JOk+4ElgPHBt9U01M+s+RURzGyBNBZ6OiCM7mT4LOD8izq4pb27D\nzWzAigj1Zvlm9FhrDaKTsd5ifPUNwJzaab3dcDOzRqk0WCWdCdwAPA28FjgE2APYR9J6wLeAq4G5\nwAjgDGAeHgYws36k6h7rpsDPgc2ARcADwD4RMVXSUGBH0jmsG5B6qdOAAyNiccXtNDPrsaaPsZqZ\nDTR99l4Bkv5D0uOSlkq6V9K41cz/Vkm3Sloi6WlJE6qoW9KQ4lLcByS1SZre03p7UHeLpF9Lmi1p\ncdGGugcBG1D3DpKmS5pbzP+YpNMlrdPoumuW21bS85Ke70m93a1b0og6l123S9qr0XWXlvm8pEck\nvVj87c9odN2SJnWy3e2SNm5k3cX8+0r6g6TnJC2QdJ2kbbtbbw/rPkjS/cX/2BOSvrjaSiKizz2A\ng0mXsh4FbAf8kHSF1padzL8+aVz2cmAH4KPAc8BJFdS9LnA+8GnSWPC0Crf7a8CpwLtJY9LHAi8B\nn6ig7q2Bw4G3AlsC+xV/g7MbXXdpucHAfaRx++cqes1HAO3AnsDrS491qthu4HvAX4rXewQwhjSc\n1ujtXq9mezcFpgO3VFD3NsX8ZwKjim2+CXi0gro/WPxPHVu83vsCzwDHr7KenrwZG/0A7gL+u6bs\nr8C3O5n/OOBfwJBS2TdIp3E1tO6a+c4Fple13Z2s4wrg6ibV/T3gzqrqBs4BfgZ8Cni+ovfaCFKw\nvqOnf+dglcuhAAAGvUlEQVRe1L1dEQrbVV13neW3BF4GPl7Bdh9Y1KVS2fuKv8OGDa77l8A1NWUn\nAH9fVT19bihA0mDg7cCUmklTgPd0sti7gdsjYlnN/FtI2qrBdWeRse7hwD+rrlvSNsDeddbRkLol\njSddOHIi6R4T3dbL7f6V0qXXd0j6aEV1fxiYBewraVaxO3uJunnZd6b32lGk99k1FdQ9A3gBOFrp\nUvjXkm7gdHdEdPm93sO6BwPLaspeBN4o6U2d1dXnghXYmHRu67ya8vmkswnq2azO/PNK0xpZdy69\nrlvSh4D3AxdUVbfSrR+Xkj7174qISY2uW9IWpG08NCKWdLO+XtVN2m08mXTV4AeBW4ArJB1aQd2j\ngK2Ag0jDMIcB2wO/kdSdD5devdckDQL+H/A/EfFSN+rtUd0RMYe0C34aKdT+BbyFNBzS0LqBm4EP\nS9pT0lqSRpP+/gCbd1ZRXwzWnljjT22Q9F7gF8CJEXFvhVUfBOxMOid5T0n/VUGd/0O6Gu+eCupa\nSUQsjIhzIuLuiJgZEROBn5Buh9loa5HusXFYRNwREXeQwnVX4J0V1N9hH+CNwE+rqEzSKOA64GLS\ndraQPuCu7OYHSrdFxE+BHwG/JvVc7wQuKya3d7ZcXwzWfwDLSYPjZZtS5wqswlxe/YmzaWlaI+vO\npcd1F0c1bwQmRMR/V1l3RDwdEY9ExOXAV4HPFT2aRtb9PmCipJckvQRcCKxXPP90g+uu5x6gu0eo\ne1L3HODliPhbqexvxXo63S3NVHfZZ4AZEfFIN+rsTd3HAE9FxFci4oGIuB34JOnionc3uG4i4quk\ng3dvIuVMxwf6rM6W6XPBGukuVvcBtaev7En6tKjn98BukobUzP9MRDzZ4Lqz6GndknYnherEiPhh\nlXXX0XF5cpffVz2se0fSkeGOxzdJd0obQ7pyr5F117MTMLsb8/e07juAtYseXIdRpNe9kvd5MQyz\nLz3srfawbvHq3mHH80a/1zqWjYiYExEvA58gHaRduKoF+tyDtHu5jDRA/mbgB6TTp7Yspp8B/G9p\n/vVJnziXkcZeDiBd2fWFRtddlO1A+ue6nPRpNgbYqYLtbgEWA2eRPnU3Kx6bVFD3YaSjtduT/rkP\nIl2q/PMqXvOa5Y+g52cFdHe7P0X6x3oz6Sj9F4vlP1dB3QLuBVqL99vOwK307EyMHr3mwCnAs8DQ\nnrzePdzucaSe5gTSnsHbSadbPQEMa3DdG5HOOnpz8Zr/gPQ/985V1tPTF6fRj2JjHicNVt8DjCtN\nuxiYVTP/jsWbbCnpPLMJFdb9OOkTtL14A7QDyxtdd/F8eanujsesCur+OOnT/znSeNf/kYYChjS6\n7jrLHkEPz2PtwXYfDjxEOkq9CLgbOKTC99pmwJXF6z6PNN7c7Q/SHtYt0u7vuT3d3l7UfSDpQ+X5\nYruvA7av4O+9Eak3+3zxN58C7LK6OnxJq5lZZn1ujNXMrL9zsJqZZeZgNTPLzMFqZpaZg9XMLDMH\nq5lZZg5WM7PMHKxmZpk5WM3MMnOwmpll5mC1Pk1Sq6QfS/qupIWS5kv6rKShkn4i6V+SnpT0idIy\nZxZftrekuMv+WeU7n0naUulLGBcWXxD3Z0kHl6Z/s/jSuBclzZE0uerttv5t7WY3wKwLDgW+S7qh\n84eB75O+luUG0p2OjgAuknRLRMwn3SzjSNLNeN5CuhH1MtLtBQF+TPrKjRbSzUy276io+JqVk0k3\nmXmQdNewdzVy42zg8U1YrE+T1Er6BtT3lsrmk26V95Hi+dqkW7l9IiJ+VWcdxwInR8S2xfMHSF8Q\nd2qdeU8i3ch5x0j33jTrNg8FWF8XwJ9qyuaTepNphhSAz5K+lhlJBxZf8jdH0vOkb4/dsrT8D4BT\niu/r+k9Jby9NuxIYCjwu6cJiXYPzb5YNZA5W6w9qv7AuOilbS9JY0g3Pfwd8iHRz4lNIu/5pxoiL\ngJGke2+OBu6UNLGY9jTpBtbHkIYJvgvcJ2ndzNtkA5iD1Qaa95K+kuf0iLgvIh4DRtTOFBHPRMRP\nI+Jg0tjrZ0rTlkXEjRFxErALaZy2oV9/bgOLD15ZX6fiUVvWmb8Ab5B0CPAHYG/SgahXFpZ+QPqe\nsEdJX+vzQdK3AiDpCNJ3SN1NOgh2MNBWzGvWJe6xWl8XvPrrzTs94hoRNwBnk84ceAD4AKlHWl5G\npK80foj0VRtzSN9lBWms9ijgNtI47v7AAdGNL6U081kBZmaZucdqZpaZg9XMLDMHq5lZZg5WM7PM\nHKxmZpk5WM3MMnOwmpll5mA1M8vMwWpmltn/B2mbZ3Qk566jAAAAAElFTkSuQmCC\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# scatter plot of training data, colored by id (0=red, 1=green, 2=blue)\n", "plt.scatter(train.mass, train.rings, c=colors[train.id], s=50)\n", "\n", "# testing data\n", "plt.scatter(test.mass, test.rings, c='white', s=50)\n", "\n", "# add labels\n", "plt.xlabel('mass')\n", "plt.ylabel('rings')\n", "plt.title('How we interpret the data')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(0, 30)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVQAAAFdCAYAAABcnZV9AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XmcnWV9/vHPRQJMgBJAEJGCSaps4gIIBhtjtCUsKXVj\nsSBLSkFFbC2uIEu0simISAIYLduPCrKITTGGYMMUBZElCBQIa6CEhAQQQkhIgsn398f9TPLk5JzM\nmZl7zjJc79frvGbO/Szne56Zuc5z388yigjMzKzv1mt2AWZmA4UD1cwsEweqmVkmDlQzs0wcqGZm\nmThQzcwycaBav5B0tKSVkrZvdi1vJpImFNv9rU16/Tf1z92BWlL6ZdirxvSbJM1udF1V6rhc0utV\n2neUNE/SXEk7FG2dxXv6dZX5tyymnV5qG1O0rZT0wSrLTJS0ss5Se3WSs6S3F8Hwvt4s32okHS/p\nqIzrG1Jsn4/kWmcrkHRA+XexHTlQe65VroRYow5JOwK3AiuBMRHxWMV8+1YLyGrrKpnQw/nLrgSG\nRMT/1TFvpbcDpwEDIlCB44GjM65vY9L2GVCBChwAOFCtKbTqm7Q3eisp6D5aCtOu+eYAL1I7IKv5\nI7VDWFXa1hARKyNieQ9er5puX6dHK5M2zrSeITnWk0HW7dMiWmWHpVccqH0kaZCkb0l6QtJSSc9I\nOkdSR2me8yS9XLHcWUW3+uRS2xBJy3rS7ZH0LmqHaZfFwHmkgBxZ56onkUL42/XWUlHXWmNpxfDD\nI5J2kTRD0mJJcyR9rTTPGOCu4ullpeGH8rDEDpKulfSipNclzZT06Rqv/1FJP5I0H1hUMW1MMYTx\noqRXJf28cuyxVPP7Jd0q6TXgomKaJH1J0oNFHfMl/VTSW0rLPw3sAnyk9F5ml6YfXyz/mqRXJN0n\n6bh1bNdhwILi6emldV5aMetmxdDQy8V6L632QSDpMEl3S1oi6U/Fdh1W6/Urln138XNcIulZSd+i\nSqZI+ntJ/1XMs1TS05K+J2nD0jyXk/bkVXpPq35/ip/Zb5SGtJZKekzSNyW11IfK4GYX0KI2k7Rl\nlfb1q7T9GPhH4AbgXGBP4GvArsC4Yp7bgH+V9P6I+GPRNprUPR8NnFm0fbB4jdvqqDEk/RXQWTyv\nFaaQwnYi8BXSXup+dax/UfF+zpY0MiLurGOZ7gQwFJgK/AL4OXAwcI6kByNiGvAwqTv7HdK2/W2x\n7AMAknYG7gDmAucArwGfAq6TdERE/EfFa14I/An4t+K1yy4olp8ADAO+BOwoac+IeKNU82bANOB6\n4D+AV4ppF5N+9pcDPwK2L9axV7GOZcC/FDUsAs4olnuteC/HkH4u1xXLr0/6vdkbmFxjGy4AvlC8\n9i+KB8CTFfNdU7R9E9gD+Kdi2W92zSDpm0VN1wH/DmwBnADcLul9EfFijRqQ9DbSB/l6wNnFezoO\nWFZl9qOB10nbe2Hx/v4V2A74h2KeS4BtgH2Az5aW7arheNLvxk3AUuBvSX83Q4GTatXZcBHhR/Eg\n/eBXdvN4qjT/e4u2f69Yz+lF+7ji+ZbF838unneQfimuAV4FVLSfSvqF7OimzsuBN0hd+eeAHdYx\nbyfwcPH914s6RlbUdVpp/jFF2yHARqQ/wmml6ROBlT3YlttX1LIS+GypbX1SOF5XavtAMd+RVdY7\nHXgQ2LCi/Wbg2Sqv/3tgvRq1/RFYv9Q+vmg/pkrNJ1Ss40NF++EV7X9dtB9bavtfYEaV93Ij8EAv\nfk/X+rmVpk0opv20ov0G4IXS8+2L36FTKuYbQQq/M7qp4fzidT5QatuC9OG1ouLnPqTK8icV8/1l\nPb9bVPmbIH3gLir/DJv9cJe/ui+RPgHLj32AP1TM17UH+oOK9vNJvyzjACJ90j9C2hsFGEkKkjOB\nTYDdi/YPA/dGxNI6alyP9If1Cqs/xbszkR504yNiCWkvdaykvet8je4siYirSq/xBqmLP6K7BSVt\nAfwNaY/qL5TOUtiy6E3cDGxbDIGU/SQiap2V8ONYvScK6UDaK8DfVcz3BmvvMR5C2iubXlHHo6QP\noY92936K19pO0gfqmLenflLx/HfAWyRtUjz/FDAIuLai/ldJHwDd1X8AcFdE3NPVEBF/Iu3Br9EN\nj4jXASStJ2lo8Tq3F/PtVs+b6fqbUBpi27xYx22kA3Q71bOORnCgVnd3RMyoePw38FLFfO8gdQnX\n6GpHxKvAvGJ6l9+SApPi60MR8QAwGxgtaRCpK1RPdx9gOXAYsCPw69IfSk1FQH4f2KcHATkReIGe\nHdBal+eqtL0CbF7Hsu8k/RFOIIVW+XEu6WdRef5lZVe47PHyk4hYATzNmj83gLmx9gG2HUgfhvOr\n1PJWYKs63s85pD2su5TG4C8uxpBzqDy7omsMv2s771B8ncXa9e9B9/W/g4rtV1irTdKukqaS3uvL\nxWt0FpMrh2GqkjRK0m2k4wEvFev4fz1ZRyN4DLX/VA6W/xY4TtJOpD3V20rtHym+bszqMcPuRET8\nojiA8VNgiqT9I43brcsk4KukUDqsjhdZIulc0jhnjr3UFTXa6zm40LUD8APSOGw1D1U8X+t83V6o\nto71SH/Yh9ZY5uUa7atExCyl090OAPYl7Rl/TtJFEXFCb4stdLedu7blfsCfq8zX3Xar62i8pKGk\nsdZFwMnAE8W6/5I0dNXtTp2kEcBvSOH/ZdKHxVJS8J9TzzoaxYHaN8+QfkF3JHWTAJC0KWmAfUpp\n3q6g/Bipy9/VJbsN+B6rD1L9rs7XFkBEXFr80p5HOjDzyWJPq6pyQJLG++rRFcLfJnVp+1utP9an\niq8rImJGhtfZgfSHCoCkwcBwUgB050nSUNAfImJxN/PWDJ+iO3wDcEPRS7kcOF7SGRExr6fr64Gu\nPfdnI+KRXiz/DKv3cssq2z4KvAX4VESs2lmQtE+VZWu9r78HNgAOjIhnS+v4qx5V3AAtk+xt6qbi\n65cr2v+FtG27phPpBPf/I43PbsSae6hbAMcCD0bEwjpfe9UvX0ScD3yXtIdzRR2nkkwidePrOj2r\nNFTwt6SDMf1xrmB5nV0BtUVFHQtIYXespLdXrkBSPd3sss9J2qD0/EhS9/FXdSx7DelnfFqVOgZJ\n2qzUtJiK91LM95by8+KDsOuDebPK+UuWFF/XWmcPXE/ai12r/mq1VTEV2FPSnhXLHMaaP8uuD/f1\nSvOtB5xYZZ2Li+mV773aOjYknZHQUryH2nOrwioiHpT078Axpa7N7qSjxb+OiMrLPX8LHA48ERHP\nF+t4XNIC0l7uxN7UUazntOIX8QTSgYXj1zHvEknfJ+0Z12sS6XSwug4irEOtsC+3P0nqMn9B0mJS\nd/HBiHiIdMrQ7cADkn5C2mt9K+mUs52ByoNS6xLArZKuYfVpUw8CV3RXc0T8VtIk4GuS3ks6+2AZ\naZz306QzNq4sZr+btNd5GmmMcVFE3EQ6oDW/eD/PF8ueANy/rr3GiHhd0kPAZyQ9Rjqy/lRE3FVr\nmSrrmF2cNvV9Se8A/pM0lj2ctEd4DenUtVq+BxwBTJN0ASkMjyXtNLy3NN/vSEMjV0i6kDS8cBBp\neKvS3cXXiZKmFfNOIZ2ythy4SdKPSWfJHEHtYY3mafRpBaSu8BWkQeXXSWNeo0vTL2ftU5XuaFBt\nR5N+SHvVmP5flE6bKtrWY/XY0DJSV+hsKk7rKeY9juqntFxbvO5BddZ5GeloebVpVxTrOrN4fivF\naVMV8w0h/RGvYO3TplYAh1RZ5itF/St6sC3Lp8/UquWyKtv1AOD+YptW1vgO4FLSAa5lwLOk3sCh\n9fwsWX3a1EdYfdDt1eLnsHXFvFVrLk0fTzr7YzHpHMsHSGFTPh1oK+CXpMBadeodKYBuLf4WlpI+\nSH4IbFnH9t0LuLP4G1oJXFq0Tyje91u7+3kU7X9POkD0KumshUdIH54711HDrkX9S0hBenKxPSp/\n7nuRdiZeIx2snVgsu8apcaQPrh8U86wor4c01juzeK1nSOcV/20xz+juam3Uo+v8x4Yo9qBmkrq7\nXb/II4B5ETGrmOcy0rXcR5QWXR4Rr2CWgaSjSYE8MnqwV2fWnUZ3+b8OPBcRR5fanqmYR6QAXYCZ\nWRtp9EGpT5DOufu50nXP90n6YsU8AYwqpj8qaXIvDjaYmTVcowN1BOlgyRPAWNK1vWdXhOo0Unf/\nY6Qxu72AGRVHY836qq3vamStqdFjqMtJl6uNKrWdAXwyInapscw2pGGBQyPixsZUambWc40eQ51L\numNM2SzSjRqqioh5kuaQTilZRZL3MMysX0REr24L2Ogu/+2sfSODHUjXT1dVjJ9uSzqVYg3NPkWi\nt4/TTz+96TW8mepu59rbte52rr0vGh2o5wMjJZ0s6Z2SDiadTD0J0h3VJZ0raaSkYcWNIqaQbkDh\n7r6ZtbSGdvkj4h5JnyDdtu5U0tjoKRFxcTHLCtIJv0eQLr2bB8wgnfDe3fXSZmZN1fBLTyNiKjXu\nFBTpnof13E2+rY0ZM6bZJfRKu9YN7Vt7u9YN7V17bzX0KH9OkqJdazez1iWJaJODUmZmA5YD1cws\nEweqmVkmDlQzs0wcqGZmmThQzcwycaCamWXiQDUzy8SBamaWiQPVzCwTB6qZWSYOVDOzTByoZmaZ\nOFDNzDJxoJqZZeJANTPLxIFqZpaJA9XMLBMHqplZJg5UM7NMHKhmZpk4UM3MMnGgmpll4kA1M8vE\ngWpmlokD1cwsEweqmVkmDlQzs0wcqGZmmThQzcwycaCamWXiQDUzy8SBamaWiQPVzCwTB6qZWSYN\nD1RJ20i6QtICSa9LekjS6Ip5Jkh6TtISSbdK2qXRdZqZ9VRDA1XSZsDtQAAHADsBJwALSvN8Azix\naN+zmHaLpE26W/+8efM49dRvs/fe+zFu3KFMmTKFlStX9sM7MTNbmyKicS8mnQl8OCI+XGO6gLnA\njyLirKKtgxSqX42IyaV5o1z7H//4R0aP3pflyz/FsmV/B8xj440vZL/93su1117Beut5dMPMuieJ\niFBvlm10ynwCuEvSzyXNl3SfpC+Wpg8HtgamdzVExFLgNuBDtVYaERx88HgWLTqPZcsuBsYB/8Ti\nxXcybdojXH/99f3zbszMShodqCOA44EngLHABcDZpVB9W/F1fsVyC0rT1vLggw8yb94rwGEVU4aw\nePFXmDjxij4XbmbWncENfr31gLsi4lvF8/slvQv4IjCpm2XXGpuYMGECALNnzwY2pfrnwzAWLHix\nt/Wa2QDX2dlJZ2dnlnU1egz1aWB6RBxXajsCuDgiNpE0grT3umdE3Fua51fAgogYX2pbNYb6wgsv\nsP32O7B06VPA5mu85uDB3+aII+Zz6aUX9eM7M7OBop3GUG8nHdkv2wF4uvh+NvA8aTgAWHVQahRw\nR62VbrXVVhxyyKEMGfKPwGulKZ1ssMEkvva1L2Uo3cxs3Rrd5T8fuEPSycC1wG7Al4CTACIiJP0Q\nOFnSLOBx4BRgEfCzda148uQLWLbsC/zyl9uz4YajgOcZPHgeV199FTvvvHM/viUzs6ShXX4ASQcA\nZwI7As8AEyNiYsU8pwOfI/Xf7wS+GBEPV8wT1WqfM2cOd999N0OHDmX06NEMHtzozwwza2d96fI3\nPFBzqRWoZmZ90U5jqGZmA5YD1cwsEweqmVkmDlQzs0wcqGZmmThQzcwycaCamWXiQDUzy8SBamaW\niQPVzCwTB6qZWSYOVDOzTByoZmaZOFDNzDJxoJqZZeJANTPLxIFqZpaJA9XMLBMHqplZJg5UM7NM\nHKhmZpk4UM3MMnGgmpll4kA1M8vEgWpmlokD1cwsEweqmVkmDlQzs0wcqGZmmThQzcwycaCamWXi\nQDUzy8SBamaWiQPVzCwTB6qZWSYNDVRJEyStrHjMLU2/vMr0OxpZo5lZbw1uwmvOAsaUnq8ofR/A\nLcARpbblDajJzKzPmhGoKyJiQY1pApavY7qZWctqxhjqCEnPSXpK0tWShpemBTBK0nxJj0qaLGmr\nJtRoZtZjiojGvZi0H7AJqdu/NXAKsBPw7oj4k6RDgcXAbGA48F1gELBHRCyvWFc0snYze3OQRESo\nV8s2M5QkbUQKz7Mj4vwq07cBngEOjYgbK6Y5UM0su74EajPGUFeJiCWSHgLeWWP6PElzak2fMGHC\nqu/HjBnDmDFj+qFKMxvIOjs76ezszLKuZu+hdpD2UCdFxHerTN8KmAMcExFXVUzzHqqZZdeXPdRG\nn4d6rqTRkoZL+iBwPTAEuELSxsX0kZKGSRoDTAHmAzeuY7VmZi2h0V3+bYGrgS2BF4DfAyMj4tli\nb3VX0jmomwHzgBnAQRGxuMF1mpn1WFO7/H2xri7/ihXpWoFBgwY1siQzGwDapsvf32bOnMmBBx5I\nR0cHHR0dHHjggdx3333NLsvM3iQGTKDOnDmTfffdl3HjxrFw4UIWLlzIuHHjGDt2LDNnzmx2eWb2\nJjBguvwHHngg48aN4/Of//wa811yySVMnTqVKVOmNLpEM2tDbXtif1+UA3XFihV0dHSwcOFCNtpo\nozXmW7JkCUOHDmXp0qUeUzWzbnkM1cysBQyIQB00aBD77bcfV1555VrTrrzySvbff3/vnZpZv2vq\npac5fec732Hs2LEAHHnkkUAK01NPPZXp06c3szQze5MYEHuoALvtths333wzU6dOZejQoQwdOpSp\nU6cyffp0dtttt2aXZ2ZvAgPioFQln9hvZr3Vtneb6i8OUjNrhgHT5TczazYHqplZJg5UM7NMHKhm\nZpk4UM3MMnGgmpll4kA1M8vEgWpmlokD1cwsEweqmVkmDlQzs0wcqGZmmThQzcwycaCamWXiQDUz\ny8SBamaWSa8CVck7JXXkLsjMrF3VFaiSzpJ0VPG9gFuAx4B5kkb2Y31mZm2j3j3Uw0kBCrA/8D5g\nJHAlcFY/1GVm1nbq/Z9SbwWeLb4/ALguIu6S9Cfg3n6pzMyszdS7h/oSMKz4fizw38X36wO9+u+A\nZmYDTb17qDcAP5P0GLAFcHPR/j7g8f4ozMys3dQbqF8BngG2B74WEa8V7W8HLu6PwszM2o0iotk1\n9IqkaNfazax1SSIiejWUWdceqqTRNSYFsBR4MiL+1JsCzMwGirr2UCWtJIVnrdQOYArw2YhYnK+8\nddbkPVQzy64ve6j1HuU/AHiEdD7qu4rH4cBDwEHAp4D3A+d0U+gESSsrHnOrzPOcpCWSbpW0Sw/f\nk5lZU9R7UOq7wJcj4jelticlvQCcExF7SFoBTARO6GZds4Axpecrur6R9A3gROAo0oUEpwG3SNqx\ndCBslf0/vj8LX13I/n+zP5//3OfZaqut6nw7Zmb51dvlfx3YPSIeqWjfBZgZER2ShgGzIqLm9f2S\nJgCfjoj3VJkmYC7wo4g4q2jrABYAX42IyRXzBx8HNoKOJzoY8swQbu+8nZ133rnb92NmVksjuvyP\nAN+StGHpRTuAk4CHi6btgHl1rGtE0aV/StLVkoYX7cOBrYHpXTNGxFLgNuBDVde0G7AjLB23lFf2\nfIXDxx9e59sxM8uv3kD9ArAvMFdSp6T/AZ4r2r5QzDMCuKib9dxJ6s7vCxwLvA24Q9IWxfcA8yuW\nWVCaVlPsHsyaNYvZs2fX8XbMzPKraww1Iv5Q7EkeDuxUNP8H8LOusc2IuKKO9UwrPf1fSb8HZpNC\n9g/rWrRq662l74fB+puvz0svvcTw4cOrzm5mVqmzs5POzs4s62r6if2SZpCGFM4FngT2jIh7S9N/\nBSyIiPEVywUTSg2vQsfkDuY/N59NN920AZWb2UDU7yf2Fy+yHfBh0p2n1hgqiIgf9ObFi3HYnYEZ\nETFb0vOkm6/cW5o+CvjqOle0FDaauhHjjxnvMDWzpqn3KP/hwKXAn4EXqOiCR0RdfWxJ55IuAHiW\nFMynkgLzPRHxrKSvAycD40k3XTmlmL5j5QUDkmLTv9oUNoLlTy/nM4d+hskXTWb99devpxQzs6oa\nsYf6HeA84NSIWNHdzOuwLXA1sCUpmH8PjIyIZwEi4nuShgCTgM1JB7HG1rr66oZLbmDRokWMHDmS\nbbbZpg9lmZn1Xb17qK8B742Ip/q/pPr40lMz6w+NOA/116R/eWJmZjXU2+WfDpwj6d3AA8Ab5YkR\n8YvchZmZtZue3G2qpojo1b+j7gt3+c2sP/T7QalmBKaZWbtxUJqZZVJzD1XSicDFEfG6pK9Q6/JP\nen9iv5nZQFJzDFXSbOADEfFS8X1N9Z7Yn5PHUM2sP/TLGGpXSEragHQS/pERMat3JZqZDXzdjqFG\nxHJgGLDOI/1mZm929R6UupJ0/1IzM6uh3hP7NwI+K2kf0p2guq6tFxAR8c/9UZyZWTupN1B3AWYW\n348otYt1HP03M3szafoNpnvLR/nNrD804uYoZmbWDQeqmVkmDlQzs0wcqGZmmThQzcwycaCamWXi\nQDUzy8SBamaWiQPVzCwTB6qZWSYOVDOzTByoZmaZOFDNzDJxoJqZZeJANTPLxIFqZpaJA9XMLBMH\nqplZJg5UM7NMHKhmZpk4UM3MMnGgmpll0rRAlXSSpJWSLiy1XV60lR93NKtGM7OeGNyMF5U0EjgW\neACI0qQAbgGOKLUtb2BpZma91vA9VElDgauA8cDLlZOB5RGxoPR4pdE1mpn1RjO6/JOB6yLif0gB\nWhbAKEnzJT0qabKkrRpfoplZzzW0yy/pWGAEcFjRFBWzTANuAGYDw4HvAjMk7RER7vqbWUtrWKBK\n2hE4AxgVESu6mintpUbEz0uLPCTpXuAZYBxwY6NqNTPrjUbuoe4NbEkKyq62QcCHJX0O2Dgi3igv\nEBHzJM0B3llthRMmTFj1/ZgxYxgzZkz+qs1sQOvs7KSzszPLuhRR2evuH8XBqG3LTcBlwGPAmRHx\ncJVltgLmAMdExFUV06JRtZvZm4ckIqLy+E5dGraHGhELgYXlNklLgJcj4mFJmwATgOuB54FhwFnA\nfNzdN7M20JTzUEuC1Qem/gzsSjoHdTNgHjADOCgiFjenPDOz+jWsy5+bu/xm1h/60uX3tfxmZpk4\nUM3MMnGgmpll4kA1M8vEgWpmlokD1cwsEweqmVkmDlQzs0wcqGZmmThQzcwycaCamWXiQDUzy8SB\namaWiQPVzCwTB6qZWSYOVDOzTByoZmaZOFDNzDJxoJqZZeJANTPLxIFqZpaJA9XMLBMHqplZJg5U\nM7NMHKhmZpk4UM3MMnGgmpll4kA1M8vEgWpmlokD1cwsEweqmVkmDlQzs0wcqGZmmThQzcwycaCa\nmWXiQDUzy6RpgSrpJEkrJV1Y0T5B0nOSlki6VdIuzarRzKwnmhKokkYCxwIPAFFq/wZwInACsCew\nALhF0ibV1rPf3ntz+re+xdy5c/u/aDOzbjQ8UCUNBa4CxgMvl9oFfBk4KyJujIiHgKOAvwAOq7au\nL915Jy+edx677bQT99xzT/8Xb2a2Ds3YQ50MXBcR/wOo1D4c2BqY3tUQEUuB24APVVvROGDSsmX8\naNEijj74YCKi2mxmZg3R0ECVdCwwAjilaCon4NuKr/MrFltQmlbVIcCyF19k5syZOco0M+uVwY16\nIUk7AmcAoyJiRVcza+6l1lJ113NC6ftNI3jppZf6VKOZvfl0dnbS2dmZZV1qVDdZ0tHApcCKUvMg\nUliuAHYFZgF7RsS9peV+BSyIiPEV61tV+UJg2IYb8vDs2WyzzTb99ybMbMCTRETUs6O3lkZ2+W8k\nheb7isf7gXuAq4vvHweeB8Z2LSCpAxgF3FFrpYuBf+ro4JOf+ITD1MyaqmFd/ohYSNqZXEXSEuDl\niHi4eP5D4GRJs0gBewqwCPhZtXV+fNNN+d3y5YwbN45Jl13Wr/WbmXWnYYFaQ1AaH42I70kaAkwC\nNgfuBMZGxOJqCx912WVc+IEPsP322zekWDOzdWnYGGpukqJdazez1tUuY6hmZgOaA9XMLBMHqplZ\nJg5UM7NMHKhmZpk4UM3MMnGgmpll4kA1M8vEgWpmlokD1cwsEweqmVkmDlQzs0wcqGZmmThQzcwy\ncaCamWXiQDUzy8SBamaWiQPVzCwTB6qZWSYOVDOzTByoZmaZOFDNzDJxoJqZZeJANTPLxIFqZpaJ\nA9XMLBMHqplZJg5UM7NMHKhmZpk4UM3MMnGgmpll4kA1M8vEgWpmlokD1cwsEweqmVkmDQ1USV+U\ndL+khcXjDkkHlKZfLmllxeOORtZoZtZbgxv8es8CXwceJ4X50cAvJe0ZEfcDAdwCHFFaZnmDazQz\n65WG7qFGxJSIuDkinoqIJyLiFGARsFcxi4DlEbGg9HilkTU2QmdnZ7NL6JV2rRvat/Z2rRvau/be\natoYqqRBkj4DdAC3Fc0BjJI0X9KjkiZL2qpZNfaXdv1Fa9e6oX1rb9e6ob1r761Gd/mR9B7g98CG\nwOvAIRHxaDF5GnADMBsYDnwXmCFpj4hw19/MWlrDAxWYBbwXGAocDFwj6aMRcU9E/Lw030OS7gWe\nAcYBNza+VDOz+ikimluAdAswJyLG15j+FHBxRHy/or25hZvZgBUR6s1yzdhDrTSIGmO5xfjptsC8\nymm9fcNmZv2loYEq6WzgJmAO8BfAYcBHgP0kbQx8G7geeB4YBpwFzMfdfTNrA43eQ90auAp4G7AQ\nuB/YLyJukdQB7Eo6B3Uz0l7pDOCgiFjc4DrNzHqs6WOoZmYDRVteyy/peEmzJb0u6R5Jo5pdU3ck\nTahyWe3cZtdVSdJoSVMkzSlqPKrKPBMkPSdpiaRbJe3SjForalpn3a16WbOkkyTdXVyKvaB4D++u\nMl8rbvNua2/F7d7dJfDFPL3a3m0XqJIOBX5IOkf1/cAdwK8lbdfUwuozizTc0fV4T3PLqWpj4AHg\nX0jnCa/RhZH0DeBE4ARgT2ABcIukTRpcZ6V11s3qy5rL2/8Amu8jwERgb+BjwJ+B30javGuGFt7m\n3dZOa273rkvgdwP2IA0t/lLS+6CP2zsi2uoB/AH4cUXbY8CZza6tm7onAA82u44e1rwIOLL0XKSx\n7ZNKbR2hntv0AAAFEElEQVTAq8Bxza63Vt1F2+XAfzW7tjpq35gUTOPaaZtXq73NtvtLwLF93d5t\ntYcqaQNgd2B6xaTpwIcaX1GPjSi6EU9JulrS8GYX1EPDSQcWV23/iFhKunS41bd/u1zWvCmp5/hy\n8bydtnll7dDi273KJfB92t5tFajAlqTzVudXtC8gdSVa2Z3AUcC+pE/CtwF3SNqiqVX1TNc2bsft\nP410BsnHgK+Qbsgzo/iQbiUXAPeRLs+G9trmlbVDi253Se+R9BqwFJjM6kvg+7S9W+HE/jeFiJhW\nevq/kn5PumfBUcD5zakqq5Y+XSTa4LJmST8g7QWNiqKv2Y2W2ea1am/h7V71Evhulul2e7fbHuqL\nwArSLnnZ1lS5mqqVRcQS4CHgnc2upQeeL75W2/7P00YiYh7pApOW2P6SzgcOBT4WEU+XJrX8Nl9H\n7Wtple0eEW9Euo3ofRFxMqkH+UVW50ivtndbBWqkO07dC4ytmLQP6Wh/2yguZNiZ9vogmE36pVq1\n/Yv3MYr22/41L2tuQi0XsDqQHquY3NLbvJvaq83fMtu9wiBgvYjo2/Zu9tG1XhyNOwRYBhxDCqQL\nSEfgtmt2bd3UfS4wmjTo/UHSJbivtFrdpCO17y8ei4FTi++3K6Z/vaj7k6Qr264h7XFs3Kp1F9PO\nBUaSLmkeQxrn+78WqHsS6arBj7LmqUUbl+Zp1W2+ztpbdbsDZxcBOYx06uJZpJ7vPn3d3k37YfRx\ng3yB9Mm9FLibNG7T9Lq6qflq4Lniw2AOcB2wU7PrqlLnGGBl8VhR+v7S0jynA3NJ53veCuzSynWT\njuBOIx1oWAY8XbRv2wJ1V9bb9TitYr5W3ObrrL1VtztwWVHL0qK26V1h2tft7UtPzcwyaasxVDOz\nVuZANTPLxIFqZpaJA9XMLBMHqplZJg5UM7NMHKhmZpk4UM3MMnGgmpll4kA1M8vEgWotTVKnpIsk\nnSfppeKfwf2zpA5Jl0h6RdIzkv6htMzZkmYV/2BttqRzJG1Ymr6dpP8s1rdY0iPF/yrrmn6apKcl\nLZU0T9IVjX7f1p58g2lrB4cD55Hu9v5x0j9pHEe6Y9fuwNHApZL+OyIWAK8B40k3o3k3cAnp5hyn\nFeu7CNiAdEOVV4Gdul5I0qdJd5b/DPAg6T6YH+zPN2cDh2+OYi1NUiewfkT8daltAXBHRHyieD6Y\ndMu+f4iIX1RZx+eBr0TEu4rn9wM3RMR3qsx7InAcsGtE/Lkf3pINYO7yW6sL0r+HLltA2ntMM6Tg\nexl4K4CkgyT9ruiuLwJ+QLovapcLgFOK/8f+b5J2L027lnTbudmSflqsq9X+75S1KAeqtYM3Kp5H\njbb1JI0k3Xv218DfkW4yfQqpi59mjLiUdKPvy4AdSP8s8fRi2hxgR+BzpOGA84B7JW2U+T3ZAORA\ntYHmr4HnIuKMiLg3Ip4k3Zl9DRHxXET8JCIOJY2tHleatiwipkbEicCepHHYVvuXzdaCfFDKWp2K\nR2VbLY8C20o6jPSP1/YlHWBavXD6P0hTgcdJ/0t+f9I/TETS0aT/L3QX6eDWocDyYl6zdfIeqrW6\nYO1/31vzSGpE3AR8n3QmwP3A35D2QMvLCLiQFKLTSf8w7qhi2suk/1d2G2mc9pPApyLimb6+ERv4\nfJTfzCwT76GamWXiQDUzy8SBamaWiQPVzCwTB6qZWSYOVDOzTByoZmaZOFDNzDJxoJqZZfL/AQOp\n739TP3uWAAAAAElFTkSuQmCC\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# adjust the x-limits\n", "plt.scatter(train.mass, train.rings, c=colors[train.id], s=50)\n", "plt.scatter(test.mass, test.rings, c='white', s=50)\n", "plt.xlabel('mass')\n", "plt.ylabel('rings')\n", "plt.title('How KNN interprets the data')\n", "plt.xlim(0, 30)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### How does StandardScaler solve the problem?\n", "\n", "[StandardScaler](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) is used for the \"standardization\" of features, also known as \"center and scale\" or \"z-score normalization\"." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# standardize the features\n", "from sklearn.preprocessing import StandardScaler\n", "scaler = StandardScaler()\n", "scaler.fit(X)\n", "X_scaled = scaler.transform(X)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 0.9, 0.1, 40. ],\n", " [ 0.3, 0.2, 50. ],\n", " [ 0.6, 0.8, 60. ]])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# original values\n", "X.values" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 1.22474487, -0.86266219, -1.22474487],\n", " [-1.22474487, -0.53916387, 0. ],\n", " [ 0. , 1.40182605, 1.22474487]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# standardized values\n", "X_scaled" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ 0.6 0.36666667 50. ]\n", "[ 0.24494897 0.30912062 8.16496581]\n" ] } ], "source": [ "# figure out how it standardized\n", "print scaler.mean_\n", "print scaler.std_" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 1.22474487, -0.86266219, -1.22474487],\n", " [-1.22474487, -0.53916387, 0. ],\n", " [ 0. , 1.40182605, 1.22474487]])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# manually standardize\n", "(X.values - scaler.mean_) / scaler.std_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Applying StandardScaler to a real dataset\n", "\n", "- Wine dataset from the UCI Machine Learning Repository: [data](http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data), [data dictionary](http://archive.ics.uci.edu/ml/datasets/Wine)\n", "- **Goal:** Predict the origin of wine using chemical analysis" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# read three columns from the dataset into a DataFrame\n", "url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'\n", "col_names = ['label', 'color', 'proline']\n", "wine = pd.read_csv(url, header=None, names=col_names, usecols=[0, 10, 13])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelcolorproline
015.641065
114.381050
215.681185
317.801480
414.32735
\n", "
" ], "text/plain": [ " label color proline\n", "0 1 5.64 1065\n", "1 1 4.38 1050\n", "2 1 5.68 1185\n", "3 1 7.80 1480\n", "4 1 4.32 735" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wine.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelcolorproline
count178.000000178.000000178.000000
mean1.9382025.058090746.893258
std0.7750352.318286314.907474
min1.0000001.280000278.000000
25%1.0000003.220000500.500000
50%2.0000004.690000673.500000
75%3.0000006.200000985.000000
max3.00000013.0000001680.000000
\n", "
" ], "text/plain": [ " label color proline\n", "count 178.000000 178.000000 178.000000\n", "mean 1.938202 5.058090 746.893258\n", "std 0.775035 2.318286 314.907474\n", "min 1.000000 1.280000 278.000000\n", "25% 1.000000 3.220000 500.500000\n", "50% 2.000000 4.690000 673.500000\n", "75% 3.000000 6.200000 985.000000\n", "max 3.000000 13.000000 1680.000000" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wine.describe()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# define X and y\n", "feature_cols = ['color', 'proline']\n", "X = wine[feature_cols]\n", "y = wine.label" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# split into training and testing sets\n", "from sklearn.cross_validation import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# standardize X_train\n", "scaler.fit(X_train)\n", "X_train_scaled = scaler.transform(X_train)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-3.90664944003e-16\n", "1.0\n", "1.6027279754e-16\n", "1.0\n" ] } ], "source": [ "# check that it standardized properly\n", "print X_train_scaled[:, 0].mean()\n", "print X_train_scaled[:, 0].std()\n", "print X_train_scaled[:, 1].mean()\n", "print X_train_scaled[:, 1].std()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# standardize X_test\n", "X_test_scaled = scaler.transform(X_test)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0305898576303\n", "0.866822198488\n", "0.0546533341088\n", "1.14955947533\n" ] } ], "source": [ "# is this right?\n", "print X_test_scaled[:, 0].mean()\n", "print X_test_scaled[:, 0].std()\n", "print X_test_scaled[:, 1].mean()\n", "print X_test_scaled[:, 1].std()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.644444444444\n" ] } ], "source": [ "# KNN accuracy on original data\n", "knn = KNeighborsClassifier(n_neighbors=3)\n", "knn.fit(X_train, y_train)\n", "y_pred_class = knn.predict(X_test)\n", "from sklearn import metrics\n", "print metrics.accuracy_score(y_test, y_pred_class)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.866666666667\n" ] } ], "source": [ "# KNN accuracy on scaled data\n", "knn.fit(X_train_scaled, y_train)\n", "y_pred_class = knn.predict(X_test_scaled)\n", "print metrics.accuracy_score(y_test, y_pred_class)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pipeline (bonus content)\n", "\n", "### What is the problem we're trying to solve?" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# define X and y\n", "feature_cols = ['color', 'proline']\n", "X = wine[feature_cols]\n", "y = wine.label" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.71983168041991563" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# proper cross-validation on the original (unscaled) data\n", "knn = KNeighborsClassifier(n_neighbors=3)\n", "from sklearn.cross_validation import cross_val_score\n", "cross_val_score(knn, X, y, cv=5, scoring='accuracy').mean()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.90104247104247115" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# why is this improper cross-validation on the scaled data?\n", "scaler = StandardScaler()\n", "X_scaled = scaler.fit_transform(X)\n", "cross_val_score(knn, X_scaled, y, cv=5, scoring='accuracy').mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### How does Pipeline solve the problem?\n", "\n", "[Pipeline](http://scikit-learn.org/stable/modules/pipeline.html) is used for chaining steps together:" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.89516011810129448" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# fix the cross-validation process using Pipeline\n", "from sklearn.pipeline import make_pipeline\n", "pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))\n", "cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pipeline can also be used with [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html) for parameter searching:" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.910112359551\n", "{'kneighborsclassifier__n_neighbors': 1}\n" ] } ], "source": [ "# search for an optimal n_neighbors value using GridSearchCV\n", "neighbors_range = range(1, 21)\n", "param_grid = dict(kneighborsclassifier__n_neighbors=neighbors_range)\n", "from sklearn.grid_search import GridSearchCV\n", "grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')\n", "grid.fit(X, y)\n", "print grid.best_score_\n", "print grid.best_params_" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }