{ "metadata": { "name": "", "signature": "sha256:8118d8c90dd1ce86803a05df8efa453000e17cf728dc7696dd62e63609d4ff15" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import numpy as np" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "# Load the data into x and y\n", "data = pd.DataFrame(boston['data'], columns = boston['feature_names'][:-1])\n", "# Make x 2D\n", "x = boston.data[:, np.newaxis]\n", "# Add median housing prices as y\n", "y = boston.target\n", "\n", "# Use pandas to describe\n", "print pd.Series(boston['target'], name = boston['feature_names'][-1]).describe()\n", "print pd.DataFrame(boston['data'], columns = boston['feature_names'][:-1]).describe()" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'boston' is not defined", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Load the data into x and y\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mboston\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'data'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mboston\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'feature_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;31m# Make x 2D\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mboston\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnewaxis\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# Add median housing prices as y\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'boston' is not defined" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.linear_model import LinearRegression\n", "# Create a new variable which is just number of rooms\n", "rooms = x[:, :, 5]\n", "\n", "# LinearRegression?\n", "# LinearRegression(self, fit_intercept=True, normalize=False, copy_X=True)\n", "lr = LinearRegression(fit_intercept=True)\n", "lr.fit(rooms, y)\n", "\n", "# Calculate R^2\n", "print \"R^2: {}\".format(lr.score(rooms, y))\n", "# Calculate residual sum of squares (residual error)\n", "print \"Residual sum of squares: {}\".format(np.mean(lr.predict(rooms) - y)**2)" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "K-Fold Cross-Validation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will now build a K-folds cross-validation function to utilize on future analysis. Dont be afraid to use help in IPython notebooks. \n", "\n", "**KFold** divides all the samples in `k` groups of samples, called folds (if `k = n`, this is equivalent to the **Leave One Out** strategy), of equal sizes (if possible). The prediction function is learned using `k - 1` folds, and the fold left out is used for test.\n", "\n", "__KFold?__\n", "\n", "*KFold(self, n, n_folds=3, indices=True, shuffle=False, random_state=None, k=None)*\n", "\n", "Provides train/test indices to split data in train test sets. Split\n", "dataset into k consecutive folds (without shuffling).\n", "\n", "Each fold is then used a validation set once while the k - 1 remaining\n", "fold form the training set.\n", "\n", "__n__ : int\n", "\n", "- Total number of elements.\n", "\n", "__n_folds__ : int, default=3\n", "\n", "- Number of folds. Must be at least 2.\n", "\n", "__indices__ : boolean, optional (default True)\n", "\n", "- Return train/test split as arrays of indices, rather than a boolean mask array. Integer indices are required when dealing with sparse matrices, since those cannot be indexed by boolean masks.\n", "\n", "__shuffle__ : boolean, optional\n", "\n", "- Whether to shuffle the data before splitting into batches.\n", "\n", "__random_state__ : int or RandomState\n", "\n", "- Pseudo number generator state used for random sampling." ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.cross_validation import KFold \n", "\n", "def cross_vd(x, y, folds):\n", " kf = KFold(len(x), n_folds=folds, shuffle=True)\n", " lr = LinearRegression(fit_intercept=True)\n", "\n", " scores = []\n", " # train and test are outputs of kf\n", " for train, test in kf:\n", " lr.fit(x[train], y[train])\n", " prediction = lr.predict(x[test])\n", " score = lr.score(x[test], y[test])\n", " scores.append(score)\n", " \n", " return sum(scores)/len(scores)\n", " \n", "cross_vd(rooms, y, 10) " ], "language": "python", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'rooms' is not defined", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscores\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscores\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mcross_vd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrooms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'rooms' is not defined" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.cross_validation import cross_val_score, ShuffleSplit\n", "cross_val_score?" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, the only thing we need to really worry about is specifying a custom cross validation iterator with **ShuffleSplit**. Then we simply pass our data, and iterator. \n", "\n", "The beauty of this approach is:\n", "\n", "1. Fewer lines of code\n", "2. Easy manipulation of output parameter\n", "\n", "Regarding point **2**, we can pass a different scoring parameter depending on which type of model we are fitting. Regression ('r2', 'mean_squared_error'), Classification ('accuracy', 'precision', ect)\n", "\n", "The full list can be found here.\n" ] }, { "cell_type": "code", "collapsed": false, "input": [ "cv = ShuffleSplit(len(x), n_iter=10, test_size=0.3, random_state=0)\n", "\n", "lr = LinearRegression(fit_intercept=True, normalize=True)\n", "scores = cross_val_score(lr, rooms, y, cv=cv, scoring='mean_squared_error')\n", "scores" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'x' is not defined", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mShuffleSplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_iter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mlr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mLinearRegression\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfit_intercept\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnormalize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcross_val_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrooms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'mean_squared_error'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mscores\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'x' is not defined" ] } ], "prompt_number": 7 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Multiple Regression (OLS)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We got a little side tracked there, but lets try improving our regression model with more input features, while at the same time talking about more of the theory, assumptions, and best practices for multiple linear OLS regression. \n", "\n", "Firstly, regression (or machine learning for that matter) is not all about the predictive power of the final model. Regression also gives us the ability to start to make estimations about which features are impacting the model. For example, it would be useful to know which house variables contribute the median value, or which variables do not significantly contribute. **With regression, we can begin to claim things like \"as crime rates go up, housing prices go down\"**." ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Load all the possible predictors into one dataframe\n", "X = pd.DataFrame(boston['data'], columns = boston['feature_names'][:-1])\n", "y = boston.target" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'boston' is not defined", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Load all the possible predictors into one dataframe\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mboston\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'data'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mboston\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'feature_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mboston\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'boston' is not defined" ] } ], "prompt_number": 8 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's improve out cross validation model fitting function so that we can analyze the variables" ] }, { "cell_type": "code", "collapsed": false, "input": [ "\n", "\n", "def cross_vd(x, y, folds):\n", " kf = KFold(len(x), n_folds=folds, shuffle=True)\n", " lr = LinearRegression(fit_intercept=True)\n", "\n", " scores = []\n", " # train and test are outputs of kf\n", " for train, test in kf:\n", " lr.fit(x[train], y[train])\n", " prediction = lr.predict(x[test])\n", " score = lr.score(x[test], y[test])\n", " scores.append(score)\n", " \n", " return sum(scores)/len(scores)\n", " \n", "cross_vd(rooms, y, 10) " ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }