{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# HIDDEN\n", "from datascience import *\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lecture 32" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "galton = Table.read_table('galton.csv')\n", "\n", "heights = Table().with_columns(\n", " 'MidParent', galton.column('midparentHeight'),\n", " 'Child', galton.column('childHeight')\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def standard_units(arr):\n", " return (arr - np.average(arr))/np.std(arr)\n", "\n", "def correlation(t, x, y):\n", " x_standard = standard_units(t.column(x))\n", " y_standard = standard_units(t.column(y))\n", " return np.average(x_standard * y_standard)\n", "\n", "def slope(t, x, y):\n", " r = correlation(t, x, y)\n", " y_sd = np.std(t.column(y))\n", " x_sd = np.std(t.column(x))\n", " return r * y_sd / x_sd\n", "\n", "def intercept(t, x, y):\n", " x_mean = np.mean(t.column(x))\n", " y_mean = np.mean(t.column(y))\n", " return y_mean - slope(t, x, y)*x_mean\n", "\n", "def fitted_values(t, x, y):\n", " \"\"\"Return an array of the regression estimates at all the x values\"\"\"\n", " a = slope(t, x, y)\n", " b = intercept(t, x, y)\n", " return a*t.column(x) + b\n", "\n", "def residuals(t, x, y):\n", " predictions = fitted_values(t, x, y)\n", " return t.column(y) - predictions" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "heights = heights.with_columns(\n", " 'Fitted Value', fitted_values(heights, 'MidParent', 'Child'),\n", " 'Residual', residuals(heights, 'MidParent', 'Child')\n", ")\n", "heights" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "correlation(heights, 'MidParent', 'Child')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "heights.scatter('MidParent')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_residuals(t, x, y):\n", " tbl = t.with_columns(\n", " 'Fitted', fitted_values(t, x, y),\n", " 'Residual', residuals(t, x, y)\n", " )\n", " tbl.select(x, y, 'Fitted').scatter(0)\n", " tbl.scatter(x, 'Residual')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_residuals(heights, 'MidParent', 'Child')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Diagnostics with Residuals ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Length in meters\n", "# Age in years\n", "# Ages are estimated based on variables (e.g. condition of teeth)\n", "dugong = Table.read_table('dugong.csv')\n", "dugong.show(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "dugong.scatter('Length', 'Age')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "correlation(dugong, 'Length', 'Age')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_residuals(dugong, 'Length', 'Age')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Height and average weight of US women\n", "us_women = Table.read_table('us_women.csv')\n", "us_women.show(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "correlation(us_women, 'height', 'ave weight')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_residuals(us_women, 'height', 'ave weight')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "demographics = Table.read_table('district_demographics2016.csv')\n", "demographics.show(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "correlation(demographics, 'Median Income', 'Percent voting for Clinton')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "plot_residuals(demographics, 'Median Income', 'Percent voting for Clinton')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "movies = Table.read_table('actors.csv')\n", "movies.show(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_residuals(movies, 'Number of Movies', 'Average per Movie')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "movies.sort(\"Average per Movie\", descending = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Average of Residuals ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Nonlinear\n", "round(np.average(residuals(dugong, 'Length', 'Age')), 6)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Linear\n", "round(np.average(residuals(heights, 'MidParent', 'Child')), 6)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Heteroscedasticity (\"uneven spread\")\n", "round(np.average(residuals(demographics, 'Median Income', 'Percent voting for Clinton')), 6)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## A Measure of Clustering ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_fitted(t, x, y):\n", " tbl = t.select(x, y)\n", " tbl.with_columns('Fitted Value', fitted_values(t, x, y)).scatter(0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_fitted(heights, 'MidParent', 'Child')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "child_predictions_sd = np.std(fitted_values(heights, 'MidParent', 'Child'))\n", "child_observed_sd = np.std(heights.column('Child'))\n", "print(child_predictions_sd)\n", "print(child_observed_sd)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "child_predictions_sd / child_observed_sd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "correlation(heights, 'MidParent', 'Child')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "correlation(dugong, 'Length', 'Age')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dugong_prediction_sd = np.std(fitted_values(dugong, 'Length', 'Age'))\n", "dugong_observed_sd = np.std(dugong.column(1))\n", "dugong_prediction_sd / dugong_observed_sd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hybrid = Table.read_table('hybrid.csv')\n", "hybrid.show(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_residuals(hybrid, 'acceleration', 'mpg')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "correlation(hybrid, 'acceleration', 'mpg')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.std(fitted_values(hybrid, 'acceleration', 'mpg'))/np.std(hybrid.column('mpg'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "No matter what the shape of the scatter plot, the SD of the fitted values is a fraction of the SD of the observed values of $y$. The fraction is |r|.\n", "\n", "$$\n", "\\frac{\\mbox{SD of fitted values}}{\\mbox{SD of }y} ~=~ |r| ~~~~~~~~~~ \\mbox{That is,} ~~ \\mbox{SD of fitted values} = |r|\\cdot \\mbox{SD of }y\n", "$$" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## SD of the Residuals ##\n", "No matter what the shape of the scatter plot, the SD of the residuals is a fraction of the SD of the observed values of $y$. The fraction is $\\sqrt{1-r^2}$.\n", "\n", "$$\n", "\\mbox{SD of residuals} ~=~ \\sqrt{1 - r^2} \\cdot \\mbox{SD of }y\n", "$$\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plot_fitted(heights, 'MidParent', 'Child')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "plot_fitted(heights, 'MidParent', 'Child')\n", "ave_child = np.mean(heights.column('Child'))\n", "plots.plot([64, 76], [ave_child, ave_child]);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.std(heights.column('Child')) ** 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.std(residuals(heights, 'MidParent', 'Child')) ** 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.std(heights.column('Fitted Value')) ** 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.std(residuals(heights, 'MidParent', 'Child')) ** 2 + np.std(heights.column('Fitted Value')) ** 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The above comes from the variance decomposition:\n", "$$\n", "\\frac{\\mbox{Variance of residuals}}{\\mbox{Variance of }y} ~+~ \\frac{\\mbox{Variance of fitted values}}{\\mbox{Variance of }y} = r^2 + (1-r^2) = 1,\n", "$$ \n", "which is leads to:\n", "$$\n", "\\mbox{Variance of residuals} ~+~ \\mbox{Variance of fitted values} = \\mbox{Variance of }y\n", "$$\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.std(dugong.column('Age')) ** 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.std(fitted_values(dugong, 'Length', 'Age')) ** 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.std(residuals(dugong, 'Length', 'Age')) ** 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.std(fitted_values(dugong, 'Length', 'Age')) ** 2 + np.std(residuals(dugong, 'Length', 'Age')) ** 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "r = correlation(heights, 'MidParent', 'Child')\n", "r" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.sqrt(1 - r**2) * np.std(heights.column('Child'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.std(residuals(heights, 'MidParent', 'Child'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.std(residuals(hybrid, 'acceleration', 'mpg'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "r = correlation(hybrid, 'acceleration', 'mpg')\n", "r" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "np.sqrt(1 - r**2)*np.std(hybrid.column('mpg'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 1 }