{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# HIDDEN\n",
    "from datascience import *\n",
    "import numpy as np\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plots\n",
    "plots.style.use('fivethirtyeight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lecture 32"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "galton = Table.read_table('galton.csv')\n",
    "\n",
    "heights = Table().with_columns(\n",
    "    'MidParent', galton.column('midparentHeight'),\n",
    "    'Child', galton.column('childHeight')\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "heights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def standard_units(arr):\n",
    "    return (arr - np.average(arr))/np.std(arr)\n",
    "\n",
    "def correlation(t, x, y):\n",
    "    x_standard = standard_units(t.column(x))\n",
    "    y_standard = standard_units(t.column(y))\n",
    "    return np.average(x_standard * y_standard)\n",
    "\n",
    "def slope(t, x, y):\n",
    "    r = correlation(t, x, y)\n",
    "    y_sd = np.std(t.column(y))\n",
    "    x_sd = np.std(t.column(x))\n",
    "    return r * y_sd / x_sd\n",
    "\n",
    "def intercept(t, x, y):\n",
    "    x_mean = np.mean(t.column(x))\n",
    "    y_mean = np.mean(t.column(y))\n",
    "    return y_mean - slope(t, x, y)*x_mean\n",
    "\n",
    "def fitted_values(t, x, y):\n",
    "    \"\"\"Return an array of the regression estimates at all the x values\"\"\"\n",
    "    a = slope(t, x, y)\n",
    "    b = intercept(t, x, y)\n",
    "    return a*t.column(x) + b\n",
    "\n",
    "def residuals(t, x, y):\n",
    "    predictions = fitted_values(t, x, y)\n",
    "    return t.column(y) - predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "heights = heights.with_columns(\n",
    "    'Fitted Value', fitted_values(heights, 'MidParent', 'Child'),\n",
    "    'Residual', residuals(heights, 'MidParent', 'Child')\n",
    ")\n",
    "heights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "correlation(heights, 'MidParent', 'Child')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "heights.scatter('MidParent')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_residuals(t, x, y):\n",
    "    tbl = t.with_columns(\n",
    "        'Fitted', fitted_values(t, x, y),\n",
    "        'Residual', residuals(t, x, y)\n",
    "    )\n",
    "    tbl.select(x, y, 'Fitted').scatter(0)\n",
    "    tbl.scatter(x, 'Residual')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_residuals(heights, 'MidParent', 'Child')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Diagnostics with Residuals ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Length in meters\n",
    "# Age in years\n",
    "# Ages are estimated based on variables (e.g. condition of teeth)\n",
    "dugong = Table.read_table('dugong.csv')\n",
    "dugong.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "dugong.scatter('Length', 'Age')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "correlation(dugong, 'Length', 'Age')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_residuals(dugong, 'Length', 'Age')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Height and average weight of US women\n",
    "us_women = Table.read_table('us_women.csv')\n",
    "us_women.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "correlation(us_women, 'height', 'ave weight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_residuals(us_women, 'height', 'ave weight')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "demographics = Table.read_table('district_demographics2016.csv')\n",
    "demographics.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "correlation(demographics, 'Median Income', 'Percent voting for Clinton')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "plot_residuals(demographics, 'Median Income', 'Percent voting for Clinton')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies = Table.read_table('actors.csv')\n",
    "movies.show(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_residuals(movies, 'Number of Movies', 'Average per Movie')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies.sort(\"Average per Movie\", descending = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Average of Residuals ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Nonlinear\n",
    "round(np.average(residuals(dugong, 'Length', 'Age')), 6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Linear\n",
    "round(np.average(residuals(heights, 'MidParent', 'Child')), 6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Heteroscedasticity (\"uneven spread\")\n",
    "round(np.average(residuals(demographics, 'Median Income', 'Percent voting for Clinton')), 6)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## A Measure of Clustering ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_fitted(t, x, y):\n",
    "    tbl = t.select(x, y)\n",
    "    tbl.with_columns('Fitted Value', fitted_values(t, x, y)).scatter(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_fitted(heights, 'MidParent', 'Child')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "child_predictions_sd = np.std(fitted_values(heights, 'MidParent', 'Child'))\n",
    "child_observed_sd = np.std(heights.column('Child'))\n",
    "print(child_predictions_sd)\n",
    "print(child_observed_sd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "child_predictions_sd / child_observed_sd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "correlation(heights, 'MidParent', 'Child')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "correlation(dugong, 'Length', 'Age')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dugong_prediction_sd = np.std(fitted_values(dugong, 'Length', 'Age'))\n",
    "dugong_observed_sd = np.std(dugong.column(1))\n",
    "dugong_prediction_sd / dugong_observed_sd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hybrid = Table.read_table('hybrid.csv')\n",
    "hybrid.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_residuals(hybrid, 'acceleration', 'mpg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "correlation(hybrid, 'acceleration', 'mpg')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(fitted_values(hybrid, 'acceleration', 'mpg'))/np.std(hybrid.column('mpg'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "No matter what the shape of the scatter plot, the SD of the fitted values is a fraction of the SD of the observed values of $y$. The fraction is |r|.\n",
    "\n",
    "$$\n",
    "\\frac{\\mbox{SD of fitted values}}{\\mbox{SD of }y} ~=~ |r| ~~~~~~~~~~ \\mbox{That is,} ~~ \\mbox{SD of fitted values} = |r|\\cdot \\mbox{SD of }y\n",
    "$$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SD of the Residuals ##\n",
    "No matter what the shape of the scatter plot, the SD of the residuals is a fraction of the SD of the observed values of $y$. The fraction is  $\\sqrt{1-r^2}$.\n",
    "\n",
    "$$\n",
    "\\mbox{SD of residuals} ~=~ \\sqrt{1 - r^2} \\cdot \\mbox{SD of }y\n",
    "$$\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_fitted(heights, 'MidParent', 'Child')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "plot_fitted(heights, 'MidParent', 'Child')\n",
    "ave_child = np.mean(heights.column('Child'))\n",
    "plots.plot([64, 76], [ave_child, ave_child]);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(heights.column('Child')) ** 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(residuals(heights, 'MidParent', 'Child')) ** 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(heights.column('Fitted Value')) ** 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(residuals(heights, 'MidParent', 'Child')) ** 2 + np.std(heights.column('Fitted Value')) ** 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The above comes from the variance decomposition:\n",
    "$$\n",
    "\\frac{\\mbox{Variance of residuals}}{\\mbox{Variance of }y} ~+~ \\frac{\\mbox{Variance of fitted values}}{\\mbox{Variance of }y} = r^2 + (1-r^2) = 1,\n",
    "$$    \n",
    "which is leads to:\n",
    "$$\n",
    "\\mbox{Variance of residuals} ~+~ \\mbox{Variance of fitted values} = \\mbox{Variance of }y\n",
    "$$\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(dugong.column('Age')) ** 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(fitted_values(dugong, 'Length', 'Age')) ** 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(residuals(dugong, 'Length', 'Age')) ** 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(fitted_values(dugong, 'Length', 'Age')) ** 2 + np.std(residuals(dugong, 'Length', 'Age')) ** 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r = correlation(heights, 'MidParent', 'Child')\n",
    "r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.sqrt(1 - r**2) * np.std(heights.column('Child'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(residuals(heights, 'MidParent', 'Child'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.std(residuals(hybrid, 'acceleration', 'mpg'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r = correlation(hybrid, 'acceleration', 'mpg')\n",
    "r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "np.sqrt(1 - r**2)*np.std(hybrid.column('mpg'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}