{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# HIDDEN\n",
    "from datascience import *\n",
    "import numpy as np\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plots\n",
    "plots.style.use('fivethirtyeight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Lecture 30"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def r_table(r, num_points=1000):\n",
    "    \"\"\"\n",
    "    Generate a table of N data points with a correlation approximately r\n",
    "    \"\"\"\n",
    "    np.random.seed(8)\n",
    "    x = np.random.normal(0, 1, num_points)\n",
    "    z = np.random.normal(0, 1, num_points)\n",
    "    y = r*x + (np.sqrt(1-r**2))*z\n",
    "    return Table().with_columns('x', x, 'y', y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def draw_line(slope=0, intercept=0, x=make_array(-4, 4), color='r'):\n",
    "    y = x*slope + intercept\n",
    "    plots.plot(x, y, color=color)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def draw_vertical_line(x_position, color='black'):\n",
    "    x = make_array(x_position, x_position)\n",
    "    y = make_array(-4, 4)\n",
    "    plots.plot(x, y, color=color)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def resize_window(lim=3.5):\n",
    "    plots.xlim(-lim, lim)\n",
    "    plots.ylim(-lim, lim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#Ecological correlations\n",
    "sat2014 = Table.read_table('sat2014.csv').sort('State')\n",
    "sat2014"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sat2014.scatter('Critical Reading', 'Math')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#Example dataset with high correlation\n",
    "example = r_table(0.99)\n",
    "example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "example.scatter('x', 'y')\n",
    "# resize_window()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Function to predict values of y for a given x\n",
    "def predict_y(x_val):\n",
    "    \"\"\"\n",
    "    Predicts y-values for the example table \n",
    "    \"\"\"\n",
    "    nearby_points = example.where('x', are.between(x_val-0.25, x_val + 0.25))\n",
    "    return np.mean(nearby_points.column('y'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "example = example.with_column('Predicted y', example.apply(predict_y, 'x'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Visualize predicted values\n",
    "example.scatter('x')\n",
    "resize_window()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Blue line matches our predicted points\n",
    "example.scatter('x')\n",
    "draw_line(slope=1, intercept=0, color='dodgerblue')\n",
    "resize_window()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Example with correlation of 0\n",
    "example = r_table(0)\n",
    "example.scatter('x', 'y')\n",
    "resize_window()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Predictions produce approximately horizontal line\n",
    "example = example.with_column('Predicted y', example.apply(predict_y, 'x'))\n",
    "example.scatter('x')\n",
    "resize_window()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#Example with correlation of 0.5\n",
    "### which line follows the pattern?\n",
    "example = r_table(0.5)\n",
    "example.scatter('x', 'y')\n",
    "resize_window()\n",
    "draw_vertical_line(1.5)\n",
    "draw_line(slope=1, intercept=0)\n",
    "draw_line(slope=0.2, intercept=0, color='dodgerblue')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Nearest neighbors predictions\n",
    "example = example.with_column('Predicted y', example.apply(predict_y, 'x'))\n",
    "example.scatter('x')\n",
    "resize_window()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#Blue line matches our predictions well\n",
    "example.scatter('x')\n",
    "draw_line(slope=1, intercept=0, color='red')\n",
    "draw_line(slope=0.5, intercept=0, color='dodgerblue')\n",
    "resize_window()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "################################################\n",
    "def standard_units(arr):\n",
    "    return (arr - np.average(arr))/np.std(arr)\n",
    "\n",
    "def correlation(t, x, y):\n",
    "    x_standard = standard_units(t.column(x))\n",
    "    y_standard = standard_units(t.column(y))\n",
    "    return np.average(x_standard * y_standard)\n",
    "\n",
    "def slope(t, x, y):\n",
    "    r = correlation(t, x, y)\n",
    "    y_sd = np.std(t.column(y))\n",
    "    x_sd = np.std(t.column(x))\n",
    "    return r * y_sd / x_sd\n",
    "\n",
    "\n",
    "def intercept(t, x, y):\n",
    "    x_mean = np.mean(t.column(x))\n",
    "    y_mean = np.mean(t.column(y))\n",
    "    return y_mean - slope(t, x, y)*x_mean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "galton = Table.read_table('galton.csv')\n",
    "\n",
    "heights = Table().with_columns(\n",
    "    'MidParent', galton.column('midparentHeight'),\n",
    "    'Child', galton.column('childHeight'))\n",
    "heights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_child(h):\n",
    "    \"\"\"Return a prediction of the height of a child \n",
    "    whose parents have a midparent height of h.\n",
    "    \n",
    "    The prediction is the average height of the children \n",
    "    whose midparent height is in the range h plus or minus 0.25 inches.\n",
    "    \"\"\"\n",
    "    \n",
    "    close_points = heights.where('MidParent', are.between(h-0.5, h + 0.5))\n",
    "    return close_points.column('Child').mean()   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "heights_with_predictions = heights.with_column(\n",
    "    'Average neighbor prediction', heights.apply(predict_child, 'MidParent'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "galton_slope = slope(heights, 'MidParent', 'Child')\n",
    "galton_intercept = intercept(heights, 'MidParent', 'Child')\n",
    "galton_slope, galton_intercept"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "heights.take(123)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "galton_slope*heights.take(123)[0] + galton_intercept"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "heights_with_predictions.where('MidParent', are.equal_to(69.48))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "heights_with_predictions = heights_with_predictions.with_column(\n",
    "    'Regression Prediction', galton_slope*heights.column('MidParent') + galton_intercept\n",
    ")\n",
    "heights_with_predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "heights_with_predictions.scatter('MidParent')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "demographics = Table.read_table('district_demographics2016.csv')\n",
    "demographics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "demographics.scatter('College%', 'Median Income')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "demographics_slope = slope(demographics, 'College%', 'Median Income')\n",
    "demographics_intercept = intercept(demographics, 'College%', 'Median Income')\n",
    "(demographics_slope, demographics_intercept)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "demographics.scatter('College%', 'Median Income')\n",
    "draw_line(slope=demographics_slope, intercept=demographics_intercept, x=make_array(0, 80))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}