{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# HIDDEN\n", "from datascience import *\n", "import numpy as np\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lecture 30" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def r_table(r, num_points=1000):\n", " \"\"\"\n", " Generate a table of N data points with a correlation approximately r\n", " \"\"\"\n", " np.random.seed(8)\n", " x = np.random.normal(0, 1, num_points)\n", " z = np.random.normal(0, 1, num_points)\n", " y = r*x + (np.sqrt(1-r**2))*z\n", " return Table().with_columns('x', x, 'y', y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def draw_line(slope=0, intercept=0, x=make_array(-4, 4), color='r'):\n", " y = x*slope + intercept\n", " plots.plot(x, y, color=color)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def draw_vertical_line(x_position, color='black'):\n", " x = make_array(x_position, x_position)\n", " y = make_array(-4, 4)\n", " plots.plot(x, y, color=color)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def resize_window(lim=3.5):\n", " plots.xlim(-lim, lim)\n", " plots.ylim(-lim, lim)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#Ecological correlations\n", "sat2014 = Table.read_table('sat2014.csv').sort('State')\n", "sat2014" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sat2014.scatter('Critical Reading', 'Math')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#Example dataset with high correlation\n", "example = r_table(0.99)\n", "example" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "example.scatter('x', 'y')\n", "# resize_window()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Function to predict values of y for a given x\n", "def predict_y(x_val):\n", " \"\"\"\n", " Predicts y-values for the example table \n", " \"\"\"\n", " nearby_points = example.where('x', are.between(x_val-0.25, x_val + 0.25))\n", " return np.mean(nearby_points.column('y'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "example = example.with_column('Predicted y', example.apply(predict_y, 'x'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Visualize predicted values\n", "example.scatter('x')\n", "resize_window()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Blue line matches our predicted points\n", "example.scatter('x')\n", "draw_line(slope=1, intercept=0, color='dodgerblue')\n", "resize_window()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Example with correlation of 0\n", "example = r_table(0)\n", "example.scatter('x', 'y')\n", "resize_window()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Predictions produce approximately horizontal line\n", "example = example.with_column('Predicted y', example.apply(predict_y, 'x'))\n", "example.scatter('x')\n", "resize_window()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#Example with correlation of 0.5\n", "### which line follows the pattern?\n", "example = r_table(0.5)\n", "example.scatter('x', 'y')\n", "resize_window()\n", "draw_vertical_line(1.5)\n", "draw_line(slope=1, intercept=0)\n", "draw_line(slope=0.2, intercept=0, color='dodgerblue')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Nearest neighbors predictions\n", "example = example.with_column('Predicted y', example.apply(predict_y, 'x'))\n", "example.scatter('x')\n", "resize_window()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#Blue line matches our predictions well\n", "example.scatter('x')\n", "draw_line(slope=1, intercept=0, color='red')\n", "draw_line(slope=0.5, intercept=0, color='dodgerblue')\n", "resize_window()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "################################################\n", "def standard_units(arr):\n", " return (arr - np.average(arr))/np.std(arr)\n", "\n", "def correlation(t, x, y):\n", " x_standard = standard_units(t.column(x))\n", " y_standard = standard_units(t.column(y))\n", " return np.average(x_standard * y_standard)\n", "\n", "def slope(t, x, y):\n", " r = correlation(t, x, y)\n", " y_sd = np.std(t.column(y))\n", " x_sd = np.std(t.column(x))\n", " return r * y_sd / x_sd\n", "\n", "\n", "def intercept(t, x, y):\n", " x_mean = np.mean(t.column(x))\n", " y_mean = np.mean(t.column(y))\n", " return y_mean - slope(t, x, y)*x_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "galton = Table.read_table('galton.csv')\n", "\n", "heights = Table().with_columns(\n", " 'MidParent', galton.column('midparentHeight'),\n", " 'Child', galton.column('childHeight'))\n", "heights" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def predict_child(h):\n", " \"\"\"Return a prediction of the height of a child \n", " whose parents have a midparent height of h.\n", " \n", " The prediction is the average height of the children \n", " whose midparent height is in the range h plus or minus 0.25 inches.\n", " \"\"\"\n", " \n", " close_points = heights.where('MidParent', are.between(h-0.5, h + 0.5))\n", " return close_points.column('Child').mean() " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights_with_predictions = heights.with_column(\n", " 'Average neighbor prediction', heights.apply(predict_child, 'MidParent'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "galton_slope = slope(heights, 'MidParent', 'Child')\n", "galton_intercept = intercept(heights, 'MidParent', 'Child')\n", "galton_slope, galton_intercept" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights.take(123)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "galton_slope*heights.take(123)[0] + galton_intercept" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights_with_predictions.where('MidParent', are.equal_to(69.48))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights_with_predictions = heights_with_predictions.with_column(\n", " 'Regression Prediction', galton_slope*heights.column('MidParent') + galton_intercept\n", ")\n", "heights_with_predictions" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "heights_with_predictions.scatter('MidParent')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "demographics = Table.read_table('district_demographics2016.csv')\n", "demographics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "demographics.scatter('College%', 'Median Income')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "demographics_slope = slope(demographics, 'College%', 'Median Income')\n", "demographics_intercept = intercept(demographics, 'College%', 'Median Income')\n", "(demographics_slope, demographics_intercept)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "demographics.scatter('College%', 'Median Income')\n", "draw_line(slope=demographics_slope, intercept=demographics_intercept, x=make_array(0, 80))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 1 }