{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datascience import *\n",
    "import numpy as np\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plots\n",
    "plots.style.use('fivethirtyeight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Lecture 30"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prediction\n",
    "\n",
    "Let's revisit Galton's predictions of children's heights based on their parent's heights..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "galton = Table.read_table('galton.csv')\n",
    "\n",
    "heights = Table().with_columns(\n",
    "    'MidParent', galton.column('midparentHeight'),\n",
    "    'Child', galton.column('childHeight')\n",
    "    )\n",
    "heights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# look at a scatter plot of the relationship \n",
    "heights.scatter('MidParent')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_child(h):\n",
    "    \"\"\"Return a prediction of the height of a child \n",
    "    whose parents have a midparent height of h.\n",
    "    \n",
    "    The prediction is the average height of the children \n",
    "    whose midparent height is in the range h plus or minus 0.25 inches.\n",
    "    \"\"\"\n",
    "    \n",
    "    close_points = heights.where('MidParent', are.between(h-0.5, h + 0.5))\n",
    "    return close_points.column('Child').mean()   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# predict the height for each child in the data set\n",
    "heights_with_predictions = heights.with_column(\n",
    "    'Prediction', heights.apply(predict_child, 'MidParent')\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# visualize the predicted heights\n",
    "heights_with_predictions.scatter('MidParent')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Association\n",
    "\n",
    "Data on hybrid passenger cars sold in US 1997-2013:\n",
    "\n",
    "- `vehicle`: model of the car\n",
    "- `year`: year of manufacture\n",
    "- `msrp`: manufacturer's suggested retail price in 2013 dollars\n",
    "- `acceleration`: acceleration rate in km per hour per second\n",
    "- `mpg`: fuel econonmy in miles per gallon\n",
    "- `class`: the model's class."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "hybrid = Table.read_table('hybrid.csv')\n",
    "hybrid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# explore the data to see which cars cost the most\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# vehicles with higher mpg tend to cost less on average - surprising?\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# vehicles that accelerate faster tend to cost more, and have lower mpg (not as fuel efficient)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Correlation coefficient"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def r_scatter(r):\n",
    "    plots.figure(figsize=(5,5))\n",
    "    \"Generate a scatter plot with a correlation approximately r\"\n",
    "    x = np.random.normal(0, 1, 1000)\n",
    "    z = np.random.normal(0, 1, 1000)\n",
    "    y = r*x + (np.sqrt(1-r**2))*z\n",
    "    plots.scatter(x, y)\n",
    "    plots.xlim(-4, 4)\n",
    "    plots.ylim(-4, 4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# try r = 0.3\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# try r = 0\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# try r = -0.2\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# try r = -0.95\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Calculating the correlation coefficient\n",
    "\n",
    "To calculate the correlation coefficient r, we first convert our data to standardized units (by z-scoring out data)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert data to standard units\n",
    "def standard_units(x):\n",
    "    \"Convert any array of numbers to standard units.\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# simplify the hybrid data set to only have msrp and acceleration and add the standard units\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use of standard units does not change the point patterns\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use of standard units does not change the point patterns\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we then calculate the product of the standardized units\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# r is the average of the products of standard units\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we can create a function to calculate the correlation coefficient\n",
    "def correlation(t, label_x, label_y):\n",
    "    ..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate correlation between acceleration and msrp\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate correlation between mpg and msrp\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# order doesn't matter\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Correlation cautions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# correlation only captures linear trends\n",
    "new_x = np.arange(-4, 4.1, 0.5)\n",
    "nonlinear = Table().with_columns(\n",
    "        'x', new_x,\n",
    "        'y', new_x**2\n",
    "    )\n",
    "nonlinear.scatter('x', 'y', s=30, color='r')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# correlation for the curved data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# correlation is heavily influenced by outliers\n",
    "anscombes = Table.read_table('anscombes.csv')\n",
    "data1 = anscombes.where(\"dataset\", \"I\")\n",
    "data2 = anscombes.where(\"dataset\", \"II\")\n",
    "data3 = anscombes.where(\"dataset\", \"III\")\n",
    "data4 = anscombes.where(\"dataset\", \"IV\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# data set 1\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# data set 2\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# data set 3\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# data set 4\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Linear regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# original scatter plot of the data and the correlation for the Galton data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# predictions made by taking average of children's heights in a neighborhood\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# predictions made by the regression line\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# comparing prediction of the regression line and average in a neighborhood\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Regression in standardized units"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's look at the relationship in standaridized units (z-score transformed units)\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Correlation between children's and parent's heights\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# predictions are less than the identity line -> regression to the mean\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# function to calculate the slope \n",
    "def slope(t, x, y):\n",
    "    ...\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# slope for predicting child's height\n",
    "\n",
    "\n",
    "# Q: for every additional inch a parent is taller, how much taller is the predicted child's height?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# function to calculate the intercept\n",
    "def intercept(t, x, y):\n",
    "    ...\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# intercept for predicting child's height\n",
    "\n",
    "\n",
    "#Q: How a parents that are 0\" tall, how tall is the predicted height of their child? "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Regression equation for Galton data\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# How tall would be predict a child to be if their parents were 70 inches?\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}