In [None]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Lecture 30

## Prediction

Let's revisit Galton's predictions of children's heights based on their parent's heights...

In [None]:
galton = Table.read_table('galton.csv')

heights = Table().with_columns(
 'MidParent', galton.column('midparentHeight'),
 'Child', galton.column('childHeight')
 )
heights

In [None]:
# look at a scatter plot of the relationship 
heights.scatter('MidParent')

In [None]:
def predict_child(h):
 """Return a prediction of the height of a child 
 whose parents have a midparent height of h.
 
 The prediction is the average height of the children 
 whose midparent height is in the range h plus or minus 0.25 inches.
 """
 
 close_points = heights.where('MidParent', are.between(h-0.5, h + 0.5))
 return close_points.column('Child').mean() 

In [None]:
# predict the height for each child in the data set
heights_with_predictions = heights.with_column(
 'Prediction', heights.apply(predict_child, 'MidParent')
 )

In [None]:
# visualize the predicted heights
heights_with_predictions.scatter('MidParent')

## Association

Data on hybrid passenger cars sold in US 1997-2013:

- `vehicle`: model of the car
- `year`: year of manufacture
- `msrp`: manufacturer's suggested retail price in 2013 dollars
- `acceleration`: acceleration rate in km per hour per second
- `mpg`: fuel econonmy in miles per gallon
- `class`: the model's class.

In [None]:
hybrid = Table.read_table('hybrid.csv')
hybrid

In [None]:
# explore the data to see which cars cost the most


In [None]:
# vehicles with higher mpg tend to cost less on average - surprising?


In [None]:
# vehicles that accelerate faster tend to cost more, and have lower mpg (not as fuel efficient)


### Correlation coefficient

In [None]:
def r_scatter(r):
 plots.figure(figsize=(5,5))
 "Generate a scatter plot with a correlation approximately r"
 x = np.random.normal(0, 1, 1000)
 z = np.random.normal(0, 1, 1000)
 y = r*x + (np.sqrt(1-r**2))*z
 plots.scatter(x, y)
 plots.xlim(-4, 4)
 plots.ylim(-4, 4)

In [None]:
# try r = 0.3


In [None]:
# try r = 0


In [None]:
# try r = -0.2


In [None]:
# try r = -0.95


### Calculating the correlation coefficient

To calculate the correlation coefficient r, we first convert our data to standardized units (by z-scoring out data)


In [None]:
# Convert data to standard units
def standard_units(x):
 "Convert any array of numbers to standard units."


In [None]:
# simplify the hybrid data set to only have msrp and acceleration and add the standard units






In [None]:
# Use of standard units does not change the point patterns


In [None]:
# Use of standard units does not change the point patterns


In [None]:
# we then calculate the product of the standardized units





In [None]:
# r is the average of the products of standard units



In [None]:
# we can create a function to calculate the correlation coefficient
def correlation(t, label_x, label_y):
 ...

In [None]:
# calculate correlation between acceleration and msrp


In [None]:
# calculate correlation between mpg and msrp


In [None]:
# order doesn't matter


## Correlation cautions

In [None]:
# correlation only captures linear trends
new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns(
 'x', new_x,
 'y', new_x**2
 )
nonlinear.scatter('x', 'y', s=30, color='r')

In [None]:
# correlation for the curved data


In [None]:
# correlation is heavily influenced by outliers
anscombes = Table.read_table('anscombes.csv')
data1 = anscombes.where("dataset", "I")
data2 = anscombes.where("dataset", "II")
data3 = anscombes.where("dataset", "III")
data4 = anscombes.where("dataset", "IV")

In [None]:
# data set 1



In [None]:
# data set 2



In [None]:
# data set 3



In [None]:
# data set 4



## Linear regression

In [None]:
# original scatter plot of the data and the correlation for the Galton data


In [None]:
# predictions made by taking average of children's heights in a neighborhood


In [None]:
# predictions made by the regression line


In [None]:
# comparing prediction of the regression line and average in a neighborhood


## Regression in standardized units

In [None]:
# Let's look at the relationship in standaridized units (z-score transformed units)





In [None]:
# Correlation between children's and parent's heights


In [None]:
# predictions are less than the identity line -> regression to the mean



In [None]:
# function to calculate the slope 
def slope(t, x, y):
 ...



In [None]:
# slope for predicting child's height


# Q: for every additional inch a parent is taller, how much taller is the predicted child's height?

In [None]:
# function to calculate the intercept
def intercept(t, x, y):
 ...


In [None]:
# intercept for predicting child's height


#Q: How a parents that are 0" tall, how tall is the predicted height of their child? 

### Regression equation for Galton data




In [None]:
# How tall would be predict a child to be if their parents were 70 inches?

