In [None]:
# HIDDEN
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 32

In [None]:
galton = Table.read_table('galton.csv')

heights = Table().with_columns(
 'MidParent', galton.column('midparentHeight'),
 'Child', galton.column('childHeight')
 )

In [None]:
heights

In [None]:
def standard_units(arr):
 return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
 x_standard = standard_units(t.column(x))
 y_standard = standard_units(t.column(y))
 return np.average(x_standard * y_standard)

def slope(t, x, y):
 r = correlation(t, x, y)
 y_sd = np.std(t.column(y))
 x_sd = np.std(t.column(x))
 return r * y_sd / x_sd

def intercept(t, x, y):
 x_mean = np.mean(t.column(x))
 y_mean = np.mean(t.column(y))
 return y_mean - slope(t, x, y)*x_mean

def fitted_values(t, x, y):
 """Return an array of the regression estimates at all the x values"""
 a = slope(t, x, y)
 b = intercept(t, x, y)
 return a*t.column(x) + b

def residuals(t, x, y):
 predictions = fitted_values(t, x, y)
 return t.column(y) - predictions

In [None]:
heights = heights.with_columns(
 'Fitted Value', fitted_values(heights, 'MidParent', 'Child'),
 'Residual', residuals(heights, 'MidParent', 'Child')
)
heights

In [None]:
correlation(heights, 'MidParent', 'Child')

In [None]:
heights.scatter('MidParent')

In [None]:
def plot_residuals(t, x, y):
 tbl = t.with_columns(
 'Fitted', fitted_values(t, x, y),
 'Residual', residuals(t, x, y)
 )
 tbl.select(x, y, 'Fitted').scatter(0)
 tbl.scatter(x, 'Residual')

In [None]:
plot_residuals(heights, 'MidParent', 'Child')

## Diagnostics with Residuals ##

In [None]:
# Length in meters
# Age in years
# Ages are estimated based on variables (e.g. condition of teeth)
dugong = Table.read_table('dugong.csv')
dugong.show(5)

In [None]:
dugong.scatter('Length', 'Age')

In [None]:
correlation(dugong, 'Length', 'Age')

In [None]:
plot_residuals(dugong, 'Length', 'Age')

In [None]:
# Height and average weight of US women
us_women = Table.read_table('us_women.csv')
us_women.show(5)

In [None]:
correlation(us_women, 'height', 'ave weight')

In [None]:
plot_residuals(us_women, 'height', 'ave weight')

In [None]:
demographics = Table.read_table('district_demographics2016.csv')
demographics.show(5)

In [None]:
correlation(demographics, 'Median Income', 'Percent voting for Clinton')

In [None]:
plot_residuals(demographics, 'Median Income', 'Percent voting for Clinton')

In [None]:
movies = Table.read_table('actors.csv')
movies.show(3)

In [None]:
plot_residuals(movies, 'Number of Movies', 'Average per Movie')

In [None]:
movies.sort("Average per Movie", descending = True)

## Average of Residuals ##

In [None]:
# Nonlinear
round(np.average(residuals(dugong, 'Length', 'Age')), 6)

In [None]:
# Linear
round(np.average(residuals(heights, 'MidParent', 'Child')), 6)

In [None]:
# Heteroscedasticity ("uneven spread")
round(np.average(residuals(demographics, 'Median Income', 'Percent voting for Clinton')), 6)

## A Measure of Clustering ##

In [None]:
def plot_fitted(t, x, y):
 tbl = t.select(x, y)
 tbl.with_columns('Fitted Value', fitted_values(t, x, y)).scatter(0)

In [None]:
plot_fitted(heights, 'MidParent', 'Child')

In [None]:
child_predictions_sd = np.std(fitted_values(heights, 'MidParent', 'Child'))
child_observed_sd = np.std(heights.column('Child'))
print(child_predictions_sd)
print(child_observed_sd)

In [None]:
child_predictions_sd / child_observed_sd

In [None]:
correlation(heights, 'MidParent', 'Child')

In [None]:
correlation(dugong, 'Length', 'Age')

In [None]:
dugong_prediction_sd = np.std(fitted_values(dugong, 'Length', 'Age'))
dugong_observed_sd = np.std(dugong.column(1))
dugong_prediction_sd / dugong_observed_sd

In [None]:
hybrid = Table.read_table('hybrid.csv')
hybrid.show(5)

In [None]:
plot_residuals(hybrid, 'acceleration', 'mpg')

In [None]:
correlation(hybrid, 'acceleration', 'mpg')

In [None]:
np.std(fitted_values(hybrid, 'acceleration', 'mpg'))/np.std(hybrid.column('mpg'))

No matter what the shape of the scatter plot, the SD of the fitted values is a fraction of the SD of the observed values of $y$. The fraction is |r|.

$$
\frac{\mbox{SD of fitted values}}{\mbox{SD of }y} ~=~ |r| ~~~~~~~~~~ \mbox{That is,} ~~ \mbox{SD of fitted values} = |r|\cdot \mbox{SD of }y
$$

## SD of the Residuals ##
No matter what the shape of the scatter plot, the SD of the residuals is a fraction of the SD of the observed values of $y$. The fraction is $\sqrt{1-r^2}$.

$$
\mbox{SD of residuals} ~=~ \sqrt{1 - r^2} \cdot \mbox{SD of }y
$$


In [None]:
plot_fitted(heights, 'MidParent', 'Child')

In [None]:
plot_fitted(heights, 'MidParent', 'Child')
ave_child = np.mean(heights.column('Child'))
plots.plot([64, 76], [ave_child, ave_child]);

In [None]:
np.std(heights.column('Child')) ** 2

In [None]:
np.std(residuals(heights, 'MidParent', 'Child')) ** 2

In [None]:
np.std(heights.column('Fitted Value')) ** 2

In [None]:
np.std(residuals(heights, 'MidParent', 'Child')) ** 2 + np.std(heights.column('Fitted Value')) ** 2

The above comes from the variance decomposition:
$$
\frac{\mbox{Variance of residuals}}{\mbox{Variance of }y} ~+~ \frac{\mbox{Variance of fitted values}}{\mbox{Variance of }y} = r^2 + (1-r^2) = 1,
$$ 
which is leads to:
$$
\mbox{Variance of residuals} ~+~ \mbox{Variance of fitted values} = \mbox{Variance of }y
$$


In [None]:
np.std(dugong.column('Age')) ** 2

In [None]:
np.std(fitted_values(dugong, 'Length', 'Age')) ** 2

In [None]:
np.std(residuals(dugong, 'Length', 'Age')) ** 2

In [None]:
np.std(fitted_values(dugong, 'Length', 'Age')) ** 2 + np.std(residuals(dugong, 'Length', 'Age')) ** 2

In [None]:
r = correlation(heights, 'MidParent', 'Child')
r

In [None]:
np.sqrt(1 - r**2) * np.std(heights.column('Child'))

In [None]:
np.std(residuals(heights, 'MidParent', 'Child'))

In [None]:
np.std(residuals(hybrid, 'acceleration', 'mpg'))

In [None]:
r = correlation(hybrid, 'acceleration', 'mpg')
r

In [None]:
np.sqrt(1 - r**2)*np.std(hybrid.column('mpg'))