In [None]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def standard_units(arr):
 return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
 x_standard = standard_units(t.column(x))
 y_standard = standard_units(t.column(y))
 return np.average(x_standard * y_standard)

def slope(t, x, y):
 r = correlation(t, x, y)
 y_sd = np.std(t.column(y))
 x_sd = np.std(t.column(x))
 return r * y_sd / x_sd

def intercept(t, x, y):
 x_mean = np.mean(t.column(x))
 y_mean = np.mean(t.column(y))
 return y_mean - slope(t, x, y)*x_mean

def fitted_values(t, x, y):
 """Return an array of the regression estimates at all the x values"""
 a = slope(t, x, y)
 b = intercept(t, x, y)
 return a*t.column(x) + b

def residuals(t, x, y):
 predictions = fitted_values(t, x, y)
 return t.column(y) - predictions

## Hypothesis testing for the slope

In [None]:
def bootstrap_slope(t, x, y, repetitions=5000):
 
 # Bootstrap the scatter, find the slope, collect
 slopes = make_array()
 for i in np.arange(repetitions):
 bootstrap_sample = t.sample()
 bootstrap_slope = slope(bootstrap_sample, x, y)
 slopes = np.append(slopes, bootstrap_slope)
 
 # Find the endpoints of the 95% confidence interval for the true slope
 left = percentile(2.5, slopes)
 right = percentile(97.5, slopes)
 
 # Slope of the regression line from the original sample
 observed_slope = slope(t, x, y)
 
 # Display results
 Table().with_column('Bootstrap Slopes', slopes).hist(bins=20)
 plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=8);
 print('Slope of regression line:', observed_slope)
 print('Approximate 95%-confidence interval for the slope of the true line:')
 print(left, 'to', right)

In [None]:
baby = Table.read_table('baby.csv')
baby.show(5)

In [None]:
slope(baby, 'Maternal Age', 'Birth Weight')

In [None]:
baby.scatter('Maternal Age', 'Birth Weight', fit_line=True)

In [None]:
bootstrap_slope(baby, 'Maternal Age', 'Birth Weight', 1000)

## Classification

In [None]:
#ckd = chronic kidney disease
#class = 1 = has ckd
#class = 0 = does not have ckd
ckd = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd.show(3)

In [None]:
ckd.group('Class')

In [None]:
#Could you predict if a patient has ckd?
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')

In [None]:
ckd.scatter('Hemoglobin', 'Glucose', colors='Class')

In [None]:
#Can you tell if a bank note is counterfeit or legitimate?
#Variables based on photgraphs of many banknotes (a few numbers for each image calculated)
banknotes = Table.read_table('banknote.csv')
banknotes

In [None]:
banknotes.scatter('WaveletVar', 'WaveletCurt', colors='Class')

In [None]:
banknotes.scatter('WaveletSkew', 'Entropy', colors='Class')

In [None]:
#Two attributes have some overlap of classes...what happens with three attributes?
fig = plots.figure(figsize=(8,8))
ax = Axes3D(fig)
ax.scatter(banknotes.column('WaveletSkew'), 
 banknotes.column('WaveletVar'), 
 banknotes.column('WaveletCurt'), 
 c=banknotes.column('Class'),
 cmap='viridis',
 s=50);