In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 9 ##

## Numerical Distribution ##

Let's examine visualizations of numerical data by looking at how old the top grossing movies are. 

In [None]:
top = Table.read_table('top_movies_2017.csv')
top

In [None]:
# add the movie age to the top Table
ages = 2022 - top.column('Year')
top = top.with_column('Age', ages)
top

## Binning ##

We can bin numerical data by creating a set of bins end points, and then calculating how many data points fall within each bin. 


In [None]:
[min(ages), max(ages)]

In [None]:
# create the bin end points
my_bins = np.arange(0, 121, 20)
my_bins

In [None]:
# Bin the ages of movies into bins of [ ). The last row just gives the end of the last bin and is always 0. 
top.bin('Age', bins = my_bins)

In [None]:
# It is possible to bin with intervals of different sizes
uneven_bins = make_array(0, 5, 10, 15, 25, 40, 65, 101)
uneven_bins

In [None]:
# Bin the ages of movies into bins of [ ). The last row just gives the end of the last bin and is always 0. 
top.bin('Age', bins = uneven_bins)

In [None]:
sum(top.bin('Age', bins = uneven_bins).column(1))

## Histograms ##

Histograms are a useful way to visual numerical data. To create a histogram we binned the data, and then treated the bins as categories and create a bar plot of the resulting data. 

In [None]:
# histogram with even bin sizes
top.hist('Age', bins = np.arange(0, 110, 10), unit = 'Years')

In [None]:
# We can also specify the number of evenly sized bins we want.
top.hist('Age', bins = 20, unit = 'Years')

In [None]:
# We can create histograms of uneven bin sizes. 
# The *area* of the bar should be proportional to the number of items in a bin range. 
top.hist('Age', bins = uneven_bins, unit = 'Years')

## Writing functions ##

In [None]:
def double(x):
    return x * 2

In [None]:
double(7)

In [None]:
double(15/3)

In [None]:
my_number = 12

In [None]:
double(my_number)

In [None]:
double(my_number / 8)

In [None]:
double(make_array(3, 4, 5))

In [None]:
double('data')

In [None]:
#"local scope"
x

In [None]:
x = 17

In [None]:
double(2)

In [None]:
x

In [None]:
double(x)

In [None]:
x

### Discussion Question

In [None]:
#What does this function do?
def percents(values):
    return np.round(100 * values / sum(values), 2)

In [None]:
percents(make_array(1, 2, 3, 4))

In [None]:
percents(make_array(1, 4, 30))

In [None]:
#Can have multiple inputs
def percents(values, places):
    return np.round(values / sum(values) * 100, places)

In [None]:
percents(make_array(1, 4, 30), 1)

## Apply ##

In [None]:
ages = Table().with_columns(
    'Person', make_array('A', 'B', 'C', 'D'),
    'Age', make_array(63, 110, 99, 102)
)
ages

In [None]:
def cut_off_at_100(z):
    return min(z, 100)

In [None]:
cut_off_at_100(3)

In [None]:
cut_off_at_100(107)

In [None]:
cut_age_array = ages.apply(cut_off_at_100, 'Age')
cut_age_array

In [None]:
ages.with_column('Cut off ages', cut_age_array)

In [None]:
type(cut_off_at_100)

## Prediction ##

In [None]:
galton = Table.read_table('galton.csv')

In [None]:
#Each row corresponds to one adult child
#family = family indicator
#father height (inches) 
#mother height (inches) 
#"midparent height"= weighted average of parents' heights
#children= # of children in the family
#childNum = child's birth rank (1 = oldest)
#gender
#height (inches)
galton

In [None]:
heights = galton.select(3, 7).relabeled(0, 'MidParent').relabeled(1, 'Child')

In [None]:
heights

In [None]:
# Side note:  overlapping histogram 
heights.hist(bins=my_bins, unit='inches')

In [None]:
heights.scatter('MidParent', 'Child')

In [None]:
heights.scatter('MidParent', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);

In [None]:
nearby = heights.where('MidParent', are.between(67.5, 68.5))
nearby.column('Child').mean()

In [None]:
heights.scatter('MidParent', 'Child')
plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)
plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)
plots.scatter(68, 66.24, color='gold', s=75);

In [None]:
def predict_child(h):
    nearby = heights.where('MidParent', are.between(h-0.5, h+0.5))
    return nearby.column('Child').mean()

In [None]:
predict_child(68)

In [None]:
predict_child(65)

In [None]:
predictions = heights.apply(predict_child, 'MidParent')

In [None]:
heights = heights.with_column('Child Prediction', predictions)

In [None]:
heights

In [None]:
heights.scatter('MidParent')