{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from datascience import *\n",
"import numpy as np\n",
"\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plots\n",
"plots.style.use('fivethirtyeight')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Recap: Comparisons with arrays"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"pets = make_array('cat', 'dog', 'cat', 'cat', 'dog', 'rabbit')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([False, True, False, False, True, False])"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pets == 'dog'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(pets == 'dog')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([False, True, False, False, True, True])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pets > 'cat'"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(pets > 'cat')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Recap: Predicates and advanced `where`"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
" \n",
" \n",
" Semester | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
"
\n",
" \n",
" 4 | \n",
"
\n",
" \n",
" 5 | \n",
"
\n",
" \n",
" 6 | \n",
"
\n",
" \n",
" 7 | \n",
"
\n",
" \n",
" 8 | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
"Semester\n",
"1\n",
"2\n",
"3\n",
"4\n",
"5\n",
"6\n",
"7\n",
"8"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"terms = Table().with_column('Semester', np.arange(1, 9))\n",
"terms"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
" Semester | \n",
"
\n",
" \n",
" \n",
" \n",
" 7 | \n",
"
\n",
" \n",
" 8 | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
"Semester\n",
"7\n",
"8"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"terms.where('Semester', are.above(6))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
" Semester | \n",
"
\n",
" \n",
" \n",
" \n",
" 7 | \n",
"
\n",
" \n",
" 8 | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
"Semester\n",
"7\n",
"8"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"terms.where([False, False, False, False, False, False, True, True])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Conditional Statements "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def year_from_semesters(x):\n",
" if x <= 0:\n",
" return 'Not a valid input'\n",
" elif x <= 2:\n",
" return 'First Year'\n",
" elif x <= 4:\n",
" return 'Sophomore'\n",
" elif x <= 6:\n",
" return 'Junior'\n",
" elif x <= 8:\n",
" return 'Senior'\n",
" else:\n",
" return \"NA\""
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Not a valid input'"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"year_from_semesters(-15.6)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Junior'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"year_from_semesters(5)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'NA'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"year_from_semesters(9001)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
" Semester | Year | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | First Year | \n",
"
\n",
" \n",
" 2 | First Year | \n",
"
\n",
" \n",
" 3 | Sophomore | \n",
"
\n",
" \n",
" 4 | Sophomore | \n",
"
\n",
" \n",
" 5 | Junior | \n",
"
\n",
" \n",
" 6 | Junior | \n",
"
\n",
" \n",
" 7 | Senior | \n",
"
\n",
" \n",
" 8 | Senior | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
"Semester | Year\n",
"1 | First Year\n",
"2 | First Year\n",
"3 | Sophomore\n",
"4 | Sophomore\n",
"5 | Junior\n",
"6 | Junior\n",
"7 | Senior\n",
"8 | Senior"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"terms.with_column('Year', terms.apply(year_from_semesters, 'Semester'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Appending Arrays ##"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"first = np.arange(1, 6)\n",
"second = np.arange(6, 11)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 2, 3, 4, 5])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"first"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1, 2, 3, 4, 5, 30])"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.append(first, 30)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.append(first, second)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 2, 3, 4, 5])"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"first"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"first_second = np.append(first, second)\n",
"first_second"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Random Selection ##"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['treatment', 'control'], dtype='"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"Table().with_column(\n",
" 'Heads in 100 Tosses', outcomes\n",
").hist(bins = np.arange(25.5, 75.5))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now you get to try. Instead of tossing a coin 100 times, suppose you roll TWO 6-sided dice and consider the sum of their outcomes. \n",
"What is the estimated **probability** that the sum is less than or equal to 5? Use 10,000 runs of your experiment to estimate this.\n",
"Hints: \n",
"1. You will need to define the possible outcomes of 1,...,6\n",
"2. Then use the possible outcomes with np.random.choice twice - one for each roll."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 2, 3, 4, 5, 6])"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Outcomes\n",
"roll = np.arange(1,7)\n",
"roll\n"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([2, 6])"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Two rolls\n",
"np.random.choice(roll,2)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"7"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Sum of two rolls\n",
"sum(np.random.choice(roll,2))"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.2803"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Setup iterations\n",
"outcomes = make_array()\n",
"\n",
"for i in np.arange(10000):\n",
" total = sum(np.random.choice(roll,2))\n",
" outcomes = np.append(outcomes, total <= 5)\n",
" \n",
"outcomes_average = sum(outcomes)/len(outcomes)\n",
"outcomes_average\n"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.2819"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Another option\n",
"roll1 = np.random.choice(roll,10000)\n",
"roll2 = np.random.choice(roll,10000)\n",
"total = roll1+roll2\n",
"outcomes_under_five = total <= 5\n",
"sum(outcomes_under_five)/len(total)\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Let's apply some of our new skills to the Covid-19 data\n",
"\n",
"We'll first process the data as just as we did in last class. Then, we'll sample from all of the counties, and display that subset. We'll also use a loop to create a more informative label for each bubble in the map.\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"covid_table = Table.read_table(\"https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv\")\n",
"county_geo = Table.read_table(\"https://raw.githubusercontent.com/jdlafferty/covid-19/master/data/geo-counties.csv\") \n"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"first_date = '2021-02-21'\n",
"\n",
"# Some subsets of states to visualize:\n",
"all_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',\n",
" 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',\n",
" 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',\n",
" 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts',\n",
" 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',\n",
" 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',\n",
" 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',\n",
" 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',\n",
" 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',\n",
" 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']\n",
"\n",
"states = all_states\n",
"\n",
"recent_data = covid_table.where('date', are.above(first_date))\n",
"recent_state_data = recent_data.where('state', are.contained_in(states))\n"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"# remove extra columns\n",
"data = recent_state_data.drop('date').drop('county').drop('state').drop('deaths')\n",
"\n",
"# exclude cases where fips is not known\n",
"data = data.where('fips', are.above(0))\n",
"\n",
"# now, group by fips and form a list of the cumlative cases\n",
"data = data.group('fips', list)\n",
"\n",
"# apply the difference function np.diff to get the new cases\n",
"data = data.with_column('new cases', data.apply(np.diff, 'cases list'))\n",
"data = data.drop('cases list')\n",
"\n",
"# Now average to get the average new cases in each county over the past week\n",
"# We add a small amount .001 to avoid zeros, which the graphics handles badly \n",
"new_cases = Table().with_columns('fips', data['fips'], \n",
" 'new cases', data.apply(np.mean, 'new cases') + .001)\n"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" \n",
" county | state | lat | lon | new cases | \n",
"
\n",
" \n",
" \n",
" \n",
" Autauga | Alabama | 32.5077 | -86.651 | 21.2867 | \n",
"
\n",
" \n",
" Baldwin | Alabama | 30.7698 | -87.7827 | 42.4296 | \n",
"
\n",
" \n",
" Blount | Alabama | 34.0128 | -86.5337 | 8.42957 | \n",
"
\n",
" \n",
" Bullock | Alabama | 32.0927 | -85.7129 | 2.28671 | \n",
"
\n",
" \n",
" Butler | Alabama | 32.0894 | -88.2213 | 3.28671 | \n",
"
\n",
" \n",
" Calhoun | Alabama | 33.7623 | -85.8421 | 40.4296 | \n",
"
\n",
" \n",
" Chambers | Alabama | 32.9188 | -85.3938 | 6.57243 | \n",
"
\n",
" \n",
" Cherokee | Alabama | 34.7555 | -87.9734 | 2.42957 | \n",
"
\n",
" \n",
" Chilton | Alabama | 32.866 | -86.6652 | 11.7153 | \n",
"
\n",
" \n",
" Choctaw | Alabama | 32.004 | -88.2858 | 0.715286 | \n",
"
\n",
" \n",
"
\n",
"... (1631 rows omitted)
"
],
"text/plain": [
"county | state | lat | lon | new cases\n",
"Autauga | Alabama | 32.5077 | -86.651 | 21.2867\n",
"Baldwin | Alabama | 30.7698 | -87.7827 | 42.4296\n",
"Blount | Alabama | 34.0128 | -86.5337 | 8.42957\n",
"Bullock | Alabama | 32.0927 | -85.7129 | 2.28671\n",
"Butler | Alabama | 32.0894 | -88.2213 | 3.28671\n",
"Calhoun | Alabama | 33.7623 | -85.8421 | 40.4296\n",
"Chambers | Alabama | 32.9188 | -85.3938 | 6.57243\n",
"Cherokee | Alabama | 34.7555 | -87.9734 | 2.42957\n",
"Chilton | Alabama | 32.866 | -86.6652 | 11.7153\n",
"Choctaw | Alabama | 32.004 | -88.2858 | 0.715286\n",
"... (1631 rows omitted)"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"state_geo = county_geo.where('state', are.contained_in(states)).sort('fips')\n",
"new_cases_geo = state_geo.join('fips', new_cases)\n",
"new_cases_geo = new_cases_geo.drop('fips')\n",
"new_cases_geo"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"Make this Notebook Trusted to load map: File -> Trust Notebook
"
],
"text/plain": [
""
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n = new_cases_geo.num_rows\n",
"\n",
"# A random sample of 100 counties across the US:\n",
"rows = np.random.choice(np.arange(n), 100, replace=False)\n",
"sample = new_cases_geo.take(rows)\n",
"\n",
"labels = []\n",
"for i in np.arange(sample.num_rows):\n",
" s = sample['county'][i] + \" County, \" + \\\n",
" sample['state'][i] + \": \" + \\\n",
" str(np.round(sample['new cases'][i],1))\n",
" labels.append(s)\n",
"\n",
"dat = Table().with_columns('lat', sample['lat'], \n",
" 'long', sample['lon'], \n",
" 'labels', labels,\n",
" 'areas', 10*sample['new cases'],\n",
" 'colors', 'red')\n",
"Circle.map_table(dat, weight=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can check our results by comparing to the *Times* numbers [reported here](https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html#states)."
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 1
}