{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Recap: Comparisons with arrays" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "pets = make_array('cat', 'dog', 'cat', 'cat', 'dog', 'rabbit')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([False, True, False, False, True, False])" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pets == 'dog'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(pets == 'dog')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([False, True, False, False, True, True])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pets > 'cat'" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(pets > 'cat')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Recap: Predicates and advanced `where`" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Semester
1
2
3
4
5
6
7
8
" ], "text/plain": [ "Semester\n", "1\n", "2\n", "3\n", "4\n", "5\n", "6\n", "7\n", "8" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "terms = Table().with_column('Semester', np.arange(1, 9))\n", "terms" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Semester
7
8
" ], "text/plain": [ "Semester\n", "7\n", "8" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "terms.where('Semester', are.above(6))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Semester
7
8
" ], "text/plain": [ "Semester\n", "7\n", "8" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "terms.where([False, False, False, False, False, False, True, True])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Conditional Statements " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def year_from_semesters(x):\n", " if x <= 0:\n", " return 'Not a valid input'\n", " elif x <= 2:\n", " return 'First Year'\n", " elif x <= 4:\n", " return 'Sophomore'\n", " elif x <= 6:\n", " return 'Junior'\n", " elif x <= 8:\n", " return 'Senior'\n", " else:\n", " return \"NA\"" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Not a valid input'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "year_from_semesters(-15.6)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Junior'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "year_from_semesters(5)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'NA'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "year_from_semesters(9001)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Semester Year
1 First Year
2 First Year
3 Sophomore
4 Sophomore
5 Junior
6 Junior
7 Senior
8 Senior
" ], "text/plain": [ "Semester | Year\n", "1 | First Year\n", "2 | First Year\n", "3 | Sophomore\n", "4 | Sophomore\n", "5 | Junior\n", "6 | Junior\n", "7 | Senior\n", "8 | Senior" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "terms.with_column('Year', terms.apply(year_from_semesters, 'Semester'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Appending Arrays ##" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "first = np.arange(1, 6)\n", "second = np.arange(6, 11)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 2, 3, 4, 5])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "first" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1, 2, 3, 4, 5, 30])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.append(first, 30)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.append(first, second)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 2, 3, 4, 5])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "first" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "first_second = np.append(first, second)\n", "first_second" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Random Selection ##" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['treatment', 'control'], dtype='" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "Table().with_column(\n", " 'Heads in 100 Tosses', outcomes\n", ").hist(bins = np.arange(25.5, 75.5))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now you get to try. Instead of tossing a coin 100 times, suppose you roll TWO 6-sided dice and consider the sum of their outcomes. \n", "What is the estimated **probability** that the sum is less than or equal to 5? Use 10,000 runs of your experiment to estimate this.\n", "Hints: \n", "1. You will need to define the possible outcomes of 1,...,6\n", "2. Then use the possible outcomes with np.random.choice twice - one for each roll." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 2, 3, 4, 5, 6])" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Outcomes\n", "roll = np.arange(1,7)\n", "roll\n" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([2, 6])" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Two rolls\n", "np.random.choice(roll,2)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Sum of two rolls\n", "sum(np.random.choice(roll,2))" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.2803" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Setup iterations\n", "outcomes = make_array()\n", "\n", "for i in np.arange(10000):\n", " total = sum(np.random.choice(roll,2))\n", " outcomes = np.append(outcomes, total <= 5)\n", " \n", "outcomes_average = sum(outcomes)/len(outcomes)\n", "outcomes_average\n" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.2819" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Another option\n", "roll1 = np.random.choice(roll,10000)\n", "roll2 = np.random.choice(roll,10000)\n", "total = roll1+roll2\n", "outcomes_under_five = total <= 5\n", "sum(outcomes_under_five)/len(total)\n", "\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Let's apply some of our new skills to the Covid-19 data\n", "\n", "We'll first process the data as just as we did in last class. Then, we'll sample from all of the counties, and display that subset. We'll also use a loop to create a more informative label for each bubble in the map.\n" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "covid_table = Table.read_table(\"https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv\")\n", "county_geo = Table.read_table(\"https://raw.githubusercontent.com/jdlafferty/covid-19/master/data/geo-counties.csv\") \n" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "first_date = '2021-02-21'\n", "\n", "# Some subsets of states to visualize:\n", "all_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',\n", " 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',\n", " 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',\n", " 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts',\n", " 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',\n", " 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',\n", " 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',\n", " 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina',\n", " 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',\n", " 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']\n", "\n", "states = all_states\n", "\n", "recent_data = covid_table.where('date', are.above(first_date))\n", "recent_state_data = recent_data.where('state', are.contained_in(states))\n" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "# remove extra columns\n", "data = recent_state_data.drop('date').drop('county').drop('state').drop('deaths')\n", "\n", "# exclude cases where fips is not known\n", "data = data.where('fips', are.above(0))\n", "\n", "# now, group by fips and form a list of the cumlative cases\n", "data = data.group('fips', list)\n", "\n", "# apply the difference function np.diff to get the new cases\n", "data = data.with_column('new cases', data.apply(np.diff, 'cases list'))\n", "data = data.drop('cases list')\n", "\n", "# Now average to get the average new cases in each county over the past week\n", "# We add a small amount .001 to avoid zeros, which the graphics handles badly \n", "new_cases = Table().with_columns('fips', data['fips'], \n", " 'new cases', data.apply(np.mean, 'new cases') + .001)\n" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
county state lat lon new cases
Autauga Alabama 32.5077 -86.651 21.2867
Baldwin Alabama 30.7698 -87.7827 42.4296
Blount Alabama 34.0128 -86.5337 8.42957
Bullock Alabama 32.0927 -85.7129 2.28671
Butler Alabama 32.0894 -88.2213 3.28671
Calhoun Alabama 33.7623 -85.8421 40.4296
Chambers Alabama 32.9188 -85.3938 6.57243
Cherokee Alabama 34.7555 -87.9734 2.42957
Chilton Alabama 32.866 -86.6652 11.7153
Choctaw Alabama 32.004 -88.2858 0.715286
\n", "

... (1631 rows omitted)

" ], "text/plain": [ "county | state | lat | lon | new cases\n", "Autauga | Alabama | 32.5077 | -86.651 | 21.2867\n", "Baldwin | Alabama | 30.7698 | -87.7827 | 42.4296\n", "Blount | Alabama | 34.0128 | -86.5337 | 8.42957\n", "Bullock | Alabama | 32.0927 | -85.7129 | 2.28671\n", "Butler | Alabama | 32.0894 | -88.2213 | 3.28671\n", "Calhoun | Alabama | 33.7623 | -85.8421 | 40.4296\n", "Chambers | Alabama | 32.9188 | -85.3938 | 6.57243\n", "Cherokee | Alabama | 34.7555 | -87.9734 | 2.42957\n", "Chilton | Alabama | 32.866 | -86.6652 | 11.7153\n", "Choctaw | Alabama | 32.004 | -88.2858 | 0.715286\n", "... (1631 rows omitted)" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_geo = county_geo.where('state', are.contained_in(states)).sort('fips')\n", "new_cases_geo = state_geo.join('fips', new_cases)\n", "new_cases_geo = new_cases_geo.drop('fips')\n", "new_cases_geo" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
Make this Notebook Trusted to load map: File -> Trust Notebook
" ], "text/plain": [ "" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "n = new_cases_geo.num_rows\n", "\n", "# A random sample of 100 counties across the US:\n", "rows = np.random.choice(np.arange(n), 100, replace=False)\n", "sample = new_cases_geo.take(rows)\n", "\n", "labels = []\n", "for i in np.arange(sample.num_rows):\n", " s = sample['county'][i] + \" County, \" + \\\n", " sample['state'][i] + \": \" + \\\n", " str(np.round(sample['new cases'][i],1))\n", " labels.append(s)\n", "\n", "dat = Table().with_columns('lat', sample['lat'], \n", " 'long', sample['lon'], \n", " 'labels', labels,\n", " 'areas', 10*sample['new cases'],\n", " 'colors', 'red')\n", "Circle.map_table(dat, weight=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can check our results by comparing to the *Times* numbers [reported here](https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html#states)." ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 1 }