{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# california-coronavirus-data examples\n", "\n", "By [Ben Welsh](https://palewi.re/who-is-ben-welsh)\n", "\n", "A demonstration of how to use Python to work with the Los Angeles Times' independent tally of coronavirus cases in California published on GitHub at [datadesk/california-coronavirus-data](https://github.com/datadesk/california-coronavirus-data#state-cdph-totalscsv). To run this notebook immediately in the cloud, click the [Binder](https://mybinder.org/) launcher below.\n", "\n", "[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/datadesk/california-coronavirus-data/master?urlpath=lab/tree/notebooks/examples.ipynb)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%load_ext lab_black" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import Python tools" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Our data analysis and plotting tools" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import altair as alt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Customizations to the Altair theme" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import altair_latimes as lat" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ThemeRegistry.enable('latimes')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alt.themes.register(\"latimes\", lat.theme)\n", "alt.themes.enable(\"latimes\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DataTransformerRegistry.enable('default')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alt.data_transformers.disable_max_rows()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read in the agency totals" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "agency_df = pd.read_csv(\"../latimes-agency-totals.csv\", parse_dates=[\"date\"])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agencycountyfipsdateconfirmed_casesdeathsrecoveriesdid_not_update
0AlamedaAlameda12020-06-214686117.0NaNNaN
1BerkeleyAlameda12020-06-211191.0NaNNaN
2AlpineAlpine32020-06-2110.01.0True
3AmadorAmador52020-06-21130.010.0NaN
4ButteButte72020-06-21941.072.0True
\n", "
" ], "text/plain": [ " agency county fips date confirmed_cases deaths recoveries \\\n", "0 Alameda Alameda 1 2020-06-21 4686 117.0 NaN \n", "1 Berkeley Alameda 1 2020-06-21 119 1.0 NaN \n", "2 Alpine Alpine 3 2020-06-21 1 0.0 1.0 \n", "3 Amador Amador 5 2020-06-21 13 0.0 10.0 \n", "4 Butte Butte 7 2020-06-21 94 1.0 72.0 \n", "\n", " did_not_update \n", "0 NaN \n", "1 NaN \n", "2 True \n", "3 NaN \n", "4 True " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "agency_df.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 6293 entries, 0 to 6292\n", "Data columns (total 8 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 agency 6293 non-null object \n", " 1 county 6293 non-null object \n", " 2 fips 6293 non-null int64 \n", " 3 date 6293 non-null datetime64[ns]\n", " 4 confirmed_cases 6293 non-null int64 \n", " 5 deaths 6292 non-null float64 \n", " 6 recoveries 2095 non-null float64 \n", " 7 did_not_update 893 non-null object \n", "dtypes: datetime64[ns](1), float64(2), int64(2), object(3)\n", "memory usage: 393.4+ KB\n" ] } ], "source": [ "agency_df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Aggregate data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### By state" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lump all the agencies together and you get the statewide totals." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "state_df = (\n", " agency_df.groupby([\"date\"])\n", " .agg({\"confirmed_cases\": \"sum\", \"deaths\": \"sum\"})\n", " .reset_index()\n", ")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateconfirmed_casesdeaths
02020-01-2620.0
12020-01-2730.0
22020-01-2830.0
32020-01-2940.0
42020-01-3040.0
\n", "
" ], "text/plain": [ " date confirmed_cases deaths\n", "0 2020-01-26 2 0.0\n", "1 2020-01-27 3 0.0\n", "2 2020-01-28 3 0.0\n", "3 2020-01-29 4 0.0\n", "4 2020-01-30 4 0.0" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "state_df.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 148 entries, 0 to 147\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 date 148 non-null datetime64[ns]\n", " 1 confirmed_cases 148 non-null int64 \n", " 2 deaths 148 non-null float64 \n", "dtypes: datetime64[ns](1), float64(1), int64(1)\n", "memory usage: 3.6 KB\n" ] } ], "source": [ "state_df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### By county" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Three cities — Berkeley, Long Beach and Pasadena — run independent public health departments. Calculating county-level totals requires grouping them with their local peers." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "county_df = (\n", " agency_df.groupby([\"date\", \"county\"])\n", " .agg({\"confirmed_cases\": \"sum\", \"deaths\": \"sum\"})\n", " .reset_index()\n", ")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datecountyconfirmed_casesdeaths
02020-01-26Alameda00.0
12020-01-26Calaveras00.0
22020-01-26Contra Costa00.0
32020-01-26Humboldt00.0
42020-01-26Los Angeles10.0
\n", "
" ], "text/plain": [ " date county confirmed_cases deaths\n", "0 2020-01-26 Alameda 0 0.0\n", "1 2020-01-26 Calaveras 0 0.0\n", "2 2020-01-26 Contra Costa 0 0.0\n", "3 2020-01-26 Humboldt 0 0.0\n", "4 2020-01-26 Los Angeles 1 0.0" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "county_df.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 5948 entries, 0 to 5947\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 date 5948 non-null datetime64[ns]\n", " 1 county 5948 non-null object \n", " 2 confirmed_cases 5948 non-null int64 \n", " 3 deaths 5948 non-null float64 \n", "dtypes: datetime64[ns](1), float64(1), int64(1), object(1)\n", "memory usage: 186.0+ KB\n" ] } ], "source": [ "county_df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Chart the statewide totals over time" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create a base chart with the common x-axis\n", "chart = alt.Chart(state_df).encode(x=alt.X(\"date:T\", title=None))\n", "\n", "# Create the cases line\n", "cases = chart.mark_line(color=lat.palette[\"default\"]).encode(\n", " y=alt.Y(\"confirmed_cases:Q\", title=\"Confirmed cases\")\n", ")\n", "\n", "# Create the deaths line\n", "deaths = chart.mark_line(color=lat.palette[\"schemes\"][\"ice-7\"][3]).encode(\n", " y=alt.Y(\"deaths:Q\", title=\"Deaths\")\n", ")\n", "\n", "# Combine them into a single chart\n", "(cases & deaths).properties(title=\"Statewide cumulative totals\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Chart the county totals" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First on a linear scale" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create the base chart\n", "chart = (\n", " alt.Chart(county_df)\n", " .mark_line()\n", " .encode(\n", " x=alt.X(\"date:T\", title=None),\n", " color=alt.Color(\"county:N\", title=\"County\", legend=None),\n", " )\n", ")\n", "\n", "# The cases line\n", "cases = chart.encode(y=alt.Y(\"confirmed_cases:Q\", title=\"Confirmed cases\"),)\n", "\n", "# The deaths line\n", "deaths = chart.mark_line().encode(y=alt.Y(\"deaths:Q\", title=\"Deaths\"),)\n", "\n", "# Combined into a chart\n", "(cases & deaths).properties(title=\"Cumulative totals by county\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Again on a logarithmic scale" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make a base chart\n", "chart = (\n", " alt.Chart(county_df)\n", " .mark_line()\n", " .encode(\n", " x=alt.X(\"date:T\", title=None),\n", " color=alt.Color(\"county:N\", title=\"County\", legend=None),\n", " )\n", ")\n", "\n", "# The cases lines\n", "cases = chart.transform_filter(alt.datum.confirmed_cases > 0).encode(\n", " y=alt.Y(\"confirmed_cases:Q\", scale=alt.Scale(type=\"log\"), title=\"Confirmed cases\"),\n", ")\n", "\n", "# The deaths lines\n", "deaths = chart.transform_filter(alt.datum.deaths > 0).encode(\n", " y=alt.Y(\"deaths:Q\", scale=alt.Scale(type=\"log\"), title=\"Deaths\"),\n", ")\n", "\n", "# Slapping them together\n", "(cases & deaths).properties(title=\"Cumulative totals by county\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A common technique for clarifying these charts to begin each line on the day the county hit a minimum number. Let's try it with 10." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "day_10_df = (\n", " county_df[\n", " # Filter down to only days with 10 or more cumulative cases\n", " county_df.confirmed_cases\n", " >= 10\n", " ]\n", " .groupby(\n", " # And then get the minimum date for each county\n", " \"county\"\n", " )\n", " .date.min()\n", " .reset_index()\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Merge that date to each row in the data." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "county_date_diff_df = county_df.merge(\n", " day_10_df, how=\"inner\", on=\"county\", suffixes=[\"\", \"_gte_10_cases\"]\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Calculate each day's distance from its tenth day." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "county_date_diff_df[\"days_since_10\"] = (\n", " county_date_diff_df.date - county_date_diff_df.date_gte_10_cases\n", ").dt.days" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Chart it." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alt.Chart(county_date_diff_df).transform_filter(\n", " # Only keep everything once they hit 10 cases\n", " alt.datum.days_since_10\n", " >= 0\n", ").mark_line().encode(\n", " x=alt.X(\"days_since_10:O\", title=\"Days since 10th case\"),\n", " y=alt.Y(\"confirmed_cases:Q\", scale=alt.Scale(type=\"log\"), title=\"Confirmed cases\"),\n", " color=alt.Color(\"county:N\", title=\"County\", legend=None),\n", ").properties(\n", " title=\"Cumulative totals by county\"\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## County trends on a linear 'Pez' plot" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fill in any date gaps so that every county has a row for every date." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "backfilled_county_df = (\n", " county_df.set_index([\"county\", \"date\"])\n", " .unstack(\"county\")\n", " .fillna(0)\n", " .stack(\"county\")\n", " .reset_index()\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Calculate the rolling change in each county." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "chronological_county_df = backfilled_county_df.sort_values([\"county\", \"date\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Calculate the daily change in each county." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "chronological_county_df[\"new_confirmed_cases\"] = chronological_county_df.groupby(\n", " \"county\"\n", ").confirmed_cases.diff()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's chill that out as a seven-day average." ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "chronological_county_df[\"new_confirmed_cases_rolling_average\"] = (\n", " chronological_county_df.groupby(\"county\")\n", " .new_confirmed_cases.rolling(7)\n", " .mean()\n", " .droplevel(0)\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make the chart." ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alt.Chart(chronological_county_df, title=\"New cases by day\").mark_rect(\n", " stroke=None\n", ").encode(\n", " x=alt.X(\n", " \"date:O\", axis=alt.Axis(ticks=False, grid=False, labels=False,), title=None\n", " ),\n", " y=alt.Y(\n", " \"county:N\",\n", " title=\"County\",\n", " axis=alt.Axis(ticks=False, grid=False, labelPadding=5),\n", " ),\n", " color=alt.Color(\n", " \"new_confirmed_cases_rolling_average:Q\",\n", " scale=alt.Scale(\n", " type=\"threshold\", domain=[0, 3, 10, 25, 50, 100, 500], scheme=\"blues\"\n", " ),\n", " title=\"New cases (7-day average)\",\n", " ),\n", ").properties(\n", " height=800\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Chart new cases and deaths" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Calculate the number of new cases each day using panda's [diff](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.diff.html) method." ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "state_df[\"new_confirmed_cases\"] = state_df.confirmed_cases.diff()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Do the same for deaths" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "state_df[\"new_deaths\"] = state_df.deaths.diff()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now calculate the moving seven-day average of each using panda's [rolling](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html) method." ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "state_df[\"new_confirmed_cases_rolling_average\"] = state_df.new_confirmed_cases.rolling(\n", " 7\n", ").mean()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "state_df[\"new_deaths_rolling_average\"] = state_df.new_deaths.rolling(7).mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Put it all together on the chart " ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# One base chart object with the data they all share\n", "chart = alt.Chart(state_df).encode(x=alt.X(\"date:T\", title=None),)\n", "\n", "# The new cases bars\n", "cases_bars = chart.mark_bar(color=lat.palette[\"default\"]).encode(\n", " y=alt.Y(\"new_confirmed_cases:Q\", title=\"New confirmed cases\")\n", ")\n", "\n", "# The cases rolling average\n", "cases_line = chart.mark_line(color=lat.palette[\"accent\"]).encode(\n", " y=alt.Y(\"new_confirmed_cases_rolling_average:Q\", title=\"7-day average\")\n", ")\n", "\n", "# The new deaths bars\n", "deaths_bars = chart.mark_bar(color=lat.palette[\"schemes\"][\"ice-7\"][3]).encode(\n", " y=alt.Y(\"new_deaths:Q\", title=\"New deaths\")\n", ")\n", "\n", "# The deaths rolling average\n", "deaths_line = chart.mark_line(color=lat.palette[\"schemes\"][\"ice-7\"][6]).encode(\n", " y=alt.Y(\"new_deaths_rolling_average:Q\", title=\"7-day average\")\n", ")\n", "\n", "# Combine it all together into one paired chart\n", "((cases_bars + cases_line) & (deaths_bars + deaths_line)).properties(\n", " title=\"New case and deaths statewide by day\"\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now do it by county" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datecountyconfirmed_casesdeathsnew_confirmed_casesnew_confirmed_cases_rolling_average
02020-01-26Alameda0.00.0NaNNaN
582020-01-27Alameda0.00.00.0NaN
1162020-01-28Alameda0.00.00.0NaN
1742020-01-29Alameda0.00.00.0NaN
2322020-01-30Alameda0.00.00.0NaN
\n", "
" ], "text/plain": [ " date county confirmed_cases deaths new_confirmed_cases \\\n", "0 2020-01-26 Alameda 0.0 0.0 NaN \n", "58 2020-01-27 Alameda 0.0 0.0 0.0 \n", "116 2020-01-28 Alameda 0.0 0.0 0.0 \n", "174 2020-01-29 Alameda 0.0 0.0 0.0 \n", "232 2020-01-30 Alameda 0.0 0.0 0.0 \n", "\n", " new_confirmed_cases_rolling_average \n", "0 NaN \n", "58 NaN \n", "116 NaN \n", "174 NaN \n", "232 NaN " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chronological_county_df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Try it by county" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alt.Chart(chronological_county_df, title=\"New cases by day\").mark_line().encode(\n", " x=alt.X(\"date:O\", axis=alt.Axis(ticks=False, grid=False, labels=False), title=None),\n", " y=alt.Y(\"new_confirmed_cases_rolling_average:Q\", title=\"7-day average\"),\n", " color=alt.Color(\"county:N\", title=\"County\", legend=None),\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a statistic to measure recent changes in new cases" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datecountyconfirmed_casesdeathsnew_confirmed_casesnew_confirmed_cases_rolling_average
78292020-06-08Yuba32.01.00.00.285714
78872020-06-09Yuba32.01.00.00.142857
79452020-06-10Yuba33.01.01.00.285714
80032020-06-11Yuba34.01.01.00.428571
80612020-06-12Yuba35.01.01.00.428571
81192020-06-13Yuba35.01.00.00.428571
81772020-06-14Yuba35.01.00.00.428571
82352020-06-15Yuba36.01.01.00.571429
82932020-06-16Yuba37.01.01.00.714286
83512020-06-17Yuba39.01.02.00.857143
84092020-06-18Yuba39.01.00.00.714286
84672020-06-19Yuba41.01.02.00.857143
85252020-06-20Yuba42.01.01.01.000000
85832020-06-21Yuba42.01.00.01.000000
\n", "
" ], "text/plain": [ " date county confirmed_cases deaths new_confirmed_cases \\\n", "7829 2020-06-08 Yuba 32.0 1.0 0.0 \n", "7887 2020-06-09 Yuba 32.0 1.0 0.0 \n", "7945 2020-06-10 Yuba 33.0 1.0 1.0 \n", "8003 2020-06-11 Yuba 34.0 1.0 1.0 \n", "8061 2020-06-12 Yuba 35.0 1.0 1.0 \n", "8119 2020-06-13 Yuba 35.0 1.0 0.0 \n", "8177 2020-06-14 Yuba 35.0 1.0 0.0 \n", "8235 2020-06-15 Yuba 36.0 1.0 1.0 \n", "8293 2020-06-16 Yuba 37.0 1.0 1.0 \n", "8351 2020-06-17 Yuba 39.0 1.0 2.0 \n", "8409 2020-06-18 Yuba 39.0 1.0 0.0 \n", "8467 2020-06-19 Yuba 41.0 1.0 2.0 \n", "8525 2020-06-20 Yuba 42.0 1.0 1.0 \n", "8583 2020-06-21 Yuba 42.0 1.0 0.0 \n", "\n", " new_confirmed_cases_rolling_average \n", "7829 0.285714 \n", "7887 0.142857 \n", "7945 0.285714 \n", "8003 0.428571 \n", "8061 0.428571 \n", "8119 0.428571 \n", "8177 0.428571 \n", "8235 0.571429 \n", "8293 0.714286 \n", "8351 0.857143 \n", "8409 0.714286 \n", "8467 0.857143 \n", "8525 1.000000 \n", "8583 1.000000 " ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chronological_county_df.tail(14)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "chronological_county_df[\n", " \"new_confirmed_cases_rolling_average_two_week_pct_change\"\n", "] = chronological_county_df.groupby(\n", " \"county\"\n", ").new_confirmed_cases_rolling_average.pct_change(\n", " 14\n", ")" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "latest_county_df = chronological_county_df[\n", " chronological_county_df.date == chronological_county_df.date.max()\n", "]" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "biggest_county_jumps = latest_county_df[\n", " latest_county_df.new_confirmed_cases_rolling_average >= 25\n", "].sort_values(\n", " \"new_confirmed_cases_rolling_average_two_week_pct_change\", ascending=False\n", ")" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "def facet_wrap(subplts, plots_per_row):\n", " rows = [\n", " subplts[i : i + plots_per_row] for i in range(0, len(subplts), plots_per_row)\n", " ]\n", " compound_chart = alt.hconcat()\n", " for r in rows:\n", " rowplot = alt.vconcat() # start a new row\n", " for item in r:\n", " rowplot |= item # add suplot to current row as a new column\n", " compound_chart &= rowplot # add the entire row of plots as a new row\n", " return compound_chart" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chart_list = []\n", "for county in list(biggest_county_jumps.head(12).county):\n", " this_df = chronological_county_df[chronological_county_df.county == county]\n", " chart = alt.Chart(this_df, title=county).encode(\n", " x=alt.X(\"date:T\", title=None, axis=None),\n", " )\n", " lines = chart.mark_line(color=lat.palette[\"accent\"]).encode(\n", " y=alt.Y(\"new_confirmed_cases_rolling_average:Q\", title=None,),\n", " )\n", " bars = chart.mark_bar(color=lat.palette[\"default\"], opacity=0.33).encode(\n", " y=alt.Y(\"new_confirmed_cases:Q\", title=\"New confirmed cases\",),\n", " )\n", " chart_list.append((bars + lines).properties(height=200, width=250))\n", "facet_wrap(chart_list, plots_per_row=4)" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chart_list = []\n", "for county in list(biggest_county_jumps.tail(12).county):\n", " this_df = chronological_county_df[chronological_county_df.county == county]\n", " chart = alt.Chart(this_df, title=county).encode(\n", " x=alt.X(\"date:T\", title=None, axis=None),\n", " )\n", " lines = chart.mark_line(color=lat.palette[\"accent\"]).encode(\n", " y=alt.Y(\"new_confirmed_cases_rolling_average:Q\", title=None,),\n", " )\n", " bars = chart.mark_bar(color=lat.palette[\"default\"], opacity=0.33).encode(\n", " y=alt.Y(\"new_confirmed_cases:Q\", title=\"New confirmed cases\",),\n", " )\n", " chart_list.append((bars + lines).properties(height=200, width=250))\n", "facet_wrap(chart_list, plots_per_row=4)" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 22.000000\n", "mean 0.997674\n", "std 0.959813\n", "min -0.311563\n", "25% 0.362187\n", "50% 0.772164\n", "75% 1.450797\n", "max 3.142857\n", "Name: new_confirmed_cases_rolling_average_two_week_pct_change, dtype: float64" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "biggest_county_jumps.new_confirmed_cases_rolling_average_two_week_pct_change.describe()" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datecountyconfirmed_casesdeathsnew_confirmed_casesnew_confirmed_cases_rolling_averagenew_confirmed_cases_rolling_average_two_week_pct_change
85412020-06-21Kings2104.016.013.042.714286-0.281250
85382020-06-21Imperial4800.064.00.091.857143-0.311563
\n", "
" ], "text/plain": [ " date county confirmed_cases deaths new_confirmed_cases \\\n", "8541 2020-06-21 Kings 2104.0 16.0 13.0 \n", "8538 2020-06-21 Imperial 4800.0 64.0 0.0 \n", "\n", " new_confirmed_cases_rolling_average \\\n", "8541 42.714286 \n", "8538 91.857143 \n", "\n", " new_confirmed_cases_rolling_average_two_week_pct_change \n", "8541 -0.281250 \n", "8538 -0.311563 " ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "biggest_county_jumps[\n", " biggest_county_jumps.new_confirmed_cases_rolling_average_two_week_pct_change < 0\n", "]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 4 }