{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Loading libraries and data " ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import altair as alt\n", "\n", "from github import Github\n", "import configparser" ] }, { "cell_type": "code", "execution_count": 176, "metadata": {}, "outputs": [], "source": [ "config = configparser.ConfigParser()\n", "config.read('secrets.ini');" ] }, { "cell_type": "code", "execution_count": 177, "metadata": {}, "outputs": [], "source": [ "g = Github(config['corona']['api_key'])" ] }, { "cell_type": "code", "execution_count": 209, "metadata": {}, "outputs": [], "source": [ "START_CASES_INFECTION = 100\n", "START_CASES_DEATH = 10" ] }, { "cell_type": "code", "execution_count": 210, "metadata": {}, "outputs": [], "source": [ "URL = \"https://covid.ourworldindata.org/data/ecdc/full_data.csv\"" ] }, { "cell_type": "code", "execution_count": 211, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(URL)" ] }, { "cell_type": "code", "execution_count": 212, "metadata": {}, "outputs": [], "source": [ "df.date = pd.to_datetime(df.date)" ] }, { "cell_type": "code", "execution_count": 213, "metadata": {}, "outputs": [], "source": [ "df = df.sort_values(['location','date'], ascending=True)" ] }, { "cell_type": "code", "execution_count": 214, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7213" ] }, "execution_count": 214, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape[0]" ] }, { "cell_type": "code", "execution_count": 215, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datelocationnew_casesnew_deathstotal_casestotal_deaths
02019-12-31Afghanistan0000
12020-01-01Afghanistan0000
22020-01-02Afghanistan0000
32020-01-03Afghanistan0000
42020-01-04Afghanistan0000
\n", "
" ], "text/plain": [ " date location new_cases new_deaths total_cases total_deaths\n", "0 2019-12-31 Afghanistan 0 0 0 0\n", "1 2020-01-01 Afghanistan 0 0 0 0\n", "2 2020-01-02 Afghanistan 0 0 0 0\n", "3 2020-01-03 Afghanistan 0 0 0 0\n", "4 2020-01-04 Afghanistan 0 0 0 0" ] }, "execution_count": 215, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 216, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 7213 entries, 0 to 7212\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 date 7213 non-null datetime64[ns]\n", " 1 location 7213 non-null object \n", " 2 new_cases 7213 non-null int64 \n", " 3 new_deaths 7213 non-null int64 \n", " 4 total_cases 7213 non-null int64 \n", " 5 total_deaths 7213 non-null int64 \n", "dtypes: datetime64[ns](1), int64(4), object(1)\n", "memory usage: 394.5+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Date of data" ] }, { "cell_type": "code", "execution_count": 217, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Timestamp('2020-03-27 00:00:00')" ] }, "execution_count": 217, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.date.max()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Creating data for graphs" ] }, { "cell_type": "code", "execution_count": 218, "metadata": {}, "outputs": [], "source": [ "df = df.drop(columns=['new_cases', 'new_deaths'])" ] }, { "cell_type": "code", "execution_count": 219, "metadata": {}, "outputs": [], "source": [ "df_infection = df[df.total_cases >= START_CASES_INFECTION].copy()\n", "df_death = df[df.total_deaths >= START_CASES_DEATH].copy()" ] }, { "cell_type": "code", "execution_count": 220, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datelocationtotal_casestotal_deaths
932020-03-24Albania1004
942020-03-25Albania1235
952020-03-26Albania1465
962020-03-27Albania1746
1752020-03-23Algeria10215
\n", "
" ], "text/plain": [ " date location total_cases total_deaths\n", "93 2020-03-24 Albania 100 4\n", "94 2020-03-25 Albania 123 5\n", "95 2020-03-26 Albania 146 5\n", "96 2020-03-27 Albania 174 6\n", "175 2020-03-23 Algeria 102 15" ] }, "execution_count": 220, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_infection.head()" ] }, { "cell_type": "code", "execution_count": 221, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datelocationtotal_casestotal_deaths
4882020-03-25Austria528230
4892020-03-26Austria588834
4902020-03-27Austria702949
\n", "
" ], "text/plain": [ " date location total_cases total_deaths\n", "488 2020-03-25 Austria 5282 30\n", "489 2020-03-26 Austria 5888 34\n", "490 2020-03-27 Austria 7029 49" ] }, "execution_count": 221, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_infection[df_infection.location == 'Austria'].tail(3)" ] }, { "cell_type": "code", "execution_count": 222, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datelocationtotal_casestotal_deaths
1732020-03-21Algeria9410
1742020-03-22Algeria9410
1752020-03-23Algeria10215
1762020-03-24Algeria18917
1772020-03-25Algeria23117
\n", "
" ], "text/plain": [ " date location total_cases total_deaths\n", "173 2020-03-21 Algeria 94 10\n", "174 2020-03-22 Algeria 94 10\n", "175 2020-03-23 Algeria 102 15\n", "176 2020-03-24 Algeria 189 17\n", "177 2020-03-25 Algeria 231 17" ] }, "execution_count": 222, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_death.head()" ] }, { "cell_type": "code", "execution_count": 223, "metadata": {}, "outputs": [], "source": [ "df_infection['total_cases_normalized'] = df_infection.groupby(\n", " \"location\")[['total_cases']].transform(lambda x: x / x.min() * START_CASES_INFECTION)\n", "df_infection.total_cases_normalized = df_infection.total_cases_normalized.astype(\"int\")\n", "\n", "df_death['total_deaths_normalized'] = df_death.groupby(\n", " \"location\")[['total_deaths']].transform(lambda x: x / x.min() * START_CASES_DEATH)\n", "df_death.total_deaths_normalized = df_death.total_deaths_normalized.astype(\"int\")" ] }, { "cell_type": "code", "execution_count": 224, "metadata": {}, "outputs": [], "source": [ "df_infection[\"days_after\"] = df_infection.groupby(\"location\").cumcount()\n", "df_death[\"days_after\"] = df_death.groupby(\"location\").cumcount()" ] }, { "cell_type": "code", "execution_count": 225, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datelocationtotal_casestotal_deathstotal_cases_normalizeddays_after
932020-03-24Albania10041000
942020-03-25Albania12351231
952020-03-26Albania14651462
962020-03-27Albania17461743
1752020-03-23Algeria102151000
\n", "
" ], "text/plain": [ " date location total_cases total_deaths total_cases_normalized \\\n", "93 2020-03-24 Albania 100 4 100 \n", "94 2020-03-25 Albania 123 5 123 \n", "95 2020-03-26 Albania 146 5 146 \n", "96 2020-03-27 Albania 174 6 174 \n", "175 2020-03-23 Algeria 102 15 100 \n", "\n", " days_after \n", "93 0 \n", "94 1 \n", "95 2 \n", "96 3 \n", "175 0 " ] }, "execution_count": 225, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_infection.head()" ] }, { "cell_type": "code", "execution_count": 226, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datelocationtotal_casestotal_deathstotal_deaths_normalizeddays_after
1732020-03-21Algeria9410100
1742020-03-22Algeria9410101
1752020-03-23Algeria10215152
1762020-03-24Algeria18917173
1772020-03-25Algeria23117174
\n", "
" ], "text/plain": [ " date location total_cases total_deaths total_deaths_normalized \\\n", "173 2020-03-21 Algeria 94 10 10 \n", "174 2020-03-22 Algeria 94 10 10 \n", "175 2020-03-23 Algeria 102 15 15 \n", "176 2020-03-24 Algeria 189 17 17 \n", "177 2020-03-25 Algeria 231 17 17 \n", "\n", " days_after \n", "173 0 \n", "174 1 \n", "175 2 \n", "176 3 \n", "177 4 " ] }, "execution_count": 226, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_death.head()" ] }, { "cell_type": "code", "execution_count": 227, "metadata": {}, "outputs": [], "source": [ "df_summary = df[df.date == df.date.max()].drop(columns='date')\n", "df_summary['cases'] = df_summary.total_cases\n", "df_summary = df_summary.drop(['total_cases', 'total_deaths'], axis=1)" ] }, { "cell_type": "code", "execution_count": 228, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
locationcases
77Afghanistan75
96Albania174
179Algeria305
193Andorra224
199Angola3
\n", "
" ], "text/plain": [ " location cases\n", "77 Afghanistan 75\n", "96 Albania 174\n", "179 Algeria 305\n", "193 Andorra 224\n", "199 Angola 3" ] }, "execution_count": 228, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_summary.head()" ] }, { "cell_type": "code", "execution_count": 229, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Albania', 'Algeria', 'Andorra', 'Argentina', 'Armenia',\n", " 'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Belgium',\n", " 'Bosnia and Herzegovina', 'Brazil', 'Brunei', 'Bulgaria',\n", " 'Burkina Faso', 'Canada', 'Chile', 'China', 'Colombia',\n", " 'Costa Rica', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark',\n", " 'Dominican Republic', 'Ecuador', 'Egypt', 'Estonia',\n", " 'Faeroe Islands', 'Finland', 'France', 'Germany', 'Ghana',\n", " 'Greece', 'Hungary', 'Iceland', 'India', 'Indonesia',\n", " 'International', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy',\n", " 'Japan', 'Jordan', 'Kazakhstan', 'Kuwait', 'Latvia', 'Lebanon',\n", " 'Lithuania', 'Luxembourg', 'Macedonia', 'Malaysia', 'Malta',\n", " 'Mexico', 'Moldova', 'Morocco', 'Netherlands', 'New Zealand',\n", " 'Norway', 'Oman', 'Pakistan', 'Panama', 'Peru', 'Philippines',\n", " 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 'San Marino',\n", " 'Saudi Arabia', 'Senegal', 'Serbia', 'Singapore', 'Slovakia',\n", " 'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka',\n", " 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Tunisia', 'Turkey',\n", " 'Ukraine', 'United Arab Emirates', 'United Kingdom',\n", " 'United States', 'Uruguay', 'Venezuela', 'Vietnam', 'World'],\n", " dtype=object)" ] }, "execution_count": 229, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_infection.location.unique()" ] }, { "cell_type": "code", "execution_count": 230, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Algeria', 'Argentina', 'Australia', 'Austria', 'Belgium',\n", " 'Brazil', 'Canada', 'China', 'Denmark', 'Dominican Republic',\n", " 'Ecuador', 'Egypt', 'France', 'Germany', 'Greece', 'Hungary',\n", " 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Italy', 'Japan',\n", " 'Malaysia', 'Morocco', 'Netherlands', 'Norway', 'Philippines',\n", " 'Poland', 'Portugal', 'Romania', 'San Marino', 'South Korea',\n", " 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'United Kingdom',\n", " 'United States', 'World'], dtype=object)" ] }, "execution_count": 230, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_death.location.unique()" ] }, { "cell_type": "code", "execution_count": 231, "metadata": {}, "outputs": [], "source": [ "COUNTRIES = set(df[(df.date == df.date.max())].nlargest(columns='total_cases', n=34).sort_values('location').location)\n", "COUNTRIES.remove('World')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Altair" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Storing data locally " ] }, { "cell_type": "code", "execution_count": 232, "metadata": {}, "outputs": [], "source": [ "url_infection = 'data_infection.json'\n", "url_death = 'data_death.json'\n", "url_summary = 'data_summary.json'\n", "\n", "df_infection[df_infection.location.isin(COUNTRIES)].to_json(url_infection, orient='records')\n", "df_death[df_death.location.isin(COUNTRIES)].to_json(url_death, orient='records')\n", "df_summary[df_summary.location.isin(COUNTRIES)].to_json(url_summary, orient='records')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Uploading data to github" ] }, { "cell_type": "code", "execution_count": 202, "metadata": {}, "outputs": [], "source": [ "repo = g.get_repo(\"Datenspieler/notebooks_for_blog\")" ] }, { "cell_type": "code", "execution_count": 203, "metadata": {}, "outputs": [], "source": [ "FILELIST = ['data_infection.json', 'data_death.json', 'data_summary.json']" ] }, { "cell_type": "code", "execution_count": 236, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Uploading data_infection.json - Sat, 28 Mar 2020 04:46:56 GMT\n", "Uploading data_death.json - Sat, 28 Mar 2020 04:46:59 GMT\n", "Uploading data_summary.json - Sat, 28 Mar 2020 04:47:00 GMT\n" ] } ], "source": [ "for file_for_upload in FILELIST:\n", " print('Uploading', file_for_upload, end=' - ')\n", " with open(file_for_upload) as f:\n", " data = f.read()\n", " contents = repo.get_contents(\"2020-corona/\" + file_for_upload)\n", " repo.update_file(contents.path, \"Update data as of \" + df.date.max().strftime('%Y-%m-%d'), data, \n", " contents.sha, branch=\"master\")\n", " \n", " contents = repo.get_contents(\"2020-corona/\" + file_for_upload)\n", "\n", " print(contents.last_modified)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preparing link to data in github " ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "# In future version files should be pushed to git automatically, this time I uploaded them \n", "\n", "BASEURL = 'https://raw.githubusercontent.com/Datenspieler/notebooks_for_blog/master/2020-corona/'\n", "\n", "url_infection = BASEURL + url_infection\n", "url_death = BASEURL + url_death\n", "url_summary = BASEURL + url_summary" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Alternatively local version" ] }, { "cell_type": "code", "execution_count": 233, "metadata": {}, "outputs": [], "source": [ "url_infection = df_infection[df_infection.location.isin(COUNTRIES)]\n", "url_death = df_death[df_death.location.isin(COUNTRIES)]\n", "url_summary = df_summary[df_summary.location.isin(COUNTRIES)]\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plotting data " ] }, { "cell_type": "code", "execution_count": 235, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.HConcatChart(...)" ] }, "execution_count": 235, "metadata": {}, "output_type": "execute_result" } ], "source": [ "highlight = alt.selection(type='single', on='mouseover',\n", " fields=['location'], nearest=True)\n", "\n", "base_infection = alt.Chart(url_infection).encode(\n", " alt.X('days_after', type='quantitative', scale=alt.Scale(domain=[0,30], type='ordinal'),\n", " title='Days since the 100th confirmed infection'),\n", " alt.Y('total_cases_normalized', type='quantitative', scale=alt.Scale(type='log', base=10), \n", " title='Total confirmed infections of COVID-19, normalized'),\n", " alt.Color('location:N', title=\"Country\", legend=None),\n", " alt.Tooltip(['location:N', 'total_cases:Q', 'date:T'])\n", ").properties(\n", " title='Development of Corona infections',\n", ")\n", "\n", "base_death = alt.Chart(url_death).encode(\n", " alt.X('days_after', type='quantitative', scale=alt.Scale(domain=[0,30], type='ordinal'),\n", " title='Days since the 10th confirmed death'),\n", " alt.Y('total_deaths_normalized', type='quantitative', scale=alt.Scale(type='log', base=10), \n", " title='Total confirmed deaths of COVID-19, normalized'),\n", " alt.Color('location:N', title=\"Country\", legend=None),\n", " alt.Tooltip(['location:N', 'total_deaths:Q', 'date:T'])\n", ").properties(\n", " title='Development of Corona deaths',\n", ")\n", "\n", "base_summary = alt.Chart(url_summary).mark_bar().encode(\n", " x = alt.X('cases', type='quantitative', title='Confirmed cases', scale=alt.Scale(type='linear')),\n", " y = alt.Y('location:N', title='Country'),\n", " color = alt.condition(highlight, alt.Color('location:N', title=\"Country\", legend=None), alt.ColorValue(\"grey\")),\n", " tooltip = alt.Tooltip(['location:N', 'cases:Q'])\n", ").add_selection(\n", " highlight\n", ").properties(\n", " width=200,\n", " title='Corona cases by country'\n", ")\n", "\n", "\n", "points_infection = base_infection.mark_circle().encode(\n", " opacity=alt.value(0)\n", ").add_selection(\n", " highlight\n", ").properties(\n", " width=600\n", ").interactive(\n", " bind_y = False\n", ")\n", "\n", "points_death = base_death.mark_circle().encode(\n", " opacity=alt.value(0)\n", ").add_selection(\n", " highlight\n", ").properties(\n", " width=600\n", ").interactive(\n", " bind_y = False\n", ")\n", "\n", "\n", "lines_infection = base_infection.mark_line().encode(\n", " size=alt.condition(~highlight, alt.value(1), alt.value(5), legend=None)\n", ")\n", "\n", "lines_death = base_death.mark_line().encode(\n", " size=alt.condition(~highlight, alt.value(1), alt.value(5), legend=None)\n", ")\n", "\n", "chart = alt.vconcat(points_infection + lines_infection, points_death + lines_death)\n", "\n", "chart = alt.hconcat(base_summary, chart)\n", "\n", "\n", "#chart.save('corona.html')\n", "chart.save('corona.json')\n", "\n", "chart" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }