{ "cells": [ { "cell_type": "code", "execution_count": 280, "metadata": {}, "outputs": [], "source": [ "import os\n", "import shutil\n", "import datetime\n", "import pandas as pd\n", "import requests\n", "\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data Links" ] }, { "cell_type": "code", "execution_count": 281, "metadata": {}, "outputs": [], "source": [ "census_url='http://censusindia.gov.in/2011census/C-series/c-14/DDW-0000C-14.xls'\n", "raw_data_covid='https://api.covid19india.org/raw_data.json'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Processing Census Data" ] }, { "cell_type": "code", "execution_count": 282, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TableStateDistt.Area NameAge-groupTotalUnnamed: 6Unnamed: 7RuralUnnamed: 9Unnamed: 10UrbanUnnamed: 12Unnamed: 13
0NameCodeCodeNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1NaNNaNNaNNaNNaNPersonsMalesFemalesPersonsMalesFemalesPersonsMalesFemales
2NaNNaNNaNNaN12345678910
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
5C411400000IndiaAll ages1210854977623270258587584719833748852427781058405967794377106125195489200181616925
6C411400000India0-41128067785863207454174704829866604303637739950283298201181559569714224421
7C411400000India5-91269281266630046660627660938076124882525944982353331205141747520715645307
8C411400000India10-141327092126941883563290377968044945048815846316336359047181893067716974041
9C411400000India15-191205264496398239656544053839024724457055739331915366239771941183917212138
\n", "
" ], "text/plain": [ " Table State Distt. Area Name Age-group Total Unnamed: 6 Unnamed: 7 \\\n", "0 Name Code Code NaN NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN NaN Persons Males Females \n", "2 NaN NaN NaN NaN 1 2 3 4 \n", "3 NaN NaN NaN NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN NaN NaN NaN \n", "5 C4114 00 000 India All ages 1210854977 623270258 587584719 \n", "6 C4114 00 000 India 0-4 112806778 58632074 54174704 \n", "7 C4114 00 000 India 5-9 126928126 66300466 60627660 \n", "8 C4114 00 000 India 10-14 132709212 69418835 63290377 \n", "9 C4114 00 000 India 15-19 120526449 63982396 56544053 \n", "\n", " Rural Unnamed: 9 Unnamed: 10 Urban Unnamed: 12 Unnamed: 13 \n", "0 NaN NaN NaN NaN NaN NaN \n", "1 Persons Males Females Persons Males Females \n", "2 5 6 7 8 9 10 \n", "3 NaN NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN NaN \n", "5 833748852 427781058 405967794 377106125 195489200 181616925 \n", "6 82986660 43036377 39950283 29820118 15595697 14224421 \n", "7 93807612 48825259 44982353 33120514 17475207 15645307 \n", "8 96804494 50488158 46316336 35904718 18930677 16974041 \n", "9 83902472 44570557 39331915 36623977 19411839 17212138 " ] }, "execution_count": 282, "metadata": {}, "output_type": "execute_result" } ], "source": [ "censusDS = pd.read_excel(census_url,skiprows=range(0, 1), skipinitialspace=True)\n", "censusDS.head(10)" ] }, { "cell_type": "code", "execution_count": 283, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_populationpopulation_percentage
age-group
023973490419.798812
125323566120.913789
221283818717.577513
317373563514.348179
413475643911.129032
5882153097.285374
6641186905.295324
7284413452.348865
8112890050.932317
A44898020.370796
\n", "
" ], "text/plain": [ " total_population population_percentage\n", "age-group \n", "0 239734904 19.798812\n", "1 253235661 20.913789\n", "2 212838187 17.577513\n", "3 173735635 14.348179\n", "4 134756439 11.129032\n", "5 88215309 7.285374\n", "6 64118690 5.295324\n", "7 28441345 2.348865\n", "8 11289005 0.932317\n", "A 4489802 0.370796" ] }, "execution_count": 283, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Clean the Headers\n", "censusDS.columns = [c.strip().lower().replace(' ','_') for c in censusDS.columns]\n", "\n", "# We are interested only in total India population\n", "censusDF = pd.DataFrame(censusDS[censusDS['area_name'] == 'India'][['age-group','total']])\n", "\n", "# Rename the column for better verbosity\n", "censusDF.rename(columns={'total': 'total_population'}, inplace=True)\n", "\n", "# Remove the all-age group summary data\n", "censusDF = censusDF[censusDF['age-group'] !='All ages']\n", "\n", "# Correcting the age column 5-9 for helping in grouping\n", "censusDF.loc[(censusDF['age-group'] == '5-9'),'age-group']='05-9'\n", "\n", "# Strip the age-group by keeping only the first character. So 1 denoter 10-19, 2 denote 20-29 and so on\n", "censusDF['age-group'] = censusDF['age-group'].str[:1]\n", "\n", "# Group by the Age-Group and Sum\n", "censusAvgDF = censusDF.groupby(['age-group']).agg('sum')\n", "\n", "# Calculate the Percentage for each age group\n", "censusAvgDF['population_percentage']=(censusAvgDF['total_population']/censusAvgDF['total_population'].sum())*100\n", "\n", "censusAvgDF" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Processing Covid Data" ] }, { "cell_type": "code", "execution_count": 284, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape of patient records (27891, 21)\n" ] } ], "source": [ "# Download the covid data\n", "r=requests.get(raw_data_covid)\n", "o=r.json()\n", "df_orig = pd.DataFrame(o['raw_data'])\n", "df_orig.loc[:, 'datekey'] = df_orig.dateannounced.apply(lambda x: datetime.datetime.strptime(x, '%d/%m/%Y').strftime(\"%Y-%m-%d\"))\n", "\n", "# Set the empty age as 555 to denote the unknown\n", "df_orig['agebracket'].replace('', '55', inplace=True)\n", "\n", "# if there are any range value set in Age Bracket, keep the start age. For eg if the age is '28-35', then keep 28 as the age\n", "df_clean_covid = df_orig.apply(lambda x: x['agebracket'][:2] if '-' in x.agebracket else x,axis=1)\n", "\n", "print('Shape of patient records', df_clean_covid.shape)" ] }, { "cell_type": "code", "execution_count": 275, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape after filter deceased patient records (46, 21)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\users\\sanal\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\ipykernel_launcher.py:7: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " import sys\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_deceased
agebracketcurrentstatus
(0, 10]Deceased1
(10, 20]Deceased0
(20, 30]Deceased1
(30, 40]Deceased2
(40, 50]Deceased8
(50, 60]Deceased7
(60, 70]Deceased17
(70, 80]Deceased9
(80, 150]Deceased1
(150, 1000]Deceased0
\n", "
" ], "text/plain": [ " total_deceased\n", "agebracket currentstatus \n", "(0, 10] Deceased 1\n", "(10, 20] Deceased 0\n", "(20, 30] Deceased 1\n", "(30, 40] Deceased 2\n", "(40, 50] Deceased 8\n", "(50, 60] Deceased 7\n", "(60, 70] Deceased 17\n", "(70, 80] Deceased 9\n", "(80, 150] Deceased 1\n", "(150, 1000] Deceased 0" ] }, "execution_count": 275, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We are only interested in Deceased patients\n", "df_deceased = df_clean_covid[df_clean_covid['currentstatus'] == 'Deceased']\n", "print ('Shape after filter deceased patient records', df_deceased.shape)\n", "\n", "# Bucket them into age groups\n", "df_deceased['agebracket'] = df_deceased['agebracket'].astype(float)\n", "age_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 150, 1000]\n", "bins = pd.cut(df_deceased['agebracket'], age_bins)\n", "\n", "# Group age wise and get the summary\n", "deceased_agewise = df_deceased.groupby([bins,'currentstatus'])['agebracket'].agg(['count'])\n", "\n", "# Rename the column for better verbosity\n", "deceased_agewise.rename(columns={'count': 'total_deceased'}, inplace=True)\n", "\n", "deceased_agewise" ] }, { "cell_type": "code", "execution_count": 285, "metadata": {}, "outputs": [], "source": [ "# Combine the sensus and covid data\n", "censusAvgDF.index = deceased_agewise.index\n", "\n", "data = pd.concat([deceased_agewise,censusAvgDF], axis=1)" ] }, { "cell_type": "code", "execution_count": 286, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_deceasedtotal_populationpopulation_percentage
agebracketcurrentstatus
(0, 10]Deceased123973490419.798812
(10, 20]Deceased025323566120.913789
(20, 30]Deceased121283818717.577513
(30, 40]Deceased217373563514.348179
(40, 50]Deceased813475643911.129032
(50, 60]Deceased7882153097.285374
(60, 70]Deceased17641186905.295324
(70, 80]Deceased9284413452.348865
(80, 150]Deceased1112890050.932317
(150, 1000]Deceased044898020.370796
\n", "
" ], "text/plain": [ " total_deceased total_population \\\n", "agebracket currentstatus \n", "(0, 10] Deceased 1 239734904 \n", "(10, 20] Deceased 0 253235661 \n", "(20, 30] Deceased 1 212838187 \n", "(30, 40] Deceased 2 173735635 \n", "(40, 50] Deceased 8 134756439 \n", "(50, 60] Deceased 7 88215309 \n", "(60, 70] Deceased 17 64118690 \n", "(70, 80] Deceased 9 28441345 \n", "(80, 150] Deceased 1 11289005 \n", "(150, 1000] Deceased 0 4489802 \n", "\n", " population_percentage \n", "agebracket currentstatus \n", "(0, 10] Deceased 19.798812 \n", "(10, 20] Deceased 20.913789 \n", "(20, 30] Deceased 17.577513 \n", "(30, 40] Deceased 14.348179 \n", "(40, 50] Deceased 11.129032 \n", "(50, 60] Deceased 7.285374 \n", "(60, 70] Deceased 5.295324 \n", "(70, 80] Deceased 2.348865 \n", "(80, 150] Deceased 0.932317 \n", "(150, 1000] Deceased 0.370796 " ] }, "execution_count": 286, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }