{
"cells": [
{
"cell_type": "code",
"execution_count": 280,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import shutil\n",
"import datetime\n",
"import pandas as pd\n",
"import requests\n",
"\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Data Links"
]
},
{
"cell_type": "code",
"execution_count": 281,
"metadata": {},
"outputs": [],
"source": [
"census_url='http://censusindia.gov.in/2011census/C-series/c-14/DDW-0000C-14.xls'\n",
"raw_data_covid='https://api.covid19india.org/raw_data.json'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Processing Census Data"
]
},
{
"cell_type": "code",
"execution_count": 282,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Table | \n",
" State | \n",
" Distt. | \n",
" Area Name | \n",
" Age-group | \n",
" Total | \n",
" Unnamed: 6 | \n",
" Unnamed: 7 | \n",
" Rural | \n",
" Unnamed: 9 | \n",
" Unnamed: 10 | \n",
" Urban | \n",
" Unnamed: 12 | \n",
" Unnamed: 13 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Name | \n",
" Code | \n",
" Code | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" Persons | \n",
" Males | \n",
" Females | \n",
" Persons | \n",
" Males | \n",
" Females | \n",
" Persons | \n",
" Males | \n",
" Females | \n",
"
\n",
" \n",
" 2 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
"
\n",
" \n",
" 3 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 4 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 5 | \n",
" C4114 | \n",
" 00 | \n",
" 000 | \n",
" India | \n",
" All ages | \n",
" 1210854977 | \n",
" 623270258 | \n",
" 587584719 | \n",
" 833748852 | \n",
" 427781058 | \n",
" 405967794 | \n",
" 377106125 | \n",
" 195489200 | \n",
" 181616925 | \n",
"
\n",
" \n",
" 6 | \n",
" C4114 | \n",
" 00 | \n",
" 000 | \n",
" India | \n",
" 0-4 | \n",
" 112806778 | \n",
" 58632074 | \n",
" 54174704 | \n",
" 82986660 | \n",
" 43036377 | \n",
" 39950283 | \n",
" 29820118 | \n",
" 15595697 | \n",
" 14224421 | \n",
"
\n",
" \n",
" 7 | \n",
" C4114 | \n",
" 00 | \n",
" 000 | \n",
" India | \n",
" 5-9 | \n",
" 126928126 | \n",
" 66300466 | \n",
" 60627660 | \n",
" 93807612 | \n",
" 48825259 | \n",
" 44982353 | \n",
" 33120514 | \n",
" 17475207 | \n",
" 15645307 | \n",
"
\n",
" \n",
" 8 | \n",
" C4114 | \n",
" 00 | \n",
" 000 | \n",
" India | \n",
" 10-14 | \n",
" 132709212 | \n",
" 69418835 | \n",
" 63290377 | \n",
" 96804494 | \n",
" 50488158 | \n",
" 46316336 | \n",
" 35904718 | \n",
" 18930677 | \n",
" 16974041 | \n",
"
\n",
" \n",
" 9 | \n",
" C4114 | \n",
" 00 | \n",
" 000 | \n",
" India | \n",
" 15-19 | \n",
" 120526449 | \n",
" 63982396 | \n",
" 56544053 | \n",
" 83902472 | \n",
" 44570557 | \n",
" 39331915 | \n",
" 36623977 | \n",
" 19411839 | \n",
" 17212138 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Table State Distt. Area Name Age-group Total Unnamed: 6 Unnamed: 7 \\\n",
"0 Name Code Code NaN NaN NaN NaN NaN \n",
"1 NaN NaN NaN NaN NaN Persons Males Females \n",
"2 NaN NaN NaN NaN 1 2 3 4 \n",
"3 NaN NaN NaN NaN NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN NaN NaN NaN NaN \n",
"5 C4114 00 000 India All ages 1210854977 623270258 587584719 \n",
"6 C4114 00 000 India 0-4 112806778 58632074 54174704 \n",
"7 C4114 00 000 India 5-9 126928126 66300466 60627660 \n",
"8 C4114 00 000 India 10-14 132709212 69418835 63290377 \n",
"9 C4114 00 000 India 15-19 120526449 63982396 56544053 \n",
"\n",
" Rural Unnamed: 9 Unnamed: 10 Urban Unnamed: 12 Unnamed: 13 \n",
"0 NaN NaN NaN NaN NaN NaN \n",
"1 Persons Males Females Persons Males Females \n",
"2 5 6 7 8 9 10 \n",
"3 NaN NaN NaN NaN NaN NaN \n",
"4 NaN NaN NaN NaN NaN NaN \n",
"5 833748852 427781058 405967794 377106125 195489200 181616925 \n",
"6 82986660 43036377 39950283 29820118 15595697 14224421 \n",
"7 93807612 48825259 44982353 33120514 17475207 15645307 \n",
"8 96804494 50488158 46316336 35904718 18930677 16974041 \n",
"9 83902472 44570557 39331915 36623977 19411839 17212138 "
]
},
"execution_count": 282,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"censusDS = pd.read_excel(census_url,skiprows=range(0, 1), skipinitialspace=True)\n",
"censusDS.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 283,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" total_population | \n",
" population_percentage | \n",
"
\n",
" \n",
" age-group | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 239734904 | \n",
" 19.798812 | \n",
"
\n",
" \n",
" 1 | \n",
" 253235661 | \n",
" 20.913789 | \n",
"
\n",
" \n",
" 2 | \n",
" 212838187 | \n",
" 17.577513 | \n",
"
\n",
" \n",
" 3 | \n",
" 173735635 | \n",
" 14.348179 | \n",
"
\n",
" \n",
" 4 | \n",
" 134756439 | \n",
" 11.129032 | \n",
"
\n",
" \n",
" 5 | \n",
" 88215309 | \n",
" 7.285374 | \n",
"
\n",
" \n",
" 6 | \n",
" 64118690 | \n",
" 5.295324 | \n",
"
\n",
" \n",
" 7 | \n",
" 28441345 | \n",
" 2.348865 | \n",
"
\n",
" \n",
" 8 | \n",
" 11289005 | \n",
" 0.932317 | \n",
"
\n",
" \n",
" A | \n",
" 4489802 | \n",
" 0.370796 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" total_population population_percentage\n",
"age-group \n",
"0 239734904 19.798812\n",
"1 253235661 20.913789\n",
"2 212838187 17.577513\n",
"3 173735635 14.348179\n",
"4 134756439 11.129032\n",
"5 88215309 7.285374\n",
"6 64118690 5.295324\n",
"7 28441345 2.348865\n",
"8 11289005 0.932317\n",
"A 4489802 0.370796"
]
},
"execution_count": 283,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Clean the Headers\n",
"censusDS.columns = [c.strip().lower().replace(' ','_') for c in censusDS.columns]\n",
"\n",
"# We are interested only in total India population\n",
"censusDF = pd.DataFrame(censusDS[censusDS['area_name'] == 'India'][['age-group','total']])\n",
"\n",
"# Rename the column for better verbosity\n",
"censusDF.rename(columns={'total': 'total_population'}, inplace=True)\n",
"\n",
"# Remove the all-age group summary data\n",
"censusDF = censusDF[censusDF['age-group'] !='All ages']\n",
"\n",
"# Correcting the age column 5-9 for helping in grouping\n",
"censusDF.loc[(censusDF['age-group'] == '5-9'),'age-group']='05-9'\n",
"\n",
"# Strip the age-group by keeping only the first character. So 1 denoter 10-19, 2 denote 20-29 and so on\n",
"censusDF['age-group'] = censusDF['age-group'].str[:1]\n",
"\n",
"# Group by the Age-Group and Sum\n",
"censusAvgDF = censusDF.groupby(['age-group']).agg('sum')\n",
"\n",
"# Calculate the Percentage for each age group\n",
"censusAvgDF['population_percentage']=(censusAvgDF['total_population']/censusAvgDF['total_population'].sum())*100\n",
"\n",
"censusAvgDF"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Processing Covid Data"
]
},
{
"cell_type": "code",
"execution_count": 284,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shape of patient records (27891, 21)\n"
]
}
],
"source": [
"# Download the covid data\n",
"r=requests.get(raw_data_covid)\n",
"o=r.json()\n",
"df_orig = pd.DataFrame(o['raw_data'])\n",
"df_orig.loc[:, 'datekey'] = df_orig.dateannounced.apply(lambda x: datetime.datetime.strptime(x, '%d/%m/%Y').strftime(\"%Y-%m-%d\"))\n",
"\n",
"# Set the empty age as 555 to denote the unknown\n",
"df_orig['agebracket'].replace('', '55', inplace=True)\n",
"\n",
"# if there are any range value set in Age Bracket, keep the start age. For eg if the age is '28-35', then keep 28 as the age\n",
"df_clean_covid = df_orig.apply(lambda x: x['agebracket'][:2] if '-' in x.agebracket else x,axis=1)\n",
"\n",
"print('Shape of patient records', df_clean_covid.shape)"
]
},
{
"cell_type": "code",
"execution_count": 275,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shape after filter deceased patient records (46, 21)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\sanal\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\ipykernel_launcher.py:7: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" import sys\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" total_deceased | \n",
"
\n",
" \n",
" agebracket | \n",
" currentstatus | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" (0, 10] | \n",
" Deceased | \n",
" 1 | \n",
"
\n",
" \n",
" (10, 20] | \n",
" Deceased | \n",
" 0 | \n",
"
\n",
" \n",
" (20, 30] | \n",
" Deceased | \n",
" 1 | \n",
"
\n",
" \n",
" (30, 40] | \n",
" Deceased | \n",
" 2 | \n",
"
\n",
" \n",
" (40, 50] | \n",
" Deceased | \n",
" 8 | \n",
"
\n",
" \n",
" (50, 60] | \n",
" Deceased | \n",
" 7 | \n",
"
\n",
" \n",
" (60, 70] | \n",
" Deceased | \n",
" 17 | \n",
"
\n",
" \n",
" (70, 80] | \n",
" Deceased | \n",
" 9 | \n",
"
\n",
" \n",
" (80, 150] | \n",
" Deceased | \n",
" 1 | \n",
"
\n",
" \n",
" (150, 1000] | \n",
" Deceased | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" total_deceased\n",
"agebracket currentstatus \n",
"(0, 10] Deceased 1\n",
"(10, 20] Deceased 0\n",
"(20, 30] Deceased 1\n",
"(30, 40] Deceased 2\n",
"(40, 50] Deceased 8\n",
"(50, 60] Deceased 7\n",
"(60, 70] Deceased 17\n",
"(70, 80] Deceased 9\n",
"(80, 150] Deceased 1\n",
"(150, 1000] Deceased 0"
]
},
"execution_count": 275,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We are only interested in Deceased patients\n",
"df_deceased = df_clean_covid[df_clean_covid['currentstatus'] == 'Deceased']\n",
"print ('Shape after filter deceased patient records', df_deceased.shape)\n",
"\n",
"# Bucket them into age groups\n",
"df_deceased['agebracket'] = df_deceased['agebracket'].astype(float)\n",
"age_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 150, 1000]\n",
"bins = pd.cut(df_deceased['agebracket'], age_bins)\n",
"\n",
"# Group age wise and get the summary\n",
"deceased_agewise = df_deceased.groupby([bins,'currentstatus'])['agebracket'].agg(['count'])\n",
"\n",
"# Rename the column for better verbosity\n",
"deceased_agewise.rename(columns={'count': 'total_deceased'}, inplace=True)\n",
"\n",
"deceased_agewise"
]
},
{
"cell_type": "code",
"execution_count": 285,
"metadata": {},
"outputs": [],
"source": [
"# Combine the sensus and covid data\n",
"censusAvgDF.index = deceased_agewise.index\n",
"\n",
"data = pd.concat([deceased_agewise,censusAvgDF], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 286,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" total_deceased | \n",
" total_population | \n",
" population_percentage | \n",
"
\n",
" \n",
" agebracket | \n",
" currentstatus | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" (0, 10] | \n",
" Deceased | \n",
" 1 | \n",
" 239734904 | \n",
" 19.798812 | \n",
"
\n",
" \n",
" (10, 20] | \n",
" Deceased | \n",
" 0 | \n",
" 253235661 | \n",
" 20.913789 | \n",
"
\n",
" \n",
" (20, 30] | \n",
" Deceased | \n",
" 1 | \n",
" 212838187 | \n",
" 17.577513 | \n",
"
\n",
" \n",
" (30, 40] | \n",
" Deceased | \n",
" 2 | \n",
" 173735635 | \n",
" 14.348179 | \n",
"
\n",
" \n",
" (40, 50] | \n",
" Deceased | \n",
" 8 | \n",
" 134756439 | \n",
" 11.129032 | \n",
"
\n",
" \n",
" (50, 60] | \n",
" Deceased | \n",
" 7 | \n",
" 88215309 | \n",
" 7.285374 | \n",
"
\n",
" \n",
" (60, 70] | \n",
" Deceased | \n",
" 17 | \n",
" 64118690 | \n",
" 5.295324 | \n",
"
\n",
" \n",
" (70, 80] | \n",
" Deceased | \n",
" 9 | \n",
" 28441345 | \n",
" 2.348865 | \n",
"
\n",
" \n",
" (80, 150] | \n",
" Deceased | \n",
" 1 | \n",
" 11289005 | \n",
" 0.932317 | \n",
"
\n",
" \n",
" (150, 1000] | \n",
" Deceased | \n",
" 0 | \n",
" 4489802 | \n",
" 0.370796 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" total_deceased total_population \\\n",
"agebracket currentstatus \n",
"(0, 10] Deceased 1 239734904 \n",
"(10, 20] Deceased 0 253235661 \n",
"(20, 30] Deceased 1 212838187 \n",
"(30, 40] Deceased 2 173735635 \n",
"(40, 50] Deceased 8 134756439 \n",
"(50, 60] Deceased 7 88215309 \n",
"(60, 70] Deceased 17 64118690 \n",
"(70, 80] Deceased 9 28441345 \n",
"(80, 150] Deceased 1 11289005 \n",
"(150, 1000] Deceased 0 4489802 \n",
"\n",
" population_percentage \n",
"agebracket currentstatus \n",
"(0, 10] Deceased 19.798812 \n",
"(10, 20] Deceased 20.913789 \n",
"(20, 30] Deceased 17.577513 \n",
"(30, 40] Deceased 14.348179 \n",
"(40, 50] Deceased 11.129032 \n",
"(50, 60] Deceased 7.285374 \n",
"(60, 70] Deceased 5.295324 \n",
"(70, 80] Deceased 2.348865 \n",
"(80, 150] Deceased 0.932317 \n",
"(150, 1000] Deceased 0.370796 "
]
},
"execution_count": 286,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}