import pandas as pd
filepath = "C:\\Users\\abdelilah\\Desktop\\Python Projects\\Covid-19\\"
pd.set_option('display.max_rows', 10)
covid_df = pd.read_csv(filepath + "covid_19_data.csv", delimiter = ",")
covid_df
Date | State | Region | Confirmed | Deaths | Recovered | |
---|---|---|---|---|---|---|
0 | 4/29/2020 | NaN | Afghanistan | 1939 | 60 | 252 |
1 | 4/29/2020 | NaN | Albania | 766 | 30 | 455 |
2 | 4/29/2020 | NaN | Algeria | 3848 | 444 | 1702 |
3 | 4/29/2020 | NaN | Andorra | 743 | 42 | 423 |
4 | 4/29/2020 | NaN | Angola | 27 | 2 | 7 |
... | ... | ... | ... | ... | ... | ... |
316 | 4/29/2020 | Wyoming | US | 545 | 7 | 0 |
317 | 4/29/2020 | Xinjiang | Mainland China | 76 | 3 | 73 |
318 | 4/29/2020 | Yukon | Canada | 11 | 0 | 0 |
319 | 4/29/2020 | Yunnan | Mainland China | 185 | 2 | 181 |
320 | 4/29/2020 | Zhejiang | Mainland China | 1268 | 1 | 1263 |
321 rows × 6 columns
covid_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 321 entries, 0 to 320 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 321 non-null object 1 State 140 non-null object 2 Region 321 non-null object 3 Confirmed 321 non-null int64 4 Deaths 321 non-null int64 5 Recovered 321 non-null int64 dtypes: int64(3), object(3) memory usage: 15.2+ KB
values = dict()
for col in covid_df.columns:
values[col] = covid_df[col].value_counts()
values
{'Date': 4/29/2020 321 Name: Date, dtype: int64, 'State': Grand Princess 2 Diamond Princess cruise ship 2 Recovered 2 North Carolina 1 North Dakota 1 .. Zhejiang 1 Shaanxi 1 Puerto Rico 1 New Hampshire 1 Arizona 1 Name: State, Length: 137, dtype: int64, 'Region': US 58 Mainland China 31 Canada 15 France 11 UK 11 .. North Macedonia 1 El Salvador 1 Pakistan 1 Slovakia 1 Lithuania 1 Name: Region, Length: 187, dtype: int64, 'Confirmed': 11 5 6 4 16 4 18 4 8 3 .. 2727 1 168 1 299691 1 1361 1 510 1 Name: Confirmed, Length: 282, dtype: int64, 'Deaths': 0 53 1 25 3 15 2 14 6 13 .. 98 1 92 1 87 1 3670 1 26097 1 Name: Deaths, Length: 142, dtype: int64, 'Recovered': 0 78 5 5 8 5 19 4 6 4 .. 124 1 63616 1 131 1 645 1 1023 1 Name: Recovered, Length: 195, dtype: int64}
First Task: Show the number of Confirmed, Deaths and Recovered cases in each Region
# i don't get what they mean exactly, it confused me, cuz if they want the info about each Region then it's already displayed
# in the dataframe. So i think i'll just assign the columns wanted in a new dataframe
# i think i get it now, the regions here mentioned many times, maybe this task wants me to gather all those regions into one
cases_df = covid_df.drop(["Date", "State"], axis = 1)
cases_df["Region"].value_counts()
US 58 Mainland China 31 Canada 15 France 11 UK 11 .. North Macedonia 1 El Salvador 1 Pakistan 1 Slovakia 1 Lithuania 1 Name: Region, Length: 187, dtype: int64
cases_df = cases_df.groupby('Region').sum()
cases_df
Confirmed | Deaths | Recovered | |
---|---|---|---|
Region | |||
Afghanistan | 1939 | 60 | 252 |
Albania | 766 | 30 | 455 |
Algeria | 3848 | 444 | 1702 |
Andorra | 743 | 42 | 423 |
Angola | 27 | 2 | 7 |
... | ... | ... | ... |
West Bank and Gaza | 344 | 2 | 71 |
Western Sahara | 6 | 0 | 5 |
Yemen | 6 | 0 | 1 |
Zambia | 97 | 3 | 54 |
Zimbabwe | 32 | 4 | 5 |
187 rows × 3 columns
Second Task: Remove all the records where the Confirmed Cases is Less Than 10
less_10 = covid_df[covid_df["Confirmed"] >= 10]
less_10
Date | State | Region | Confirmed | Deaths | Recovered | |
---|---|---|---|---|---|---|
0 | 4/29/2020 | NaN | Afghanistan | 1939 | 60 | 252 |
1 | 4/29/2020 | NaN | Albania | 766 | 30 | 455 |
2 | 4/29/2020 | NaN | Algeria | 3848 | 444 | 1702 |
3 | 4/29/2020 | NaN | Andorra | 743 | 42 | 423 |
4 | 4/29/2020 | NaN | Angola | 27 | 2 | 7 |
... | ... | ... | ... | ... | ... | ... |
316 | 4/29/2020 | Wyoming | US | 545 | 7 | 0 |
317 | 4/29/2020 | Xinjiang | Mainland China | 76 | 3 | 73 |
318 | 4/29/2020 | Yukon | Canada | 11 | 0 | 0 |
319 | 4/29/2020 | Yunnan | Mainland China | 185 | 2 | 181 |
320 | 4/29/2020 | Zhejiang | Mainland China | 1268 | 1 | 1263 |
304 rows × 6 columns
Third Task: In which Region, maximum number of Confirmed cases were recorded ?
cases_df["Confirmed"].idxmax()
'US'
cases_df.loc['US']
Confirmed 1039909 Deaths 60967 Recovered 120720 Name: US, dtype: int64
Fourth Task: In which Region, minimum number of Deaths cases were recorded ?
cases_df["Deaths"].idxmin()
'Bhutan'
cases_df.loc['Bhutan']
Confirmed 7 Deaths 0 Recovered 5 Name: Bhutan, dtype: int64
Fifth Task: How many Confirmed, Deaths & Recovered cases were reported from India till 29 April 2020 ?
cases_df.loc["India"]
Confirmed 33062 Deaths 1079 Recovered 8437 Name: India, dtype: int64
Sixth Task:
A- Sort the entire data wrt No. of Confirmed cases in ascending order.
cases_df.sort_values(by = ["Confirmed"], ascending = True)
Confirmed | Deaths | Recovered | |
---|---|---|---|
Region | |||
Yemen | 6 | 0 | 1 |
Western Sahara | 6 | 0 | 5 |
Bhutan | 7 | 0 | 5 |
Papua New Guinea | 8 | 0 | 0 |
Sao Tome and Principe | 8 | 0 | 4 |
... | ... | ... | ... |
UK | 166441 | 26166 | 857 |
France | 166543 | 24121 | 49118 |
Italy | 203591 | 27682 | 71252 |
Spain | 236899 | 24275 | 132929 |
US | 1039909 | 60967 | 120720 |
187 rows × 3 columns
B- Sort the entire data wrt No. of Recovered cases in descending order.
cases_df.sort_values(by = ["Recovered"], ascending = False)
Confirmed | Deaths | Recovered | |
---|---|---|---|
Region | |||
Spain | 236899 | 24275 | 132929 |
US | 1039909 | 60967 | 120720 |
Germany | 161539 | 6467 | 120400 |
Mainland China | 82862 | 4633 | 77610 |
Iran | 93657 | 5957 | 73791 |
... | ... | ... | ... |
Cabo Verde | 114 | 1 | 2 |
Yemen | 6 | 0 | 1 |
South Sudan | 34 | 0 | 0 |
Papua New Guinea | 8 | 0 | 0 |
MS Zaandam | 9 | 2 | 0 |
187 rows × 3 columns
#this is a useful way to get an idea about how dispersed the missing values are in the dataset
import matplotlib.pyplot as plt
import seaborn as sns
sns.heatmap(covid_df.isnull())
<AxesSubplot:>