import pandas as pd
filepath = "C:\\Users\\abdelilah\\Desktop\\Python Projects\\Covid-19\\"
pd.set_option('display.max_rows', 10)
covid_df = pd.read_csv(filepath + "covid_19_data.csv", delimiter = ",")
covid_df
| Date | State | Region | Confirmed | Deaths | Recovered | |
|---|---|---|---|---|---|---|
| 0 | 4/29/2020 | NaN | Afghanistan | 1939 | 60 | 252 |
| 1 | 4/29/2020 | NaN | Albania | 766 | 30 | 455 |
| 2 | 4/29/2020 | NaN | Algeria | 3848 | 444 | 1702 |
| 3 | 4/29/2020 | NaN | Andorra | 743 | 42 | 423 |
| 4 | 4/29/2020 | NaN | Angola | 27 | 2 | 7 |
| ... | ... | ... | ... | ... | ... | ... |
| 316 | 4/29/2020 | Wyoming | US | 545 | 7 | 0 |
| 317 | 4/29/2020 | Xinjiang | Mainland China | 76 | 3 | 73 |
| 318 | 4/29/2020 | Yukon | Canada | 11 | 0 | 0 |
| 319 | 4/29/2020 | Yunnan | Mainland China | 185 | 2 | 181 |
| 320 | 4/29/2020 | Zhejiang | Mainland China | 1268 | 1 | 1263 |
321 rows × 6 columns
covid_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 321 entries, 0 to 320 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 321 non-null object 1 State 140 non-null object 2 Region 321 non-null object 3 Confirmed 321 non-null int64 4 Deaths 321 non-null int64 5 Recovered 321 non-null int64 dtypes: int64(3), object(3) memory usage: 15.2+ KB
values = dict()
for col in covid_df.columns:
values[col] = covid_df[col].value_counts()
values
{'Date': 4/29/2020 321
Name: Date, dtype: int64,
'State': Grand Princess 2
Diamond Princess cruise ship 2
Recovered 2
North Carolina 1
North Dakota 1
..
Zhejiang 1
Shaanxi 1
Puerto Rico 1
New Hampshire 1
Arizona 1
Name: State, Length: 137, dtype: int64,
'Region': US 58
Mainland China 31
Canada 15
France 11
UK 11
..
North Macedonia 1
El Salvador 1
Pakistan 1
Slovakia 1
Lithuania 1
Name: Region, Length: 187, dtype: int64,
'Confirmed': 11 5
6 4
16 4
18 4
8 3
..
2727 1
168 1
299691 1
1361 1
510 1
Name: Confirmed, Length: 282, dtype: int64,
'Deaths': 0 53
1 25
3 15
2 14
6 13
..
98 1
92 1
87 1
3670 1
26097 1
Name: Deaths, Length: 142, dtype: int64,
'Recovered': 0 78
5 5
8 5
19 4
6 4
..
124 1
63616 1
131 1
645 1
1023 1
Name: Recovered, Length: 195, dtype: int64}
First Task: Show the number of Confirmed, Deaths and Recovered cases in each Region
# i don't get what they mean exactly, it confused me, cuz if they want the info about each Region then it's already displayed
# in the dataframe. So i think i'll just assign the columns wanted in a new dataframe
# i think i get it now, the regions here mentioned many times, maybe this task wants me to gather all those regions into one
cases_df = covid_df.drop(["Date", "State"], axis = 1)
cases_df["Region"].value_counts()
US 58
Mainland China 31
Canada 15
France 11
UK 11
..
North Macedonia 1
El Salvador 1
Pakistan 1
Slovakia 1
Lithuania 1
Name: Region, Length: 187, dtype: int64
cases_df = cases_df.groupby('Region').sum()
cases_df
| Confirmed | Deaths | Recovered | |
|---|---|---|---|
| Region | |||
| Afghanistan | 1939 | 60 | 252 |
| Albania | 766 | 30 | 455 |
| Algeria | 3848 | 444 | 1702 |
| Andorra | 743 | 42 | 423 |
| Angola | 27 | 2 | 7 |
| ... | ... | ... | ... |
| West Bank and Gaza | 344 | 2 | 71 |
| Western Sahara | 6 | 0 | 5 |
| Yemen | 6 | 0 | 1 |
| Zambia | 97 | 3 | 54 |
| Zimbabwe | 32 | 4 | 5 |
187 rows × 3 columns
Second Task: Remove all the records where the Confirmed Cases is Less Than 10
less_10 = covid_df[covid_df["Confirmed"] >= 10]
less_10
| Date | State | Region | Confirmed | Deaths | Recovered | |
|---|---|---|---|---|---|---|
| 0 | 4/29/2020 | NaN | Afghanistan | 1939 | 60 | 252 |
| 1 | 4/29/2020 | NaN | Albania | 766 | 30 | 455 |
| 2 | 4/29/2020 | NaN | Algeria | 3848 | 444 | 1702 |
| 3 | 4/29/2020 | NaN | Andorra | 743 | 42 | 423 |
| 4 | 4/29/2020 | NaN | Angola | 27 | 2 | 7 |
| ... | ... | ... | ... | ... | ... | ... |
| 316 | 4/29/2020 | Wyoming | US | 545 | 7 | 0 |
| 317 | 4/29/2020 | Xinjiang | Mainland China | 76 | 3 | 73 |
| 318 | 4/29/2020 | Yukon | Canada | 11 | 0 | 0 |
| 319 | 4/29/2020 | Yunnan | Mainland China | 185 | 2 | 181 |
| 320 | 4/29/2020 | Zhejiang | Mainland China | 1268 | 1 | 1263 |
304 rows × 6 columns
Third Task: In which Region, maximum number of Confirmed cases were recorded ?
cases_df["Confirmed"].idxmax()
'US'
cases_df.loc['US']
Confirmed 1039909 Deaths 60967 Recovered 120720 Name: US, dtype: int64
Fourth Task: In which Region, minimum number of Deaths cases were recorded ?
cases_df["Deaths"].idxmin()
'Bhutan'
cases_df.loc['Bhutan']
Confirmed 7 Deaths 0 Recovered 5 Name: Bhutan, dtype: int64
Fifth Task: How many Confirmed, Deaths & Recovered cases were reported from India till 29 April 2020 ?
cases_df.loc["India"]
Confirmed 33062 Deaths 1079 Recovered 8437 Name: India, dtype: int64
Sixth Task:
A- Sort the entire data wrt No. of Confirmed cases in ascending order.
cases_df.sort_values(by = ["Confirmed"], ascending = True)
| Confirmed | Deaths | Recovered | |
|---|---|---|---|
| Region | |||
| Yemen | 6 | 0 | 1 |
| Western Sahara | 6 | 0 | 5 |
| Bhutan | 7 | 0 | 5 |
| Papua New Guinea | 8 | 0 | 0 |
| Sao Tome and Principe | 8 | 0 | 4 |
| ... | ... | ... | ... |
| UK | 166441 | 26166 | 857 |
| France | 166543 | 24121 | 49118 |
| Italy | 203591 | 27682 | 71252 |
| Spain | 236899 | 24275 | 132929 |
| US | 1039909 | 60967 | 120720 |
187 rows × 3 columns
B- Sort the entire data wrt No. of Recovered cases in descending order.
cases_df.sort_values(by = ["Recovered"], ascending = False)
| Confirmed | Deaths | Recovered | |
|---|---|---|---|
| Region | |||
| Spain | 236899 | 24275 | 132929 |
| US | 1039909 | 60967 | 120720 |
| Germany | 161539 | 6467 | 120400 |
| Mainland China | 82862 | 4633 | 77610 |
| Iran | 93657 | 5957 | 73791 |
| ... | ... | ... | ... |
| Cabo Verde | 114 | 1 | 2 |
| Yemen | 6 | 0 | 1 |
| South Sudan | 34 | 0 | 0 |
| Papua New Guinea | 8 | 0 | 0 |
| MS Zaandam | 9 | 2 | 0 |
187 rows × 3 columns
#this is a useful way to get an idea about how dispersed the missing values are in the dataset
import matplotlib.pyplot as plt
import seaborn as sns
sns.heatmap(covid_df.isnull())
<AxesSubplot:>