Covid-19

In [1]:

import pandas as pd

In [2]:

filepath = "C:\\Users\\abdelilah\\Desktop\\Python Projects\\Covid-19\\"

In [39]:

pd.set_option('display.max_rows', 10)

In [38]:

covid_df = pd.read_csv(filepath + "covid_19_data.csv", delimiter = ",")
covid_df

Out[38]:

	Date	State	Region	Confirmed	Deaths	Recovered
0	4/29/2020	NaN	Afghanistan	1939	60	252
1	4/29/2020	NaN	Albania	766	30	455
2	4/29/2020	NaN	Algeria	3848	444	1702
3	4/29/2020	NaN	Andorra	743	42	423
4	4/29/2020	NaN	Angola	27	2	7
...	...	...	...	...	...	...
316	4/29/2020	Wyoming	US	545	7	0
317	4/29/2020	Xinjiang	Mainland China	76	3	73
318	4/29/2020	Yukon	Canada	11	0	0
319	4/29/2020	Yunnan	Mainland China	185	2	181
320	4/29/2020	Zhejiang	Mainland China	1268	1	1263

321 rows × 6 columns

In [4]:

covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321 entries, 0 to 320
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Date       321 non-null    object
 1   State      140 non-null    object
 2   Region     321 non-null    object
 3   Confirmed  321 non-null    int64 
 4   Deaths     321 non-null    int64 
 5   Recovered  321 non-null    int64 
dtypes: int64(3), object(3)
memory usage: 15.2+ KB

In [9]:

values = dict()
for col in covid_df.columns:
    values[col] = covid_df[col].value_counts()

In [10]:

values

Out[10]:

{'Date': 4/29/2020    321
 Name: Date, dtype: int64,
 'State': Grand Princess                  2
 Diamond Princess cruise ship    2
 Recovered                       2
 North Carolina                  1
 North Dakota                    1
                                ..
 Zhejiang                        1
 Shaanxi                         1
 Puerto Rico                     1
 New Hampshire                   1
 Arizona                         1
 Name: State, Length: 137, dtype: int64,
 'Region': US                 58
 Mainland China     31
 Canada             15
 France             11
 UK                 11
                    ..
 North Macedonia     1
 El Salvador         1
 Pakistan            1
 Slovakia            1
 Lithuania           1
 Name: Region, Length: 187, dtype: int64,
 'Confirmed': 11        5
 6         4
 16        4
 18        4
 8         3
          ..
 2727      1
 168       1
 299691    1
 1361      1
 510       1
 Name: Confirmed, Length: 282, dtype: int64,
 'Deaths': 0        53
 1        25
 3        15
 2        14
 6        13
          ..
 98        1
 92        1
 87        1
 3670      1
 26097     1
 Name: Deaths, Length: 142, dtype: int64,
 'Recovered': 0        78
 5         5
 8         5
 19        4
 6         4
          ..
 124       1
 63616     1
 131       1
 645       1
 1023      1
 Name: Recovered, Length: 195, dtype: int64}

In [14]:

# i don't get what they mean exactly, it confused me, cuz if they want the info about each Region then it's already displayed
# in the dataframe. So i think i'll just assign the columns wanted in a new dataframe
# i think i get it now, the regions here mentioned many times, maybe this task wants me to gather all those regions into one 

In [17]:

cases_df = covid_df.drop(["Date", "State"], axis = 1)

In [64]:

cases_df["Region"].value_counts()

Out[64]:

US                 58
Mainland China     31
Canada             15
France             11
UK                 11
                   ..
North Macedonia     1
El Salvador         1
Pakistan            1
Slovakia            1
Lithuania           1
Name: Region, Length: 187, dtype: int64

In [69]:

cases_df = cases_df.groupby('Region').sum()

In [70]:

cases_df

Out[70]:

	Confirmed	Deaths	Recovered
Region
Afghanistan	1939	60	252
Albania	766	30	455
Algeria	3848	444	1702
Andorra	743	42	423
Angola	27	2	7
...	...	...	...
West Bank and Gaza	344	2	71
Western Sahara	6	0	5
Yemen	6	0	1
Zambia	97	3	54
Zimbabwe	32	4	5

187 rows × 3 columns

In [20]:

less_10 = covid_df[covid_df["Confirmed"] >= 10]

In [21]:

less_10

Out[21]:

	Date	State	Region	Confirmed	Deaths	Recovered
0	4/29/2020	NaN	Afghanistan	1939	60	252
1	4/29/2020	NaN	Albania	766	30	455
2	4/29/2020	NaN	Algeria	3848	444	1702
3	4/29/2020	NaN	Andorra	743	42	423
4	4/29/2020	NaN	Angola	27	2	7
...	...	...	...	...	...	...
316	4/29/2020	Wyoming	US	545	7	0
317	4/29/2020	Xinjiang	Mainland China	76	3	73
318	4/29/2020	Yukon	Canada	11	0	0
319	4/29/2020	Yunnan	Mainland China	185	2	181
320	4/29/2020	Zhejiang	Mainland China	1268	1	1263

304 rows × 6 columns

In [71]:

cases_df["Confirmed"].idxmax()

Out[71]:

'US'

In [72]:

cases_df.loc['US']

Out[72]:

Confirmed    1039909
Deaths         60967
Recovered     120720
Name: US, dtype: int64

In [74]:

cases_df["Deaths"].idxmin()

Out[74]:

'Bhutan'

In [75]:

cases_df.loc['Bhutan']

Out[75]:

Confirmed    7
Deaths       0
Recovered    5
Name: Bhutan, dtype: int64

In [77]:

cases_df.loc["India"]

Out[77]:

Confirmed    33062
Deaths        1079
Recovered     8437
Name: India, dtype: int64

In [78]:

cases_df.sort_values(by = ["Confirmed"], ascending = True)

Out[78]:

	Confirmed	Deaths	Recovered
Region
Yemen	6	0	1
Western Sahara	6	0	5
Bhutan	7	0	5
Papua New Guinea	8	0	0
Sao Tome and Principe	8	0	4
...	...	...	...
UK	166441	26166	857
France	166543	24121	49118
Italy	203591	27682	71252
Spain	236899	24275	132929
US	1039909	60967	120720

187 rows × 3 columns

In [79]:

cases_df.sort_values(by = ["Recovered"], ascending = False)

Out[79]:

	Confirmed	Deaths	Recovered
Region
Spain	236899	24275	132929
US	1039909	60967	120720
Germany	161539	6467	120400
Mainland China	82862	4633	77610
Iran	93657	5957	73791
...	...	...	...
Cabo Verde	114	1	2
Yemen	6	0	1
South Sudan	34	0	0
Papua New Guinea	8	0	0
MS Zaandam	9	2	0

187 rows × 3 columns

In [60]:

#this is a useful way to get an idea about how dispersed the missing values are in the dataset

In [57]:

import matplotlib.pyplot as plt
import seaborn as sns

In [59]:

sns.heatmap(covid_df.isnull())

Out[59]:

<AxesSubplot:>