#------------------------------------------------
# Subsetting, transforming (and plotting) data
# Author: Cindy Cheng & Luca Messerschmidt


# -------------------------------------------------
# load packages 
# -------------------------------------------------
rm(list=ls())

library(readr)
library(tidyverse)

# -------------------------------------------------
# load data
# -------------------------------------------------
coronaNet <- read_csv("data/coronanet_release.csv")
wb <- read_csv("data/API_NY.GDP.PCAP.PP.CD_DS2_en_csv_v2_988619.csv", skip = 4)


# -------------------------------------------------
# # explore data
# -------------------------------------------------
# lets take a look at the first 6 rows
head(coronaNet)

# lets take a look at what classes the different data are stored in and the values they can take on
str(coronaNet)

# lets take a look at the distribution of the countries that have initiated a policy
table(coronaNet$country)%>% sort(decreasing = TRUE) 

# lets look at the distribution of different policies
table(coronaNet$type) %>% sort()

# Class question: how could you look at the distribution of different sub-types: type_sub_cat ? 

names(coronaNet)

# -------------------------------------------------
# clean data
# -------------------------------------------------
coronaNet  = coronaNet %>% 
  mutate(type = ifelse(type == 'Quarantine/Lockdown', 'Lockdown', type))

table(coronaNet$type)
names(wb)

wb = wb %>% gather("year", "gdpPPP", -`Country Name`, -`Country Code`, -`Indicator Name`, -`Indicator Code`, -X65)

?gather

wb = wb %>% select( -`Indicator Name`, -`Indicator Code`, -X65)
wb$year = as.numeric(wb$year)
wb$lgdpPPP= log(wb$gdpPPP)


# -------------------------------------------------
# # subset data
# -------------------------------------------------

businessRestrict = coronaNet %>% filter(type == "Restriction of Non-Essential Businesses")

# what are the dimensions of this new subsetted data?
dim(businessRestrict)
# which countries have instituted restrictions of non-essential businesses?

businessRestrict$country %>% unique()
unique(businessRestrict$country)


# what are the different sub types for the broader non-essentital businesses policy type?
unique(businessRestrict$type_sub_cat)

# -------------------------------------------------
# # Merge data
# -------------------------------------------------
names(wb)[1] = 'country'

names(businessRestrict)

data = merge(businessRestrict, wb %>% filter(year == 2018), by= 'country', all.x = TRUE)


# how can we find out what gdp of the Germany is in this 'data'?
data %>% filter(country == 'Germany') %>% select(gdpPPP)


# how can we find out what the gdp of the United States is in this data?
data %>% filter(country == 'United States of America') %>% select(gdpPPP)


# how can we investigate which countries match across different datasets?
intersect(coronaNet$country, wb$country)


# and the countries that don't match?
cbind(setdiff(coronaNet$country, wb$country), 
      setdiff( wb$country, coronaNet$country)) %>% head()

names(wb)[2] = 'ccode'
names(businessRestrict)[31] = 'ccode'


data = merge(businessRestrict, wb %>% filter(year == 2018), by= 'ccode', all.x = TRUE)

# how can we find out what the gdp of the United States is in this data?
data %>% filter(ccode == 'USA') %>% select(gdpPPP)


# lets see if there's a relationship between how rich a country as measured by gdpPPP is and how many business restrictions it imposes

dataAgg = data %>% 
  group_by(ccode) %>% 
  summarise(numBusinessRestrictions = n(),
            gdpPPP = mean(gdpPPP))

head(dataAgg)

dataAgg %>% filter(ccode == 'DEU') %>% select(gdpPPP)


plot(dataAgg$gdpPPP, dataAgg$numBusinessRestrictions)
cor(dataAgg$gdpPPP, dataAgg$numBusinessRestrictions, use = 'complete.obs')

# Class question: how about the relationship between log gdpPPP and how many business restrictions it imposes?

plot(log(dataAgg$gdpPPP), dataAgg$numBusinessRestrictions)
cor(log(dataAgg$gdpPPP), dataAgg$numBusinessRestrictions, use = 'complete.obs')