#------------------------------------------------ # Difference in Difference & Regression Discontinuity Designs # Author: Luca Messerschmidt # Contributions: Princeton.edu: http://www.princeton.edu/~otorres/DID101R.pdf # ------------------------------------------------- # load packages # ------------------------------------------------- rm(list=ls()) library(readr) library(tidyverse) # ------------------------------------------------- # load data # ------------------------------------------------- coronaNet <- read_csv("data/coronanet_release_allvars.csv") wb <- read_csv("data/API_NY.GDP.PCAP.PP.CD_DS2_en_csv_v2_988619.csv", skip = 4) # ------------------------------------------------- # # subset and merge data # ------------------------------------------------- businessRestrict = coronaNet %>% filter(type %in% c("Restriction and Regulation of Businesses" ,"Restriction of Non-Essential Businesses")) wb = wb %>% gather("year", "gdpPPP", -`Country Name`, -`Country Code`, -`Indicator Name`, -`Indicator Code`, -X65) wb = wb %>% select( -`Indicator Name`, -`Indicator Code`, -X65) wb$year = as.numeric(wb$year) wb$lgdpPPP= log(wb$gdpPPP) data = merge(businessRestrict, wb %>% filter(year == 2018), by.x = 'ISO_A3', by.y = "Country Code", all.x = TRUE) dataAgg = data %>% group_by(country) %>% summarise(numBusinessRestrictions = n(), gdpPPP = mean(gdpPPP)) businessRestrictAgg = businessRestrict %>% group_by(date_announced, country) %>% summarise(numBusinessRestrictions = n()) dframe = expand.grid(unique(coronaNet$date_announced), unique(coronaNet$country)) names(dframe) = c('date_announced', 'country') businessRestrictFull = merge(dframe, businessRestrictAgg, by = c('date_announced', 'country'), all.x = TRUE) businessRestrictFull$numBusinessRestrictions = ifelse(is.na(businessRestrictFull$numBusinessRestrictions), 0, businessRestrictFull$numBusinessRestrictions) RestrictAgg = coronaNet %>% group_by(date_announced, country) %>% summarise(numpolicies = n())%>% filter(!is.na(date_announced)) businessRestrictFull = merge(businessRestrictFull, RestrictAgg, by = c('date_announced', 'country'), all.x = TRUE) businessRestrictFull$numpolicies = ifelse(is.na(businessRestrictFull$numpolicies), 0, businessRestrictFull$numpolicies) businessRestrictFull$ratio<-businessRestrictFull$numBusinessRestrictions/businessRestrictFull$numpolicies # ------------------------------------------------- # CoronaNet Example # ------------------------------------------------- #We want to know if countries that experienced Corona early on have a higher number of business restrictions # Create a dummy variable to indicate the time when the treatment started. Lets assume the Corona crisis started April 2020. In this case, days before April 2020 will have a value of 0 and April+ a 1. businessRestrictFull$time = ifelse(businessRestrictFull$date_announced >= "2020-04-01", 1, 0) # Create a dummy variable to identify the group exposed to the treatment. In this example lets assumed that countries Italy, Iran and China exprienced hard COVID-Waves (=1). Other countries did not (=0). businessRestrictFull$treated = ifelse(businessRestrictFull$country == "Italy" | businessRestrictFull$country == "China" | businessRestrictFull$country == "Iran", 1, 0) # Create an interaction between time and treated. We will call this interaction ‘did’. businessRestrictFull$did = businessRestrictFull$time * businessRestrictFull$treated # Estimating the DID estimator didreg = lm(numBusinessRestrictions ~ treated + time + did, data = businessRestrictFull) summary(didreg) coefplot(didreg) # The coefficient for ‘did’ is the differences-in-differences estimator. The effect is not significant with the treatment having a positive effect on business restrictions #The problem is that we can not clearly identify countries that received the treatment (early cases) and does that did not as countries policies. # With the Regression Discontinuity Design, we loose the Control/Treatment group character and focus on the time difference. # In our preliminary example, we want to check the influence of cases on the number of policies (be careful: This is only an example to illustrate the stats and does not make a lot of sense) names(coronaNet) download.file("https://covid.ourworldindata.org/data/ecdc/total_cases.csv", "./data/cases.csv") cases <- read_csv("data/cases.csv") cases<-gather(cases,"country","cases",3:212,-date) cases$date_announced<- cases$date cases$ration <- cases$cases/cases$World cases<-cases[,-1] test<- left_join(businessRestrictFull,cases, by=c("country","date_announced")) didreg = lm(numpolicies ~ ration + time + I(ratio * time), data = test) summary(didreg) library(arm) coefplot(didreg)