#------------------------------------------------ # Subsetting, transforming (and plotting) data # Author: Cindy Cheng & Luca Messerschmidt # ------------------------------------------------- # load packages # ------------------------------------------------- rm(list=ls()) library(readr) library(tidyverse) # ------------------------------------------------- # load data # ------------------------------------------------- coronaNet <- read_csv("data/coronanet_release.csv") wb <- read_csv("data/API_NY.GDP.PCAP.PP.CD_DS2_en_csv_v2_988619.csv", skip = 4) # ------------------------------------------------- # exploring and manipulating the data # ------------------------------------------------- # https://r4ds.had.co.nz/transform.html #https://garthtarr.github.io/meatR/dplyr.html # the dplyr counterpart to the str() function is the 'glimpse' function glimpse(coronaNet) # dplyr counterpart to the 'which' function is the 'filter' function filter(coronaNet, type == "Closure and Regulation of Schools") # Class question: what if you want to find all observations that fits the condition above but only if the initiating country (country) is Germany? # Class question: what if you want to find all observations that fits the condition above AND the type of policy (type) is 'External Border Restrictions'? # Class question: what if you want to find all observations that fits the condition above that are not NA for the end of policy field (date_end) # the dplyr counterpart to the '[ ]' command to subset by rows is the slice function slice(coronaNet, 1:10) # the dplyr counterpart to the 'sort' function is the arrange function arrange(coronaNet, date_start) arrange(coronaNet, desc(date_start)) # Class question: How could you use arrange() to sort all missing values to the date_end variable? (Hint: use is.na()). # the dplyr counterpart to the $ operator or the [,variableName] operator is 'select' select(coronaNet, entry_type) select(coronaNet, entry_type, country) select(coronaNet, entry_type:country) select(coronaNet, -c(entry_type:country)) select(coronaNet, contains('type')) # Class question: what are some ways you can select record_id, policy_id, target_direction, ISO_A3, and ISO_A2? # the dplyr counterpart to making a new variable is the 'mutate' function mutate(coronaNet, newVar1 = 1, newVar2 = ifelse(entry_type == 'new_entry', 1, 0)) %>% select(newVar1, newVar2) # if you only want to keep the new variables, use transmute() transmute(coronaNet, newVar1 = 1, newVar2 = ifelse(entry_type == 'new_entry', 1, 0)) # ------------------------------------------------- # summarizing data # ------------------------------------------------- coronaNet %>% group_by(country) %>% summarise(count = n(), numberOfTypesOfPolicies = n_distinct(type), mostActive = median(date_start)) coronaNet %>% group_by(country) %>% mutate(correct_type = ifelse(correct_type == 'original', 0, 1)) %>% summarise(count = n(), numberOfTypesOfPolicies = n_distinct(type), mostActive = median(date_start), percCorrected = mean(correct_type)) coronaNet %>% group_by(country, date_start) %>% summarise(count = n(), numberOfTypesOfPolicies = n_distinct(type)) # ------------------------------------------------- # # subset and merge data # ------------------------------------------------- businessRestrict = coronaNet %>% filter(type %in% c("Restriction and Regulation of Businesses" ,"Restriction of Non-Essential Businesses")) wb = wb %>% gather("year", "gdpPPP", -`Country Name`, -`Country Code`, -`Indicator Name`, -`Indicator Code`, -X65) wb = wb %>% select( -`Indicator Name`, -`Indicator Code`, -X65) wb$year = as.numeric(wb$year) wb$lgdpPPP= log(wb$gdpPPP) data = merge(businessRestrict, wb %>% filter(year == 2018), by.x = 'ISO_A3', by.y = "Country Code", all.x = TRUE) dataAgg = data %>% group_by(country) %>% summarise(numBusinessRestrictions = n(), gdpPPP = mean(gdpPPP)) plot(dataAgg$gdpPPP, dataAgg$numBusinessRestrictions) cor(dataAgg$gdpPPP, dataAgg$numBusinessRestrictions, use = 'complete.obs') # Frequency Table library(kableExtra) coronaNet %>% filter(init_country_level == "National", #Only choose the national level #entry_type=="New Entry", #Remove # When you only want to have new entries (no updates or corrections) country=="Germany") %>% # Only Germany group_by(type) %>% summarize(`Total Number of Policies`=n(), `Number of Countries`=length(unique(country)), `Number of Targeted Countries`=length(unique(target_country)), `% With Mandatory Enforcement`=round(mean(grepl(x=compliance,pattern="Mandatory")*100,na.rm=T),0)) %>% dplyr::select(Type="type",everything()) %>% filter(!is.na(Type)) %>% arrange(desc(`Total Number of Policies`)) %>% ungroup() %>% knitr::kable("html",booktabs=T, caption="Policies by Germany on National level") %>% kableExtra::kable_styling(latex_options = c("striped", "hold_position")) %>% kableExtra::column_spec(1,width="4cm") %>% kableExtra::column_spec(2:5,width="2.5cm") coronaNet %>% filter(init_country_level == "Yes, it is at the province/state level", #entry_type=="new_entry", country=="Germany") %>% group_by(type) %>% summarize(`Total Number of Policies`=n(), `Number of Countries`=length(unique(country)), `Number of Targeted Countries`=length(unique(target_country)), `% With Mandatory Enforcement`=round(mean(grepl(x=compliance,pattern="Mandatory")*100,na.rm=T),0)) %>% dplyr::select(Type="type",everything()) %>% filter(!is.na(Type)) %>% arrange(desc(`Total Number of Policies`)) %>% ungroup() %>% knitr::kable("html",booktabs=T, caption="Policies by Germany on state level (Bundesländer)") %>% kableExtra::kable_styling(latex_options = c("striped", "hold_position")) %>% kableExtra::column_spec(1,width="4cm") %>% kableExtra::column_spec(2:5,width="2.5cm") # Density Plot library(lubridate) d<- coronaNet%>% filter(!is.na(type)) %>% group_by(type,date_announced) %>% summarize(Policies=length(unique(record_id))) %>% arrange(type,date_announced) %>% mutate(Policies=cumsum(Policies)) %>% ungroup %>% mutate( type=recode(type, `External Border Restrictions`="External\nBorder\nRestrictions", `Restriction of Non-Essential Businesses`="Restriction of\nNon-Essential\nBusinesses", `Restrictions of Mass Gatherings`="Restrictions of\nMass Gatherings", `Restriction of Non-Essential Government Services`="Restriction of\nNon-Essential\nGovernment Services", `Internal Border Restrictions`="Internal\nBorder Restrictions", `External Border Restrictions`="External\nBorder Restrictions", `Public Awareness Campaigns`="Public\nAwareness Campaigns", `New Task Force or Bureau`="New Task Force")) %>% ggplot(aes(y=Policies,x=date_announced)) + geom_area() + theme_minimal() + theme(panel.grid = element_blank(), strip.background = element_blank()) + xlab("") + facet_wrap(~type) d_all<-d + scale_x_date(date_labels = "%d/%m") d_all ggsave(filename="data/viz/density_sub.jpeg", plot=d_all, pointsize = 24, width = 18 , height = 10, scale = 0.5, dpi = 800) # -------------------------------------- # T-tests # -------------------------------------- # https://www.dummies.com/education/science/biology/how-to-use-student-t-tests-to-compare-averages/ # Doing a t-test in R # https://uc-r.github.io/t_test # Doing a t-test by hand # http://onlinestatbook.com/2/tests_of_means/difference_means.html # https://stattrek.com/hypothesis-test/difference-in-means.aspx # Interpreting p-values # https://blog.minitab.com/blog/adventures-in-statistics-2/how-to-correctly-interpret-p-values # class question: what is the average gdpPPP # lets see if there is a statistical difference in the number of business restrictions for countries over the average GDP PPP level # compared to countries under the average GDP PPP level t.test(dataAgg %>% filter(gdpPPP > mean(dataAgg$gdpPPP, na.rm = TRUE)) %>% select(numBusinessRestrictions), dataAgg %>% filter(gdpPPP <= mean(dataAgg$gdpPPP, na.rm = TRUE))%>% select(numBusinessRestrictions)) dataAgg %>% filter(gdpPPP > mean(dataAgg$gdpPPP, na.rm = TRUE)) %>% summarise(avgNumBusinessRestrictions = mean(numBusinessRestrictions, na.rm = TRUE)) dataAgg %>% filter(gdpPPP <= mean(dataAgg$gdpPPP, na.rm = TRUE)) %>% summarise(avgNumBusinessRestrictions = mean(numBusinessRestrictions, na.rm = TRUE)) #------------ Creating a ggplot ------------------------ businessRestrictAgg = businessRestrict %>% group_by(date_announced, country) %>% summarise(numBusinessRestrictions = n()) dframe = expand.grid(unique(coronaNet$date_announced), unique(coronaNet$country)) names(dframe) = c('date_announced', 'country') businessRestrictFull = merge(dframe, businessRestrictAgg, by = c('date_announced', 'country'), all.x = TRUE) businessRestrictFull$numBusinessRestrictions = ifelse(is.na(businessRestrictFull$numBusinessRestrictions), 0, businessRestrictFull$numBusinessRestrictions) # lets plot business restrictions over time ggplot(data = businessRestrictFull) + geom_point(aes(x = date_announced, y = numBusinessRestrictions )) # hmmm, how can we distinguish by country? ggplot(data =businessRestrictFull) + geom_point(aes(x = date_announced, y = numBusinessRestrictions, color = country)) + theme(legend.position = "none") ggplot(data = businessRestrictFull) + geom_point(aes(x = date_announced, y = numBusinessRestrictions, shape = country)) + theme(legend.position = "none") ggplot(data = businessRestrictFull) + geom_point(aes(x = date_announced, y = numBusinessRestrictions, size = country)) + theme(legend.position = "none") ggplot(data = businessRestrictFull) + geom_point(aes(x = date_announced, y = numBusinessRestrictions, alpha = country)) + theme(legend.position = "none") # what happens if we try to connect the dots? ggplot(data = businessRestrictFull) + geom_point(aes(x = date_announced, y = numBusinessRestrictions, color = country)) + geom_line(aes(x = date_announced, y = numBusinessRestrictions, color = country)) + theme(legend.position = "none") ggplot(data = businessRestrictFull %>% filter(country %in% c('China', 'Germany'))) + geom_line(aes(x = date_announced, y = numBusinessRestrictions, color = country)) + theme(legend.position = "none") # ------ note you can also set the asethetic porperties of your geom manually. If you wanted to make all of the points red then... ggplot(data = businessRestrictFull) + geom_point(aes(x = date_announced, y = numBusinessRestrictions, color = 'red'))+ theme(legend.position= 'none') ggplot(data = businessRestrictFull) + geom_point(aes(x = date_announced, y = numBusinessRestrictions, size = 15) ) # ------ Facets # If you want to look at each country individually, you can split your plot into facets ggplot(data = businessRestrictFull%>% filter(country %in% c('China', 'Germany'))) + geom_point(aes(x = date_announced, y = numBusinessRestrictions))+ facet_wrap(~country, nrow = 2) # ------ Geom_smooth # Lets plot the smoothed out data ggplot(data = businessRestrictFull%>% filter(country %in% c('China', 'Germany'))) + geom_smooth(aes(x = date_announced, y = numBusinessRestrictions)) # We can layer different geoms on top of each other! ggplot(data = businessRestrictFull%>% filter(country %in% c('China', 'Germany'))) + geom_point(aes(x = date_announced, y = numBusinessRestrictions, color = country))+ geom_smooth(aes(x = date_announced, y = numBusinessRestrictions, linetype = country)) # Note geom_smooth is a predicted line, to plot the line that goes through the actual points, use geom_line ggplot(data = businessRestrictFull%>% filter(country %in% c('China', 'Germany'))) + geom_line(aes(x = date_announced, y = numBusinessRestrictions, linetype = country)) + geom_point(aes(x = date_announced, y = numBusinessRestrictions,color = country)) + geom_smooth(aes(x = date_announced, y = numBusinessRestrictions, linetype = country)) # ------ Density Plots # how about plotting density plot? dataAgg$gdpDum = ifelse(dataAgg$gdpPPP > mean(dataAgg$gdpPPP, na.rm = TRUE), "Above Average GDP", "Below Average GDP") dataAgg = dataAgg %>% filter(!is.na(gdpDum)) ggplot(data = dataAgg, aes(x = numBusinessRestrictions, fill = gdpDum))+ geom_density(alpha = .5) # ----------- In order to make plots understandable to others, it would be helpful to have useful labels on the plots #http://r4ds.had.co.nz/graphics-for-communication.html p <-ggplot(data = dataAgg, aes(x = numBusinessRestrictions, fill = gdpDum))+ geom_density(alpha = .5)+ labs(title = "Are countries more or less likely to put restriction on businesses of they have above average vs. below average GDP", y = 'Density', x = 'Number of Policies Which Have Put Restrictions on Businesses')+ theme_minimal()+ theme(legend.position = 'bottom', legend.title = element_blank()) # saving your plots ggsave('viz/restrictionGDPDensityPlot.pdf', plot = p) # --------------------------------- # OLS #----------------------------------- # Here are some resources for learning more about OLS: # http://setosa.io/ev/ordinary-least-squares-regression/ # http://r-statistics.co/Linear-Regression.html # https://www.datacamp.com/community/tutorials/linear-regression-R # https://www.albert.io/blog/key-assumptions-of-ols-econometrics-review/ # Simple Bivariate Model model1 = lm(numBusinessRestrictions ~ gdpPPP , data = dataAgg) summary(model1) model2 = lm(numBusinessRestrictions ~ gdpPPP +gdpDum , data = dataAgg) summary(model2) # -------- Making pretty tables with Stargazer---------------- stargazer::stargazer(model1,model2, type = "text") # ---- Outlier Analysis ---- borderRestrict = coronaNet %>% filter(type %in% c("External Border Restrictions")) dataAgg_2 = borderRestrict %>% group_by(country) %>% summarise(numBorderRestrictions = n()) dataAgg_2<- dataAgg[dataAgg$numBusinessRestrictions<80,] # Plot of data with outliers. par(mfrow=c(1, 2)) plot(dataAgg$gdpPPP,dataAgg$numBusinessRestrictions, main="With Outliers", xlab="GDP",ylim=c(0,500), ylab="Policies", pch="*", col="red", cex=2) abline(lm(numBusinessRestrictions ~ gdpPPP , data=dataAgg), col="blue", lwd=3, lty=2) # Plot of original data without outliers. Note the change in slope (angle) of best fit line. plot(dataAgg_2$gdpPPP,dataAgg_2$numBusinessRestrictions, main="Without Outliers", ylim=c(0,500),xlab="GDP", ylab="Policies", pch="*", col="red", cex=2) abline(lm(numBusinessRestrictions ~ gdpPPP , data=dataAgg_2), col="blue", lwd=3, lty=2) #Detect Outliers par(mfrow=c(1, 1)) outlier_values <- boxplot.stats(dataAgg$numBusinessRestrictions) # outlier values. boxplot(dataAgg$numBusinessRestrictions, main="Outliers", boxwex=0.1) mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6) #Bivariate approach # For categorical variable boxplot(numBusinessRestrictions ~ date_announced, data=businessRestrictAgg,main="Ozone reading across months") # clear pattern is noticeable. #Multivariate Model Approach ##Cooks distance mod <- lm(numBusinessRestrictions ~ gdpPPP, data=dataAgg) cooksd <- cooks.distance(mod) plot(cooksd, pch="*", cex=2, main="Influential Obs by Cooks distance") # plot cook's distance abline(h = 4*mean(cooksd, na.rm=T), col="red") # add cutoff line text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red") # add labels