##############################################################
### Title: "Week 5 - Randomized Controlled Trials"
### Course: STA 235H
### Semester: Fall 2023
### Professor: Magdalena Bennett
##############################################################
# Clears memory
rm(list = ls())
# Clears console
cat("\014")
options(scipen = 0)
### Load libraries
# If you don't have one of these packages installed already, you will need to run install.packages() line
library(tidyverse)
library(estimatr)
#install.packages(modelsummary)
library(modelsummary)
## Are Emily and Greg More Employable Than Lakisha and Jamal? Example of an audit study
d = read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week5/1_RCT/data/lakisha_aer.csv")
# Some variables of interest
# education: 0 = not reported; 1 = High school dropout (HSD); 2 = High school graduate (HSG); 3 = Some college; 4 = college +
# ofjobs: Number of jobs listed on resume
# yearsexp: Years of experience
# computerskills: Applicant lists computer skills
# sex: gender of the applicant (according to name)
# race: race-sounding name
# h: high quality resume
# l: low quality resume
# city: c = chicago, b = boston
# call: applicant was called back
## Let's assume we are only interested in the "race" treatment. Create a variable treat = 1 if race = "b" and 0 if race = "w"
d = d %>% mutate(treat = ifelse(race == "b", 1, 0))
## Q: How many treated observations are there? And how many control?
d %>% group_by(treat) %>% count() # group_by() groups your data according to the different levels of the variables in the argument (in this case, treat).
# You could also do the same with d %>% group_by(treat) %>% table()
############################ BALANCE CHECK ####################################
## First step is check for balance: This means, make sure baseline covariates
# are the same between both groups!
## Let's check if randomization was done correctly: Balance table
# We are going to create another dataset that only contains the variables
# we want to compare (baseline covariates/characteristics + the treatment variable)
# DO NOT INCLUDE POST-TREATMENT VARIABLES
d_bal = d %>% select(treat, education, ofjobs, yearsexp, computerskills, h, l, city)
# We will use datasummary_balance() to get a pretty table
datasummary_balance(~ treat, data = d_bal, title = "Balance table",
fmt=2, dinm_statistic = "p.value") # Q: Does education make sense as a continuous variable?
# The first argument provides the variable for the groups we want to compare (in this case treat=0 and treat=1)
# The second argument is our data
# Third argument is the title of our table (if we want to include one)
# fmt = 2 is an argument to set the number of decimal places we would like for our table (change it to fmt=4 and see what happens!)
# dinm_statistic provides the statistic that is returned to assess the difference between groups.
# (By default it provides the standard error, but the p.value is easier to assess differences)
# Note that you can obtain the same values for the balance table using tidyverse code,
# but note that it doesn't teell us if the difference is statistically significant or not!:
d_bal %>% group_by(treat) %>% summarize(across(.cols = everything(), .fns = mean))
#across() tells the summarize() function which variables (.cols) to use for summarizing (in this case, everything). We want to get the mean of each variable for treat = 0 and treat = 1.
# You can change mean for any other statistic that you're interested in (standard deviation, median, etc.).
# Q: Why does city return an NA value?
# Let's transform education and city into binary variables to assess the difference (p-values)!
d_bal = d %>% mutate(educ_noinfo = ifelse(education==0, 1, 0),
educ_hsd = ifelse(education==1, 1, 0),
educ_hsg = ifelse(education==2, 1, 0),
educ_somecoll = ifelse(education==3, 1, 0),
educ_college = ifelse(education==4, 1, 0),
boston = ifelse(city=="b",1,0)) %>%
select(treat, educ_noinfo, educ_hsd, educ_hsg, educ_somecoll, educ_college,
ofjobs, yearsexp, computerskills, h, l, boston)
# If your variables have a common root (e.g. all the dummies for education start with "educ_",
# you can also use this to select all of them!)
d_bal_alt = d %>% mutate(educ_noinfo = ifelse(education==0, 1, 0),
educ_hsd = ifelse(education==1, 1, 0),
educ_hsg = ifelse(education==2, 1, 0),
educ_somecoll = ifelse(education==3, 1, 0),
educ_college = ifelse(education==4, 1, 0),
boston = ifelse(city=="b", 1, 0)) %>%
select(treat, starts_with("educ_"),
ofjobs, yearsexp, computerskills, h, l, boston)
# Q: Check d_bal and d_bal_alt. Are they the same?
datasummary_balance(~ treat, data = d_bal, title = "Balance table", fmt=2, dinm_statistic = "p.value")
# Q: Is there a statistical significant difference? Is that a problem?
############################ ESTIMATING CAUSAL EFFECT ##########################
## Now, let's run a simple model
summary(lm_robust(call ~ treat, data = d)) #Question: Why do we use lm_robust() and not lm()?
# Interpretation: Having an African-American sounding name reduces the probability of a callback by 3.2 percentage points
# compared to having a white sounding name.
# Q: What is the estimand we are estimating? Comment your results.
# Now, let's add covariates
summary(lm_robust(call ~ treat + factor(education) + ofjobs + yearsexp + computerskills + h + factor(city), data = d))
# Q: Does the point estimate change? Why or why not?
##################### EXERCISE AT HOME ########################################
# Do the same analysis (balance and estimate the causal effect), but now using
# female vs male sounding name as the treatment variable. Is there an effect?