#' Calculate two-stage difference-in-differences following Gardner (2021) #' #' @import fixest #' #' @param data The dataframe containing all the variables #' @param yname Outcome variable #' @param first_stage Fixed effects and other covariates you want to residualize #' with in first stage. #' Formula following \code{\link[fixest:feols]{fixest::feols}}. #' Fixed effects specified after "`|`". #' @param second_stage Second stage, these should be the treatment indicator(s) #' (e.g. treatment variable or event-study leads/lags). #' Formula following \code{\link[fixest:feols]{fixest::feols}}. #' Use `i()` for factor variables, see \code{\link[fixest:i]{fixest::i}}. #' @param treatment A variable that = 1 if treated, = 0 otherwise. The first #' stage will be estimated for `treatment == 0`. The second stage will be #' estimated for the *full sample*. #' @param cluster_var What variable to cluster standard errors. This can be IDs #' or a higher aggregate level (state for example) #' @param weights Optional. Variable name for regression weights. #' @param bootstrap Optional. Should standard errors be calculated using bootstrap? #' Default is `FALSE`. #' @param n_bootstraps Optional. How many bootstraps to run. #' Default is `250`. #' @param return_bootstrap Optional. Logical. Will return each bootstrap second-stage #' estimate to allow for manual use, e.g. percentile standard errors and empirical #' confidence intervals. #' @param verbose Optional. Logical. Should information about the two-stage #' procedure be printed back to the user? #' Default is `TRUE`. #' #' @return `fixest` object with adjusted standard errors #' (either by formula or by bootstrap). All the methods from `fixest` package #' will work, including \code{\link[fixest:esttable]{fixest::esttable}} and #' \code{\link[fixest:coefplot]{fixest::coefplot}} #' #' @section Examples: #' #' Load example dataset which has two treatment groups and homogeneous treatment effects #' #' ```{r, comment = "#>", collapse = TRUE} #' # Load Example Dataset #' data("df_hom") #' ``` #' #' ### Static TWFE #' #' You can run a static TWFE fixed effect model for a simple treatment indicator #' ```{r, comment = "#>", collapse = TRUE} #' static <- did2s(df_hom, #' yname = "dep_var", treatment = "treat", cluster_var = "state", #' first_stage = ~ 0 | unit + year, #' second_stage = ~ i(treat, ref=FALSE)) #' #' fixest::esttable(static) #' ``` #' #' ### Event Study #' #' Or you can use relative-treatment indicators to estimate an event study estimate #' ```{r, comment = "#>", collapse = TRUE} #' es <- did2s(df_hom, #' yname = "dep_var", treatment = "treat", cluster_var = "state", #' first_stage = ~ 0 | unit + year, #' second_stage = ~ i(rel_year, ref=Inf)) #' #' fixest::esttable(es) #' ``` #' #' ```{r, eval = F} #' # plot rel_year coefficients and standard errors #' fixest::coefplot(es, keep = "rel_year::(.*)") #' ``` #' #' ### Example from Cheng and Hoekstra (2013) #' #' Here's an example using data from Cheng and Hoekstra (2013) #' ```{r, comment = "#>", collapse = TRUE} #' # Castle Data #' data(castle, package = "did2s") #' #' did2s( #' data = castle, #' yname = "l_homicide", #' first_stage = ~ 0 | state + year, #' second_stage = ~ i(post, ref=0), #' treatment = "post", #' cluster_var = "state", #' weights = "popwt" #' ) #' ``` #' #' @export did2s <- function( data, yname, first_stage, second_stage, treatment, cluster_var, weights = NULL, bootstrap = FALSE, n_bootstraps = 250, return_bootstrap = FALSE, verbose = FALSE ) { # Check Parameters --------------------------------------------------------- dreamerr::check_arg(data, "data.frame") dreamerr::check_value(data[[treatment]], "logical (loose) vector") if (verbose) { did2s_summary_message( yname, first_stage, second_stage, treatment, cluster_var, bootstrap ) } # Point Estimates ---------------------------------------------------------- est <- did2s_estimate( data = data, yname = yname, first_stage = first_stage, second_stage = second_stage, treatment = treatment, weights = weights, bootstrap = bootstrap ) # Analytic Standard Errors ------------------------------------------------- if (!bootstrap) { # Subset data to the observations used in the second stage # obsRemoved have - in front of rows, so they are deleted removed_rows <- est$second_stage$obs_selection$obsRemoved if (!is.null(removed_rows)) { data <- data[removed_rows, ] } # Extract weights if (is.null(weights)) { weights_vector <- rep.int(1L, nrow(data)) } else { weights_vector <- sqrt(data[[weights]]) } # Extract first stage first_u <- est$first_u if (!is.null(removed_rows)) { first_u <- first_u[removed_rows] } # x1 is matrix used to predict Y(0) x1 <- fixest::sparse_model_matrix( est$first_stage, data = data, type = c("rhs", "fixef") ) # Extract second stage second_u <- stats::residuals(est$second_stage) x2 <- fixest::sparse_model_matrix( est$second_stage, type = c("rhs", "fixef") ) # multiply by weights first_u <- weights_vector * first_u x1 <- weights_vector * x1 second_u <- weights_vector * second_u x2 <- weights_vector * x2 # x10 is matrix used to estimate first stage (zero out rows with D_it = 1) x10 <- copy(x1) # treated rows. Note dgcMatrix is 0-index !! treated_rows <- which(data[[treatment]] == 1L) - 1 idx <- x10@i %in% treated_rows x10@x[idx] <- 0 # The influence consists of two terms: # (X_2' X_2)^{-1} X_2' (y - \hat{y}(0)) # # First is the second stage standard OLS IF: # $$(X_2' X_2)^{-1} X_{2i}' \hat{v}_i$$ # # Second is the effect of the first stage estimate of $\hat{y}(0)$: # $$(X_2' X_2)^{-1} X_2' X_1 (X_{10}' X_{10})^{-1) X_{10, i} \hat{u}_i$$ # # The IF is the sum of the two terms # x2tx2_inv <- (est$second_stage$cov.iid / est$second_stage$sigma2) IF_ss <- x2tx2_inv %*% Matrix::t(x2 * second_u) # Use robust QR-based solve instead of direct Matrix::solve() gamma_hat <- robust_solve_XtX(x10, Matrix::crossprod(x1, x2)) IF_fs <- x2tx2_inv %*% Matrix::t(gamma_hat) %*% Matrix::t((x10 * first_u)) IF <- IF_fs - IF_ss cl <- data[[cluster_var]] cov <- Reduce( "+", lapply( split(1:length(cl), cl), function(cl_idx) { Matrix::tcrossprod(Matrix::rowSums(IF[, cl_idx, drop = FALSE])) } ) ) cov <- as.matrix(cov) rownames(cov) <- colnames(cov) <- names(est$second_stage$coefficients) } # Bootstrap Standard Errors ------------------------------------------------ if (bootstrap) { if (verbose) { message(sprintf( "Starting %s bootstraps at cluster level: %s\n", n_bootstraps, cluster_var )) } # Unique values of cluster variable cl <- unique(data[[cluster_var]]) stat <- function(x, i) { # select the observations to subset based on the cluster var block_obs <- unlist(lapply( i, function(n) which(x[n] == data[[cluster_var]]) )) # run regression for given replicate, return estimated coefficients stats::coefficients( did2s_estimate( data = data[block_obs, ], yname = yname, first_stage = first_stage, second_stage = second_stage, treatment = treatment, weights = weights, bootstrap = TRUE )$second_stage ) } boot <- boot::boot(cl, stat, n_bootstraps) # Get estimates and fix names estimates <- boot$t colnames(estimates) <- names(stats::coef(est$second_stage)) # Bootstrap Var-Cov Matrix cov <- stats::cov(estimates) if (return_bootstrap) { return(estimates) } } # summary creates fixest object with correct standard errors and vcov vcov_list = list() vcov_list[[sprintf("Corrected Clustered (%s)", cluster_var)]] = cov est <- base::suppressWarnings(summary(est$second_stage, vcov = vcov_list)) return(est) } # Point estimate for did2s #' Robust solve for X'X beta = X'Y using QR decomposition #' #' This function computes the least squares solution beta = (X'X)^(-1) X'Y #' in a numerically stable way using QR decomposition, handling rank-deficient #' matrices gracefully. #' #' @param X Design matrix (sparse or dense) #' @param Y Response matrix/vector (can be X'Y if already computed) #' @return The least squares solution beta (may contain 0 for rank-deficient columns) robust_solve_XtX <- function(X, Y) { # Handle both vector and matrix Y if (is.vector(Y)) { # Y is a vector, need to compute X'Y XtY <- Matrix::crossprod(X, Y) } else { # Y is a matrix, check dimensions if (nrow(Y) == nrow(X)) { # Y is the raw response matrix, need to compute X'Y XtY <- Matrix::crossprod(X, Y) } else if (nrow(Y) == ncol(X)) { # Y is already X'Y (cross product form) XtY <- Y } else { stop("Incompatible dimensions between X and Y") } } # Now solve the system X'X beta = X'Y using robust methods XtX <- Matrix::crossprod(X) # Check if the matrix is singular using condition number approach beta_hat <- tryCatch( { # Try direct solve first for speed Matrix::solve(XtX, XtY) }, error = function(e) { # If direct solve fails, fall back to SVD-based approach # Use the fact that the least squares solution is # beta_hat = (X'X)+ * X'Y where (X'X)+ is the Moore-Penrose pseudoinverse # For now, use a simple approach: set small singular values to zero SVD <- svd(as.matrix(XtX)) tol <- max(dim(XtX)) * .Machine$double.eps * max(SVD$d) positive_indices <- SVD$d > tol # Matrix is rank deficient d_inv <- ifelse(positive_indices, 1 / SVD$d, 0) XtX_pinv <- SVD$v %*% diag(d_inv) %*% t(SVD$u) XtX_pinv %*% as.matrix(XtY) } ) return(beta_hat) } # Point estimate for did2s did2s_estimate <- function( data, yname, first_stage, second_stage, treatment, weights = NULL, bootstrap = FALSE, cluster_var = NULL # Just so you can switch `did2s` to `did2s_estimate` for quick dev estimation ) { ## We'll use fixest's formula expansion macros to swap out first and second ## stages (see: ?fixest::xpd) fixest::setFixest_fml( ..first_stage = first_stage, ..second_stage = second_stage ) # First stage among untreated untreat <- data[data[[treatment]] == 0, ] if (is.null(weights)) { weights_vector <- NULL } else { weights_vector <- untreat[[weights]] } first_stage <- fixest::feols( fixest::xpd(~ 0 + ..first_stage, lhs = yname), data = untreat, weights = weights_vector, # combine.quick = FALSE, # (deprecated argument) fixef.keep_names = TRUE, # allows var1^var2 in FEs fixef.rm = "none", warn = FALSE, notes = FALSE, ) # Residualize outcome variable but keep same yname first_u <- data[[yname]] - stats::predict(first_stage, newdata = data) data[[yname]] <- first_u # Zero out residual rows with D_it = 1 (for analytical SEs later on) if (!bootstrap) { first_u[data[[treatment]] == 1] <- 0 } # Second stage if (!is.null(weights)) { weights_vector <- data[[weights]] } second_stage <- fixest::feols( fixest::xpd(~ 0 + ..second_stage, lhs = yname), data = data, weights = weights_vector, warn = FALSE, notes = FALSE ) ret <- list( first_stage = first_stage, second_stage = second_stage ) if (!bootstrap) { ret <- list( first_stage = first_stage, second_stage = second_stage, first_u = first_u ) } else { ret <- list(second_stage = second_stage) } return(ret) } did2s_summary_message <- function( yname, first_stage, second_stage, treatment, cluster_var, bootstrap ) { cluster_msg <- if (bootstrap) { sprintf( "- Standard errors will be block bootstrapped with cluster `%s`\n", cluster_var ) } else { sprintf( "- Standard errors will be clustered by `%s`\n", cluster_var ) } msg <- paste( "Running Two-stage Difference-in-Differences\n", sprintf("- first stage formula %s\n", rlang::f_label(first_stage)), sprintf("- second stage formula %s\n", rlang::f_label(second_stage)), sprintf( "- The indicator variable that denotes when treatment is on is `%s`\n", treatment ), cluster_msg, collapse = "\n" ) message(msg) }