#Depends: AER, sandwich library(AER); library(sandwich); #'The Confidence Interval Method for Selecting Valid Instrumental Variables. #' #'Implements the confidence interval (CI) downward testing procedure based on #'the Sargan/Hansen-J test for overidentifying restrictions. The algorithm #'selects the valid instrumental variables from a set of potential instruments #'that may contain invalid ones. It also provides post-selection IV estimation #'results using the selected valid instruments as IV and controlling for the #'selected invalid instruments as explanatory variables. #' #'@param Y A numeric vector of outcomes. #'@param D A numeric vector of exposures/treatments. #'@param Z A numeric matrix of instrumental variables, with each column #' referring to one instrument. #'@param X An optional numeric matrix of exogenous explanatory variables, with #' each column referring to one variable. #'@param alpha A numeric scalar between 0 and 1 specifying the significance #' level for the confidence interval for the causal effect estimate (default = #' 0.05). #'@param tuning A numeric scalar specifiying the threshold p-value for the #' Saran/Hansen-J test (default = 0.1/log(n)). #'@param robust Logical. If robust = TRUE, the linear model is robust to #' heteroskedasticity (default = TRUE). #'@param firststage Logical. If firststage = TRUE, a first-stage thresholding is #' implemented to select the relevant instrument variables (default = FALSE). #'@param firsttuning A numeric scalar specifiying the threshold critical value for the #' first stage t-test (default = sqrt(2.01*log(pz))). #'@return Valid instruments: #' Identities of the valid instrumental variables #' selected by the algorithm. #'@return Number of Valid Instruments: #' The number of the selected valid instrumental variables. #'@return Relevant instruments: #' Identities of the relevant instrumental variables #' selected by the first stage thresholding if firststage = TRUE. #'@return Number of Relevant Instruments: #' The number of the selected relevant instrumental variables #' if firststage = TRUE. #'@return Coefficients: #' The matrix for the post-selection IV estimation results #' for the coefficients of the exposure/treatment variable and exogenous #' explanatory variables using the selected valid instruments as IV and #' controlling for the selected invalid instruments. The first two columns are #' 2SLS estimates and their standard errors. If robust = TRUE, the second #' column is heteroskedasticity-robust stand errors. The two-step GMM estimates #' and their standard errors are also reported. If intercept = TRUE, the #' estimates for the intercept are reported in the last row. #'@return Confidence Interval: #' The confidence interval for the 2SLS estimates for #' the coefficient of the exposure/treatment variable with significance level #' specified by alpha (default = 0.05). If robust = TRUE, the confidence #' interval for the two-step GMM estimate is also reported. #'@return p-value of Sargan: #' The p-value for the Sargan overidentifying test for #' the selected valid instruments. If robust = TRUE, the p-value for the #' Hansen-J test is reported. #'@examples #'library(AER); library(sandwich) #'# the MASS package is only needed to #'run the working example #'library(MASS) #'#Generate data #'n = 2000; L = 10; s = 3 #'pi = c(rep(3,s),rep(0,L-s)); beta = 1; gamma = c(0, rep(1,(L-2)), 0) #'epsilonSigma = matrix(c(1,0.8,0.8,1),2,2) #'epsilon = mvrnorm(n,rep(0,2),epsilonSigma) #'Z = matrix(rnorm(n*L),n,L) #'D = 0.5 + Z %*% gamma + epsilon[,1] #'Y = -0.5 + Z %*% pi + D * beta + epsilon[,2] #'result = CIIV(Y,D,Z,robust=FALSE, firststage=TRUE) #'result #' @export CIIV <- function(Y, D, Z, X, alpha = 0.05, tuning = 0.1/log(length(Y)), robust = TRUE, firststage = FALSE, firsttuning = sqrt(2.01*log(ncol(Z)))) { #Check Input #Check Data if(is.data.frame(Y)){Y <- as.matrix(Y)} if(!is.vector(Y) && !is.matrix(Y) | !is.numeric(Y) | ncol(as.matrix(Y))!=1) stop("Y must be a numeric vector."); Y = as.numeric(Y); if(!is.null(colnames(D))){Dname = colnames(D)}else{Dname = "D"}; if(is.data.frame(D)){D <- as.matrix(D)} if(!is.vector(D) && !is.matrix(D) | !is.numeric(D) | ncol(as.matrix(D))!=1) stop("D must be a numeric vector."); D = as.numeric(D); if(is.data.frame(Z)){Z <- as.matrix(Z)} if(!is.matrix(Z) | !is.numeric(Z)) stop("Z must be a numerical matrix."); if(ncol(Z)<2) stop("The number of instruments must be greater than 1."); if( ncol(Z) > nrow(Z)) stop("The number of instruments must be smaller than the sample size."); if(!missing(X) && is.data.frame(X)){X <- as.matrix(X)} if(!missing(X) && (!is.matrix(X) | !is.numeric(X))) stop("X must be a numerical matrix."); stopifnot(length(Y) == length(D),length(Y) == nrow(Z)); #Other Arguments stopifnot(is.logical(firststage), is.logical(robust)); stopifnot(is.numeric(alpha), length(alpha) == 1,alpha <= 1,alpha >= 0); stopifnot(is.numeric(tuning), length(tuning) == 1); # Define Constants n <- length(Y); pz <- ncol(Z); # Preserve Data d = D; y = Y; z = Z; if (!missing(X)){ X <- cbind(1, X); if(is.null(colnames(X))){colnames(X) = c("intercept", paste0('X', 1:ncol(X)))}; }else{ X <- matrix(1, n, 1); colnames(X) <- "intercept"; }; Covariates = cbind(D,X); Exogenous = cbind(Z,X); # Variable Names colnames(Covariates)[1] <- Dname; CovariatesNames <- colnames(Covariates); if(!is.null(colnames(Z))){InstrumentNames = colnames(Z)}else{InstrumentNames = paste0('Z', 1:ncol(Z))}; # Centralization D <- qr.resid(qr(X), D); Z <- scale(qr.resid(qr(X), Z), center = FALSE, scale = TRUE); # First Stage if (firststage){ lm_first <- lm(D ~ Z - 1); coef_first <- coef(lm_first); sd_first <- sqrt(diag(if(robust) vcovHC(lm_first, "HC0") else vcov(lm_first))); first_index <- as.numeric(abs(coef_first/sd_first) >= firsttuning); relevant_index <- relevant_instruments <- which(first_index == 1); Nr_relevant <- length(relevant_instruments); if (sum(first_index) <= 1){ warning("Less than two IVs are individually relevant, treat all IVs as strong"); relevant_instruments <- c(1:pz); } if (length(relevant_instruments) < pz){ X <- cbind(X, z[, -relevant_instruments]); Z <- as.matrix(z[, relevant_instruments], nrow = n, ncol = Nr_relevant); pz <- ncol(Z); D <- qr.resid(qr(X), d); Z <- scale(qr.resid(qr(X), Z), center = FALSE, scale = TRUE); } }else{ relevant_index <- relevant_instruments <- c(1:pz); Nr_relevant <- pz; }; Y <- qr.resid(qr(X), Y); # OLS Estimation for the IV-Specific Estimates and Standard Error lm_Reduced <- lm(cbind(Y, D) ~ Z - 1); # IV-Specific Estimates gamma_Y <- coef(lm_Reduced)[, 1]; gamma_D <- coef(lm_Reduced)[, 2]; betaIV <- gamma_Y / gamma_D; # Variances by Delta Method U <- solve(crossprod(Z)); Jacobian_betaIV <- cbind( diag(c(1 / gamma_D)), diag(c(-gamma_Y / gamma_D^2)) ); Covmat_gamma <- if(robust) vcovHC(lm_Reduced, "HC0") else vcov(lm_Reduced); sdIV <- sqrt(diag(Jacobian_betaIV %*% Covmat_gamma %*% t(Jacobian_betaIV))); # CI Downward Testing Sargen/Hansen-J Procedure and Post-Selection Estimation CIIV.TestingSelectionEstimation( Y, D, Z, U, Covariates, Exogenous, y, z, CovariatesNames, InstrumentNames, betaIV = betaIV, sdIV = sdIV, alpha = alpha, tuning = tuning, robust = robust, gamma_D = gamma_D, Covmat_gamma = Covmat_gamma, firststage = firststage, relevant_instruments = relevant_instruments, Nr_relevant = Nr_relevant, relevant_index = relevant_index ); } #Selection by the Sargan/Hansen Downward Testing Procedure #'Internal CIIV Functions #' #'These are not to be called by the user. CIIV.TestingSelectionEstimation <- function( Y, D, Z, U, Covariates, Exogenous, y, z, CovariatesNames, InstrumentNames, betaIV, sdIV, alpha = 0.05, tuning = 0.1/log(length(Y)), robust = TRUE, gamma_D, Covmat_gamma, firststage = FALSE, relevant_instruments, Nr_relevant, relevant_index) { # Function for Sargan Test non_robust_sar_CI <- function(res_CI, Z, U, n) { (t(res_CI) %*% Z %*% U %*% t(Z) %*% res_CI) / (t(res_CI) %*% res_CI / n) } # Function for Two Step GMM and Hansen-J Test CIM.HansenJTest <- function(Y,X,Z){ res_FirstStep <- residuals(AER::ivreg(Y ~ X - 1 | Z)); Weight_SecondStep <- crossprod(res_FirstStep * Z); Coef_SecondStep <- solve( t(X) %*% Z %*% solve(Weight_SecondStep) %*%t(Z) %*% X ) %*% t(X) %*% Z %*% solve(Weight_SecondStep) %*% t(Z) %*% Y; res_SecondStep <- as.vector(Y - X %*% Coef_SecondStep); sd_SecondStep <- sqrt(diag(solve( t(X) %*% Z %*% solve(Weight_SecondStep) %*%t(Z) %*% X ) %*% t(X) %*% Z %*% solve(Weight_SecondStep)%*%crossprod(res_SecondStep * Z)%*%t( solve( t(X) %*% Z %*% solve(Weight_SecondStep) %*%t(Z) %*% X ) %*% t(X) %*% Z %*% solve(Weight_SecondStep) ))); HansenJ_Stat <- t(res_SecondStep) %*% Z %*% solve(Weight_SecondStep) %*% t(Z) %*% res_SecondStep; list(HansenJ_Stat, Coef_SecondStep,sd_SecondStep) } # Define Constants n <- length(Y); pz <- ncol(Z); colnames(Z) = paste0(relevant_instruments); colnames(z) = paste0(c(1:ncol(z))); # Break Points crit <- matrix(0, pz, pz); rownames(crit) = colnames(crit) <- paste0(relevant_instruments); for (i in 1:pz) { for (j in 1:pz) { crit[i, j] <- abs(betaIV[i] - betaIV[j]) / (sdIV[i] + sdIV[j]); } } # The Downward Testing Procedure maxs <- pz; Psar <- 0; epsi <- 10^(-7); Psi <- max(crit) + epsi; while (maxs > 0 && Psar < tuning) { # The confidence intervals ciIV <- matrix(0, pz, 3); ciIV[, 1] <- betaIV - Psi * sdIV; ciIV[, 2] <- betaIV + Psi * sdIV; ciIV[, 3] <- relevant_instruments; # Ordering the Confidence Intervals by the Lower Ends ciIV <- ciIV[order(ciIV[, 1]), ]; # Check Overlap grid_CI <- matrix(0, pz, pz) for (i in 2:pz) { for (j in 1:i) { grid_CI[i, j] <- as.numeric(ciIV[i, 1] <= ciIV[j, 2]); } } sumCI <- apply(grid_CI, 1, sum); selCI <- as.matrix(grid_CI[which(sumCI == max(sumCI)), ]); maxs <- max(sumCI); #Selection if(maxs >= 2 && length(selCI) < pz + 1) { # No tie selCI <- cbind(ciIV[, 3], selCI); selVec <- sort(selCI[selCI[, 2] == 1, 1]); Zv <- Z[,!colnames(Z) %in% paste0(selVec)]; if(robust) { sar_CI <- CIM.HansenJTest(Y, cbind(D, Zv), Z)[[1]]; } else { res_CI <- resid(AER::ivreg(Y ~ cbind(D, Zv) - 1 | Z)); sar_CI <- non_robust_sar_CI(res_CI, Z, U, n); } Psi <- max(crit[paste0(selVec), paste0(selVec)]) - epsi; # updated psi Psar <- pchisq(sar_CI, maxs - 1, lower.tail = FALSE); } else if(maxs >= 2) { # Tie selCI <- cbind(ciIV[, 3], t(selCI)); selVec <- matrix(0, maxs, ncol(selCI) - 1); Psar <- Psi <- rep(0, ncol(selCI) - 1); for (i in 1:(ncol(selCI) - 1)) { selVec[, i] <- sort(selCI[selCI[, i+1] == 1, 1]); if(robust){ sar_CI <- CIM.HansenJTest(Y, cbind(D, Z[,!colnames(Z) %in% paste0(selVec[, i])]), Z)[[1]]; } else{ res_CI <- resid(AER::ivreg(Y ~ D + Z[,!colnames(Z) %in% paste0(selVec[, i])] - 1 | Z)); sar_CI <- non_robust_sar_CI(res_CI, Z, U, n); } Psar[i] <- pchisq(sar_CI, maxs - 1, lower.tail = FALSE); Psi[i] <- max(crit[paste0(selVec[, i]), paste0(selVec[, i])]); } index.tie <- match(max(Psar), Psar); selVec <- selVec[,index.tie]; Psar <- Psar[index.tie]; Psi <- min(Psi) - epsi; # updated psi } else { # None Valid maxs <- 0; Psar <- 0; selVec <- NULL; } } Nr_valid <- length(selVec); # Post-Selection Estimation if (Nr_valid == 0) { print("None of the instruments is selected as valid, do OLS.") regressor_CIM_temp = regressor_CIM <- cbind(Covariates, z); length_regressor <- ncol(regressor_CIM)-ncol(z); Coefficients_CIM <- qr.coef(qr(regressor_CIM), y)[1:length_regressor]; res_CIM <- qr.resid(qr(regressor_CIM), y); } else { # At least one of the IVs is selected as valid. z_invalid <- matrix(z[,!colnames(z) %in% paste0(selVec)], ncol = (ncol(z) - Nr_valid), nrow = n); regressor_CIM_temp <- cbind(Covariates, z_invalid); regressor_CIM <- cbind(fitted(lm(Covariates[,1] ~ Exogenous)), Covariates[,-1], z_invalid) length_regressor <- ncol(regressor_CIM_temp) - ncol(z_invalid); iv_CIM <- AER::ivreg(y ~ regressor_CIM_temp - 1 | Exogenous); Coefficients_CIM <- coef(iv_CIM)[1:length_regressor]; if(robust){ Coefficients_CIM_GMM <- (CIM.HansenJTest(y, regressor_CIM_temp, Exogenous)[[2]])[1:length_regressor]; } res_CIM <- resid(iv_CIM); } # Standard Error and Confidence Interval if (robust) { sd_CIM <- sqrt(diag( solve(crossprod(regressor_CIM)) %*% crossprod(res_CIM * regressor_CIM) %*% solve(crossprod(regressor_CIM)) ))[1:length_regressor]; ci_CIM <- c( Coefficients_CIM[1] - qnorm(1-alpha/2) * sd_CIM[1], Coefficients_CIM[1] + qnorm(1-alpha/2) * sd_CIM[1] ); if (Nr_valid == 0){ Coefficients_CIM_GMM <- NA; sd_CIM_GMM <- NA; ci_CIM_GMM <- NA; }else{ sd_CIM_GMM <- (CIM.HansenJTest(y, regressor_CIM_temp, Exogenous)[[3]])[1:length_regressor]; ci_CIM_GMM <- c( Coefficients_CIM_GMM[1] - qnorm(1-alpha/2) * sd_CIM_GMM[1], Coefficients_CIM_GMM[1] + qnorm(1-alpha/2) * sd_CIM_GMM[1] ); } } else { sd_CIM <- sqrt(diag( mean(res_CIM^2) * solve(crossprod(regressor_CIM)) ))[1:length_regressor]; ci_CIM <- c( Coefficients_CIM[1] - qnorm(1-alpha/2) * sd_CIM[1], Coefficients_CIM[1] + qnorm(1-alpha/2) * sd_CIM[1] ); } # Results if(robust){ object <- list( robust = robust, if(is.null(selVec)){ Valid_Instruments = paste0("None instruments selected as valid."); }else{ Valid_Instruments = InstrumentNames[c(1:length(z))[selVec]]; }, Valid_Instruments = Valid_Instruments, Nr_valid = Nr_valid, if(firststage){ if(Nr_relevant == 0){ Relevant_Instruments = paste0("None instruments selected as relevant.") }else{ Relevant_Instruments = InstrumentNames[c(1:length(z))[relevant_index]]; }; }else{ Relevant_Instruments = paste0("No first stage selection."); Nr_relevant = paste0("No first stage selection."); }, Relevant_Instruments = Relevant_Instruments, Nr_relevant = Nr_relevant, Covariate_Names = CovariatesNames, Coefficients_CIM = Coefficients_CIM, sd_CIM = sd_CIM, Coefficients_CIM_GMM = Coefficients_CIM_GMM, sd_CIM_GMM = sd_CIM_GMM, ci_CIM = ci_CIM, ci_CIM_GMM = ci_CIM_GMM, HansenJ_CIM = Psar ) }else{ object <- list( robust = robust, if(is.null(selVec)){ Valid_Instruments = paste0("None instruments selected as valid."); }else{ Valid_Instruments = InstrumentNames[c(1:ncol(z))[selVec]]; }, Valid_Instruments = Valid_Instruments, Nr_valid = Nr_valid, if(firststage){ if(Nr_relevant == 0){ Relevant_Instruments = paste0("None instruments selected as relevant.") }else{ Relevant_Instruments = InstrumentNames[c(1:length(z))[relevant_index]]; }; }else{ Relevant_Instruments = paste0("No first stage selection."); Nr_relevant = paste0("No first stage selection."); }, Relevant_Instruments = Relevant_Instruments, Nr_relevant = Nr_relevant, Covariate_Names = CovariatesNames, Coefficients_CIM = Coefficients_CIM, sd_CIM = sd_CIM, ci_CIM = ci_CIM, Sargan_CIM = Psar ) } class(object) <- "CIIV" object } #'Internal CIIV Functions #' #'These are not to be called by the user. print.CIIV <- function(object,robust = object$robust,...){ cat("\nValid Instruments:\n", object$Valid_Instruments, "\n","\nNumber of Valid Instruments:\n", object$Nr_valid,"\n"); cat("\nRelevant Instruments:\n", object$Relevant_Instruments, "\n","\nNumber of Relevant Instruments:\n", object$Nr_relevant,"\n"); cat("\nCoefficients:\n") names(object$Coefficients_CIM) = names(object$sd_CIM) = names(object$ci_CIM) = NULL; if(robust){ names(object$Coefficients_CIM_GMM) = names(object$sd_CIM_GMM) = names(object$ci_CIM_GMM) = NULL; coef_cim <- cbind(object$Coefficients_CIM, object$sd_CIM, object$Coefficients_CIM_GMM,object$sd_CIM_GMM); colnames(coef_cim) = c("2SLS Estimate", "2SLS Std. Error", "GMM Estimate", "GMM Std. Error"); rownames(coef_cim) = object$Covariate_Names; print(coef_cim, quote = FALSE); cat("\nConfidence Interval 2SLS: [", object$ci_CIM[1], ",", object$ci_CIM[2], "]", "\n", sep = ''); cat("\nConfidence Interval GMM: [", object$ci_CIM_GMM[1], ",", object$ci_CIM_GMM[2], "]", "\n", sep = ''); cat("\np-value of Hansen-J: ", object$HansenJ_CIM, sep = ''); }else{ coef_cim <- cbind(object$Coefficients_CIM, object$sd_CIM); colnames(coef_cim) = c("2SLS Estimate", "Std. Error"); rownames(coef_cim) = object$Covariate_Names; print(coef_cim, quote = FALSE); cat("\nConfidence Interval: [", object$ci_CIM[1], ",", object$ci_CIM[2], "]", "\n", sep = ''); cat("\np-value of Sargan: ", object$Sargan_CIM, sep = ''); } }