####################################################### ### Statistical Computing with R 2e ### ### Maria L. Rizzo ### ### Chapman & Hall/CRC The R Series ### ### ISBN 9781466553323 - CAT# K15269 ### ### January 2019 ### ### ### ### R code for Chapter 9 ### ### Resampling Applications ### ####################################################### # packages to install: bootstrap, DAAG, ggplot2 ### Example 9.1 (Jackknife-after-bootstrap) library(boot) library(bootstrap) set.seed(1111) theta.boot <- function(patch, i) { # function to compute the patch ratio statistic y <- patch[i, "y"] z <- patch[i, "z"] mean(y) / mean(z) } boot.out <- boot(bootstrap::patch, statistic = theta.boot, R=2000) A <- boot.array(boot.out) head(A, 3) mean(A[, 1] == 0) # jackknife-after-bootstrap to est. se(se) A <- boot.array(boot.out) theta.b <- boot.out$t n <- NROW(patch) jack.se <- numeric(n) for (i in 1:n) { #in i-th replicate omit all samples with x[i] keep <- which(A[, i] == 0) jack.se[i] <- sd(theta.b[keep]) } print(boot.out) #for se_boot se.bar <- mean(jack.se) se.se <- sqrt((n-1) * mean((jack.se - se.bar)^2)) print(paste("Jackknife-after-bootstrap est. se(se)=", se.se)) ### Example 9.2 (Jackknife-after-bootstrap) # initialize data(patch, package = "bootstrap") y <- patch$y z <- patch$z dat <- cbind(y, z) n <- NROW(dat) B <- 2000 # jackknife-after-bootstrap step 1: run the bootstrap theta_boot <- function(dat, ind) { # function to compute the statistic y <- dat[ind, 1] z <- dat[ind, 2] mean(y) / mean(z) } boot.obj <- boot(dat, statistic = theta_boot, R=2000) theta.hat <- boot.obj$t0 theta.b <- boot.obj$t se.boot <- sd(theta.b) # jackknife-after-bootstrap to est. se(se) sample.freq <- boot.array(boot.obj) se.se.reps <- numeric(n) N <- 1:n for (i in N) { # jackknife-after-bootstrap # omit all bootstrap samples that contain obs i keep <- which(sample.freq[ ,i] == 0) se.se.reps[i] <- sd(theta.b[keep]) } print(boot.obj) se.bar <- mean(se.se.reps) se.se <- sqrt((n-1) * mean((se.se.reps - se.bar)^2)) se.se ### Example 9.3 (ironslag linear model) library(ggplot2) library(DAAG) L1 <- lm(magnetic ~ chemical, data=ironslag) cf3 <- round(L1$coeff, 3) cap <- paste("Fit: magnetic =", cf3[1], "+", cf3[2], "chemical") ggplot(data=ironslag, aes(chemical, magnetic)) + geom_point() + geom_smooth(method="lm") + ggtitle(cap) plot(L1, which=1:2) #residual plots ### Example 9.4 (mammals data) library(MASS) cor(log(mammals$body), log(mammals$brain)) summary(mammals) y <- log(mammals$brain) x <- log(mammals$body) L <- lm(y ~ x) L cap <- paste("Fit: log(brain) =", round(L$coeff[1],3), "+", round(L$coeff[2],3), "log(body)") ggplot(data=mammals, aes(x, y)) + geom_point() + geom_smooth(method="lm") + labs(x = "log(body)", y = "log(brain)", title = cap) summary(L)$r.squared ### Example 9.5 (ironslag data, resampling cases) x <- ironslag$chemical y <- ironslag$magnetic m <- 2000 n <- NROW(x) L1 <- lm(y ~ x) #estimate the model b0 <- L1$coeff[1]; b1 <- L1$coeff[2] ## run bootstrap of cases out <- replicate(m, expr={ i <- sample(1:n, replace=TRUE, size=n) xstar <- x[i] ystar <- y[i] Lb <- lm(ystar ~ xstar) s <- summary(Lb)$sigma c(Lb$coeff[1], slope=Lb$coeff[2], s=s) }) bootCases <- t(out) meanCases <- colMeans(bootCases) sdCases <- apply(bootCases, 2, "sd") meanCases sdCases biasInt <- mean(bootCases[,1] - b0) #bias for intercept biasSlope <- mean(bootCases[,2] - b1) #bias for slope rbind(estimate=c(b0, b1), bias=c(biasInt, biasSlope), se=sdCases[1:2], cv=c(biasInt, cv=biasSlope)/sdCases[1:2]) ### Example 9.6 (Resampling cases using the boot function) # set.seed(1104) library(boot) m <- 2000 stats <- function(dat, i) { x <- dat$chemical[i] y <- dat$magnetic[i] Lb <- lm(y ~ x) s <- summary(Lb)$sigma c(Lb$coeff[1], slope=Lb$coeff[2], s=s) } boot.out <- boot(ironslag, statistic=stats, R=2000) boot.out boot.out$t0 sd(boot.out$t[,2]) boottbl <- broom::tidy(boot.out) boottbl$std.error[2] MASS::truehist(boot.out$t[ ,2], main="", xlab="slopes") abline(v = boot.out$t0[2], lwd=2) boot.ci(boot.out, index=2, type=c("norm","perc","basic","bca")) ### Example 9.7 (Resampling errors: mammals data) m.resid <- rstandard(L, sd = 1) r <- m.resid - mean(m.resid) m <- 1000; n <- NROW(x) estsErr <- replicate(m, expr={ estar <- sample(r, replace=TRUE, size=n) ystar <- L$fitted.values + estar Lb <- lm(ystar ~ x) s <- summary(Lb)$sigma c(b0=Lb$coeff[1], b1=Lb$coeff[2], s=s) }) ests <- t(estsErr) summary(ests) ### Example 9.8 (Resampling errors, continued) sd(ests[,2]) s <- summary(L)$sigma SSx <- (n - 1) * var(x) se.beta1 <- sqrt(s^2 / SSx) se.beta1 s * sqrt(1/n + mean(x)^2 / SSx) sd(ests[,1]) betas <- summary(L)$coeff betas betas[, "Std. Error"] broom::tidy(summary(L)) broom::tidy(summary(L))$std.error ### Example 9.9 (Model based resampling with the boot function) regstats <- function(dat, i) { #dat is a data frame (r, x, yhat) #r are the modified centered residuals, yhat are the fits ystar <- dat$yhat + dat$r[i] xstar <- dat$x Lnew <- lm(ystar ~ xstar) Lnew$coefficients } y <- log(mammals$brain) x <- log(mammals$body) L <- lm(y ~ x) r <- rstandard(L, sd=1) r <- r - mean(r) df <- data.frame(r=r, x=x, yhat=L$fitted) head(df) boot.obj <- boot(data=df, statistic=regstats, R=2000) broom::tidy(boot.obj) ### Example 9.10 (Empirical influence values for the patch ratio statistic) library(boot) library(bootstrap) theta_boot <- function(dat, ind) { # function to compute the patch ratio statistic mean(dat[ind, ]$y) / mean(dat[ind, ]$z) } boot.out <- boot(patch, theta_boot, R = 2000) infl <- empinf(boot.out, type = "jack") theta.hat <- boot.out$t0 jack <- theta.hat - infl / (nrow(patch) - 1) rbind(infl, jack) ### Example 9.11 (Jackknife-after-bootstrap plot) jack.after.boot(boot.out, useJ=TRUE, stinf=FALSE) n <- NROW(patch) J <- numeric(n) b.freq <- boot.array(boot.out) theta.b <- boot.out$t for (i in 1:n) { keep <- which(b.freq[ ,i] == 0) J[i] <- mean(theta.b[keep]) } # the jackknife influence values (n - 1) * (mean(J) - J) jack.after.boot(boot.out, useJ=TRUE, stinf=TRUE)