#######################################################
    ###       Statistical Computing with R  2e          ###
    ###       Maria L. Rizzo                            ###
    ###       Chapman & Hall/CRC The R Series           ###
    ###       ISBN 9781466553323 - CAT# K15269          ###
    ###       March 6, 2019                             ###
    ###                                                 ###
    ###       R code for Chapter 8                      ###
    ###       Bootstrap and Jackknife                   ###
    #######################################################


# packages to install: bootstrap, DAAG

    
### Example 8.2 (Bootstrap estimate of standard error)

    library(bootstrap)    #for the law data
    print(cor(law$LSAT, law$GPA))
    print(cor(law82$LSAT, law82$GPA))

    #set up the bootstrap
    B <- 200            #number of replicates
    n <- nrow(law)      #sample size
    R <- numeric(B)     #storage for replicates

    #bootstrap estimate of standard error of R
    for (b in 1:B) {
        #randomly select the indices
        i <- sample(1:n, size = n, replace = TRUE)
        LSAT <- law$LSAT[i]       #i is a vector of indices
        GPA <- law$GPA[i]
        R[b] <- cor(LSAT, GPA)
    }
    #output
    print(se.R <- sd(R))
    hist(R, prob = TRUE)


### Example 8.3 (Bootstrap estimate of standard error: boot function)

    r <- function(x, i) {
        #want correlation of columns 1 and 2
        cor(x[i,1], x[i,2])
    }

    library(boot)       #for boot function
    obj <- boot(data = law, statistic = r, R = 2000)
    obj
    y <- obj$t
    sd(y)


### Example 8.4 (Bootstrap estimate of bias)

    #sample estimate for n=15
    theta.hat <- cor(law$LSAT, law$GPA)

    #bootstrap estimate of bias
    B <- 2000   #larger for estimating bias
    n <- nrow(law)
    theta.b <- numeric(B)

    for (b in 1:B) {
        i <- sample(1:n, size = n, replace = TRUE)
        LSAT <- law$LSAT[i]
        GPA <- law$GPA[i]
        theta.b[b] <- cor(LSAT, GPA)
    }
    bias <- mean(theta.b - theta.hat)
    bias

    
### Example 8.5 (Bootstrap estimate of bias of a ratio estimate)


    data(patch, package = "bootstrap")
    patch

    n <- nrow(patch)  #in bootstrap package
    B <- 2000
    theta.b <- numeric(B)
    theta.hat <- mean(patch$y) / mean(patch$z)

    #bootstrap
    for (b in 1:B) {
        i <- sample(1:n, size = n, replace = TRUE)
        y <- patch$y[i]
        z <- patch$z[i]
        theta.b[b] <- mean(y) / mean(z)
        }
    bias <- mean(theta.b) - theta.hat
    se <- sd(theta.b)
    print(list(est=theta.hat, bias = bias,
               se = se, cv = bias/se))


### Example 8.6 (Jackknife estimate of bias)

    data(patch, package = "bootstrap")
    n <- nrow(patch)
    y <- patch$y
    z <- patch$z
    theta.hat <- mean(y) / mean(z)
    print (theta.hat)

    #compute the jackknife replicates, leave-one-out estimates
    theta.jack <- numeric(n)
    for (i in 1:n)
        theta.jack[i] <- mean(y[-i]) / mean(z[-i])
    bias <- (n - 1) * (mean(theta.jack) - theta.hat)

    print(bias)  #jackknife estimate of bias
    
    
### Example 8.7 (Jackknife estimate of standard error)

    se <- sqrt((n-1) *
        mean((theta.jack - mean(theta.jack))^2))
    print(se)
    

### Example 8.8 (Failure of jackknife)

    set.seed(137) #for the specific example given
    #change the seed to see other examples

    n <- 10
    x <- sample(1:100, size = n)

    #jackknife estimate of se
    M <- numeric(n)
    for (i in 1:n) {        #leave one out
        y <- x[-i]
        M[i] <- median(y)
    }
    Mbar <- mean(M)
    print(sqrt((n-1)/n * sum((M - Mbar)^2)))

    #bootstrap estimate of se
    Mb <- replicate(1000, expr = {
            y <- sample(x, size = n, replace = TRUE)
            median(y) })
    print(sd(Mb)) 
    print(x)
    print(M)
    print(Mb)


### Example 8.9 (Bootstrap confidence intervals for patch ratio statistic)

    library(boot)       #for boot and boot.ci
    data(patch, package = "bootstrap")

    theta.boot <- function(dat, ind) {
        #function to compute the statistic
        y <- dat[ind, 1]
        z <- dat[ind, 2]
        mean(y) / mean(z)
    }

    y <- patch$y
    z <- patch$z
    dat <- cbind(y, z)
    boot.obj <- boot(dat, statistic = theta.boot, R = 2000)

    print(boot.obj)
    print(boot.ci(boot.obj,
                  type = c("basic", "norm", "perc")))


    #calculations for bootstrap confidence intervals
    alpha <- c(.025, .975)

    #normal
    print(boot.obj$t0 + qnorm(alpha) * sd(boot.obj$t))

    #basic
    print(2*boot.obj$t0 - 
        quantile(boot.obj$t, rev(alpha), type=1))

    #percentile
    print(quantile(boot.obj$t, alpha, type=6))


### Example 8.10 (Bootstrap confidence intervals for the correlation statistic)

    library(boot)
    data(law, package = "bootstrap")
    boot.obj <- boot(law, R = 2000,
             statistic = function(x, i){cor(x[i,1], x[i,2])})
    print(boot.ci(boot.obj, type=c("basic","norm","perc")))


### Example 8.11 (Bootstrap t confidence interval)

    boot.t.ci <-
    function(x, B = 500, R = 100, level = .95, statistic){
        #compute the bootstrap t CI
        x <- as.matrix(x);  n <- nrow(x)
        stat <- numeric(B); se <- numeric(B)

        boot.se <- function(x, R, f) {
            #local function to compute the bootstrap
            #estimate of standard error for statistic f(x)
            x <- as.matrix(x); m <- nrow(x)
            th <- replicate(R, expr = {
                i <- sample(1:m, size = m, replace = TRUE)
                f(x[i, ])
                })
            return(sd(th))
        }

        for (b in 1:B) {
            j <- sample(1:n, size = n, replace = TRUE)
            y <- x[j, ]
            stat[b] <- statistic(y)
            se[b] <- boot.se(y, R = R, f = statistic)
        }
        stat0 <- statistic(x)
        t.stats <- (stat - stat0) / se
        se0 <- sd(stat)
        alpha <- 1 - level
        Qt <- quantile(t.stats, c(alpha/2, 1-alpha/2), type = 1)
        names(Qt) <- rev(names(Qt))
        CI <- rev(stat0 - Qt * se0)
    }


### Example 8.12 (Bootstrap t confidence interval for patch ratio statistic)

    #boot package and patch data were loaded in Example 8.10
    #library(boot)       #for boot and boot.ci
    #data(patch, package = "bootstrap")
    
    dat <- cbind(patch$y, patch$z)
    stat <- function(dat) {
        mean(dat[, 1]) / mean(dat[, 2]) }
    ci <- boot.t.ci(dat, statistic = stat, B=2000, R=200)
    print(ci)


### Example 8.13 (BCa bootstrap confidence interval)

    boot.BCa <-
    function(x, th0, th, stat, conf = .95) {
        # bootstrap with BCa bootstrap confidence interval
        # th0 is the observed statistic
        # th is the vector of bootstrap replicates
        # stat is the function to compute the statistic

        x <- as.matrix(x)
        n <- nrow(x) #observations in rows
        N <- 1:n
        alpha <- (1 + c(-conf, conf))/2
        zalpha <- qnorm(alpha)

        # the bias correction factor
        z0 <- qnorm(sum(th < th0) / length(th))

        # the acceleration factor (jackknife est.)
        th.jack <- numeric(n)
        for (i in 1:n) {
            J <- N[1:(n-1)]
            th.jack[i] <- stat(x[-i, ], J)
        }
        L <- mean(th.jack) - th.jack
        a <- sum(L^3)/(6 * sum(L^2)^1.5)

        # BCa conf. limits
        adj.alpha <- pnorm(z0 + (z0+zalpha)/(1-a*(z0+zalpha)))
        limits <- quantile(th, adj.alpha, type=6)
        return(list("est"=th0, "BCa"=limits))
    }
    

### Example 8.14 (BCa bootstrap confidence interval)

    #boot package and patch data were loaded in Example 8.10
    #library(boot)       #for boot and boot.ci
    #data(patch, package = "bootstrap")

    n <- nrow(patch)
    B <- 2000
    y <- patch$y
    z <- patch$z
    x <- cbind(y, z)
    theta.b <- numeric(B)
    theta.hat <- mean(y) / mean(z)

    #bootstrap
    for (b in 1:B) {
        i <- sample(1:n, size = n, replace = TRUE)
        y <- patch$y[i]
        z <- patch$z[i]
        theta.b[b] <- mean(y) / mean(z)
        }
    #compute the BCa interval
    stat <- function(dat, index) {
        mean(dat[index, 1]) / mean(dat[index, 2])  }

    boot.BCa(x, th0 = theta.hat, th = theta.b, stat = stat)
    
    
### Example 8.15 (BCa bootstrap confidence interval using boot.ci)

    #using x from Example 8.15
    boot.obj <- boot(x, statistic = stat, R=2000)
    boot.ci(boot.obj, type=c("perc", "bca"))

### Example 8.16 (Model selection)

    #to prompt for next graph, uncomment line below
    #par(ask = TRUE)   
    
    library(DAAG); attach(ironslag)
    a <- seq(10, 40, .1)     #sequence for plotting fits

    L1 <- lm(magnetic ~ chemical)
    plot(chemical, magnetic, main="Linear", pch=16)
    yhat1 <- L1$coef[1] + L1$coef[2] * a
    lines(a, yhat1, lwd=2)

    L2 <- lm(magnetic ~ chemical + I(chemical^2))
    plot(chemical, magnetic, main="Quadratic", pch=16)
    yhat2 <- L2$coef[1] + L2$coef[2] * a + L2$coef[3] * a^2
    lines(a, yhat2, lwd=2)

    L3 <- lm(log(magnetic) ~ chemical)
    plot(chemical, magnetic, main="Exponential", pch=16)
    logyhat3 <- L3$coef[1] + L3$coef[2] * a
    yhat3 <- exp(logyhat3)
    lines(a, yhat3, lwd=2)

    L4 <- lm(log(magnetic) ~ log(chemical))
    plot(log(chemical), log(magnetic), main="Log-Log", pch=16)
    logyhat4 <- L4$coef[1] + L4$coef[2] * log(a)
    lines(log(a), logyhat4, lwd=2)

### Example 8.17 (Model selection: Cross validation)

    # Example 8.16, cont.
    n <- length(magnetic)   #in DAAG ironslag
    e1 <- e2 <- e3 <- e4 <- numeric(n)

    # for n-fold cross validation
    # fit models on leave-one-out samples
    for (k in 1:n) {
        y <- magnetic[-k]
        x <- chemical[-k]

        J1 <- lm(y ~ x)
        yhat1 <- J1$coef[1] + J1$coef[2] * chemical[k]
        e1[k] <- magnetic[k] - yhat1

        J2 <- lm(y ~ x + I(x^2))
        yhat2 <- J2$coef[1] + J2$coef[2] * chemical[k] +
                J2$coef[3] * chemical[k]^2
        e2[k] <- magnetic[k] - yhat2

        J3 <- lm(log(y) ~ x)
        logyhat3 <- J3$coef[1] + J3$coef[2] * chemical[k]
        yhat3 <- exp(logyhat3)
        e3[k] <- magnetic[k] - yhat3

        J4 <- lm(log(y) ~ log(x))
        logyhat4 <- J4$coef[1] + J4$coef[2] * log(chemical[k])
        yhat4 <- exp(logyhat4)
        e4[k] <- magnetic[k] - yhat4
    }


    c(mean(e1^2), mean(e2^2), mean(e3^2), mean(e4^2))

    #selected model, fitted in Example 8.16
    L2

    par(mfrow = c(2, 2))    #layout for graphs
    plot(L2$fit, L2$res)    #residuals vs fitted values
    abline(0, 0)            #reference line
    qqnorm(L2$res)          #normal probability plot
    qqline(L2$res)          #reference line
    par(mfrow = c(1, 1))    #restore display