## ---------------------------------------------------------------------------- ## Name: 01-preliminaries.R ## Description: Script for Chapter 1 of "A First Course on Statistical Inference" ## Link: https://bookdown.org/egarpor/inference/ ## License: https://creativecommons.org/licenses/by-nc-nd/4.0/ ## Author: Isabel Molina Peralta and Eduardo García-Portugués ## Version: 2.4.1 ## ---------------------------------------------------------------------------- ## ----toss-coin-freq-1, echo = FALSE------------------------------------------------------------------------ set.seed(123456) n <- c(10, 20, 30, 100, 1000) toss <- sample(c("heads", "tails"), size = max(n), replace = TRUE) p <- cumsum(toss == "heads") / (1:max(n)) knitr::kable(x = cbind(n, p[n], 1 - p[n]), col.names = c("$n$", "Heads", "Tails"), booktabs = TRUE, caption = paste("Relative frequencies of \"heads\" and", "\"tails\" for $n$ random experiments."), digits = 3, align = c("lcc"), escape = FALSE) ## ----car-accidents-1, echo = FALSE, cache = TRUE----------------------------------------------------------- set.seed(123456) lambda <- 4 n <- c(10, 20, 30, 100, 1000, 10000) accidents <- pmin(rpois(max(n), lambda), 6) rel_freq_accidents <- sapply(0:6, function(k) cumsum(k == accidents) / (1:max(n))) knitr::kable(x = cbind(n, rel_freq_accidents[n, ]), col.names = c("$n$", paste0("$", 0:5, "$"), "$\\geq 6$"), caption = paste("Relative frequencies of car accidents", "in Spain for $n$ hours."), booktabs = TRUE, digits = 3, align = c("lccccccccc"), escape = FALSE) ## ----pedestrian-1, echo = FALSE, cache = TRUE-------------------------------------------------------------- set.seed(123456) n <- c(10, 20, 30, 100, 1000, 5000) weights <- rnorm(max(n), mean = 49, sd = 5) intervals <- cut(weights, breaks = c(0, 35, 45, 55, 65, Inf)) rel_freq_intervals <- sapply(levels(intervals), function(w) cumsum(w == intervals) / (1:max(n))) knitr::kable(x = cbind(n, rel_freq_intervals[n, ]), col.names = c("$n$", "$[0, 35)$", "$[35, 45)$", "$[45, 55)$", "$[55, 65)$", "$[65, \\infty)$"), caption = paste("Relative frequencies of weight intervals", "for $n$ measured pedestrians."), booktabs = TRUE, digits = 3, align = c("lccccc"), escape = FALSE) ## ----normal-prob------------------------------------------------------------------------------------------- # Computation of P(Z > k) k <- 0.96 1 - pnorm(k) # 1 - P(Z <= k) pnorm(k, lower.tail = FALSE) # Alternatively ## ----normal-prob-2----------------------------------------------------------------------------------------- alpha <- 0.05 qnorm(1 - alpha / 2) # LOWER (1 - beta)-quantile = UPPER beta-quantile qnorm(alpha / 2, lower.tail = FALSE) # Alternatively, lower.tail = FALSE # computes the upper quantile and lower.tail = TRUE (the default) computes the # lower quantile ## ----chi-prob-1-------------------------------------------------------------------------------------------- alpha <- 0.05 qchisq(1 - alpha, df = 6) # df stands for the degrees of freedom qchisq(alpha, df = 6, lower.tail = FALSE) # Alternatively ## ----chi-prob-2-------------------------------------------------------------------------------------------- alpha <- 0.10 qchisq(1 - alpha / 2, df = 9, lower.tail = FALSE) # a1 qchisq(alpha / 2, df = 9, lower.tail = FALSE) # a2 ## ----t-prob------------------------------------------------------------------------------------------------ pt(-2, df = 5) ## ----F-prob------------------------------------------------------------------------------------------------ qf(0.05, df1 = 5, df2 = 9, lower.tail = FALSE) ## ----prob-clt---------------------------------------------------------------------------------------------- pnorm(-2.4658) pnorm(5.5, mean = 6.1, sd = 1.5 / sqrt(38)) # Alternatively ## ----prob-clt-2-------------------------------------------------------------------------------------------- pnorm(-3) ## ----prob-clt-7-------------------------------------------------------------------------------------------- pnorm(8.5, mean = 10, sd = sqrt(6)) ## ----prob-clt-8-------------------------------------------------------------------------------------------- pnorm(8.5, mean = 10, sd = sqrt(6)) - pnorm(7.5, mean = 10, sd = sqrt(6)) ## ----prob-clt-9-------------------------------------------------------------------------------------------- pbinom(8, size = 25, prob = 0.4) dbinom(8, size = 25, prob = 0.4) ## ----prob-clt-4-------------------------------------------------------------------------------------------- 1 - pbinom(54, size = 100, prob = 0.5) pbinom(54, size = 100, prob = 0.5, lower.tail = FALSE) # Alternatively ## ----prob-clt-5-------------------------------------------------------------------------------------------- 1 - pnorm(0.55, mean = 0.5, sd = sqrt(0.25 / 100)) ## ----prob-clt-6-------------------------------------------------------------------------------------------- 1 - pnorm(54.5, mean = 50, sd = sqrt(25)) ## ----ci-mu-1----------------------------------------------------------------------------------------------- # Sample X <- c(916, 892, 895, 904, 913, 916, 895, 885) # n, mean, sigma, alpha, z_{\alpha/2} n <- length(X) X_bar <- mean(X) sigma <- 12 alpha <- 0.05 z <- qnorm(alpha / 2, lower.tail = FALSE) # CI X_bar + c(-1, 1) * z * sigma / sqrt(n) ## ----ci-mu-2----------------------------------------------------------------------------------------------- # Sample X <- c(916, 892, 895, 904, 913, 916, 895, 885) # n, mean, S', alpha, t_{\alpha/2;n-1} n <- length(X) X_bar <- mean(X) S_prime <- sd(X) alpha <- 0.05 t <- qt(alpha / 2, df = n - 1, lower.tail = FALSE) # CI X_bar + c(-1, 1) * t * S_prime / sqrt(n) ## ----ci-sigma2--------------------------------------------------------------------------------------------- # Sample X <- c(4.1, 5.2, 10.2) # n, S'^2, alpha, c1, c2 n <- length(X) S2_prime <- var(X) alpha <- 0.10 c1 <- qchisq(1 - alpha / 2, df = n - 1, lower.tail = FALSE) c2 <- qchisq(alpha / 2, df = n - 1, lower.tail = FALSE) # CI (n - 1) * S2_prime / c(c2, c1) ## ----ci-diff-mu-1------------------------------------------------------------------------------------------ # Samples X_1 <- c(32, 37, 35, 28, 41, 44, 35, 31, 34) X_2 <- c(35, 31, 29, 25, 34, 40, 27, 32, 31) # n1, n2, Xbar1, Xbar2, sigma2_1, sigma2_2, alpha, z_{alpha/2} n_1 <- length(X_1) n_2 <- length(X_2) X_bar_1 <- mean(X_1) X_bar_2 <- mean(X_2) sigma2_1 <- sigma2_2 <- 22 alpha <- 0.05 z <- qnorm(alpha / 2, lower.tail = FALSE) # CI (X_bar_1 - X_bar_2) + c(-1, 1) * z * sqrt(sigma2_1 / n_1 + sigma2_2 / n_2) ## ----ci-diff-mu-2------------------------------------------------------------------------------------------ # Samples X_1 <- c(32, 37, 35, 28, 41, 44, 35, 31, 34) X_2 <- c(35, 31, 29, 25, 34, 40, 27, 32, 31) # n1, n2, Xbar1, Xbar2, S^2, alpha, t_{alpha/2;n1-n2-2} n_1 <- length(X_1) n_2 <- length(X_2) X_bar_1 <- mean(X_1) X_bar_2 <- mean(X_2) S2_prime_1 <- var(X_1) S2_prime_2 <- var(X_2) S <- sqrt(((n_1 - 1) * S2_prime_1 + (n_2 - 1) * S2_prime_2) / (n_1 + n_2 - 2)) alpha <- 0.05 t <- qt(alpha / 2, df = n_1 + n_2 - 2, lower.tail = FALSE) # CI (X_bar_1 - X_bar_2) + c(-1, 1) * t * S * sqrt(1 / n_1 + 1 / n_2) ## ----ratio-vars-1------------------------------------------------------------------------------------------ # n1, n2, S1'^2, S2'^2, alpha, c1, c2 n_1 <- 11 n_2 <- 14 S2_prime_1 <- 52 S2_prime_2 <- 71 alpha <- 0.05 c1 <- qf(1 - alpha / 2, df1 = n_1 - 1, df2 = n_2 - 1, lower.tail = FALSE) c2 <- qf(alpha / 2, df1 = n_1 - 1, df2 = n_2 - 1, lower.tail = FALSE) # CI (S2_prime_1 / S2_prime_2) / c(c2, c1) ## ----ratio-vars-2------------------------------------------------------------------------------------------ # Xbar1, Xbar2, S^2, t_{alpha/2;n1-n2-2} X_bar_1 <- 64 X_bar_2 <- 69 S2 <- ((n_1 - 1) * S2_prime_1 + (n_2 - 1) * S2_prime_2) / (n_1 + n_2 - 2) t <- qt(1 - alpha / 2, df = n_1 + n_2 - 2, lower.tail = TRUE) # CI (X_bar_1 - X_bar_2) + c(-1, 1) * t * sqrt(S2 * (1 / n_1 + 1 / n_2)) ## ----aci-1------------------------------------------------------------------------------------------------- # Sample from Exp(2) set.seed(123456) n <- 100 x <- rexp(n = n, rate = 2) # MLE lambda_mle <- 1 / mean(x) # Asymptotic confidence interval alpha <- 0.05 z <- qnorm(alpha / 2, lower.tail = FALSE) lambda_mle + c(-1, 1) * z * lambda_mle / sqrt(n) ## ----ci-1-------------------------------------------------------------------------------------------------- # Computation of percentile bootstrap confidence intervals. It requires a # sample x and an estimator theta_hat() (must be a function!). In ... you # can pass parameters to hist(), such as xlim. boot_ci <- function(x, theta_hat, B = 5e3, alpha = 0.05, plot_boot = TRUE, ...) { # Check that theta_hat is a proper function stopifnot(is.function(theta_hat)) # Creates convenience statistic for boot::boot() stat <- function(x, indexes) theta_hat(x[indexes]) # Perform bootstrap resampling (step 2) with the aid of boot::boot() boot_obj <- boot::boot(data = x, statistic = stat, sim = "ordinary", R = B) # Extract bootstrapped statistics (theta_hat_star's) from the boot object theta_hat_star <- boot_obj$t # Confidence intervals ci <- quantile(theta_hat_star, probs = c(alpha / 2, 1 - alpha / 2)) # Plot the distribution of bootstrap estimators and the confidence intervals? # Draw also theta_hat, the statistic from the original sample, in boot_obj$t0 if (plot_boot) { hist(theta_hat_star, probability = TRUE, main = "", xlab = latex2exp::TeX("$\\hat{\\theta}^*$"), ...) rug(theta_hat_star) abline(v = ci, col = 2, lwd = 2) abline(v = boot_obj$t0, col = 3, lwd = 2) } # Return confidence intervals return(ci) } ## ----ci-2-------------------------------------------------------------------------------------------------- # Sample from Exp(2) set.seed(123456) n <- 200 x <- rexp(n = n, rate = 2) ## ----ci-3-------------------------------------------------------------------------------------------------- # Bootstrap confidence interval lambda_hat_exp <- function(x) 1 / mean(x) boot_ci(x = x, B = 5e3, theta_hat = lambda_hat_exp) ## ----ci-4-------------------------------------------------------------------------------------------------- # Asymptotic confidence interval alpha <- 0.05 z <- qnorm(alpha / 2, lower.tail = FALSE) lambda_mle <- 1 / mean(x) lambda_mle + c(-1, 1) * z * lambda_mle / sqrt(n) ## ----ci-5-------------------------------------------------------------------------------------------------- # Sample X <- c(916, 892, 895, 904, 913, 916, 895, 885) # Bootstrap confidence interval boot_ci(x = X, B = 5e3, theta_hat = mean) ## ----qt-2-------------------------------------------------------------------------------------------------- qt(0.025, df = 7, lower.tail = FALSE) ## ----t-test-1-2-------------------------------------------------------------------------------------------- # Apply t.test() with H1: mu < 915 t.test(x = c(910, 921, 918, 896, 909, 905, 895, 903), mu = 915, alternative = "less") ## ----t-test-2-2-------------------------------------------------------------------------------------------- # Apply t.test() with equal variances and H1: mu1 != mu2 std <- c(32, 37, 35, 28, 41, 44, 35, 31, 34) new <- c(35, 31, 29, 25, 34, 40, 27, 32, 31) t.test(x = std, y = new, alternative = "two.sided", var.equal = TRUE, paired = FALSE) ## ----var-test-0, echo = FALSE------------------------------------------------------------------------------ men <- c(16.0, 13.4, 17.7, 10.2, 13.1, 15.4, 15.9, 11.9, 13.9, 15.5, 15.9, 12.5, 16.5, 16.5) wom <- c(5.8, 6.4, 13.1, 7.2, 12.8, 9.8, 10.5, 18.9, 13.7, 13.7, 9.8, 11.5) ## ----var-test-2-------------------------------------------------------------------------------------------- qf(0.025, df1 = 13, df2 = 11, lower.tail = FALSE) qf(0.975, df1 = 13, df2 = 11, lower.tail = FALSE) ## ----var-test-4-------------------------------------------------------------------------------------------- # Apply var.test() with H1: sigma_1^2 != sigma_2^2 men <- c(16.0, 13.4, 17.7, 10.2, 13.1, 15.4, 15.9, 11.9, 13.9, 15.5, 15.9, 12.5, 16.5, 16.5) wom <- c(5.8, 6.4, 13.1, 7.2, 12.8, 9.8, 10.5, 18.9, 13.7, 13.7, 9.8, 11.5) var.test(x = men, y = wom, alternative = "two.sided") ## ----pbiom------------------------------------------------------------------------------------------------- pbinom(3, size = 15, prob = 0.5) ## ----pt---------------------------------------------------------------------------------------------------- pt(-1.992, df = 119) ## ----lrt-1------------------------------------------------------------------------------------------------- # Data xbar_1 <- 20 xbar_2 <- 22 n_1 <- 87 n_2 <- 61 # MLE's under H0 and H1 theta_hat_0 <- (n_1 * xbar_1 + n_2 * xbar_2) / (n_1 + n_2) theta_hat_1 <- xbar_1 theta_hat_2 <- xbar_2 # Log-likelihood ratio statistic log_lamba_n <- (n_1 * xbar_1 + n_2 * xbar_2) * log(theta_hat_0) - (n_1 * xbar_1) * log(theta_hat_1) - (n_2 * xbar_2) * log(theta_hat_2) - (n_1 + n_2) * theta_hat_0 + (n_1 * theta_hat_1 + n_2 * theta_hat_2) -2 * log_lamba_n ## ----lrt-2------------------------------------------------------------------------------------------------- # p-value of the test pchisq(-2 * log_lamba_n, df = 1, lower.tail = FALSE) ## ----lrt-3------------------------------------------------------------------------------------------------- # Log-likelihood log_lik <- function(x, theta) { n <- length(x) -n * log(theta) - n * mean(x) / theta } # Estimator function theta_hat <- function(x) { mean(x) } # theta_0 theta_0 <- 1.5 # Statistic and p-value for first sample x <- c(1.69, 1.15, 2.66, 0.06, 0.11) (log_lambda_n <- -2 * (log_lik(x = x, theta = theta_0) - log_lik(x = x, theta = theta_hat(x = x)))) pchisq(q = log_lambda_n, df = 1, lower.tail = FALSE) # Statistic and p-value for second sample x <- c(1.05, 0.72, 1.66, 0.04, 0.07, 0.4, 0.39) (log_lambda_n <- -2 * (log_lik(x = x, theta = theta_0) - log_lik(x = x, theta = theta_hat(x = x)))) pchisq(q = log_lambda_n, df = 1, lower.tail = FALSE)