## ---------------------------------------------------------------------------- ## Name: 06-nptests.R ## Description: Script for Chapter 6 of "Notes for Nonparametric Statistics" ## Link: https://bookdown.org/egarpor/NP-UC3M/ ## License: https://creativecommons.org/licenses/by-nc-nd/4.0/ ## Author: Eduardo García-Portugués ## Version: 6.9.1 ## ---------------------------------------------------------------------------- ## ----ks-1, fig.cap = '(ref:ks-1-title)'------------------------------------------------------- # Sample data n <- 10 mu0 <- 2 sd0 <- 1 set.seed(54321) samp <- rnorm(n = n, mean = mu0, sd = sd0) # Fn vs. F0 plot(ecdf(samp), main = "", ylab = "Probability", xlim = c(-1, 6), ylim = c(0, 1.3)) curve(pnorm(x, mean = mu0, sd = sd0), add = TRUE, col = 4) # Add Dn+ and Dn- samp_sorted <- sort(samp) Ui <- pnorm(samp_sorted, mean = mu0, sd = sd0) Dn_plus <- (1:n) / n - Ui Dn_minus <- Ui - (1:n - 1) / n i_plus <- which.max(Dn_plus) i_minus <- which.max(Dn_minus) lines(rep(samp_sorted[i_plus], 2), c(i_plus / n, pnorm(samp_sorted[i_plus], mean = mu0, sd = sd0)), col = 3, lwd = 2, pch = 16, type = "o", cex = 0.75) lines(rep(samp_sorted[i_minus], 2), c((i_minus - 1) / n, pnorm(samp_sorted[i_minus], mean = mu0, sd = sd0)), col = 2, lwd = 2, pch = 16, type = "o", cex = 0.75) rug(samp) legend("topleft", lwd = 2, col = c(1, 4, 3, 2), legend = latex2exp::TeX(c("$F_n$", "$F_0$", "$D_n^+$", "$D_n^-$"))) ## ----ks-2------------------------------------------------------------------------------------- # Sample data from a N(0, 1) n <- 50 set.seed(3245678) x <- rnorm(n = n) # Kolmogorov-Smirnov test for H_0: F = N(0, 1). Does not reject (ks <- ks.test(x = x, y = "pnorm")) # In "y" we specify F0 as a function # Structure of "htest" class str(ks) # Kolmogorov-Smirnov test for H_0: F = N(0.5, 1). Rejects ks.test(x = x, y = "pnorm", mean = 0.5) # Kolmogorov-Smirnov test for H_0: F = Exp(2). Strongly rejects ks.test(x = x, y = "pexp", rate = 1/2) ## ----ks-3------------------------------------------------------------------------------------- # Sample data from a Pois(5) n <- 100 set.seed(3245678) x <- rpois(n = n, lambda = 5) # Kolmogorov-Smirnov test for H_0: F = Pois(5) without specifying that the # distribution is discrete. Rejects (!?) giving a warning message ks.test(x = x, y = "ppois", lambda = 5) # We rely on dgof::ks.test, which works as stats::ks.test if the "y" argument # is not marked as a "stepfun" object, the way the dgof package codifies # discrete distribution functions # Step function containing the cdf of the Pois(5). The "x" stands for the # location of the steps and "y" for the value of the steps. "y" needs to have # an extra point for the initial value of the function before the first step x_eval <- 0:20 ppois_stepfun <- stepfun(x = x_eval, y = c(0, ppois(q = x_eval, lambda = 5))) plot(ppois_stepfun, main = "Cdf of a Pois(5)") # Kolmogorov-Smirnov test for H_0: F = Pois(5) adapted for discrete data, # with data coming from a Pois(5) dgof::ks.test(x = x, y = ppois_stepfun) # If data is normally distributed, the test rejects H_0 dgof::ks.test(x = rnorm(n = n, mean = 5), y = ppois_stepfun) ## ----ks-4, fig.margin = FALSE, fig.fullwidth = TRUE, fig.asp = 1/2, fig.cap = '(ref:ks-4-title)', out.width = .tex_web('80%', '100%')---- # Simulation of p-values when H_0 is true set.seed(131231) n <- 100 M <- 1e4 pvalues_H0 <- sapply(1:M, function(i) { x <- rnorm(n) # N(0, 1) ks.test(x, "pnorm")$p.value }) # Simulation of p-values when H_0 is false -- the data does not # come from a N(0, 1) but from a N(0, 2) pvalues_H1 <- sapply(1:M, function(i) { x <- rnorm(n, mean = 0, sd = sqrt(2)) # N(0, 2) ks.test(x, "pnorm")$p.value }) # Comparison of p-values par(mfrow = 1:2) hist(pvalues_H0, breaks = seq(0, 1, l = 20), probability = TRUE, main = latex2exp::TeX("$H_0$"), ylim = c(0, 4)) abline(h = 1, col = 2) hist(pvalues_H1, breaks = seq(0, 1, l = 20), probability = TRUE, main = latex2exp::TeX("$H_1$"), ylim = c(0, 4)) abline(h = 1, col = 2) ## ----cvm-1------------------------------------------------------------------------------------ # Sample data from a N(0, 1) set.seed(3245678) n <- 50 x <- rnorm(n = n) # Cramér-von Mises test for H_0: F = N(0, 1). Does not reject goftest::cvm.test(x = x, null = "pnorm") # Comparison with Kolmogorov-Smirnov ks.test(x = x, y = "pnorm") # Sample data from a Pois(5) set.seed(3245678) n <- 100 x <- rpois(n = n, lambda = 5) # Cramér-von Mises test for H_0: F = Pois(5) without specifying that the # distribution is discrete. Rejects (!?) without giving a warning message goftest::cvm.test(x = x, null = "ppois", lambda = 5) # We rely on dgof::cvm.test and run a Cramér-von Mises test for H_0: F = Pois(5) # adapted for discrete data, with data coming from a Pois(5) x_eval <- 0:20 ppois_stepfun <- stepfun(x = x_eval, y = c(0, ppois(q = x_eval, lambda = 5))) dgof::cvm.test(x = x, y = ppois_stepfun) # Plot the asymptotic null distribution function curve(goftest::pCvM(x), from = 0, to = 1, n = 300) ## ----ad-1------------------------------------------------------------------------------------- # Sample data from a N(0, 1) set.seed(3245678) n <- 50 x <- rnorm(n = n) # Anderson-Darling test for H_0: F = N(0, 1). Does not reject goftest::ad.test(x = x, null = "pnorm") # Sample data from a Pois(5) set.seed(3245678) n <- 100 x <- rpois(n = n, lambda = 5) # Anderson-Darling test for H_0: F = Pois(5) without specifying that the # distribution is discrete. Rejects (!?) without giving a warning message goftest::ad.test(x = x, null = "ppois", lambda = 5) # We rely on dgof::cvm.test and run an Anderson-Darling test for H_0: F = Pois(5) # adapted for discrete data, with data coming from a Pois(5) x_eval <- 0:20 ppois_stepfun <- stepfun(x = x_eval, y = c(0, ppois(q = x_eval, lambda = 5))) dgof::cvm.test(x = x, y = ppois_stepfun, type = "A2") # Plot the asymptotic null distribution function curve(goftest::pAD(x), from = 0, to = 5, n = 300) ## ----norm-1----------------------------------------------------------------------------------- # Sample data from a N(10, 1) set.seed(123456) n <- 100 x <- rnorm(n = n, mean = 10) # Normality tests -- do not reject H0 nortest::lillie.test(x = x) nortest::cvm.test(x = x) nortest::ad.test(x = x) # Sample data from a Student's t with 3 degrees of freedom x <- rt(n = n, df = 3) # Normality tests -- reject H0 nortest::lillie.test(x = x) nortest::cvm.test(x = x) nortest::ad.test(x = x) # Flawed normality tests -- do not reject because the null distribution # that is employed is wrong! ks.test(x = x, y = "pnorm", mean = mean(x), sd = sd(x)) goftest::cvm.test(x = x, null = "pnorm", mean = mean(x), sd = sd(x)) goftest::ad.test(x = x, null = "pnorm", mean = mean(x), sd = sd(x)) ## ----qq, fig.cap = '(ref:qq-title)'----------------------------------------------------------- n <- 100 mu <- 10 sigma <- 2 set.seed(12345678) x <- rnorm(n, mean = mu, sd = sigma) qqnorm(x) abline(a = mu, b = sigma, col = 2) ## ----conf-qq, fig.cap = '(ref:conf-qq-title)', fig.margin = FALSE----------------------------- M <- 1e3 n <- 100 plot(0, 0, xlim = c(-3.5, 3.5), ylim = c(-3.5, 3.5), type = "n", xlab = "Theoretical Quantiles", ylab = "Sample Quantiles", main = "Confidence bands for QQ-plot") x <- matrix(rnorm(M * n), nrow = n, ncol = M) matpoints(qnorm(ppoints(n)), apply(x, 2, sort), pch = 19, cex = 0.5, col = gray(0, alpha = 0.01)) abline(a = 0, b = 1) p <- seq(0, 1, l = 1e4) xi <- qnorm(p) lines(xi, xi - qnorm(0.975) / sqrt(n) * sqrt(p * (1 - p)) / dnorm(xi), col = 2, lwd = 2) lines(xi, xi + qnorm(0.975) / sqrt(n) * sqrt(p * (1 - p)) / dnorm(xi), col = 2, lwd = 2) ## ----norm-2----------------------------------------------------------------------------------- # Does not reject H0 set.seed(123456) n <- 100 x <- rnorm(n = n, mean = 10) nortest::sf.test(x) # Rejects H0 x <- rt(n = n, df = 3) nortest::sf.test(x) # Test statistic cor(x = sort(x), y = qnorm(ppoints(n, a = 3/8)))^2 ## ----gof-1------------------------------------------------------------------------------------ # A goodness-of-fit test of the exponential distribution using the # Cramér-von Mises statistic cvm_exp_gof <- function(x, B = 5e3, plot_boot = TRUE) { # Test statistic function (depends on the data only) Tn <- function(data) { # Maximum likelihood estimator -- MODIFY DEPENDING ON THE PROBLEM theta_hat <- 1 / mean(data) # Test statistic -- MODIFY DEPENDING ON THE PROBLEM goftest::cvm.test(x = data, null = "pexp", rate = theta_hat)$statistic } # Function to simulate bootstrap samples X_1^*, ..., X_n^*. Requires TWO # arguments, one being the data X_1, ..., X_n and the other containing # the parameter theta r_mod <- function(data, theta) { # Simulate from an exponential. In this case, the function only uses # the sample size from the data argument to estimate theta -- MODIFY # DEPENDING ON THE PROBLEM rexp(n = length(data), rate = 1 / theta) } # Estimate of theta -- MODIFY DEPENDING ON THE PROBLEM theta_hat <- 1 / mean(x) # Perform bootstrap resampling with the aid of boot::boot Tn_star <- boot::boot(data = x, statistic = Tn, sim = "parametric", ran.gen = r_mod, mle = theta_hat, R = B) # Test information -- MODIFY DEPENDING ON THE PROBLEM method <- "Bootstrap-based Cramér-von Mises test for exponentiality" alternative <- "any alternative to exponentiality" # p-value: modify if rejection does not happen for large values of the # test statistic pvalue <- mean(Tn_star$t > Tn_star$t0) # Construct an "htest" result result <- list(statistic = c("stat" = Tn_star$t0), p.value = pvalue, theta_hat = theta_hat, statistic_boot = drop(Tn_star$t), B = B, alternative = alternative, method = method, data.name = deparse(substitute(x))) class(result) <- "htest" # Plot the position of the original statistic with respect to the # bootstrap replicates? if (plot_boot) { hist(result$statistic_boot, probability = TRUE, main = paste("p-value:", result$p.value), xlab = latex2exp::TeX("$T_n^*$")) rug(result$statistic_boot) abline(v = result$statistic, col = 2) } # Return "htest" return(result) } # Check the test for H0 true set.seed(123456) x <- rgamma(n = 100, shape = 1, scale = 1) gof0 <- cvm_exp_gof(x = x, B = 1e3) gof0 # Check the test for H0 false x <- rgamma(n = 100, shape = 2, scale = 1) gof1 <- cvm_exp_gof(x = x, B = 1e3) gof1 ## ----gof-2------------------------------------------------------------------------------------ # A goodness-of-fit test of a mixture of m normals using the # Cramér-von Mises statistic cvm_nm_gof <- function(x, m, B = 1e3, plot_boot = TRUE) { # Test statistic function (depends on the data only) Tn <- function(data) { # EM algorithm for fitting normal mixtures. With trace = 0 we disable the # default convergence messages or otherwise they will saturate the screen # with the bootstrap loop. Be aware that this is a potentially dangerous # practice, as we may lose important information about the convergence of # the EM algorithm theta_hat <- nor1mix::norMixEM(x = data, m = m, trace = 0) # Test statistic goftest::cvm.test(x = data, null = nor1mix::pnorMix, obj = theta_hat)$statistic } # Function to simulate bootstrap samples X_1^*, ..., X_n^*. Requires TWO # arguments, one being the data X_1, ..., X_n and the other containing # the parameter theta r_mod <- function(data, theta) { nor1mix::rnorMix(n = length(data), obj = theta) } # Estimate of theta theta_hat <- nor1mix::norMixEM(x = x, m = m, trace = 0) # Perform bootstrap resampling with the aid of boot::boot Tn_star <- boot::boot(data = x, statistic = Tn, sim = "parametric", ran.gen = r_mod, mle = theta_hat, R = B) # Test information method <- "Bootstrap-based Cramér-von Mises test for normal mixtures" alternative <- paste("any alternative to a", m, "normal mixture") # p-value: modify if rejection does not happen for large values of the # test statistic pvalue <- mean(Tn_star$t > Tn_star$t0) # Construct an "htest" result result <- list(statistic = c("stat" = Tn_star$t0), p.value = pvalue, theta_hat = theta_hat, statistic_boot = drop(Tn_star$t), B = B, alternative = alternative, method = method, data.name = deparse(substitute(x))) class(result) <- "htest" # Plot the position of the original statistic with respect to the # bootstrap replicates? if (plot_boot) { hist(result$statistic_boot, probability = TRUE, main = paste("p-value:", result$p.value), xlab = latex2exp::TeX("$T_n^*$")) rug(result$statistic_boot) abline(v = result$statistic, col = 2) } # Return "htest" return(result) } # Check the test for H0 true set.seed(123456) x <- c(rnorm(n = 100, mean = 2, sd = 0.5), rnorm(n = 100, mean = -2)) gof0 <- cvm_nm_gof(x = x, m = 2, B = 1e3) gof0 # Graphical assessment that H0 (parametric fit) and data (kde) match plot(gof0$theta_hat, p.norm = FALSE, ylim = c(0, 0.5)) plot(ks::kde(x), col = 2, add = TRUE) # Check the test for H0 false x <- rgamma(n = 100, shape = 2, scale = 1) gof1 <- cvm_nm_gof(x = x, m = 1, B = 1e3) gof1 # Graphical assessment that H0 (parametric fit) and data (kde) do not match plot(gof1$theta_hat, p.norm = FALSE, ylim = c(0, 0.5)) plot(ks::kde(x), col = 2, add = TRUE) ## ----comp-ks-2, fig.cap = '(ref:comp-ks-2-title)'--------------------------------------------- # Sample data n <- 10; m <- 10 mu1 <- 0; sd1 <- 1 mu2 <- 0; sd2 <- 2 set.seed(1998) samp1 <- rnorm(n = n, mean = mu1, sd = sd1) samp2 <- rnorm(n = m, mean = mu2, sd = sd2) # Fn vs. Gm plot(ecdf(samp1), main = "", ylab = "Probability", xlim = c(-3.5, 3.5), ylim = c(0, 1.3)) lines(ecdf(samp2), main = "", col = 4) # Add Dnm1 samp1_sorted <- sort(samp1) samp2_sorted <- sort(samp2) Dnm_1 <- abs((1:n) / n - ecdf(samp2)(samp1_sorted)) i1 <- which.max(Dnm_1) lines(rep(samp2_sorted[i1], 2), c(i1 / m, ecdf(samp1_sorted)(samp2_sorted[i1])), col = 3, lwd = 2, type = "o", pch = 16, cex = 0.75) rug(samp1, col = 1) # Add Dnm2 Dnm_2 <- abs(ecdf(samp1)(samp2_sorted) - (1:m) / m) i2 <- which.max(Dnm_2) lines(rep(samp1_sorted[i2], 2), c(i2 / n, ecdf(samp2_sorted)(samp1_sorted[i2])), col = 2, lwd = 2, type = "o", pch = 16, cex = 0.75) rug(samp2, col = 4) legend("topleft", lwd = 2, col = c(1, 4, 3, 2), legend = latex2exp::TeX(c("$F_n$", "$G_m$", "$D_{n,m,1}$", "$D_{n,m,2}$"))) ## ----comp-ks-3-------------------------------------------------------------------------------- # Check the test for H0 true set.seed(123456) x0 <- rgamma(n = 50, shape = 1, scale = 1) y0 <- rgamma(n = 100, shape = 1, scale = 1) ks.test(x = x0, y = y0) # Check the test for H0 false x1 <- rgamma(n = 50, shape = 2, scale = 1) y1 <- rgamma(n = 75, shape = 1, scale = 1) ks.test(x = x1, y = y1) ## ----comp-ks-4-------------------------------------------------------------------------------- # Check the test for H0 true set.seed(123456) x0 <- rgamma(n = 50, shape = 1, scale = 1) y0 <- rgamma(n = 100, shape = 1, scale = 1) ks.test(x = x0, y = y0, alternative = "less") # H1: F < G ks.test(x = x0, y = y0, alternative = "greater") # H1: F > G # Check the test for H0 false x1 <- rnorm(n = 50, mean = 1, sd = 1) y1 <- rnorm(n = 75, mean = 0, sd = 1) ks.test(x = x1, y = y1, alternative = "less") # H1: F < G ks.test(x = x1, y = y1, alternative = "greater") # H1: F > G # Interpretations: # 1. Strong rejection of H0: F = G in favor of H1: F < G when # alternative = "less", as in reality x1 is stochastically greater than y1. # This outcome allows stating that "there is strong evidence supporting that # x1 is locally stochastically greater than y1" # 2. No rejection ("strong acceptance") of H0: F = G versus H1: F > G when # alternative = "greater". Even if in reality x1 is stochastically greater than # y1 (so H0 is false), the alternative to which H0 is confronted is even less # plausible. A p-value ~ 1 indicates one is probably conducting the test in # the uninteresting direction alternative! ## ----comp-cvm-1------------------------------------------------------------------------------- # Two-sample Cramér-von Mises statistic cvm2_stat <- function(x, y) { # Sample sizes n <- length(x) m <- length(y) # Pooled sample z <- c(x, y) # Statistic computation via ecdf() (n * m / (n + m)^2) * sum((ecdf(x)(z) - ecdf(y)(z))^2) } # Check the test for H0 true set.seed(123456) x0 <- rgamma(n = 50, shape = 1, scale = 1) y0 <- rgamma(n = 100, shape = 1, scale = 1) cvm0 <- cvm2_stat(x = x0, y = y0) pval0 <- 1 - goftest::pCvM(q = cvm0) c("statistic" = cvm0, "p-value"= pval0) # Check the test for H0 false x1 <- rgamma(n = 50, shape = 2, scale = 1) y1 <- rgamma(n = 75, shape = 1, scale = 1) cvm1 <- cvm2_stat(x = x1, y = y1) pval1 <- 1 - goftest::pCvM(q = cvm1) c("statistic" = cvm1, "p-value"= pval1) ## ----comp-ad-1-------------------------------------------------------------------------------- # Two-sample Anderson-Darling statistic ad2_stat <- function(x, y) { # Sample sizes n <- length(x) m <- length(y) # Pooled sample and pooled ecdf z <- c(x, y) z <- z[-which.max(z)] # Exclude the largest point H <- rank(z) / (n + m) # Statistic computation via ecdf() (n * m / (n + m)^2) * sum((ecdf(x)(z) - ecdf(y)(z))^2 / ((1 - H) * H)) } # Check the test for H0 true set.seed(123456) x0 <- rgamma(n = 50, shape = 1, scale = 1) y0 <- rgamma(n = 100, shape = 1, scale = 1) ad0 <- ad2_stat(x = x0, y = y0) pval0 <- 1 - goftest::pAD(q = ad0) c("statistic" = ad0, "p-value"= pval0) # Check the test for H0 false x1 <- rgamma(n = 50, shape = 2, scale = 1) y1 <- rgamma(n = 75, shape = 1, scale = 1) ad1 <- ad2_stat(x = x1, y = y1) pval1 <- 1 - goftest::pAD(q = ad1) c("statistic" = ad1, "p-value"= pval1) ## ----comp-wilcox-4---------------------------------------------------------------------------- # Check the test for H0 true set.seed(123456) n <- 50 m <- 100 x0 <- rgamma(n = n, shape = 1, scale = 1) y0 <- rgamma(n = m, shape = 1, scale = 1) wilcox.test(x = x0, y = y0) # H1: P[X >= Y] != 0.5 wilcox.test(x = x0, y = y0, alternative = "greater") # H1: P[X >= Y] > 0.5 wilcox.test(x = x0, y = y0, alternative = "less") # H1: P[X <= Y] > 0.5 # Check the test for H0 false x1 <- rnorm(n = n, mean = 0, sd = 1) y1 <- rnorm(n = m, mean = 1, sd = 2) wilcox.test(x = x1, y = y1) # H1: P[X >= Y] != 0.5 wilcox.test(x = x1, y = y1, alternative = "greater") # H1: P[X >= Y] > 0.5 wilcox.test(x = x1, y = y1, alternative = "less") # H1: P[X <= Y] > 0.5 # Exact pmf versus asymptotic pdf # Beware! dwilcox considers (m, n) as the sample sizes of (X, Y) x <- seq(1500, 3500, by = 100) plot(x, dwilcox(x = x, m = n, n = m), type = "h", ylab = "Density") curve(dnorm(x, mean = n * m / 2, sd = sqrt((n * m * (n + m + 1)) / 12)), add = TRUE, col = 2) ## ----comp-signed-1---------------------------------------------------------------------------- # Check the test for H0 true set.seed(123456) x0 <- rgamma(n = 50, shape = 1, scale = 1) y0 <- x0 + rnorm(n = 50) wilcox.test(x = x0, y = y0, paired = TRUE) # H1: P[X >= Y] != 0.5 wilcox.test(x = x0, y = y0, paired = TRUE, alternative = "greater") # H1: P[X >= Y] > 0.5 wilcox.test(x = x0, y = y0, paired = TRUE, alternative = "less") # H1: P[X <= Y] > 0.5 # Check the test for H0 false x1 <- rnorm(n = 50, mean = 0, sd = 1) y1 <- x1 + rnorm(n = 50, mean = 1, sd = 2) wilcox.test(x = x1, y = y1, paired = TRUE) # H1: P[X >= Y] != 0.5 wilcox.test(x = x1, y = y1, paired = TRUE, alternative = "greater") # H1: P[X >= Y] > 0.5 wilcox.test(x = x1, y = y1, paired = TRUE, alternative = "less") # H1: P[X <= Y] > 0.5 ## ----comp-perm-1------------------------------------------------------------------------------ # A homogeneity test using the Anderson-Darling statistic perm_comp_test <- function(x, y, B = 1e3, plot_boot = TRUE) { # Sizes of x and y n <- length(x) m <- length(y) # Test statistic function. Requires TWO arguments, one being the original # data (X_1, ..., X_n, Y_1, ..., Y_m) and the other containing the random # index for permuting the sample Tn <- function(data, perm_index) { # Permute sample by perm_index data <- data[perm_index] # Split into two samples x <- data[1:n] y <- data[(n + 1):(n + m)] # Test statistic -- MODIFY DEPENDING ON THE PROBLEM ad2_stat(x = x, y = y) } # Perform permutation resampling with the aid of boot::boot Tn_star <- boot::boot(data = c(x, y), statistic = Tn, sim = "permutation", R = B) # Test information -- MODIFY DEPENDING ON THE PROBLEM method <- "Permutation-based Anderson-Darling test of homogeneity" alternative <- "any alternative to homogeneity" # p-value: modify if rejection does not happen for large values of the # test statistic pvalue <- mean(Tn_star$t > Tn_star$t0) # Construct an "htest" result result <- list(statistic = c("stat" = Tn_star$t0), p.value = pvalue, statistic_perm = drop(Tn_star$t), B = B, alternative = alternative, method = method, data.name = deparse(substitute(x))) class(result) <- "htest" # Plot the position of the original statistic with respect to the # permutation replicates? if (plot_boot) { hist(result$statistic_perm, probability = TRUE, main = paste("p-value:", result$p.value), xlab = latex2exp::TeX("$T_{n,m}^*$")) rug(result$statistic_perm) abline(v = result$statistic, col = 2) } # Return "htest" return(result) } # Check the test for H0 true set.seed(123456) x0 <- rpois(n = 75, lambda = 5) y0 <- rpois(n = 75, lambda = 5) comp0 <- perm_comp_test(x = x0, y = y0, B = 1e3) comp0 # Check the test for H0 false x1 <- rpois(n = 50, lambda = 3) y1 <- rpois(n = 75, lambda = 5) comp1 <- perm_comp_test(x = x1, y = y1, B = 1e3) comp1 ## ----indep-assoc-1---------------------------------------------------------------------------- # Outliers fool correlation, but do not fool concordance set.seed(123456) n <- 200 x <- rnorm(n) y <- rnorm(n) y[n] <- x[n] <- 1e4 cor.test(x, y, method = "pearson") # Close to 1 due to outlier cor.test(x, y, method = "kendall") # Close to 0, as it should cor.test(x, y, method = "spearman") # Close to 0, as it should cor(rank(x), rank(y)) # Spearman's rho # Outliers fool the sign of correlation, but do not fool concordance x <- rnorm(n) y <- x x[n] <- 1e3 y[n] <- -1e4 cor.test(x, y, method = "pearson", alternative = "greater") # Change of sign! cor.test(x, y, method = "kendall", alternative = "greater") # Fine cor.test(x, y, method = "spearman", alternative = "greater") # Fine ## ----indep-assoc-2---------------------------------------------------------------------------- # Non-monotone dependence fools concordance set.seed(123456) n <- 200 x <- rnorm(n) y <- abs(x) + rnorm(n) cor.test(x, y, method = "kendall") cor.test(x, y, method = "spearman") # Dependence in terms of conditional variance fools concordance x <- rnorm(n) y <- rnorm(n, sd = abs(x)) cor.test(x, y, method = "kendall") cor.test(x, y, method = "spearman") ## ----indep-dcor-2----------------------------------------------------------------------------- # Distance correlation detects non-monotone dependence set.seed(123456) n <- 200 x <- rnorm(n) y <- abs(x) + rnorm(n) # Distance covariance and correlation tests. R is the number of permutations # and needs to be specified (the default is R = 0 -- no test is produced) energy::dcov.test(x, y, R = 1e3) energy::dcor.test(x, y, R = 1e3) # Distance correlation detects conditional variance as dependence x <- rnorm(n) y <- rnorm(n, sd = abs(x)) # Distance covariance and correlation tests energy::dcov.test(x, y, R = 1e3) energy::dcor.test(x, y, R = 1e3) ## ----indep-dcor-3----------------------------------------------------------------------------- # A multivariate case with independence set.seed(123456) n <- 200 p <- 5 x <- matrix(rnorm(n * p), nrow = n, ncol = p) y <- matrix(rnorm(n * p), nrow = n, ncol = p) energy::dcov.test(x, y, R = 1e3) energy::dcor.test(x, y, R = 1e3) # A multivariate case with dependence y <- matrix(0.2 * rnorm(n = n * p, mean = c(x)) + rnorm(n * p, sd = 1.25), nrow = n, ncol = p) energy::dcov.test(x, y, R = 1e3) energy::dcor.test(x, y, R = 1e3) ## ----ind-perm-1------------------------------------------------------------------------------- # A no-association test using the absolute value of Spearman's rho statistic perm_ind_test <- function(x, B = 1e3, plot_boot = TRUE) { # Test statistic function. Requires TWO arguments, one being the original # data (X_1, Y_1), ..., (X_n, Y_n) and the other containing the random # index for permuting the second component of sample Tn <- function(data, perm_index) { # Permute sample by perm_index -- only permute the second component data[, 2] <- data[, 2][perm_index] # Test statistic -- MODIFY DEPENDING ON THE PROBLEM abs(cor(x = data[, 1], y = data[, 2], method = "spearman")) } # Perform permutation resampling with the aid of boot::boot Tn_star <- boot::boot(data = x, statistic = Tn, sim = "permutation", R = B) # Test information -- MODIFY DEPENDING ON THE PROBLEM method <- "Permutation-based Spearman's rho test of no concordance" alternative <- "Spearman's rho is not zero" # p-value: modify if rejection does not happen for large values of the # test statistic pvalue <- mean(Tn_star$t > Tn_star$t0) # Construct an "htest" result result <- list(statistic = c("stat" = Tn_star$t0), p.value = pvalue, statistic_perm = drop(Tn_star$t), B = B, alternative = alternative, method = method, data.name = deparse(substitute(x))) class(result) <- "htest" # Plot the position of the original statistic with respect to the # permutation replicates? if (plot_boot) { hist(result$statistic_perm, probability = TRUE, main = paste("p-value:", result$p.value), xlab = latex2exp::TeX("$T_n^*$")) rug(result$statistic_perm) abline(v = result$statistic, col = 2) } # Return "htest" return(result) } # Check the test for H0 true set.seed(123456) x0 <- mvtnorm::rmvnorm(n = 100, mean = c(0, 0), sigma = diag(c(1:2))) ind0 <- perm_ind_test(x = x0, B = 1e3) ind0 # Check the test for H0 false x1 <- mvtnorm::rmvnorm(n = 100, mean = c(0, 0), sigma = toeplitz(c(1, 0.5))) ind1 <- perm_ind_test(x = x1, B = 1e3) ind1