#### Parameters ##### ## Population parameters mu1 <- 7 mu2 <- 10 sigma1 <- 2 sigma2 <- 3 ## Sample sizes n1 <- 8 n2 <- 8 ## Number of features m <- 1e4 ## Antoine's trick: : use eng notation to avoid spending hours to count the zeros #### Generate a table with random normal data #### ## Sampling from the first population ## Tip: we go very progressively for didactic purposes randVector <- rnorm(n = m * n1, mean = mu1, sd = sigma1) randMatrix <- matrix(data = randVector, nrow = m, ncol = n1) group1 <- data.frame(randMatrix) # hist(randVector, breaks = 100) ## Name the rows and columns rownames(group1) <- paste(sep = "", "ft", 1:m) colnames(group1) <- paste0("g1s", 1:n1) # head(group1) ## Sampling from the second population group2 <- data.frame( matrix(data = rnorm(n = m * n2, mean = mu2, sd = sigma2), nrow = m, ncol = n2)) rownames(group2) <- paste0("ft", 1:m) colnames(group2) <- paste0("g2s", 1:n2) # head(group2) #### Boxplot per column #### ## Live dangerously : use cbind() # dataTable <- cbind(group1, group2) # dim (dataTable) ## Safe life: use merge() dataTable <- merge(x = group1, y = group2, by = "row.names") rownames(dataTable) <- dataTable$Row.names dataTable <- dataTable[, 2:ncol(dataTable)] dim(dataTable) # View(dataTable) #### Define metadata table #### ## Instantiate a data.frame with the group corresponding to each individual metaData <- data.frame(row.names = colnames(dataTable), group = c(rep(x = "g1", length.out = n1), rep(x = "g2", length.out = n2))) ## Add the replicate number metaData$replicate <- c(1:n1, 1:n2) ## Associate a color to each group groupColors <- c("g1" = "#BBDDFF", "g2" = "#FFDDBB") metaData$color <- groupColors[as.vector(metaData$group)] ## Generate a verbosy description metaData$description <- paste0("Group: ", metaData$group, ", replicate: ", metaData$replicate) ## Draw a beautiful boxplot boxplot(dataTable, horizontal = TRUE, las = 1, col = metaData$color) #### Craftwork approach of the hypothesis test ##### i <- 271 ## Choose arbitrarily the index of one feature x1 <- group1[i, ] ## Select the corresponding samlple from population 1 x2 <- group2[i, ] ## Select the corresponding samlple from population 1 # ?t.test() tTestResult <- t.test(x = x1, y = x2, alternative = "two.sided", var.equal = FALSE) # View(tTestResult) resultVector <- c(tTestResult$statistic, tTestResult$parameter, "pValue" = tTestResult$p.value) #### Declare a function #### #' @title Run a t-test on random numbers #' @author Jacques van Helden #' @param n1=10 size of the first sample #' @param n2=10 size of the second sample #' @return a vector with the statistics of the test: t, p.value, df RandNumTest <- function(n1, n2, mu1 = 7, mu2 = 10, sigma1 = 2, sigma2 = 3, verbose = FALSE, ... ) { if (verbose) { message("Running random number test with n1 = ", n1, " and n2 = ", n2) } ## Print the message passed in the arguments x1 <- rnorm(n = n1, mean = mu1, sd = sigma1) x2 <- rnorm(n = n2, mean = mu2, sd = sigma2) testResult <- t.test(x1, x2, ...) m1 <- mean(x1) m2 <- mean(x2) d <- m1 - m2 resultVector <- c( m1 = m1, m2 = m2, d = d, t = testResult$statistic, p.value = testResult$p.value, df = testResult$parameter) return(resultVector) } ## Store the column names in a vector, we will use it several times below randNumColNames <- c("m1", "m2", "d", "t", "p.value", "df") # RandNumTest() ## this fails RandNumTest(n1 = 4, n2 = 7, var.equal = FALSE, alternative = "two.sided") RandNumTest(n1 = 40, n2 = 70) #### Run m tests (iterations) #### ## Define the number of tests m <- 10000 iterationResults <- data.frame(matrix(nrow = m, ncol = 6)) colnames(iterationResults) <- randNumColNames ## Run m experiments and measure time with a loop (inefficient) message("iterating the test ", m, " times") iterationTime <- system.time( for (i in 1:m) { iterationResults[i, ] <- RandNumTest(n1 = 16, n2 = 16, mu1 = 7, mu2 = 10, var.equal = FALSE, alternative = "two.sided") } ) print("Iteration time") print(iterationTime) ## Slightly better: run the same test with replicate() message("iterating the test ", m, " times") replicateTime <- system.time( repTests <- replicate( n = m, RandNumTest(n1 = 16, n2 = 16, mu1 = 7, mu2 = 10, var.equal = FALSE, alternative = "two.sided") ) ) class(repTests) ## this is a matrix dim(repTests) ## m columns, one per test (feature) repTests <- t(repTests) ## Transpose the result because the vectors were aggregated as columns repTests <- data.frame(repTests) ## Convert the matrix into a data.frame class(repTests) ## now a data.frame dim(repTests) ## we now have m rows, good ! rownames(repTests) <- paste0("r", 1:m) ## name the rows colnames(repTests) <- randNumColNames ## name the columns head(repTests) tail(repTests) print("Time with replicate()") print(replicateTime) colnames(repTests) ## Draw the p-value histogram hist(repTests$p.value, breaks = 20) ## Draw alpha alpha <- 0.05 abline(v = alpha, col = "red")