library(tidyverse) library(palmerpenguins) ### Part 1: demonstration of bootstrap # goal: compute the difference in median body mass between male and female Adelie penguins, # as well as a confidence interval around this estimated difference in medians # question: does the confidence interval overlap zero? # 1a) prepare data adelie_dt <- penguins %>% filter(species == "Adelie") %>% select(c("sex", "body_mass_g")) %>% drop_na() # 1b) Point estimate (difference in medians) obs_diff <- median(filter(adelie_dt, sex == "male")$body_mass_g) - median(filter(adelie_dt, sex == "female")$body_mass_g) # 1c) Bootstrap (resample within each sex, with replacement) bootstrap_replicates <- 5000 bootstrap_diffs <- rep(0, bootstrap_replicates) # create empty vector set.seed(1) for (i in 1:bootstrap_replicates) { male_b <- filter(adelie_dt, sex == "male") %>% slice_sample(n = nrow(.), replace = TRUE) female_b <- filter(adelie_dt, sex == "female") %>% slice_sample(n = nrow(.), replace = TRUE) d <- median(male_b$body_mass_g) - median(female_b$body_mass_g) bootstrap_diffs[i] <- d } # 1d) Compute 95% percentile CI ci <- quantile(bootstrap_diffs, c(0.025, 0.975)) ci ### Part 2: demonstration of permutation test # goal: compute a p-value for the difference in medians between male and female Adelie penguins set.seed(2) perm_replicates <- 10000 perm_diffs <- rep(0, perm_replicates) for (i in 1:perm_replicates) { # shuffle the sex labels within the tibble adelie_perm <- adelie_dt %>% mutate(sex = sample(sex, replace = FALSE)) # compute difference in medians using the same code as before d <- median(filter(adelie_perm, sex == "male")$body_mass_g) - median(filter(adelie_perm, sex == "female")$body_mass_g) perm_diffs[i] <- d } # One-sided p-value (male > female) p_one_sided <- mean(perm_diffs >= obs_diff) p_one_sided # Two-sided p-value (male > female or female > male) p_two_sided <- mean(abs(perm_diffs) >= abs(obs_diff)) p_two_sided # corresponding visualization ggplot(aes(x = perm_diffs), data = tibble(perm_diffs)) + geom_histogram() + geom_vline(xintercept = obs_diff, linewidth = 1, color = "red") + theme_classic() + xlab("Difference in medians (g)") + ylab("Count") ### Part 3: demonstration of power analysis # https://www.researchgate.net/profile/Rui-Zhang-377/publication/262018045/figure/fig1/AS:296950378319872@1447809895486/Schematic-of-allele-specific-expression-A-The-two-chromosomal-copies-alleles-of-a.png sample(c("R", "A"), size = 1, replace = TRUE) sample(c("R", "A"), size = 5, replace = TRUE) sample(c("R", "A"), size = 100, replace = TRUE) %>% table() sample(c("R", "A"), size = 100, replace = TRUE, prob = c(0.9, 0.1)) %>% table() binom.test(x = 55, n = 100, p = 0.5) #### # power analysis hist(rbinom(100000, size = 100, prob = 0.3), breaks = 0:100) table(rbinom(100000, size = 100, prob = 0.3)) # simulate allele-specific expression (allelic ratio [prob] = 0.3) # for 100,000 simulated sites with depth of 100 reads each, count how many times you see the ref allele set.seed(1) sim <- tibble(ref_reads = rbinom(100000, size = 100, prob = 0.3), total_reads = 100, prob_ref = 0.3) %>% rowwise() %>% mutate(pval = binom.test(ref_reads, total_reads, p = 0.5)$p.value) # compute p-values from binomial tests on each trial power <- mean(sim$pval < 0.05) # power is the proportion of trials in which we rejected the null hypothesis # note that we basically just need to set the number of trials high enough to ensure that our calculation of power # doesn't depend on the number of trials that we ran # now generalize to consider an arbitrary "fairness" of the coin and an arbitrary number of flips per trial get_power <- function(p_ref, n_reads) { sim <- tibble(ref_reads = rbinom(1000, size = n_reads, prob = p_ref), tot_reads = n_reads, prob_heads = p_ref) %>% rowwise() %>% mutate(pval = binom.test(ref_reads, tot_reads, p = 0.5)$p.value) power <- mean(sim$pval < 0.05) return(power) } power_sim <- expand_grid(prob_ref = c(0.5, 0.51, 0.55, 0.6, 0.7, 0.8, 0.9), num_reads = c(5, 10, 20, 50, 100, 200, 500, 1000)) %>% rowwise() %>% mutate(power = get_power(prob_ref, num_reads)) ggplot(data = power_sim, aes(x = prob_ref, y = power, color = factor(num_reads))) + geom_line() + ylab("Power to detect allele-specific expression") + xlab("Simulated allelic ratio (effect size)") + scale_color_brewer(palette = "Dark2", name = "Depth of coverage\n(total aligned reads)")