library(readr)
library(dplyr)
library(ggplot2)

# Get the data from the provided URL and remove any rows with missing values
lizards <- read_csv("https://whitlockschluter3e.zoology.ubc.ca/Data/chapter12/chap12e3HornedLizards.csv") %>% 
  na.omit()

#### Get the summaries
lizard_summaries <- lizards %>%
  group_by(Survival) %>%
  summarise(n = n(),                          # Sample size for each group
            df = n - 1,                       # Degrees of freedom for each group
            mean_horn = mean(squamosalHornLength),  # Mean horn length for each group
            var_horn = var(squamosalHornLength))    # Variance of horn length for each group

### Calculate pooled variance
lizard_summaries_2 <- lizard_summaries %>% 
  summarise(pooled_variance = sum(df * var_horn) / sum(df), # Calculate the pooled variance
            mean_horn_diff = diff(mean_horn))               # Calculate the difference in means between groups

### Bootstrapping to estimate the distribution of the difference in means
lizard_boot <- replicate(n = 10000, simplify = FALSE,
                         lizards %>%
                           group_by(Survival)                     %>%
                           slice_sample(prop = 1, replace = TRUE) %>%
                           summarise(mean_horn = mean(squamosalHornLength)) %>%
                           summarise(mean_horn_diff = diff(mean_horn))) %>%
  bind_rows()

#### Estimating uncertainty using confidence intervals
alpha <- 0.05
lizard_summaries_3 <- lizard_summaries %>%
  summarise(est_diff        = abs(diff(mean_horn)),             # Estimated difference in means
            pooled_variance = sum(df * var_horn) / sum(df),     # Pooled variance
            se              = sqrt(sum(pooled_variance/n)),     # Standard error
            crit_t_95       = qt(p = alpha/2, df = sum(df), lower.tail = FALSE), # Critical t-value for 95% confidence interval
            lower_95CI      = est_diff - se * crit_t_95,        # Lower bound of the 95% confidence interval
            upper_95CI      = est_diff + se * crit_t_95)        # Upper bound of the 95% confidence interval

lizard_summaries_3

### Plot comparing the sampling distribution estimated by bootstrapping to a normal distribution
ggplot(lizard_boot, aes(x = mean_horn_diff)) + 
  geom_histogram(aes(y =..density..), colour = "white", fill = "black") +
  stat_function(fun = dnorm, color = "red", linewidth = 2,
                args = list(mean = pull(lizard_summaries_3, est_diff), 
                            sd = pull(lizard_summaries_3, se))) +
  annotate(geom = "label", x = 2.3, y = 0.2, label = "Bootstrap\ndistribution\nof means.", size = 6) +
  annotate(geom = "label", x = 3.5, y = 0.6, label = "Normal distribution\nmean = est_diff = 2.29\nsd = se_est_diff = 0.528.", size = 6, color = "red")