---
title: "Solution — Day 1 microbiome (classical)"
format:
  html:
    toc: true
    code-tools: true
---

Tasks **1.1–1.8** on the [lab exercises page](../index.qmd). Classical R only (no `tidymodels`).

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
suppressPackageStartupMessages({
  library(dplyr)
  library(ggplot2)
  library(MASS)
  library(glmnet)
})
theme_set(theme_minimal())
source("_load_microbiome.R")
```

## 1.1 Load and filter

```{r load}
mic <- load_microbiome(prev = 0.10)
otu_cols <- mic_otu_cols(mic)
dim(mic)
table(mic$Label)
table(mic$Sex, useNA = "ifany")
length(unique(mic$Individual))
```

## 1.2 log1p transform

```{r log1p}
otu_mat <- as.matrix(mic[, otu_cols])
log_otu <- log1p(otu_mat)
mic$lib_size <- rowSums(otu_mat)
```

```{r lib-size, fig.width=7, fig.height=4}
ggplot(mic, aes(lib_size, fill = Label)) +
  geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
  scale_x_log10() +
  labs(title = "Library size after prevalence filter", x = "Total counts")
```

## 1.3 PCA

We have far more OTUs than stable degrees of freedom for a plain `glm` on every column. **PCA** reduces the table to a few orthogonal scores that capture most of the variance; tasks **1.4–1.5** fit logistic regression in that low-dimensional space.

```{r pca, fig.width=7, fig.height=5}
pc <- prcomp(log_otu, center = TRUE, scale. = TRUE)
pc_df <- as.data.frame(pc$x[, 1:2]) |>
  mutate(sample_id = mic$sample_id, Label = mic$Label, Sex = mic$Sex)
ggplot(pc_df, aes(PC1, PC2, color = Label, shape = Sex)) +
  geom_point(alpha = 0.7) +
  labs(title = "PCA of log1p OTU abundances")
```

## 1.4–1.6 Label (Early vs Late)

```{r label-pcs}
n_pc <- 5
pc_scores <- as.data.frame(pc$x[, seq_len(n_pc), drop = FALSE])
colnames(pc_scores) <- paste0("PC", seq_len(n_pc))
dat_label <- cbind(Label = mic$Label, pc_scores)

m_glm <- glm(Label ~ ., data = dat_label, family = binomial)
summary(m_glm)$coefficients
```

```{r label-aic}
m_null <- glm(Label ~ 1, data = dat_label, family = binomial)
m_fwd <- stepAIC(m_null, scope = list(lower = m_null, upper = m_glm), direction = "forward", trace = 0)
m_bwd <- stepAIC(m_glm, direction = "backward", trace = 0)
list(forward = formula(m_fwd), backward = formula(m_bwd))
```

**Lasso** (below) works on **all** OTUs at once: the L1 penalty handles **p > n** by driving most coefficients to zero, so a separate PCA step is not required.

```{r label-lasso}
x_lasso <- scale(log_otu)
y_lasso <- ifelse(mic$Label == "Late", 1, 0)
set.seed(7)
cv_fit <- cv.glmnet(x_lasso, y_lasso, family = "binomial", alpha = 1, nfolds = 5)
coef(cv_fit, s = "lambda.min")
```

```{r label-metrics}
pred_glm <- predict(m_glm, type = "response")
pred_glm_cls <- factor(ifelse(pred_glm > 0.5, "Late", "Early"), levels = levels(mic$Label))
acc_glm <- mean(pred_glm_cls == mic$Label)
pred_lasso <- predict(cv_fit, newx = x_lasso, s = "lambda.min", type = "response")
pred_lasso_cls <- factor(ifelse(pred_lasso > 0.5, "Late", "Early"), levels = levels(mic$Label))
acc_lasso <- mean(pred_lasso_cls == mic$Label)
n_coef_lasso <- sum(coef(cv_fit, s = "lambda.min") != 0) - 1
c(accuracy_glm_pcs = acc_glm, accuracy_lasso = acc_lasso, n_nonzero_lasso = n_coef_lasso)
```

## 1.7 Sex (Female vs Male)

```{r sex-filter}
mic_sex <- mic |> filter(Sex %in% c("F", "M"))
otu_sex <- log1p(as.matrix(mic_sex[, otu_cols]))
zv_sex <- apply(otu_sex, 2, sd) > 1e-10
otu_sex <- otu_sex[, zv_sex, drop = FALSE]
pc_sex <- prcomp(otu_sex, center = TRUE, scale. = TRUE)
pc_s <- as.data.frame(pc_sex$x[, seq_len(n_pc), drop = FALSE])
colnames(pc_s) <- paste0("PC", seq_len(n_pc))
dat_sex <- cbind(Sex = factor(mic_sex$Sex, levels = c("F", "M")), pc_s)

m_sex <- glm(Sex ~ ., data = dat_sex, family = binomial)
m_sex_null <- glm(Sex ~ 1, data = dat_sex, family = binomial)
m_sex_fwd <- stepAIC(m_sex_null, scope = list(lower = m_sex_null, upper = m_sex), direction = "forward", trace = 0)
x_sex <- scale(otu_sex)
y_sex <- ifelse(mic_sex$Sex == "M", 1, 0)
set.seed(7)
cv_sex <- cv.glmnet(x_sex, y_sex, family = "binomial", alpha = 1, nfolds = 5)
```

```{r sex-metrics}
pred_sex_glm <- factor(ifelse(predict(m_sex, type = "response") > 0.5, "M", "F"), levels = c("F", "M"))
acc_sex_glm <- mean(pred_sex_glm == mic_sex$Sex)
pred_sex_lasso <- ifelse(predict(cv_sex, newx = x_sex, s = "lambda.min", type = "response") > 0.5, "M", "F")
acc_sex_lasso <- mean(factor(pred_sex_lasso, levels = c("F", "M")) == mic_sex$Sex)
c(accuracy_glm_pcs = acc_sex_glm, accuracy_lasso = acc_sex_lasso)
```

## 1.8 Summary table

```{r summary}
acc_fwd <- mean(
  factor(ifelse(predict(m_fwd, dat_label, type = "response") > 0.5, "Late", "Early"), levels = levels(mic$Label)) == mic$Label
)
acc_sex_fwd <- mean(
  factor(ifelse(predict(m_sex_fwd, dat_sex, type = "response") > 0.5, "Male", "Female"), levels = c("Female", "Male")) == mic_sex$Sex
)
summary_tbl <- data.frame(
  outcome = c("Label", "Label", "Label", "Sex", "Sex", "Sex"),
  method = c("GLM (5 PCs)", "stepAIC forward", "Lasso (all OTUs)", "GLM (5 PCs)", "stepAIC forward", "Lasso (all OTUs)"),
  n_predictors = c(
    n_pc,
    length(coef(m_fwd)) - 1,
    n_coef_lasso,
    n_pc,
    length(coef(m_sex_fwd)) - 1,
    sum(coef(cv_sex, s = "lambda.min") != 0) - 1
  ),
  train_accuracy = c(acc_glm, acc_fwd, acc_lasso, acc_sex_glm, acc_sex_fwd, acc_sex_lasso),
  row.names = NULL
)
knitr::kable(summary_tbl, digits = 3)
```

**Note:** In-sample accuracy only — same data used to fit and score (matches Monday gene demos).

```{r session, echo=FALSE}
sessionInfo()
```