---
title: "Palmer Penguins — predicting body mass"
author: "Aparna Pandey and Stephan Peischl"
format:
  html:
    toc: true
    code-tools: true
engine: knitr
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
library(palmerpenguins)
library(dplyr)
library(ggplot2)
library(GGally)
library(glmnet)
library(tidymodels)
library(tidyr)
theme_set(theme_classic())
```

# Overview

This notebook uses **[Palmer Penguins](../data/cards/palmer-penguins.qmd)** as a **continuous outcome** example: we predict **`body_mass_g`** from bill and flipper measurements (and simple factors).

It complements the **synthetic gene** linear / LASSO notebook (`linear-regression-lasso.Rmd`), where we know the ground truth. Here we work with **real measurements** and the usual messiness (correlation, nonlinear hints, missing values). Concepts for shrinkage sit in [Module 02](../modules/module-02-regularization.qmd); this file is the **long lab** to run in the IDE.

## Load and clean

```{r}
data("penguins", package = "palmerpenguins")
pg <- penguins |>
  tidyr::drop_na()
nrow(pg)
```

## Exploratory plots

```{r fig.width=8, fig.height=4.5}
pg_num <- pg |>
  select(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)
cor_m <- cor(pg_num)
cor_df <- as.data.frame(as.table(cor_m)) |>
  rename(v1 = Var1, v2 = Var2, r = Freq)
ggplot(cor_df, aes(v1, v2, fill = r)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "steelblue", mid = "white", high = "firebrick", midpoint = 0, limits = c(-1, 1)) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  coord_fixed() +
  labs(title = "Correlation among numeric penguin measurements")
```

```{r fig.width=8, fig.height=5.5}
GGally::ggpairs(
  pg_num,
  upper = list(continuous = wrap("cor", size = 2.2)),
  aes(color = pg$species, alpha = 0.25)
) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

## Train / test split and matrix of predictors

We model `body_mass_g` from morphometrics plus `species`, `island`, `sex`, and `year` (as numeric).

```{r}
pred_df <- pg |>
  mutate(year = as.numeric(year)) |>
  select(body_mass_g, bill_length_mm, bill_depth_mm, flipper_length_mm, species, island, sex, year)

y <- pred_df$body_mass_g
X <- model.matrix(body_mass_g ~ ., data = pred_df)[, -1] # drop intercept column; dummy coding

set.seed(42)
split <- initial_split(pred_df, prop = 0.8)
train_df <- training(split)
test_df <- testing(split)
rm(split)

rec_scale <- recipe(body_mass_g ~ ., data = train_df) |>
  step_normalize(all_numeric_predictors())
prep_fit <- prep(rec_scale, train_df)
train_b <- bake(prep_fit, train_df)
test_b <- bake(prep_fit, test_df)

y_train <- train_b$body_mass_g
y_test <- test_b$body_mass_g
X_train_s <- model.matrix(body_mass_g ~ ., data = train_b)[, -1]
X_test_s <- model.matrix(body_mass_g ~ ., data = test_b)[, -1]
```

## Ridge and LASSO with cross-validation

```{r fig.width=7, fig.height=4}
cv_ridge <- cv.glmnet(X_train_s, y_train, alpha = 0)
cv_lasso <- cv.glmnet(X_train_s, y_train, alpha = 1)
par(mfrow = c(1, 2))
plot(cv_ridge, main = "Ridge CV")
plot(cv_lasso, main = "LASSO CV")
```

```{r}
best_lam_ridge <- cv_ridge$lambda.min
best_lam_lasso <- cv_lasso$lambda.min
fit_ridge <- glmnet(X_train_s, y_train, alpha = 0, lambda = best_lam_ridge)
fit_lasso <- glmnet(X_train_s, y_train, alpha = 1, lambda = best_lam_lasso)

pred_ridge <- as.numeric(predict(fit_ridge, newx = X_test_s))
pred_lasso <- as.numeric(predict(fit_lasso, newx = X_test_s))

reg_metrics <- metric_set(rmse, rsq, mae)
tibble(truth = y_test, .pred = pred_ridge) |>
  reg_metrics(truth, .pred)
tibble(truth = y_test, .pred = pred_lasso) |>
  reg_metrics(truth, .pred)
```

## Coefficient path (LASSO)

```{r fig.width=7, fig.height=5}
fit_path <- glmnet(X_train_s, y_train, alpha = 1)
plot(fit_path, xvar = "lambda", label = TRUE, main = "LASSO coefficient paths (scaled predictors)")
```

## Takeaways

- Morphometrics and **species** explain much of the variation in mass — check whether a simpler scientific story (e.g. mass ~ flipper + species) is enough before chasing complexity.
- Compare this open-ended exercise to the **gene** notebook, where we know which features are truly causal.