--- title: "Palmer Penguins — PCA walkthrough" author: "Aparna Pandey and Stephan Peischl" format: html: toc: true code-tools: true engine: knitr --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) suppressPackageStartupMessages({ library(tidymodels) library(dplyr) library(ggplot2) library(palmerpenguins) }) source("../R/slide-viz-helpers.R") ``` # Overview Self-contained PCA example on **Palmer Penguins**. - Data: Adelie vs Gentoo slice - PCA: centered/scaled numeric predictors and two components - One model type: boosted tree (`xgboost`) with and without PCA Related: [Day 4 PCA block](../slides/day-04-thursday.html#/part-c-recipes) ## Load data ```{r} peng_pca <- palmerpenguins::penguins |> filter(species %in% c("Adelie", "Gentoo")) |> mutate( y = factor(species, levels = c("Adelie", "Gentoo")), year = as.numeric(year) ) |> select(-species) |> tidyr::drop_na() peng_pca |> count(y) |> knitr::kable(col.names = c("Species", "n")) ``` ## Correlation motivation ```{r fig.width=7, fig.height=5} num_cols <- c("bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "year") cor_df <- as.data.frame(as.table(cor(peng_pca[, num_cols]))) |> rename(var1 = Var1, var2 = Var2, r = Freq) ggplot(cor_df, aes(var1, var2, fill = r)) + geom_tile(color = "white") + scale_fill_gradient2(low = "steelblue", mid = "white", high = "firebrick", midpoint = 0, limits = c(-1, 1)) + coord_fixed() + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(title = "Correlated predictors motivate PCA", fill = "r") ``` ## PCA geometry and loadings ```{r fig.width=8, fig.height=5.5} plot_pca_biplot(peng_pca, num_cols = num_cols, y_col = "y") ``` ```{r fig.width=8, fig.height=5.5} plot_pca_loadings_bars(peng_pca, num_cols = num_cols) ``` ## Minimal model usage (with vs without PCA) ```{r} set.seed(8) folds <- vfold_cv(peng_pca, v = 5, strata = y) metrics_acc <- metric_set(accuracy) xgb_spec <- boost_tree(trees = 100, tree_depth = 3, learn_rate = 0.05) |> set_engine("xgboost") |> set_mode("classification") rec_pca <- recipe(y ~ ., data = peng_pca) |> step_zv(all_predictors()) |> step_dummy(all_nominal_predictors()) |> step_normalize(all_numeric_predictors()) |> step_pca(all_numeric_predictors(), num_comp = 2) rec_no_pca <- recipe(y ~ ., data = peng_pca) |> step_zv(all_predictors()) |> step_dummy(all_nominal_predictors()) |> step_normalize(all_numeric_predictors()) wf_pca <- workflow() |> add_recipe(rec_pca) |> add_model(xgb_spec) wf_no_pca <- workflow() |> add_recipe(rec_no_pca) |> add_model(xgb_spec) rs_pca <- fit_resamples(wf_pca, folds, metrics = metrics_acc) rs_no <- fit_resamples(wf_no_pca, folds, metrics = metrics_acc) ``` ```{r} bind_rows( collect_metrics(rs_pca) |> mutate(model = "XGB + PCA (2 comp.)"), collect_metrics(rs_no) |> mutate(model = "XGB without PCA") ) |> filter(.metric == "accuracy") |> select(model, mean, std_err) |> knitr::kable(caption = "5-fold CV accuracy") ``` ```{r fig.width=7, fig.height=4} bind_rows( collect_metrics(rs_pca) |> mutate(model = "With PCA"), collect_metrics(rs_no) |> mutate(model = "No PCA") ) |> filter(.metric == "accuracy") |> ggplot(aes(model, mean, fill = model)) + geom_col(show.legend = FALSE) + geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), width = 0.12) + theme_minimal() + labs(title = "Accuracy: with vs without PCA", x = NULL, y = "Mean CV accuracy") ``` ## Takeaway PCA can reduce correlated dimensions, but component-based models are often harder to explain biologically than models on original measurements.