--- title: "PCA and UMAP Analysis Workflow" subtitle: "Biostat 212A" author: "Dr. Jin Zhou @ UCLA" date: today format: html: theme: cosmo embed-resources: true number-sections: true toc: true toc-depth: 4 toc-location: left code-fold: false engine: knitr knitr: opts_chunk: fig.align: 'center' # fig.width: 6 # fig.height: 4 message: FALSE cache: false --- Display system information for reproducibility. ::: {.panel-tabset} #### R ```{r} sessionInfo() library(tidyverse) library(tidymodels) # install.packages("tidytext") library(tidytext) ``` ::: ## Gene expression data The goal is to evaluate whether gene expression can separate three disease types. ```{r} expression <- read_csv("expression.csv") ``` ## PCA ```{r} pca_rec <- recipe(~., data = expression) %>% update_role(ID, disease, new_role = "id") %>% step_normalize(all_predictors()) %>% step_pca(all_predictors()) pca_prep <- prep(pca_rec) pca_prep ``` ```{r} library(tidytext) tidied_pca <- tidy(pca_prep, 2) tidied_pca %>% filter(component %in% paste0("PC", 1:4)) %>% group_by(component) %>% top_n(8, abs(value)) %>% ungroup() %>% mutate(terms = reorder_within(terms, abs(value), component)) %>% ggplot(aes(abs(value), terms, fill = value > 0)) + geom_col() + facet_wrap(~component, scales = "free_y") + scale_y_reordered() + labs( x = "Absolute value of contribution", y = NULL, fill = "Positive?" ) ``` ```{r} juice(pca_prep) %>% ggplot(aes(PC1, PC2, label = ID)) + geom_point(aes(color = disease), alpha = 0.7, size = 2) + #geom_text(check_overlap = TRUE, hjust = "inward") + labs(color = NULL) ``` ## UMAP ```{r} library(embed) umap_rec <- recipe(~., data = expression) %>% update_role(ID, disease, new_role = "id") %>% step_normalize(all_predictors()) %>% step_umap(all_predictors()) umap_prep <- prep(umap_rec) umap_prep ``` ```{r} juice(umap_prep) %>% ggplot(aes(UMAP1, UMAP2, label = ID)) + geom_point(aes(color = disease), alpha = 0.7, size = 2) + # geom_text(check_overlap = TRUE, hjust = "inward") + labs(color = NULL) ```