--- title: "Dimension Reduction and Clustering" author: "BIOS 635" date: "4/13/2021" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message=FALSE) ``` ```{r warning=FALSE} library(tidyverse) library(caret) library(factoextra) ``` ```{r} brain_data <- read_csv("../Data/IBIS_brain_data_ex.csv") brain_data <- brain_data %>% select(names(brain_data)[grepl("V24|RiskGroup|CandID", names(brain_data))]) %>% select(CandID:Uncinate_R_V24, RiskGroup:V24_VDQ) %>% drop_na() brain_data_x <- brain_data %>% select(EACSF_V24:Uncinate_R_V24) ``` # Dimension Reduction ## Principal Components Analysis ```{r} pca_brain <- princomp(x=brain_data_x, cor=TRUE,scores=TRUE) # Look at screen plot to view amount of variance explained by PC fviz_eig(pca_brain) # Just look at first 2 PCs load_subset <- data.frame(pca_brain$loadings[,1:2]) %>% rownames_to_column(var = "variable") %>% pivot_longer(cols=c("Comp.1", "Comp.2"), names_to="comp_number", values_to="loading") # Plot loadings ggplot(data=load_subset, mapping=aes(x=variable, y=loading))+ geom_bar(stat="identity")+ facet_grid(comp_number~.)+ theme_bw()+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # Hard to interpret, need to reduce size load_subset_normed <- data.frame(pca_brain$loadings[,1:2]) %>% rownames_to_column(var = "variable") %>% mutate(Comp.1 = scale(Comp.1), Comp.2 = scale(Comp.2)) %>% pivot_longer(cols=c("Comp.1", "Comp.2"), names_to="comp_number", values_to="loading") ggplot(data=load_subset_normed, mapping=aes(x=loading))+ geom_histogram()+ facet_grid(comp_number~.)+ theme_bw() load_subset_normed <- load_subset_normed %>% filter(loading>0.5|loading< -0.5) %>% mutate(large_loading = factor(ifelse(loading>1.96|loading< -1.96, 1, 0))) ggplot(data=load_subset_normed, mapping=aes(x=variable, y=loading, fill=large_loading))+ geom_bar(stat="identity")+ facet_grid(comp_number~.)+ theme_bw()+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # Can look at PCA scores for each person and plot as well fviz_pca_ind(pca_brain, col.ind = "cos2", # Color by the quality of representation gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE # Avoid text overlapping ) # Do these relate to diagnosis? fviz_pca_ind(pca_brain, col.ind = brain_data$RiskGroup, # color by groups palette = c("#0072B2", "#D55E00", "#CC79A7"), addEllipses = TRUE, # Concentration ellipses ellipse.type = "confidence", legend.title = "Groups", label = "none", repel = TRUE ) # Can look at high variables contribute to components, labels make this messy fviz_pca_var(pca_brain, col.var = "contrib", # Color by contributions to the PC gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE # Avoid text overlapping ) fviz_pca_var(pca_brain, col.var = "contrib", # Color by contributions to the PC gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), select.var = list("cos2"=10), repel = TRUE # Avoid text overlapping ) # Can extract scores, use for further analysis pc_scores <- pca_brain$scores[,1:2] brain_data_w_pcs <- data.frame(brain_data, pc_scores) ``` # Clustering ## K-Means ```{r} # Create function to compute AIC, BIC kmeansAICBIC = function(fit){ m = ncol(fit$centers) n = length(fit$cluster) k = nrow(fit$centers) D = fit$tot.withinss return(data.frame(AIC = D + 2*m*k, BIC = D + log(n)*m*k)) } # Run K-Means clustering, for various clusters clus <- list() aic_bic <- list() clus_num <- c() # FIRST, norm variables brain_data_x_norm <- scale(brain_data_x) for(j in 1:8){ clus[[j]] <- kmeans(x=brain_data_x_norm, centers=j, nstart=10) aic_bic[[j]] <- kmeansAICBIC(clus[[j]]) clus_num[j] <- j } clus_total_all_criterion <- data.frame("no_of_clus" = clus_num, do.call("rbind", aic_bic)) # Find min. BIC clus_min_bic <- which(clus_total_all_criterion$BIC==min(clus_total_all_criterion$BIC)) # Plot BIC ggplot(data=clus_total_all_criterion, mapping=aes(x=no_of_clus, y=BIC))+ geom_point()+ geom_line()+ geom_hline(yintercept = clus_total_all_criterion$BIC[clus_min_bic], linetype="dashed", color="red")+ geom_vline(xintercept = clus_total_all_criterion$no_of_clus[clus_min_bic], linetype="dashed", color="red")+ theme_bw() # Add in clusters brain_data_x_clus <- data.frame(brain_data_x, "clus"=factor(clus[[clus_min_bic]]$cluster), pc_scores, asd_diag = brain_data$RiskGroup) ggplot(data=brain_data_x_clus, mapping=aes(x=Comp.1, y=Comp.2, color=clus))+ geom_point()+ theme_bw() # Validate to external variables ggplot(data=brain_data_x_clus, mapping=aes(x=Comp.1, y=Comp.2, color=clus, shape=asd_diag))+ geom_point(size=2)+ theme_bw() ftable(brain_data_x_clus$clus, brain_data_x_clus$asd_diag) ```