--- title: "Precept 12" author: "wei" date: "April 28, 2016" output: html_document --- ```{r global_options, include=FALSE} knitr::opts_chunk$set(message=FALSE, warning=FALSE) ``` ```{r setup} library(dplyr) library(ggplot2) library(knitr) library(RColorBrewer) #source("https://github.com/SML201/precepts/blob/master/week12/setup.R") load(url("https://github.com/SML201/precepts/raw/master/week12/mnist_sample.RData")) pca <- function(x, space=c("rows", "columns"), center=TRUE, scale=FALSE) { space <- match.arg(space) if(space=="columns") {x <- t(x)} x <- t(scale(t(x), center=center, scale=scale)) s <- svd(x) loading <- s$u colnames(loading) <- paste0("Loading", 1:ncol(loading)) rownames(loading) <- rownames(x) pc <- diag(s$d) %*% t(s$v) rownames(pc) <- paste0("PC", 1:nrow(pc)) colnames(pc) <- colnames(x) pve <- s$d^2 / sum(s$d^2) if(space=="columns") {pc <- t(pc); loading <- t(loading)} return(list(pc=pc, loading=loading, pve=pve)) } plot_image <- function(image_vector, label=NULL){ DF <- data.frame(x=rep(1:28, 28), y=-sort(rep(1:28, 28))+28, val=image_vector) p <- ggplot(DF, aes(x=x, y=y, fill=image_vector)) + geom_tile(aes(width=1, height=1)) + scale_fill_gradient(low="white", high="black") + coord_fixed(ratio=1) + theme_void() + geom_segment(aes(x=0, xend=28, y=0, yend=0)) + geom_segment(aes(x=0, xend=28, y=28, yend=28)) + geom_segment(aes(x=0, xend=0, y=0, yend=28)) + geom_segment(aes(x=28, xend=28, y=0, yend=28)) + theme(legend.position="none") if(!is.null(label)) { p <- p + labs(title=as.character(label)) } p } ``` Visualize data: ```{r} IND <- sample(1:nrow(data), 1) plot_image(data[IND,], label[IND]) ``` k-means. Look at cluster assignments. Centers are barely meaningful since we're in this 784-dimensional space. ```{r} #about 20 sec system.time(km <- kmeans(x=data, centers=10, iter.max = 20)) names(km) km$cluster %>% head(n=50) table(km$cluster) table(label[which(km$cluster==7)]) #??? table(label[which(km$cluster==3)]) ``` Hierarchical clustering ```{r} #subset data sub_data <- data[1:2000,] sub_label <- label[1:2000] system.time(D <- dist(sub_data, method="euclidean")) hc <- hclust(D, method="ward.D") dhc <- as.dendrogram(hc) plot(hc) plot(dhc) ``` `ggdendro` dendrogram tricks. https://cran.r-project.org/web/packages/ggdendro/vignettes/ggdendro.html ```{r} library(ggdendro) ddata <- dendro_data(dhc, type = "rectangle") seg_data <- segment(ddata) %>% tbl_df ggplot(seg_data) + geom_segment(aes(x = x, y = y, xend = xend, yend = yend)) + theme_bw() leaf_labels <- ddata$labels %>% select(label) %>% unlist %>% as.character %>% as.integer seg_data %>% filter(yend==0) seg_data <- seg_data %>% mutate(label=ifelse(yend==0, sub_label[leaf_labels[x]], 11)) %>% mutate(label=as.factor(label)) pal <- c(brewer.pal(10, "Paired"), "black") ggplot(seg_data) + geom_segment(aes(x = x, y = y, xend = xend, yend = yend, color=label)) + theme_bw() + scale_y_continuous(trans="sqrt") + #just to mess with scale scale_colour_manual(values=pal, 0:9) ``` Let's try a difference distance metric ```{r} system.time(D <- dist(sub_data, method="manhattan")) manhattan_hc <- hclust(D, method="ward.D") dhc <- as.dendrogram(hc) ddata <- dendro_data(dhc, type = "rectangle") seg_data <- segment(ddata) %>% tbl_df leaf_labels <- ddata$labels %>% select(label) %>% unlist %>% as.character %>% as.integer seg_data %>% filter(yend==0) seg_data <- seg_data %>% mutate(label=ifelse(yend==0, sub_label[leaf_labels[x]], 11)) %>% mutate(label=as.factor(label)) ggplot(seg_data) + geom_segment(aes(x = x, y = y, xend = xend, yend = yend, color=label)) + theme_bw() + scale_y_continuous(trans="sqrt") + #just to mess with scale scale_colour_manual(values=pal, breaks=0:9) + guides(colour = guide_legend(override.aes = list(size=3))) ``` PCA: 1. Scree plot 2. Examine principal components 3. Reconstruct low rank images ```{r} system.time(pca_out <- pca(data)) DF <- data.frame(d=1:length(pca_out$pve), pve=pca_out$pve) ggplot(DF, aes(x=d, y=pve)) + geom_point() + theme_bw() DF <- data.frame(d=1:15, pve=pca_out$pve[1:15]) ggplot(DF, aes(x=d, y=pve)) + geom_point() + theme_bw() plot_image(pca_out$pc[1,]) plot_image(pca_out$pc[2,]) plot_image(pca_out$pc[3,]) plot_image(pca_out$pc[4,]) plot_image(pca_out$pc[5,]) plot_image(pca_out$pc[6,]) plot_image(data[1234,], label[1234]) projection <- as.integer(pca_out$loading[1234,] %*% pca_out$pc + mean(data[1234,])) plot_image(projection) projection <- as.integer(pca_out$loading[1234,1:3] %*% pca_out$pc[1:3,] + mean(data[1234,])) plot_image(projection) projection <- as.integer(pca_out$loading[1234,1:6] %*% pca_out$pc[1:6,] + mean(data[1234,])) projection[projection<0] <- 0 plot_image(projection) projection <- as.integer(pca_out$loading[1234,1:10] %*% pca_out$pc[1:10,] + mean(data[1234,])) projection[projection<0] <- 0 plot_image(projection) projection <- as.integer(pca_out$loading[1234,1:20] %*% pca_out$pc[1:20,] + mean(data[1234,])) projection[projection<0] <- 0 plot_image(projection) projection <- as.integer(pca_out$loading[1234,1:30] %*% pca_out$pc[1:30,] + mean(data[1234,])) projection[projection<0] <- 0 plot_image(projection) projection <- as.integer(pca_out$loading[1234,1:40] %*% pca_out$pc[1:40,] + mean(data[1234,])) projection[projection<0] <- 0 plot_image(projection) ```