--- title: "Gene clustering across multiple sample groups" author: "Jim Zhang" date: "`r Sys.Date()`" output: html_document: fig_caption: yes toc: yes --- ```{r global_setup, include=FALSE} knitr::opts_chunk$set(dpi=300, fig.pos="H", fig.width=8, fig.height=6, dev=c('png', 'pdf'), echo=FALSE, warning=FALSE, message=FALSE); library(awsomics); library(gplots); library(knitr); library(rmarkdown); # fn.yaml<-"ClReport_Gas1_vs_control.yml"; # comment it out if running through knitr::knit function # fn.yaml<-"/nas/is1/zhangz/projects/barr/2015-11_Rat_Brain_Development/result/gene_clustering/Juvenile_SC_injury/ClReport.yml"; # Loading inputs from yaml file # if (!exists('fn.yaml')) stop('Input file not found\n'); # writeLines(yaml::as.yaml(yml), fn.yaml); # yml <- yaml::yaml.load_file(fn.yaml); expr<-readRDS(yml$input$data); anno<-readRDS(yml$input$annotation); smpl<-readRDS(yml$input$sample); gset<-readRDS(yml$input$geneset); if (!setequal(rownames(expr), rownames(anno))) stop("Data matrix and gene annotation have different row names.\n"); if (!setequal(colnames(expr), rownames(smpl))) stop("Data matrix and sample metadata have different names.\n"); expr<-expr[rownames(anno), rownames(smpl)]; prms<-yml$parameter; ```

**_[Go back to project home](`r yml$project$Home`)_**

# Project background ```{r project, include=FALSE} lns<-lapply(names(yml$project), function(nm) { c(paste('##', nm), '\n', yml$project[[nm]], '\n'); }); lns<-paste(do.call('c', lns), collapse='\n'); ``` `r lns`

**_[Go back to project home](`r yml$project$Home`)_**

# Samples and methods ```{r methods, include=FALSE} CreateDatatable(smpl, paste(yml$output, 'samples.html', sep='/')); lns<-lapply(names(yml$methods), function(nm) { c(paste('##', nm), '\n', yml$methods[[nm]], '\n'); }); lns<-paste(do.call('c', lns), collapse='\n'); ``` ## Samples There are a total of `r ncol(expr)` samples. Click [here](samples.html) to view full table of samples `r lns` # Analysis and results

**_[Go back to project home](`r yml$project$Home`)_**

## ANOVA analysis of each gene across sample ```{r anova, include=FALSE} f<-as.factor(smpl[, prms$term[1]]); l<-unique(smpl[, prms$term[1]]); grp<-split(rownames(smpl), as.vector(f))[l]; m<-sapply(grp, function(s) rowMeans(expr[, s, drop=FALSE])); mn<-apply(expr, 1, min); mx<-apply(expr, 1, max); p<-apply(expr, 1, function(d) summary(aov(d ~ f))[[1]][1, 5]); q<-p.adjust(p, method='BH'); stat<-cbind(m, Min=mn, Max=mx, Range=mx-mn, pANOVA=p, FDR=q); CreateDatatable(cbind(anno, FormatNumeric(stat)), paste(yml$output, 'anova.html', sep='/'), caption="ANOVA results") ``` We applied ANOVA analysis on each gene to test its differential expression across samples of different groups. - There are `r nrow(expr)` total genes - There are `r ncol(expr)` total samples - There are `r length(l)` sample groups: `r paste(l, collapse=', ')` Click [here](anova.html) to view full ANOVA result table.

**_[Go back to project home](`r yml$project$Home`)_**

## Gene clustering ### Select genes differentially expressed across groups Select genes using the following steps: 1. Select genes with FDR less than `r prms$selection$fdr` 2. Stop if less than `r prms$selection$min` genes were left; otherwise, select those with ANOVA p values less than `r prms$selection$p` 3. Stop if less than `r prms$selection$min` genes were left; otherwise, select those with range (_max-min_) greater than `r prms$selection$range` 4. If there are still more than `r prms$selection$max` genes left, select the top `r prms$selection$max` with the biggest ranges ```{r selection, include=FALSE} e<-expr[q<=prms$selection$fdr, , drop=FALSE]; if (nrow(e) > prms$selection$min) { # continue if there are more selected genes than minimum s<-stat[rownames(e), , drop=FALSE]; s<-s[order(s[, 'pANOVA']), , drop=FALSE]; e<-e[rownames(s), , drop=FALSE]; e<-e[1:max(prms$selection$min, max(which(s[, 'pANOVA']<=prms$selection$p))), , drop=FALSE]; if (nrow(e) > prms$selection$min) { # continue if there are more selected genes than minimum s<-stat[rownames(e), , drop=FALSE]; s<-s[rev(order(s[, 'Range'])), , drop=FALSE]; e<-e[rownames(s), , drop=FALSE]; e<-e[1:max(prms$selection$min, max(which(s[, 'Range']>=prms$selection$range))), , drop=FALSE]; } if (nrow(e) > prms$selection$max) e<-e[1:prms$selection$max, , drop=FALSE]; # trim if more selected genes than maximum } ``` A total of **`r nrow(e)`** genes were selected. The genes were used for the next step to identify gene clusters ```{r hc_all, include=TRUE, out.width='800px'} plot(hclust(as.dist(1-cor(expr))), main='Sample clustering using all genes', xlab='Sample', sub=''); ``` **Figure 1** Hierarchical clustering of samples using all genes. ```{r hc_clustered, include=TRUE, out.width='800px'} plot(hclust(as.dist(1-cor(e))), main='Sample clustering using selected significant genes', xlab='Sample', sub=''); ``` **Figure 2** Hierarchical clustering of samples using just selected genes. ### Cluster selected genes Genes selected by the last step were clustering using the following steps: - Create a hierchical tree based on gene-gene correlation - Cut the tree at height `r prms$cluster$height`, which will classify genes into clusters. Then apply the following steps to refine the clusters - Calculate correlation of each gene to the centroid (median) of its cluster. Remove the genes if the correlation is less than `r prms$cluster$corr` - Remove clusters with size less than `r 100*prms$cluster$size`% of the expected size (the expected size is 50 if there are 500 genes and 10 clusters) - If there are less than `r length(grp)` (the number of sample groups) clusters left, reduce the height cutoff by 0.05 and repeat this step until there are enough clusters - Merge the 2 most similar clusters if the correlation of their centroids is greater than or equal to `r prms$cluster$corr`. Repeat this step until no 2 clusters are that similar ```{r cluster_seeding, include=FALSE} h<-prms$cluster$height; sz<-prms$cluster$size; # minimal size of a cluster, ratio against expected size (ex. if size=0.2, minimal cluster size is 0.2*500/10 when there are 500 genes and 10 clusters in total) # Hierarchical clustering hc<-hclust(as.dist(1-cor(t(e)))); ct<-cutree(hc, h=h); cl<-split(names(ct), ct); cl<-sapply(cl, function(c) { x<-e[c, ]; md<-apply(x, 2, median); r<-as.vector(cor(md, t(x))[1, ]); # remove outliers c[r>=prms$cluster$corr] }); cl<-cl[sapply(cl, length)>=sz*nrow(e)/length(cl)]; # Reduce cutoff if number of clusters is less than number of groups while(length(cl) < length(grp)) { h<-h-0.05; ct<-cutree(hc, h=h); cl<-split(names(ct), ct); cl<-sapply(cl, function(c) { x<-e[c, ]; md<-apply(x, 2, median); r<-as.vector(cor(md, t(x))[1, ]); # remove outliers c[r>=prms$cluster$corr] }); cl<-cl[sapply(cl, length)>=sz*nrow(e)/length(cl)]; } # merge similar clusters flag<-TRUE; while(flag) { cat('Number of clusters ', length(cl), '\n'); ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); tr<-cutree(hclust(as.dist(1-cor(ms))), k=length(cl)-1); # find the 2 most similar clusters i<-tr[duplicated(tr)]; c<-ms[, tr==i]; r<-cor(c[, 1], c[, 2]); p<-cor.test(c[, 1], c[, 2])$p.value[[1]]; if (r>prms$cluster$merge$corr & p=r & (corr[, i]-mx)>dif]; }); c; } cls<-list(cl, reCl(d, cl, prms$recluster$r, prms$recluster$diff)); while (!identical(cls[[length(cls)]], cls[[length(cls)-1]]) & length(cls)<=prms$recluster$times) { cls[[length(cls)+1]]<-reCl(d, cls[[length(cls)]], prms$recluster$r, prms$recluster$diff); cat("Reclustering #", length(cls)-1, '\n'); } cl<-cls[[length(cls)]]; names(cl)<-paste('Cluster', 1:length(cl), sep='_'); # summary cluster sizes n<-sapply(cls, function(c) sapply(c, length)); n<-data.frame(n); colnames(n)<-paste('Cycle #', 0:(ncol(n)-1), sep=''); ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); ms<-data.frame(t(ms)); colnames(ms)<-colnames(expr); rownames(ms)<-paste(rownames(ms), ' (n=', sapply(cl, length), ')', sep=''); if(identical(cls[[length(cls)]], cls[[length(cls)-1]])) msg<-paste("The reclustering converged after", length(cls)-1, 'cycles') else msg<-paste("The reclustering didn't converge after", length(cls)-1, 'cycles') ``` `r msg`. A total of **`r sum(sapply(cls[[length(cls)]], length))`** genes were clustered. ```{r plot_recluster_mean, include=TRUE, fig.width=CalculateColoredBlockSize(ms, 1, 6)[1], fig.height=CalculateColoredBlockSize(ms, 1, 6)[2], out.width='600px'} if (prms$plot$rescale) x<-t(scale(t(ms))) else x<-ms; x<-sortColumn(x, grp); PlotColoredBlock(x, min=-1*max(abs(x), na.rm=TRUE), max=max(abs(x), na.rm=TRUE), key='Average expression', groups=grp); ``` **Figure 4** This plot shows below the average expression levels of each cluster after `r ncol(n)-1` cycles of reclustering. Values indicate number of standard deviation from mean of all samples. ```{r summary_clusters, include=FALSE} ids<-as.vector(unlist(cls[[length(cls)]])); c<-rep(names(cl), sapply(cl, length)); tbl1<-cbind(anno[ids, ], Cluster=c, FormatNumeric(expr[ids, ])); tbl2<-cbind(anno[ids, ], Cluster=c, FormatNumeric(stat[ids, ])); fn1<-CreateDatatable(tbl1, paste(yml$output, 'clustered_data.html', sep='/'), caption="Expression data of clustered genes"); fn2<-CreateDatatable(tbl2, paste(yml$output, 'clustered_stat.html', sep='/'), caption="ANOVA statistical results of clustered genes"); write.csv2(tbl1, paste(yml$output, 'clustered_data.csv', sep='/')); write.csv2(tbl2, paste(yml$output, 'clustered_stat.csv', sep='/')); write.csv2(cbind(expr, anno), paste(yml$output, 'all_genes_data.csv', sep='/')); write.csv2(cbind(stat, anno), paste(yml$output, 'all_genes_stat.csv', sep='/')); saveRDS(tbl1, paste(yml$output, 'clustered_data.rds', sep='/')); saveRDS(tbl2, paste(yml$output, 'clustered_stat.rds', sep='/')); saveRDS(cbind(expr, anno), paste(yml$output, 'all_genes_data.rds', sep='/')); saveRDS(cbind(stat, anno), paste(yml$output, 'all_genes_stat.rds', sep='/')); saveRDS(gset, paste(yml$output, 'all_genesets.rds', sep='/')); ``` Result tables: - Normalized expressiond data of clustered genes - Click [here](clustered_data.html) to view online - Click [here](clustered_data.csv) to download table - Statistical results of clustered genes - Click [here](clustered_stat.html) to view online - Click [here](clustered_stat.csv) to download table

**_[Go back to project home](`r yml$project$Home`)_**

## Analysis of individual clusters ```{r cluster_detail, include=FALSE} fn.temp<-paste(yml$output, 'ClDetail.Rmd', sep='/'); if (yml$input$remote) { if (!RCurl::url.exists(yml$input$subtemplate)) stop("Template Rmd file ', yml$input$subtemplate, ' not exists\n"); writeLines(RCurl::getURL(yml$input$subtemplate)[[1]], fn.temp); } else { file.copy(yml$input$subtemplate, fn.temp); } fns<-sapply(1:length(cl), function(i) { mtrx<-expr[cl[[i]], , drop=FALSE]; name<-names(cl)[i]; grps<-grp; univ<-rownames(anno); path<-paste(yml$output, name, sep='/'); home<-"../index.html"; if (prms$plot$zero) mtrx<-mtrx-rowMeans(mtrx[, grp[[1]], drop=FALSE]); if (!file.exists(path)) dir.create(path, recursive = TRUE); fn.html<-paste(name, '.html', sep=''); rmarkdown::render(fn.temp, output_file=fn.html, output_dir=paste(yml$output, name, sep='/'), quiet=TRUE); }); ``` ```{r cluster_link, include=FALSE} lnk<-paste(names(cl), paste(names(cl), '.html', sep=''), sep='/'); lns<-paste(' - [', names(cl), '](', lnk, ')', sep=''); lns<-paste(lns, collapse='\n'); ``` Click cluster names to see more detaied information about individual clusters `r lns` **_[Go back to project home](yml$project$Home)_** ## Extra plots ```{r plot_series, include=TRUE, out.width='750px'} ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); if (prms$plot$rescale) ms<-t(scale(t(ms))); m<-sapply(grp, function(g) colMeans(ms[g, , drop=FALSE])); se<-sapply(grp, function(g) apply(ms[g, , drop=FALSE], 2, function(x) sd(x)/sqrt(length(x)))); if (prms$plot$zero) m<-m-m[, 1]; PlotSeries(m, se, c('', prms$plot$ylab)); ``` **Figure 5** Plot the group means and standard errors of all cluster as data series ```{r plot_means, include=TRUE, out.width='600px'} ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); if (prms$plot$rescale) ms<-t(scale(t(ms))); m<-sapply(grp, function(g) colMeans(ms[g, , drop=FALSE])); colnames(m)<-names(grp); rownames(m)<-names(cl); PlotColoredBlock(m, min=-1*max(abs(m), na.rm=TRUE), max=max(abs(m), na.rm=TRUE), key='Cluster average', groups=split(colnames(m), colnames(m))); ``` **Figure 6** Heatmap of group and cluster means ```{r plot_group_means, include=TRUE, out.width='600px'} d<-expr[unlist(cl, use.names = FALSE), , drop=FALSE]; d<-sapply(grp, function(g) rowMeans(d[, g, drop=FALSE])); if (prms$plot$rescale) d<-t(scale(t(d))); colnames(d)<-names(grp); PlotColoredBlock(d, min=-1*max(abs(d), na.rm=TRUE), max=max(abs(d), na.rm=TRUE), num.breaks = 127, groups=split(colnames(d), colnames(d)), key='Group average'); abline(h=c(nrow(d), nrow(d)-cumsum(sapply(cl, length)), lwd=1)); ``` **Figure 7** Group means of all clustered genes ```{r plot_heatmap, include=TRUE, out.width='900px', out.height='1200px'} d<-expr[unlist(cl, use.names = FALSE), , drop=FALSE]; d<-sortColumn(d, grp); #rownames(d)<-paste(rownames(d), anno[rownames(d), 1], sep=':'); if (prms$plot$rescale) d<-t(scale(t(d))); PlotColoredBlock(d, min=-1*max(abs(d), na.rm=TRUE), max=max(abs(d), na.rm=TRUE), num.breaks = 127, groups=grp, key='Expression level'); abline(h=c(nrow(d), nrow(d)-cumsum(sapply(cl, length)), lwd=1)); abline(v=c(0, cumsum(sapply(grp, length))), lwd=1); ``` **Figure 8** All samples and all clustered genes ```{r plot_recluster_size, include=TRUE, fig.width=CalculateColoredBlockSize(n, 1, 6)[1], fig.height=CalculateColoredBlockSize(n, 1, 6)[2], out.width='750px'} rownames(n)<-paste(rownames(n), ' (n=', n[, 1], ' to ', n[, ncol(n)], ')', sep='') PlotColoredBlock(n, key='Cluster size'); ``` **Figure 9** This plot traces the change of cluster sizes after each reclustering cycle. ```{r wrap_up, eval=FALSE, include=FALSE} x<-strsplit(yml$output, '/')[[1]]; fn.zip<-paste(x[length(x)], '.zip', sep=''); fn.zip.fn<-paste(yml$output, fn.zip, sep='/'); if (file.exists(fn.zip.fn)) file.remove(fn.zip.fn); zip(fn.zip, yml$output, "-rj9X", zip='zip'); file.rename(fn.zip, fn.zip.fn); ```

**_[Go back to project home](`r yml$project$Home`)_**

*** _END OF DOCUMENT_