--- title: "Analysis of 2-group differential gene expression" author: "Jim Zhang" date: '`r Sys.Date()`' output: html_document: toc: yes --- ```{r global_setup, include=FALSE} cat("Setting up and loading data\n"); knitr::opts_chunk$set(eval=TRUE, dpi=300, fig.pos="H", fig.width=8, fig.height=6, echo=FALSE, warning=FALSE, message=FALSE); library(MASS); library(gplots); library(knitr); library(rmarkdown); library(yaml); library(awsomics); library(rchive); library(DEGandMore); library(pathview); #fn.yaml<-"/nas/is1/zhangz/projects/falk/2015-01_Worm_Antioxidant_Drugs/R/pairwise/MS010_A-vs-dTPP_A.yml"; # comment it out if running through knitr::knit function # Loading inputs from yaml file # if (!exists('fn.yaml')) stop('Input file not found\n'); # writeLines(yaml::as.yaml(yml), fn.yaml); #yml <- yaml.load_file(fn.yaml); inputs<-yml$input; if (class(inputs$anno) == 'character') inputs$anno<-readRDS(inputs$anno); if (class(inputs$expr) == 'character') inputs$expr<-readRDS(inputs$expr); if (class(inputs$geneset) == 'character') inputs$geneset<-readRDS(inputs$geneset); # All input variables anno<-inputs$anno; expr<-as.matrix(inputs$expr); geneset<-inputs$geneset; genome<-inputs$genome; paired<-inputs$paired; penalty<-inputs$penalty; grps<-inputs$comparison; g1.ind<-grps[[1]]; g2.ind<-grps[[2]]; g1.name<-names(grps)[1] g2.name<-names(grps)[2]; g1.name<-gsub('-', '_', g1.name); g2.name<-gsub('-', '_', g2.name); # parameters of differential expression deg.method<-inputs$deg$method; cutoff.l2r<-inputs$deg$cutoff.l2r; cutoff.p<-inputs$deg$cutoff.p; cutoff.fdr<-inputs$deg$cutoff.fdr; num.top<-inputs$deg$num.top; nperm<-inputs$deg$nperm; logged<-inputs$deg$logged; # Check validity of inputs if (!identical(rownames(expr), rownames(anno))) stop('Row names of annotation and gene expression data are not identical'); if (is.null(paired)) paired<-FALSE; if (is.null(g1.ind)) stop('Error: Index of samples in group 1 unknown'); if (is.null(g2.ind)) stop('Error: Index of samples in group 2 unknown'); if (is.null(g1.name)) stop('Error: Name of group 1 unknown'); if (is.null(g2.name)) stop('Error: Name of group 2 unknown'); if (length(g1.ind)<2) stop('Error: Not enough samples in group ', g1.name, ', minimum=2'); if (length(g2.ind)<2) stop('Error: Not enough samples in group ', g2.name, ', minimum=2'); if (paired & length(g1.ind)!=length(g2.ind)) stop('Error: Unequal sample sizes for paired comparison, ', length(g1.ind), ' vs. ', length(g2.ind)); # path to output files path.output<-yml$output; path0<-paste(g1.name, g2.name, sep='-vs-'); path<-paste(path.output, path0, sep='/'); path.figure<-paste(path, 'figure', sep='/'); path.table <-paste(path, 'table', sep='/'); if (!file.exists(path.output)) dir.create(path.output, recursive = TRUE); if (!file.exists(path)) dir.create(path, recursive = TRUE); if (!file.exists(path.figure)) dir.create(path.figure, recursive = TRUE); if (!file.exists(path.table )) dir.create(path.table , recursive = TRUE); # Re-process gene expression matrix e1<-expr[, g1.ind, drop=FALSE]; e2<-expr[, g2.ind, drop=FALSE]; e1.2<-cbind(e1, e2); pctl<-apply(e1.2, 2, function(e) 100*rank(e)/length(e)); # percentile inputs$expr<-e1.2; fn.fig<-c(); # Figure file name and index res<-list(inputs=inputs); # Result set ``` ```{r analysis_variance, include=FALSE} cat('Analyze data variance\n'); fn.fig['variance']<-paste(path.figure, 'variance.pdf', sep='/'); pdf(fn.fig['variance'], width=6.4, height=8); par(mfrow=c(3,1), mar=c(4,5,2,2)); # Distribution of average expression level m<-rowMeans(e1.2); d<-density(m); x<-d$x; y<-d$y; plot(d, type='n', yaxs='i', xaxs='i', xlim=c(min(x), max(x)), ylim=c(0, 1.1*max(y)), xlab='', ylab='Density', main='A. Distribution of Expression Measurement', cex.lab=2, cex.main=1.5); title(xlab='Gene Expression Measurement (sample average)', line=2, cex=2); x0<-as.vector(summary(m))[2:5]; y0<-sapply(x0, function(x0, x, y) y[which(abs(x-x0)==min(abs(x-x0)))], x=x, y=y); col<-c('blue', 'red', 'orange', 'green'); segments(x0, 0, x0, y0, lty=1, col=col, lwd=1); lines(d, col='darkgrey', lwd=4); text(x0, y0/2, srt=90, labels=round(x0, 3)); legend(min(x)+0.75*(max(x)-min(x)), max(y), legend=c('First quantile', 'Median', 'Mean', 'Third quantile'), lty=1, col=col, bty='n', lwd=2); # Distribution of standard deviation across samples sd<-apply(e1.2, 1, sd); d<-density(sd); x<-d$x; y<-d$y; plot(d, type='n', yaxs='i', xaxs='i', xlim=c(min(x), max(x)), ylim=c(0, 1.1*max(y)), xlab='', ylab='Density', main='B. Distribution of Variance', cex.lab=2, cex.main=1.5); title(xlab='Gene Standard Deviation (between samples)', line=2, cex=2); x0<-as.vector(summary(sd))[2:5]; y0<-sapply(x0, function(x0, x, y) y[which(abs(x-x0)==min(abs(x-x0)))], x=x, y=y); col<-c('blue', 'red', 'orange', 'green'); segments(x0, 0, x0, y0, lty=1, col=col, lwd=1); lines(d, col='darkgrey', lwd=4); text(x0, y0/2, srt=90, labels=round(x0, 3)); legend(min(x)+0.75*(max(x)-min(x)), max(y), legend=c('First quantile', 'Median', 'Mean', 'Third quantile'), lty=1, col=col, bty='n', lwd=2); plot(m, sd, xlab='', ylab='Standard Deviation', main='C. Variance vs. Expression Measurement', cex.lab=2, cex.main=1.5, cex=0.5, col='darkgrey'); title(xlab='Expression Measurement', line=2, cex=2); lines(lowess(m, sd), lwd=3, col=2); dev.off(); ``` ```{r analysis_boxplot, include=FALSE} cat('Plot variance boxplot\n'); fn.fig['boxplot']<-paste(path.figure, 'boxplot.pdf', sep='/'); pdf(fn.fig['boxplot'], width=6.4, height=4); par(mfrow=c(1,1), mai=c(0.7, 0.6, .1,.1)) plot(0, type='n', xlim=c(0.5, 0.5+ncol(e1.2)), ylim=c(min(e1.2), max(e1.2)), ylab='', xlab='', xaxt='n'); lines(1:ncol(e1.2), apply(e1.2, 2, median), col='green', lwd=1); title(ylab='Expression Level', line=2); cex<-max(strwidth(colnames(e1.2), unit='in')); boxplot(data.frame(e1.2), las=3, notch=TRUE, boxwex=.6, col=c(rep('lightgrey', ncol(e1)), rep('lightblue', ncol(e2))), names=colnames(e1.2), cex.axis=min(1, 0.4/cex), yaxt='n', add=TRUE, pch=19, cex=0.4)->x; dev.off(); ``` ```{r analysis_clustering, include=FALSE} cat('Unsupervised hierarchical clustering\n'); fn.fig['clustering']<-paste(path.figure, 'clustering.pdf', sep='/'); pdf(fn.fig['clustering'], width=6.4, height=4.8); par(mfrow=c(1,1), mai=c(0.2, 0.8, .2 , .2)); plot(hclust(dist(t(e1.2))), main='', ylab='', xlab='', sub='', cex=min(1, 5/ncol(e1.2)/0.12), frame.plot=TRUE); title(ylab='Distance', line=2.5, cex.lab=2); dev.off(); ``` ```{r analysis_pca, include=FALSE} cat('Plot PCA\n'); fn.fig['pca']<-paste(path.figure, 'pca.pdf', sep='/'); pdf(fn.fig['pca'], width=6.4, height=4.8); pca<-prcomp(t(e1.2)); X<-pca$x[,1]; Y<-pca$x[,2]; per<-round(summary(pca)$importance[2, 1:2]*100, 2); pca$importance<-summary(pca)$importance; res$pca<-pca; col<-rep(c('darkgreen', 'deeppink1'), c(length(g1.ind), length(g2.ind))); layout(matrix(1:2, nrow=1), width=c(3,1)); par(mai=c(1,1,.25,.25)); cx<-max(1.25, min(4, 24/length(X))) plot(X, Y, col=col, pch=19, cex=cx, xlim=c(min(X)*1.1, max(X)*1.1), ylim=c(min(Y)*1.1, max(Y)*1.1), xlab=paste('PC1', ', ', per[1], '%', sep=''), ylab=paste('PC2', ', ', per[2], '%', sep=''), cex.lab=1.5); text(X, Y, label=1:length(X), col='white', cex=.3*cx); par(mai=c(1, 0, 0.25, 0)); plot(0, type='n', xlim=c(0, 100), ylim=c(1, 100), axes=FALSE, bty='n', xaxs='i', yaxs='i', xlab='', ylab=''); w<-strwidth(1:length(X), cex=1.2); w0<-1.2*80/max(w, 80); h0<-1.2*(100/length(X))/4.0; cex<-min(1.2, w0, h0); points(rep(5, length(X)), 100-(1:length(X))*4.0*cex, col=col, pch=19, cex=1.5*cex); text(5+cex*5, 100-(1:length(X))*4.0*cex, labels=colnames(e1.2), adj=0, col=col, pch=19, cex=cex); text(rep(5, length(labels)), 100-(1:length(X))*4.0*cex, labels=1:length(X), col='white', cex=0.8*cex); dev.off(); ``` ```{r analysis_deg, include=FALSE} cat('DEG analysis\n'); pth.deg<-paste(path, '/DEG', sep=''); if (!file.exists(pth.deg)) dir.create(pth.deg); means<-cbind(rowMeans(expr[, g1.ind]), rowMeans(expr[, g2.ind])); colnames(means)<-c(g1.name, g2.name); if (logged) l2r<-means[,2]-means[,1] else l2r<-log2(means[,2]/means[,1]); l2r[is.na(l2r)]<-0; fc<-exp(l2r*log(2)); mn<-cbind(means, l2r, fc); colnames(mn)<-c(paste('Mean', names(grps), sep='_'), 'LogFC', 'FoldChange'); # Adjust expr matrix if a penalty is given for large between-sample variance if(penalty>0) { if (penalty>1) penalty<-1; colnames(means)<-paste('Mean_', c(g1.name, g2.name), sep=''); if (paired) sd<-apply(expr[, g2.ind]-expr[, g1.ind], 1, sd) else { v1<-apply(expr[, g1.ind], 1, var); v2<-apply(expr[, g2.ind], 1, var); df1<-length(g1.ind)-1; df2<-length(g2.ind)-1; sd<-sqrt((df1*v1+df2*v2)/(df1+df2)); sd[sd==0]<-min(sd[sd>0])/2; } pnl<-quantile(sd, probs=seq(0, 1, 0.01))[100*round(1-penalty,2)+1]; e<-expr; # save the original e[, g2.ind]<-apply(e[, g2.ind], 2, function(d) means[,1]+(d-means[,1])/(pnl+sd)); } else e<-expr; ############################################################################################ de<-DeWrapper(e, grps, deg.method, list(nperm=nperm, paired=paired, logged=logged)); ############################################################################################ de$results$stat[, c(1,2,4)]<-mn[, 1:3]; de$results$stat[, 3]<-de$results$stat[, 2]-de$results$stat[, 1]; de$data<-expr; saveRDS(de, file=paste(pth.deg, 'deg.rds', sep='/')); if (deg.method == DeMethods()[3]) { # Rank Product methods s<-de$results$stat; rp<-de$results$rp$summarized; out<-lapply(rp, function(x) x[, c('Rank', 'Pvalue', 'FDR')]); out[[3]]<-cbind(out[[1]], out[[2]]); cnm<-rep(paste(c('Higher', 'Lower'), 'in', g2.name, sep='_'), each=3); colnames(out[[3]])<-paste(colnames(out[[3]]), cnm, sep='_') out<-lapply(out, function(out) cbind(mn[rownames(out), , drop=FALSE], out)); names(out)<-c(paste(c('Higher', 'Lower'), '_in_', g2.name, sep=''), 'Single_rank'); degs<-lapply(out[1:2], function(x) { deg<-x[x[, 'Pvalue']<=cutoff.p & x[, 'FDR']<=cutoff.fdr & abs(x[, 'LogFC'])>=cutoff.l2r, , drop=FALSE]; if (nrow(deg) < num.top) deg<-x[x[, 'Pvalue']<=cutoff.p & x[, 'Rank']<=num.top, , drop=FALSE]; deg; }); degs<-lapply(degs, function(d) d[order(d[, 'Rank']), ]); stat<-out[[3]]; } else { out<-cbind(mn[rownames(de$results$stat), , drop=FALSE], de$results$stat[, 5:ncol(s)]); degs<-lapply(c(1, -1), function(di) { x<-out[sign(out[, 'LogFC'])==di, , drop=FALSE]; deg<-x[x[, 'Pvalue']<=cutoff.p & x[, 'FDR']<=cutoff.fdr & abs(x[, 'LogFC'])>=cutoff.l2r, , drop=FALSE]; if (nrow(deg) < num.top) deg<-x[x[, 'Pvalue']<=cutoff.p & abs(x[, 'LogFC'])>=sort(abs(x[, 'LogFC']), decreasing = TRUE)[min(num.top, nrow(x))], , drop=FALSE]; deg; }); degs<-lapply(degs, function(d) d[order(-1*abs(d[, 'LogFC'])), , drop=FALSE]); stat<-out; } de$DEG<-degs; up<-degs[[1]]; dn<-degs[[2]]; stat.table<-cbind(anno[rownames(stat), ], stat); stat.formatted<-awsomics::FormatNumeric(cbind(ID=rownames(stat.table), stat.table)); stat.formatted$ID<-awsomics::AddHref(stat.formatted$ID, awsomics::UrlEntrezGene(stat.formatted$ID)); CreateDatatable(stat.formatted, fn = paste(path, 'DEG', 'all_genes.html', sep='/'), rownames = FALSE, caption = "Differential expression of all genes"); ``` ```{r summary_deg, include=FALSE} cat('Summarizing DEG analysis\n'); fn.fig['deg']<-paste(path.figure, 'deg.pdf', sep='/'); pdf(fn.fig['deg'], width=6.4, height=4.8); par(mai=c(0.4, 0.5, 0.3, 0.05), mfrow=c(2,2)); #if (class(out)=='list') d<-out[[length(out)]] else d<-out; d<-cbind(mn, de$results$stat[, 5:6]); # M-A Plot plot(rowMeans(d[,1:2]), d[,'LogFC'], main='M-A Plot', pch=19, col='darkgrey', cex=.25, xlab='', ylab='', cex.axis=.75); abline(h=0, lwd=1, col=1); lines(lowess(d[,'LogFC']~rowMeans(d[,1:2])), lwd=2, col=3); ylab<-paste('Log2(', g2.name, '/', g1.name, ')', sep=''); xlab<-paste('Log2(', g1.name, ')/2 + Log2(', g2.name, ')/2', sep=''); title(xlab=xlab, ylab=ylab, line=1.6, cex.lab=.75); #Volcano Plot x<-d[,'LogFC']; p<-d[,'Pvalue']; p[p==0]<-min(p[p>0])/exp(abs(d[p==0,3])*log(2)); y<--1*log10(p); z<-sqrt(abs(x*y)); cx<-z/(max(z)); plot(x, y, main='Volcano Plot', pch=19, col='darkgrey', cex=0.8*cx, xlab='', ylab='', cex.axis=.75, ylim=c(0, 1.1*max(y)), yaxs='i', xlim=c(-1*max(abs(x)), max(abs(x))), yaxt='n'); xlab<-paste('Log2(', g2.name, '/', g1.name, ')', sep=''); title(xlab=xlab, ylab='P value', line=1.6, cex.lab=.75); axis(2, at=0:ceiling(max(y)), labels=10^(-1*(0:ceiling(max(y))))); abline(h=-1*log10(0.05), v=c(-1,1), col=4, lty=3); box(); # P Value Distribution hist(p[p<1], br=seq(0, 1, 0.01), cex.axis=.75, main='P Value Distribution'); title(xlab='P value', ylab='Count', line=1.6, cex.lab=.75); box(); # FDR q<-round(d[, 'FDR'], 2); n<-sapply(seq(min(q), 1, 0.01), function(x) length(q[q<=x])); plot(1, type='n', log='y', xlab='', ylab='', cex.axis=.75, xlim=0.01*c(1, 100), ylim=c(1, nrow(expr)), main='FDR'); abline(v=seq(0, 1, .05), lty=3, col=8); lines(seq(min(q), 1, 0.01), n, lwd=2, col=2); title(xlab='FDR cutoff', ylab='Count', line=1.6, cex.lab=.75); box(); dev.off(); #FDR counts c<-c(0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.25); n.fdr<-sapply(c, function(c) sapply(c(1, -1), function(di) nrow(d[sign(d[, 'LogFC'])==di & d[, 'FDR']<=c, , drop=FALSE]))); fdr.table<-cbind(c, t(n.fdr), colSums(n.fdr)); colnames(fdr.table)<-c('FDR', paste(c('Higher', 'Lower'), 'in', g2.name, sep='_'), 'Total') # Plot top genes fn.fig['top']<-paste(path.figure, 'top.pdf', sep='/'); pdf(fn.fig['top'], width=6.4, height=2.4); par(mfrow=c(1,2), mai=c(0.7, 0.35, 0.3, 0.05)); ind<-c(rownames(up)[1], rownames(dn)[1]); cex<-min(0.75, min(0.5/max(strwidth(colnames(e1.2), unit='in')))); col<-rep(c('grey', 'lightblue'), c(length(g1.ind), length(g2.ind))); for (ii in 1:2) barplot(e1.2[ind[ii], ], las=3, cex.names=cex, main=CleanHtmlTags(anno[ind[ii], 1], FALSE), cex.main=.75, cex.axis=.75, col=col); dev.off(); # Plot heatmap of top genes fn.fig['heatmap']<-paste(path.figure, 'heatmap.pdf', sep='/'); pdf(fn.fig['heatmap'], width=6.4, height=6.4); par(mfrow=c(1,1)); goi<-e1.2[c(rownames(up), rownames(dn)), ]; rownames(goi)<-as.vector(anno[rownames(goi), 1]); DegHeatmap(goi, col=rep(c('chartreuse1', 'deeppink1'), c(length(g1.ind), length(g2.ind))), plot.new=FALSE); dev.off(); ## Gene counts by FDR cutoffs fdr<-c(0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.25); if (deg.method == DeMethods()[3]) { c1<-sapply(fdr, function(fdr) nrow(stat[stat[, 3]>0 & stat[, 7]<=fdr, , drop=FALSE])); c2<-sapply(fdr, function(fdr) nrow(stat[stat[, 3]<0 & stat[, 10]<=fdr, , drop=FALSE])); } else { c1<-sapply(fdr, function(fdr) nrow(stat[stat[, 'LogFC']>0 & stat[, 'FDR']<=fdr, , drop=FALSE])); c2<-sapply(fdr, function(fdr) nrow(stat[stat[, 'LogFC']<0 & stat[, 'FDR']<=fdr, , drop=FALSE])); } fdr.table<-cbind(fdr, c1, c2, c1+c2); colnames(fdr.table)<-c('FDR cutoff', names(degs), 'Total'); # pre-ranking of genes rnk<-sign(d[, 'LogFC'])*sqrt(d[,'LogFC']^2+log10(p)^2); ``` ```{r write_deg, include=FALSE} cat('Write out DEG results\n'); pth.deg1<-paste(pth.deg, '/Higher_in_', g2.name, sep=''); pth.deg2<-paste(pth.deg, '/Lower_in_', g2.name, sep=''); pth.deg1.bars<-paste(pth.deg1, '/bars', sep=''); pth.deg2.bars<-paste(pth.deg2, '/bars', sep=''); if (!exists(pth.deg)) dir.create(pth.deg, showWarnings = FALSE); if (!exists(pth.deg1.bars)) dir.create(pth.deg1.bars, showWarnings = FALSE, recursive = TRUE); if (!exists(pth.deg2.bars)) dir.create(pth.deg2.bars, showWarnings = FALSE, recursive = TRUE); ids<-c(up=rownames(up), dn=rownames(dn)); names(ids)<-paste(rep(c(pth.deg1.bars, pth.deg2.bars), c(nrow(up), nrow(dn))), '/', ids, '.pdf', sep=''); col<-rep(c('lightgrey', 'lightblue'), c(length(g1.ind), length(g2.ind))); wid<-min(1.8, 1.2/(0.1*max(nchar(colnames(expr[, c(g1.ind, g2.ind)]))))); fn.barplot<-sapply(names(ids), function(nm) { pdf(nm, w=8, h=6); par(mai=c(1.2, 1, 0.6, 0.2)); barplot(expr[ids[nm], c(g1.ind, g2.ind)], las=3, col=col, ylab='Expression level', cex.lab=wid, cex.names=wid); title(main=paste(ids[nm], CleanHtmlTags(as.vector(anno[ids[nm], 1])), sep=' - '), cex.main=2); #plot.new(); par(mai=c(1.2, 1, 0.6, 0.2)); barplot(pctl[ids[nm], c(g1.ind, g2.ind)], las=3, col=col, ylab='Percentile (%)', ylim=c(0, 100), cex.lab=2, cex.names=wid); title(main=paste(ids[nm], CleanHtmlTags(as.vector(anno[ids[nm], 1])), sep=' - '), cex.main=2); dev.off(); nm; }); # Write index tables of DEGs cnm<-c(colnames(anno), colnames(up), 'Samples'); up.tbl<-data.frame(anno[rownames(up), , drop=FALSE], up, rep('Figure', nrow(up))); colnames(up.tbl)<-cnm; up.tbl$Samples<-AddHref(up.tbl$Samples, paste('./bars/', rownames(up), '.pdf', sep='')); up.formatted<-GeneList2Datatable(FormatNumeric(up.tbl), paste(pth.deg1, 'index.html', sep='/'), col.symbol=1, genome=genome, title=paste('Genes with higher expression in', g2.name)); dn.tbl<-data.frame(anno[rownames(dn), , drop=FALSE], dn, rep('Figure', nrow(dn))); colnames(dn.tbl)<-cnm; dn.tbl$Samples<-AddHref(dn.tbl$Samples, paste('./bars/', rownames(dn), '.pdf', sep='')); dn.formatted<-GeneList2Datatable(FormatNumeric(dn.tbl), paste(pth.deg2, 'index.html', sep='/'), col.symbol=1, genome=genome, title=paste('Genes with lower expression in', g2.name)); ``` ```{r analysis_ora, include=FALSE} cat('ORA analysis\n'); ######################################################################################################## ora<-lapply(degs, function(gs) awsomics::TestGSE(rownames(gs), rownames(expr), geneset$list)); ######################################################################################################## path.ora<-paste(path, 'ORA', sep='/'); if (!file.exists(path.ora)) dir.create(path.ora); sapply(names(degs), function(nm) if (!file.exists(paste(path.ora, nm, sep='/'))) dir.create(paste(path.ora, nm, sep='/'), recursive = TRUE))->x; names(ora)<-names(degs); saveRDS(ora, paste(path.ora, 'ora_full.rds', sep='/')); ora.table<-lapply(ora, function(ora) { g<-ora$stat; lst<-ora$list[rownames(g)]; rand<-sapply(lst, function(l) { mtrx<-e1.2[rownames(e1.2) %in% l, , drop=FALSE]; flexclust::randIndex(table(kmeans(t(mtrx), 2)[[1]], rep(1:2, sapply(grps, length)))); }) an<-geneset[[1]][rownames(g), ]; data.frame(row.names = rownames(g), stringsAsFactors = TRUE, Source=an[,1], Collection=an[,2], Term=an[, 'Name'], Total=rowSums(ora$size[rownames(g), 3:4]), Within=ora$size[rownames(g), 4], Enrichment=g[, 'Odds_Ratio'], Rand=rand, Pvalue=g[, 'P_HyperGeo'], FDR=g[, 'FDR_BH']); }); ######################################################################################################## # Biclustering bic<-lapply(ora, function(g) if (nrow(g[[1]])<5) NA else BiclusterFromList(g[[2]][rownames(g[[1]])[1:min(250, nrow(g[[1]]))]])); ora$bicluster<-bic; bic2gs<-lapply(bic, function(bic) lapply(bic[[2]], rownames)); gs2bic<-lapply(bic2gs, function(b) lapply(split(rep(1:length(b), sapply(b, length)), unlist(b, use.names=FALSE)), function(x) sort(unique(x)))); gs2bic<-lapply(gs2bic, function(b) sapply(b, function(b) paste(b, collapse=';'))); ######################################################################################################## # Output tables ora.table<-lapply(names(ora.table), function(nm) { b<-gs2bic[[nm]]; b<-b[rownames(ora.table[[nm]])]; b[is.na(b)]<-''; ora.table[[nm]]$Cluster<-b; ora.table[[nm]]; }); names(ora.table)<-names(bic); ``` ```{r write_ora, include=FALSE} cat('Write out ORA results\n'); path.ora<-paste(path, 'ORA', sep='/'); if (!file.exists(path.ora)) dir.create(path.ora, recursive = TRUE); # Split results into tables by collections ora.wrapped<-lapply(names(ora.table), function(nm) { pth<-paste(path.ora, nm, 'table', sep='/'); if (!file.exists(pth)) dir.create(pth, recursive = TRUE); WrapGSE(ora.table[[nm]][, -(1:3)], geneset$meta, pth, TRUE); }); names(ora.wrapped)<-names(ora.table); # create index.html file ora.tbl<-lapply(ora.wrapped, function(o) { n<-lapply(o$formatted, function(o) sapply(o, nrow)); fn<-unlist(o$file, use.names=FALSE); fn<-sub(paste(path.ora, '/', sep=''), '', fn); s<-rep(names(n), sapply(n, length)); c<-unlist(lapply(n, names), use.names=FALSE); n<-as.vector(unlist(n, use.names=FALSE)); tbl<-data.frame(Source=s, Collection=c, N=n, stringsAsFactors = FALSE); tbl$N<- AddHref(tbl$N, fn); rownames(tbl)<-paste(tbl[[1]], tbl[[2]], sep='_'); tbl }); u<-sort(union(rownames(ora.tbl[[1]]), rownames(ora.tbl[[2]]))); ora.tbl<-lapply(ora.tbl, function(t) { t<-t[u, ]; t[is.na(t[,3]), 3]<-0; rownames(t)<-u; t; }); ora.ind<-cbind(ora.tbl[[1]], N2=ora.tbl[[2]]$N); ora.ind[is.na(ora.ind[,1]), 1]<-ora.tbl[[2]][is.na(ora.ind[,1]), 1]; ora.ind[is.na(ora.ind[,2]), 2]<-ora.tbl[[2]][is.na(ora.ind[,2]), 2]; names(ora.ind)<-c('Source', 'Collection', names(ora.wrapped)); CreateDatatable(ora.ind, paste(path.ora, 'index.html', sep='/'), rownames = FALSE, caption = "Click on number to see list"); # All tested gene sets gs.table<-lapply(split(geneset[[1]][, 'Collection'], geneset[[1]][, 'Source']), table); gs.table<-data.frame(stringsAsFactors = FALSE, Source=rep(names(gs.table), sapply(gs.table, length)), Collection=unlist(lapply(gs.table, names), use.names=FALSE), Geneset=as.numeric(unlist(gs.table))); awsomics::CreateDatatable(gs.table, paste(path.ora, 'geneset.html', sep='/'), rownames = FALSE, caption = "Gene set collections"); # Formatted tables of ORA results stat.slim<-data.frame(anno[rownames(stat), 1, drop=FALSE], awsomics::FormatNumeric(stat)); ora.formatted<-lapply(names(ora.table), function(nm) { if (!file.exists(paste(path.ora, nm, sep='/'))) dir.create(paste(path.ora, nm, sep='/')); if (!file.exists(paste(path.ora, nm, 'term', sep='/'))) dir.create(paste(path.ora, nm, 'term', sep='/')); t<-ora.table[[nm]]; an<-geneset[[1]][rownames(t), ]; t$Term<-awsomics::AddHref(t$Term, an[, 'URL']); t$Total[1:min(250, nrow(t))]<-awsomics::AddHref(t$Total[1:min(250, nrow(t))], paste('./term/term_', 1:min(250, nrow(t)), '.html', sep='')); #t$Within<-awsomics::AddHref(t$Within, paste('./term/term_', rownames(t), '_within.html', sep='')); t<-awsomics::FormatNumeric(t); awsomics::CreateDatatable(t, fn=paste(path.ora, nm, 'term.html', sep='/'), rownames = FALSE); sapply(1:min(250, nrow(t)), function(i) { g<-geneset$list[rownames(t)[i]][[1]]; t1<-stat.slim[rownames(stat.slim) %in% g, , drop=FALSE]; t1<-t1[order(t1[, paste('Rank', nm, sep='_')]), , drop=FALSE]; fn<-paste(path.ora, nm, 'term', paste('term_', i, '.html', sep=''), sep='/'); GeneList2Datatable(t1, fn, genome = genome, title = rownames(t)[i]); }); t; }); names(ora.formatted)<-names(ora.table); # Formatted table of biclustering bic.formatted<-lapply(names(bic), function(nm) { if (!file.exists(paste(path.ora, nm, sep='/'))) dir.create(paste(path.ora, nm, sep='/')); if (!file.exists(paste(path.ora, nm, 'bicluster', sep='/'))) dir.create(paste(path.ora, nm, 'bicluster', sep='/')); t<-awsomics::FormatNumeric(bic[[nm]]$summary); # write individual files of biclusters fns<-sapply(1:nrow(t), function(i) { fn<- paste(path.ora, nm, 'bicluster', paste(t[i, 1], c('_terms.html', '_genes.html', '_heatmap.pdf'), sep=''), sep='/'); t1<-ora.formatted[[nm]][rownames(bic[[nm]][[2]][[i]]), ]; CreateDatatable(t1, fn[1], FALSE, caption=paste('Terms of', t[i,1])); t2<-stat.slim[colnames(bic[[nm]][[2]][[i]]), , drop=FALSE]; t2<-t2[order(t2[, paste('Rank', nm, sep='_')]), , drop=FALSE]; GeneList2Datatable(t2, fn[2], genome = genome, title=paste('Genes of', t[i,1])); t3<-bic[[nm]][[2]][[i]]; rownames(t3)<-paste(ora.table[[nm]][rownames(t3), 2], ora.table[[nm]][rownames(t3), 3], sep=' - '); colnames(t3)<-paste(rownames(anno[colnames(t3), , drop=FALSE]), CleanHtmlTags(anno[colnames(t3), 1]), sep=' - '); if (min(t3)==1) col.min<-'#0000FF' else col.min<-'#999999'; PlotHeatmap(t3, size.max=Inf, fn=sub('.pdf$', '', fn[3]), col.min=col.min, col.max='#0000FF'); fn; }); bic.fn<-t(fns); # write index table t$Num_Terms<-AddHref(t$Num_Terms, sub(paste(path.ora, nm, sep='/'), '\\.', bic.fn[, 1])); t$Num_Genes<-AddHref(t$Num_Genes, sub(paste(path.ora, nm, sep='/'), '\\.', bic.fn[, 2])); t$ID<-AddHref(t$ID, sub(paste(path.ora, nm, sep='/'), '\\.', bic.fn[, 3])); CreateDatatable(t, fn=paste(path.ora, nm, 'bicluster.html', sep='/'), rownames = FALSE); t; }); ``` ```{r prepare_gsea, include=FALSE} cat('Preparing GSEA analysis\n'); path.gsea<-paste(path, 'GSEA', sep='/'); if (file.exists(path.gsea)) unlink(path.gsea, recursive = TRUE); # Path must not exist tmp<-as.character(as.integer(Sys.time())); if (file.exists(tmp)) unlist(tmp, recursive = TRUE); dir.create(tmp, recursive = TRUE); # loading template yaml input.gsea<-yml$input$gsea; gsea.yml<-input.gsea$template; if (input.gsea$remote) gsea.yml<-yaml.load(RCurl::getURL(gsea.yml)) else gsea.yml<-yaml.load_file(gsea.yml) # If Rank Product was used for DE, adjust the data for GSEA so the group mean difference reflects RP results e1.2_gsea<-e1.2; if (deg.method[1] == 'DeRankP') { log.rp<-log10(de$results$stat[, 'RP2']/de$results$stat[, 'RP1']); m1<-rowMeans(e1.2[, g1.ind]); m2<-rowMeans(e1.2[, g2.ind]); m0<-m1+log.rp; e1.2_gsea[, g2.ind]<-e1.2[, g2.ind] - (m2-m0); }; fn.gsea<-PrepareGSEA(e1.2_gsea, grps, paste(tmp, 'input', sep='/'), desc = anno[[1]]); gsea.yml$jar<-input.gsea$jar; gsea.yml$preranked<-FALSE; gsea.yml$thread<-input.gsea$thread; gsea.yml$name<-tmp; gsea.yml$groups$control<-g1.name; gsea.yml$groups$case<-g2.name; gsea.yml$out<-path; gsea.yml$input<-fn.gsea[1]; gsea.yml$class<-fn.gsea[2]; gsea.yml$chip<-c(); gsea.yml$gmt<-input.gsea$gmt; ``` ```{r analysis_gsea, include=FALSE} cat('GSEA analysis\n'); gsea.cmmd<-GSEAviaJava(gsea.yml); file.rename(paste(gsea.yml$out, tmp, sep='/'), path.gsea); gsea.tbl<-readRDS(paste(path.gsea, 'full_table.rds', sep='/')); for (i in 1:ncol(gsea.tbl)) gsea.tbl[[i]]<-as.vector(gsea.tbl[[i]]); saveRDS(gsea.tbl, file=paste(path.gsea, 'full_table.rds', sep='/')); # Load gene lists of gene sets from GMT files fn.gmt<-gsea.yml$gmt gmts<-lapply(fn.gmt, PGSEA::readGmt); gsea.lst<-lapply(gmts, function(gmt) { nm<-sub(' $', '', sapply(gmt, function(g) g@reference)); lst<-lapply(gmt, function(g) g@ids); names(lst)<-toupper(nm); lst; }); ######################################################################################################## # Biclustering gsea.tbl.sig<-gsea.tbl[gsea.tbl$PValue<=0.05, ]; gsea.tbl.sig<-gsea.tbl.sig[order(gsea.tbl.sig$PValue), , drop=FALSE]; gsea.lst.sig<-lapply(1:nrow(gsea.tbl.sig), function(i) gsea.lst[[gsea.tbl.sig[i, 1]]][[gsea.tbl.sig[i, 2]]]); gsea.lst.sig<-lapply(gsea.lst.sig, function(l) l[l %in% unlist(lapply(degs, rownames), use.names=FALSE)]); names(gsea.lst.sig)<-rownames(gsea.tbl.sig); gsea.lst.sig<-gsea.lst.sig[sapply(gsea.lst.sig, length)>1]; gsea.gs<-split(gsea.lst.sig, gsea.tbl.sig[names(gsea.lst.sig), ncol(gsea.tbl.sig)]); #gsea.gs<-rev(gsea.gs); gsea.gs<-lapply(rev(gsea.gs), function(x) x[!duplicated(x)]); names(gsea.gs)<-names(degs); gsea.bic<-lapply(gsea.gs, function(g) BiclusterFromList(g[1:min(250, length(g))])); gsea.bic2gs<-lapply(gsea.bic, function(bic) if (identical(NA, bic)) NA else lapply(bic[[2]], rownames)); gsea.gs2bic<-lapply(gsea.bic2gs, function(b) lapply(split(rep(1:length(b), sapply(b, length)), unlist(b, use.names=FALSE)), function(x) sort(unique(x)))); gsea.gs2bic<-lapply(gsea.gs2bic, function(b) sapply(b, function(b) paste(b, collapse=';'))); ``` ```{r bicluster_gsea, include=FALSE} cat('Biclustering GSEA results\n'); # Formatted table of biclustering gsea.bic.formatted<-lapply(names(gsea.bic), function(nm) { path.bic<-paste(path.gsea, 'bicluster', nm, sep='/'); if (!file.exists(path.bic)) dir.create(path.bic, recursive = TRUE); if (identical(NA, gsea.bic[[nm]])) t<-data.frame(ID='No clusters found', stringsAsFactors = FALSE) else { t<-FormatNumeric(gsea.bic[[nm]]$summary); t$ID<-as.vector(t$ID); # write individual files of biclusters fns<-sapply(1:nrow(t), function(i) { fn<-paste(path.bic, paste(t[i, 1], c('_terms.html', '_genes.html', '_heatmap.pdf'), sep=''), sep='/'); t1<-gsea.tbl[rownames(gsea.bic[[nm]][[2]][[i]]), , drop=FALSE]; CreateDatatable(FormatNumeric(t1[, 1:6]), fn[1], FALSE, caption=paste('Terms of', t[i,1])); t2<-stat.slim[colnames(gsea.bic[[nm]][[2]][[i]]), , drop=FALSE]; t2<-t2[order(t2[, paste('Rank', nm, sep='_')]), , drop=FALSE]; GeneList2Datatable(t2, fn[2], genome = genome, title=paste('Genes of', t[i,1])); t3<-gsea.bic[[nm]][[2]][[i]]; rownames(t3)<-paste(gsea.tbl[rownames(t3), 1], gsea.tbl[rownames(t3), 2], sep=' - '); colnames(t3)<-paste(rownames(anno[colnames(t3), , drop=FALSE]), CleanHtmlTags(anno[colnames(t3), 1]), sep=' - '); if (min(t3)==1) col.min<-'#0000FF' else col.min<-'#999999'; PlotHeatmap(t3, size.max=Inf, fn=sub('.pdf$', '', fn[3]), col.min=col.min, col.max='#0000FF'); fn; }); gsea.bic.fn<-t(fns); # write index table t$Num_Terms<-AddHref(t$Num_Terms, paste(t$ID, '_terms.html', sep='')); t$Num_Genes<-AddHref(t$Num_Genes, paste(t$ID, '_genes.html', sep='')); t$ID<-AddHref(t$ID, paste(t$ID, '_heatmap.pdf', sep='')); } CreateDatatable(t, fn=paste(path.bic, 'index.html', sep='/'), rownames = FALSE); t; }); gsea<-list(path=gsea.yml$out, geneset=gsea.lst, stat=gsea.tbl, bicluster=gsea.bic); ``` ```{r write_bic, include=FALSE} cat('Write out biclustering results\n'); t<-data.frame(matrix('table', nr=2, nc=2), stringsAsFactors = FALSE); t[1, ]<-AddHref(t[1, ], paste("../ORA", names(degs), 'bicluster.html', sep='/')); t[2, ]<-AddHref(t[2, ], paste("../GSEA/bicluster", names(degs), 'index.html', sep='/')); colnames(t)<-names(degs); rownames(t)<-c('Over-representation analysis', 'Gene set enrichment analysis'); CreateDatatable(t, paste(path, 'DEG', 'bicluster_index.html', sep='/')); ``` ```{r kegg_gsea, include=FALSE} cat('GSEA analysis of KEGG pathways\n'); path.kegg<-paste(path.gsea, 'kegg', sep='/'); if (file.exists(path.kegg)) unlink(path.kegg, recursive = TRUE); dir.create(path.kegg, recursive = TRUE); kegg.stat<-gsea.tbl[gsea.tbl$Collection=='KEGG_pathway' & gsea.tbl$PValue<=0.05, , drop=FALSE]; # Run pathview, generate figures in a temp directory and copy them to output directory path.temp<-as.character(as.integer(Sys.time())); if (!file.exists(path.temp)) dir.create(path.temp); setwd(path.temp); if (nrow(kegg.stat) == 0) kegg.ind<-kegg.stat[, -1] else { kegg.id<-tolower(substr(kegg.stat$Gene_set, 1, 8)); kegg.dff<-rowMeans(e1.2_gsea[, g2.ind])-rowMeans(e1.2_gsea[, g1.ind]); fn.kegg<-sapply(kegg.id, function(id) { capture.output(kegg<-pathview(kegg.dff, pathway.id=id, species = GetGenomeAlias(genome, "code"), low = list(gene = "blue"), kegg.dir=yml$input$gsea$kegg))->x; paste(id, '.pathview.png', sep=''); }); names(fn.kegg)<-kegg.id; fn.kegg<-fn.kegg[file.exists(fn.kegg)]; if (length(fn.kegg)>0) file.rename(fn.kegg, paste(path.kegg, fn.kegg, sep='/')); kegg.ind<-FormatNumeric(kegg.stat[kegg.id %in% names(fn.kegg), -1]); if (nrow(kegg.ind) > 0) kegg.ind$Gene_set<-AddHref(kegg.ind$Gene_set, fn.kegg); kegg.ind<-kegg.ind[order(kegg.ind$NES), , drop=FALSE]; } setwd('..'); unlink(path.temp, recursive = TRUE); CreateDatatable(kegg.ind, paste(path.kegg, 'index.html', sep='/')); ``` ```{r write_excel, include=FALSE} cat('Writing tables to Excel file\n'); # Excel spreadsheets xls<-list(stat.table, stat.table[rownames(up), , drop=FALSE], stat.table[rownames(dn), , drop=FALSE], ora.table[[1]], ora.table[[2]], gsea.tbl); xls<-lapply(xls, function(x) { for (i in 1:ncol(x)) x[[i]]<-CleanHtmlTags(x[[i]], FALSE); x; }); names(xls)<-c('Complete List', paste('Top', nrow(up), ', higher in ', g2.name, sep=''), paste('Top', nrow(dn), ', lower in ', g2.name, sep=''), paste('ORA, higher in ', g2.name, sep=''), paste('ORA, lower in ', g2.name, sep=''), "GSEA" ); xls<-lapply(xls, awsomics::FormatNumeric); tbl<-sapply(names(xls), function(nm) write.csv(xls[[nm]], paste(path.table, paste(nm, '.csv', sep=''), sep='/'))); fn.xls<-paste(path, paste(g1.name, g2.name, sep='-vs-'), sep='/'); WriteExcel(xls, fn.xls); ``` ```{r convert_figure, include=FALSE} cat('Converting figures to png format\n'); fn.png<-lapply(fn.fig, function(f) ConvertPdf2Png(f, resize=80, density=150)); fn.png[[length(fn.png)+1]]<-sapply(names(bic), function(nm) { t<-bic[[nm]]$summary; n<-t[,2]*t[,3]; f<-paste('bicluster/', t[n==max(n)[1], 1], '_heatmap.pdf', sep=''); awsomics::ConvertPdf2Png(paste(path.ora, nm, f, sep='/'), resize=80, density=150); }); # fn.png[[length(fn.png)+1]]<-sapply(kegg$Formatted, function(t) { # ids<-as.vector(t$Pathway_ID); # fn<-paste(path, 'KEGG', 'Files', 'figures', 'colored_plots', paste(ids, '.pathview.png', sep=''), sep='/'); # fn[file.exists(fn)][1]; # }); ``` ```{r write_out, include=FALSE} cat('Writing out all results\n'); res$de<-de; res$ora<-ora; res$gsea<-gsea; saveRDS(res, file=paste(path, 'result.rds', sep='/')); ```