library(gdata)
library(oligo)
library(pd.mogene.2.1.st)
library(mogene21sttranscriptcluster.db)
library(Heatplus)
library(ggplot2)
library(reshape2)

## Help Documentation
describe = function(obj) {
  if ('help' %in% names(attributes(obj))) {
    writeLines(attr(obj, 'help'))
  }
}
attr(describe, 'help') = "
This function prints the contents of the 'help' attribute of any R object. 
It is meant to provide help documentation in the same vein as Docstrings in Python. 
"

make.colors.boxplot <- function(col.strains)
{
  use.cols <- darkColors(length(unique(col.strains)))
  names(use.cols) <- unique(col.strains)
  return(as.character(use.cols[col.strains]))
}

make.boxplot <- function(use.exprs, type=c("raw", "norm"), make.pdf=F, base.name="test", order.by=c("Mating", "Sex", "Number"), color.by="Mating", highlight.names=NULL, ...)
{
  type <- match.arg(type)
  
  #use.exprs <- switch(type, raw=rma(raw.exprs, normalize=FALSE, target="core"), norm=rma(raw.exprs, normalize=TRUE, target="core"))
  
  basic.ord <- do.call("order", pData(use.exprs)[,order.by,drop=F])
  
  use.exprs <- use.exprs[,basic.ord]
  
  layout(c(1,1,1,1,2,3,3,3))
  par(mar=c(0,4,4,2))
  
  if (type == 'raw') {title = "Raw Expression (Background Corrected)"}
  if (type == 'norm') {title = "Normalized Expression"}
  boxplot(exprs(use.exprs), xaxt="n", ylab="Expression", main=title, outline=FALSE, col=make.colors.boxplot(pData(use.exprs)[,color.by]))
  
  #RIN subplot
  if ('RIN' %in% names(pData(use.exprs)))
  {
    par(mar=c(0,4,1,2))
    pData(use.exprs)$RIN = as.numeric(as.character(pData(use.exprs)$RIN))
    plot(x=1:ncol(use.exprs), y=pData(use.exprs)$RIN, xlim=c(1, ncol(use.exprs)) + c(-.5, .5), ylim=range(pData(use.exprs)$RIN, na.rm=T), xaxt="n", type="p", ylab="RIN", xlab="", col=make.colors.boxplot(pData(use.exprs)[,color.by]))
    abline(h=7, lty="dashed")
  } else {
    warning("RIN values were not found in the sample annotations.")
  }
  
  #labels and such
  if (missing(highlight.names) || is.null(highlight.names))
  {
    Axis(at=1:ncol(use.exprs), side=1, labels=colnames(use.exprs), las=2, ...)
  }
  else if(is.character(highlight.names) && all(highlight.names %in% colnames(use.exprs)))
  {
    should.col <- ifelse(colnames(use.exprs) %in% highlight.names, 'red', 'black')
    rle.col <- Rle(should.col)
    
    lo.pos <- 1
    
    for (i in 1:nrun(rle.col))
    {
      positions <- lo.pos:(runLength(rle.col)[i] + lo.pos-1)
      Axis(at=positions, side=1, labels=colnames(use.exprs)[positions], las=2, col.axis=runValue(rle.col)[i], ...)
      lo.pos <- max(positions) + 1
    }
    
    Axis(at=1:ncol(use.exprs), side=1, labels=colnames(use.exprs), las=2, col.axis=ifelse(should.col, 'red', 'black'), ...)
    
  }
  
  if (make.pdf) {
    d = dev.copy2pdf(file=paste0(base.name, '_', type, '_boxplot.pdf'), width=16, height=10)
    writeLines(paste0("Figure saved to: ", paste0(base.name, '_', type, '_boxplot.pdf')))
    d = dev.off()
  }
}
attr(make.boxplot, 'help') = "
This function creates a boxplot of the raw or normalized expression values. If RIN values 
are available a subplot will be added.

Parameters:
use.exprs: An ExpressionSet returned by the rma() function.
type: The type of plot to create, either 'raw' or 'norm'. 
make.pdf: A logical indicating if a PDF should be created. 
base.name: The base filename of the PDF to create (default is 'test') 
order.by: A character vector containing the column names that will be used to order 
the samples (default is c('Mating', 'Sex', 'Number')).
color.by: The column name used to assign colors to the samples (default is 'Mating'). 
highlight.names: A character vector containing sample names (default=NULL). Can be used 
to highlight samples in the plot. 
...: Additional parameters can be passed to format the x-axis labels.
"


#A convienience function for making bacterial spike plots
#Input:
#       raw.exprs: A GeneFeatureSet or similar object which has an rma method returning an ExpressionSet
#       base.name: The base name of the PDF to generate (if make.pdf == T)
#       pgf.file: probe group file from Affy for the appropriate array type.
#Output:
#       A PDF named as base.name_bac_spikes.pdf

plot.bac.spikes <- function(use.exprs, pgf.file, base.name="test", make.pdf=F)
{
  require(ggplot2)
  require(affxparser)
  #use.exprs <- rma(raw.exprs)
  
  if (file.exists(pgf.file)) {
    pgf.list <- readPgf(pgf.file)
  } else {
    stop("Probe group file not found!")
  }
  probeset.types <- data.frame(fsetid=pgf.list$probesetId, fsetName=pgf.list$probesetName, type=pgf.list$probesetType, stringsAsFactors=FALSE)
  
  spike.probes <- probeset.types[probeset.types$type == 'control->affx->bac_spike',]
  
  spike.exprs <- exprs(use.exprs)[as.character(unique(spike.probes$fsetid)),]
  
  spike.dta <- melt(spike.exprs)
  spike.dta.merge <- merge(spike.dta, spike.probes, by.x="Var1", by.y="fsetid", all=F, incomparables=NA, sort=FALSE)
  
  spike.dta.merge$`Bac. Spike` <- sapply(strsplit(spike.dta.merge$fsetName, "-"), "[[", 4)
  #should be BioB<BioC<BioD<Cre
  spike.dta.merge$`Bac. Spike` <- factor(spike.dta.merge$`Bac. Spike`, levels=c("cre", "bioD", "bioC", "bioB"), labels=c("Cre", "BioD", "BioC", "BioB"), ordered=TRUE)
  
  #bac.plot <- qplot(x=Var2, y=value, group=`Bac. Spike`, color=`Bac. Spike`, data=spike.dta.merge, geom="line", ylab="log2(Expression)", xlab="", main="Bacterial Spikes", method="summary")
  #bac.plot <- bac.plot + theme(axis.text.x=element_text(size=8, angle=90, hjust=1))
  
  bac.plot <- ggplot(spike.dta.merge, aes(x=Var2, y=value, group=`Bac. Spike`, color=`Bac. Spike`)) +
    stat_summary(fun.y=mean, geom="line") +
    ylab("log2(Expression)") +
    xlab("") +
    ggtitle("Bacterial Spikes") + 
    #theme(plot.margin = unit(c(1,1,5,1), "cm")) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1))  
  
  plot(bac.plot)
  
  if (make.pdf) {
    dev.copy2pdf(file=paste0(base.name, "_bac_spikes.pdf"), width=16, height=10)
    writeLines(paste0("Figure saved to: ", paste0(base.name, "_bac_spikes.pdf")))
    d = dev.off()
  }
}
attr(plot.bac.spikes, 'help') = "
This function creates a bacterial spike plot.

Parameters:
use.exprs: An ExpressionSet returned by the rma() function.
pgf.file: A probe group file for the array.
make.pdf: A logical indicating if a PDF should be created. 
base.name: The base filename of the PDF to create (default is 'test').
"

#A convienience function for making polya spike plots
#Input:
#       raw.exprs: A GeneFeatureSet or similar object which has an rma method returning an ExpressionSet
#       base.name: The base name of the PDF to generate (if make.pdf == T)
#       plot.type: Either "AFFX-r2-Bs" or "AFFX" which are the two types of probesets, defaults to "AFFX-r2-Bs"
#       pgf.file: probe group file from Affy for the appropriate array type.
#Output:
#       A PDF named as base.name_polya_spikes.pdf
plot.polya.spikes <- function(use.exprs, pgf.file, plot.type=c("AFFX-r2-Bs", "AFFX"), base.name="test", make.pdf=F)
{
  require(ggplot2)
  require(affxparser)
  
  plot.type <- match.arg(plot.type)
  
  #use.exprs <- rma(raw.exprs)
  
  pgf.list <- readPgf(pgf.file)
  probeset.types <- data.frame(fsetid=pgf.list$probesetId, fsetName=pgf.list$probesetName, type=pgf.list$probesetType, stringsAsFactors=FALSE)
  
  poly.a.spikes <- probeset.types[probeset.types$type == "control->affx->polya_spike",]
  
  spike.exprs <- exprs(use.exprs)[as.character(unique(poly.a.spikes$fsetid)),]
  
  spike.dta <- melt(spike.exprs)
  spike.dta.merge <- merge(spike.dta, poly.a.spikes, by.x="Var1", by.y="fsetid", all=F, incomparables=NA, sort=FALSE)
  
  spike.dta.merge$type <- ifelse(grepl("AFFX-r2-Bs", spike.dta.merge$fsetName), "AFFX-r2-Bs", "AFFX")
  spike.dta.merge$pos <- sub("_*s*_[as]t", "", sapply(strsplit(spike.dta.merge$fsetName, "-"), function(x) x[length(x)]))
  spike.dta.merge$pos <- factor(spike.dta.merge$pos, levels=c("5", "M", "3"), ordered=T)
  
  spike.dta.merge$direction <- sapply(strsplit(spike.dta.merge$fsetName, "[-_]"), function(x) x[length(x)])
  
  
  split.fset <- strsplit(spike.dta.merge$fsetName, "-")
  spike.dta.merge$Spike <- ifelse(spike.dta.merge$type == "AFFX", sapply(split.fset, "[", 2), sapply(split.fset, "[", 4))
  spike.dta.merge$Spike <- sub("X", "", spike.dta.merge$Spike)
  substr(spike.dta.merge$Spike, 1, 1) <- toupper(substr(spike.dta.merge$Spike, 1, 1))
  
  #from the affy exon/gene array whitepaper: lys<phe<thr<dap, not sure about Trpn
  spike.dta.merge$Spike <- factor(spike.dta.merge$Spike, levels=c("Dap", "Thr", "Phe", "Lys", "Trpn"), ordered=T)
  
  #pa.basic.plot <- qplot(x=Var2, y=value, group=Spike, color=Spike, data=spike.dta.merge[spike.dta.merge$type == plot.type,], stat="summary",
  #                       fun.y=mean, geom="line", ylab="log2(Expression)", xlab="", main="PolyA Spikes", facets=pos~direction) + theme(axis.text.x=element_text(size=8, angle=90, hjust=1))
  
  pa.basic.plot <- ggplot(spike.dta.merge[spike.dta.merge$type == plot.type,], aes(x=Var2, y=value, group=Spike, color=Spike)) +
    stat_summary(fun.y=mean, geom="line") +
    ylab("log2(Expression)") +
    xlab("") +
    ggtitle("PolyA Spikes") + 
    #theme(plot.margin = unit(c(1,1,5,1), "cm")) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    facet_grid(pos~direction)
  
  plot(pa.basic.plot)
  
  if (make.pdf) {
    dev.copy2pdf(file=paste0(base.name, "_polya_spikes.pdf"), width=16, height=10)
    writeLines(paste0("Figure saved to: ", paste0(base.name, "_polya_spikes.pdf")))
    d = dev.off()
  }
}
attr(plot.polya.spikes, 'help') = "
This function creates a polyA spike plot.

Parameters:
use.exprs: An ExpressionSet returned by the rma() function.
pgf.file: A probe group file for the array.
plot.type: A string indicating the type of control probesets for the array (default is 'AFFX-r2-Bs').
make.pdf: A logical indicating if a PDF should be created. 
base.name: The base filename of the PDF to create (default is 'test').
"


#A function that provides a consistent way of processing expression data for heatmaps etc.
#Input:
#   raw.exprs needs to be FeatureSet
#   num.genes should either be an integer value indicating the 'num.genes' most variable genes to return
#           or should not be specified at all.       
#Output:
#   An ExpressionSet
heatmap.process <- function(use.exprs, num.genes)
{
  norm.exprs.basic <- use.exprs
  norm.exprs.basic.mat <- exprs(norm.exprs.basic)
  if(missing(num.genes) || is.null(num.genes) || is.na(num.genes))
  {
    return(norm.exprs.basic.mat)
  } else {
    exprs.var <- apply(norm.exprs.basic.mat, 1, var)
    norm.exprs.basic.var <- norm.exprs.basic.mat[order(exprs.var, decreasing=TRUE),][1:num.genes,]
    return(norm.exprs.basic.var)
  }
  
}
attr(heatmap.process, 'help') = "
This function processes expression data for heatmaps.

Parameters:
use.exprs: An ExpressionSet returned by the rma() function.
num.genes: A number indicating the number of most variable genes to include. If NULL, all genes
will be included (default is 1000).

Returns:
A matrix of expression values.
"

make.heatmap <- function(use.exprs, cut.dist=NULL, num.genes=1000, base.factors=c("Sex"), rin.breaks=c(0, 5, 7, 10), make.pdf=F, base.name="test")
{
  
  norm.exprs.basic.var <- heatmap.process(use.exprs, num.genes)
  
  all.dta <- pData(use.exprs)
  
  stopifnot(all(rownames(all.dta) == colnames(norm.exprs.basic.var)))
  
  if (missing(cut.dist) || is.null(cut.dist) || is.na(cut.dist))
  {
    cluster <- NULL
  } else {
    cluster <- list(cuth=cut.dist)
  }
  
  ## Add option of plotting RIN as continuous variable
  rin.discrete=T
  if (rin.discrete) {
    if (!is.null(rin.breaks) && 'RIN' %in% names(all.dta)) {
      all.dta$RIN = as.numeric(as.character(all.dta$RIN))
      all.dta$RIN <- cut(all.dta$RIN, rin.breaks)
      use.dta <- all.dta[,append(base.factors, "RIN", after=0)]
    } else {
      warning("RIN values were not found in the sample annotations.")
      use.dta <- all.dta[,base.factors]
    }
  } else {
    all.dta$RIN = as.numeric(as.character(all.dta$RIN))
    use.dta <- all.dta[,append(base.factors, "RIN", after=0)]
  }
  
  new.heat <- annHeatmap(norm.exprs.basic.var, annotation=use.dta, dendrogram = list(clustfun = hclust, distfun = dist, Col = list(status = "yes"), Row = list(status = "hidden")),
                         labels=list(Row=list(labels=rep("", nrow(norm.exprs.basic.var))), Col=list(nrow=0, labels=rep("", ncol(norm.exprs.basic.var)))), legend = TRUE, cluster = cluster)
  
  # write colnames of clustered IDs to file
  #write.table(file="./bat_clustered.txt",x=colnames(new.heat$data$x2),quote=F)
  
  #cuth was chosen by examination of plot(new.heat$dendrogram$Col$dendro)
  
  plot(new.heat)
  
  if (make.pdf) {
    d = dev.copy2pdf(file=paste0(base.name, '_heatmap.pdf'), width=16, height=10)
    writeLines(paste0("Figure saved to: ", paste0(base.name, '_heatmap.pdf')))
    d = dev.off()
  }
}
attr(make.heatmap, 'help') = "
This function creates an annotated heatmap for expression data.

Parameters:
use.exprs: An ExpressionSet returned by the rma() function.
cut.dist: The height at which to cut the dendrogram (default is NULL; no cutting).
num.genes: A number indicating the number of most variable genes to include. If NULL, all genes
will be included (default is 1000).
base.factors: A character vector containing column names that will be used to annotate the
heatmap (default is c('Sex')).
rin.breaks: A numeric vector indicating the break points for discretizing the RIN 
scores (default is c(0, 5, 7, 10)).
make.pdf: A logical indicating if a PDF should be created. 
base.name: The base filename of the PDF to create (default is 'test').
"