################################################################################
# Canadian Bioinformatics Workshop 2017
# Bioinformatics for Genomic Medicine: Module 6
#
# Created:  11-May-2017.
# Author: Andrei Turinsky
###############################################################################

source("http://bioconductor.org/biocLite.R");

biocLite(FlowSorted.Blood.450k);
library(GEOquery);
library(gplots);
library(lumi);
biocLite(minfi);
library(matrixStats);
biocLite(minfiData);
biocLite(sva);

dataPath = '.'

##########################################################
# Load data matrix

gse_list = getGEO(GEO = "GSE52588", GSEMatrix=TRUE, getGPL=FALSE)

class(gse_list)
length(gse_list)

gse_eset = gse_list[[1]]
class(gse_eset)

# Alternatively:
# gse_eset = getGEO(
#		filename= file.path(dataPath, 'GSE52588_series_matrix.txt.gz'),
#		GSEMatrix=TRUE, getGPL=FALSE)


##########################################################
# Load Illumina HumanMethylation450 array annotation: pre-load from
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&acc=GPL13534&form=text&view=full

gpl = getGEO( filename=file.path(dataPath, 'GPL13534.txt'))

annotationTable = Table(gpl)
head(annotationTable)
rownames(annotationTable) = annotationTable$ID

##########################################################
# Extract phenotype data	

pheno = pData(gse_eset)

head(pheno)
pheno = pheno[, grep("characteristics_ch1", colnames(pheno))]
head(pheno)

levels(pheno$characteristics_ch1)

groups = as.character(pheno$characteristics_ch1)
groups = gsub("su", "ds", substr(groups, 8,9))

family = as.character(pheno$characteristics_ch1.2)
family = gsub("family: ", "", family)

pheno$Sample_Name  = paste(groups, family, sep="")
pheno$Sample_Group  = groups
pheno$gsm = rownames(pheno)

head(pheno)

##########################################################
# Create a minfi methylation data object	

beta_matrix = exprs(gse_eset)
head(beta_matrix)

# exclude "mother" samples

isNotMother = pheno$Sample_Group != "mo"
isNotMother

beta_matrix = beta_matrix[, isNotMother]
pheno = pheno[isNotMother,]

# remove probes with missing and extreme values

hasValueNA = rowSums( is.na(beta_matrix) ) > 0
hasValue0  = rowSums( beta_matrix == 0 , na.rm = TRUE) > 0  
hasValue1  = rowSums( beta_matrix == 1 , na.rm = TRUE) > 0 

# remove probes on sex chromosomes

probes = rownames(beta_matrix)
isChrXY = as.character(annotationTable[ probes, 'CHR']) %in% c('X', 'Y')
isNonCG = substr(probes, 1, 2) != 'cg'

cat( sum(hasValueNA), "probes have too many missing values\n")
cat( sum(hasValue0),  "probes have too many values = 0\n")
cat( sum(hasValue1),  "probes have too many values = 1\n")
cat( sum(isChrXY),    "probes are on sex chromosomes\n")
cat( sum(isNonCG),    "probes are on not CpG\n")

isGoodProbe = !hasValueNA & !hasValue0 & !hasValue1 & !isChrXY & !isNonCG
beta_matrix = beta_matrix[isGoodProbe,]

dim(beta_matrix)

colnames(beta_matrix) = pheno$Sample_Name
rownames(pheno) = pheno$Sample_Name

# Make a GenomeRatioSet object, see https://support.bioconductor.org/p/73941

require(minfiData)

tmpRSet = RatioSet(Beta = beta_matrix) 
annotation(tmpRSet) = annotation(MsetEx) 
pData(tmpRSet) = pheno

grSet = mapToGenome(tmpRSet)	

# Note: can also do: annotationTable = getAnnotation(MsetEx) 
# but beware of slightly different column names, some missing probes

##########################################################
# Find disease signature

beta_matrix = getBeta(grSet)
m_matrix = getM(grSet)
pheno = pData(grSet)

require(lumi)
m2beta
beta2m

samplesDisease = rownames(pheno)[ pheno$Sample_Group == 'ds']
samplesControl = rownames(pheno)[ pheno$Sample_Group == 'si']

# Find differentially methylated positions

dmp = dmpFinder(m_matrix, pheno$Sample_Group, type = "categorical")
head(dmp)

adjustMethod = 'bonf'

dmpCpgs = rownames(dmp)
dmp$betaDisease = rowMeans( beta_matrix[dmpCpgs, samplesDisease, drop=F]) 
dmp$betaControl = rowMeans( beta_matrix[dmpCpgs, samplesControl, drop=F]) 
dmp$deltaBeta = dmp$betaDisease - dmp$betaControl
dmp$adjPval = p.adjust(dmp$pval, method = adjustMethod)
dmp$genes = annotationTable[dmpCpgs, "UCSC_RefGene_Name"]
dmp$chr = annotationTable[dmpCpgs, "CHR"]

head(dmp)

# extract signature

pvalThresh = 0.01
deltaBetaThresh = 0.20

sigCpgs = dmpCpgs[ (dmp$adjPval < pvalThresh) & (abs(dmp$deltaBeta) > deltaBetaThresh) ]

cat("Found", length(sigCpgs), "signature CpGs:", "\n")
cat("\t", sum((dmp$pval < pvalThresh), na.rm=T), "CpGs with pval <", pvalThresh, "\n")
cat("\t", sum((dmp$adjPval < pvalThresh), na.rm=T), "CpGs with pval <", pvalThresh, "after", adjustMethod, "adjustment\n")
cat("\t", sum(abs(dmp$deltaBeta) > deltaBetaThresh, na.rm=T), "CpGs with abs(deltaBeta) >", deltaBetaThresh, "\n")
cat("\t", sum((dmp$adjPval < pvalThresh) & (abs(dmp$deltaBeta) > deltaBetaThresh), na.rm=T), "CpGs satisfy both\n")

# check signature CpG chromosomes

summary(annotationTable[sigCpgs,'CHR'])

##########################################################
# hierarchical clustering

beta_matrix = getBeta(grSet)
beta_matrix_sig = beta_matrix[sigCpgs,]


dist_global = dist( t(beta_matrix) )
hc_global = hclust(dist_global)
plot(hc_global)

dist_signature = dist( t(beta_matrix_sig))
hc_signature = hclust(dist_signature)
plot(hc_signature)

plot( as.dendrogram(hc_signature))

hc_signature = hclust(dist_signature, method="ward.D2")
plot(hc_signature)

dist_signature = dist( t(beta_matrix_sig), method = 'manhattan')
hc_signature = hclust(dist_signature, method="ward.D2")
plot(hc_signature)

# extract cluster membership

clusters_signature = cutree(hc_signature, k = 2)	
clusters_signature

##########################################################
# heat maps

# basic heatmap

heatmap(beta_matrix_sig)

# better heatmaps

require(gplots)

heatmap.2(beta_matrix_sig)
heatmap.2(beta_matrix_sig, trace="none")

# make a pretty heatmap

dendrogram = "column" # c("both","row","column","none")
method = 'manhattan' 
clustMethod = "ward.D2" 

colorRampPalette = colorRampPalette(c("blue", "white",  "orange"))(n = 100)
sampleColors = ifelse(pheno$Sample_Group == 'ds', 'red', 'turquoise')

symmetric = FALSE
reorder = TRUE


heatmap.2( 
		beta_matrix_sig, 
		dendrogram = dendrogram,
		Rowv = reorder,
		Colv=if(symmetric) "Rowv" else reorder,
		symm=symmetric, 
		symkey=F, 
		symbreaks=F,
		trace="none", 
		density.info="density", #"density" "none"
		distfun = function(x) { dist(x, method = method) },
		hclustfun = function(x) hclust(x, method = clustMethod),
		col =  colorRampPalette, 
		key.title = "DNAm scale", 
		ColSideColors = sampleColors,
		labRow = ''
)

##########################################################
# Principal component analysis plots

pca_global = prcomp(t(beta_matrix))
pca_signature = prcomp(t(beta_matrix_sig))

samples = rownames(pheno)
sampleColors = ifelse(pheno$Sample_Group == 'ds', 'red', 'turquoise')
		
par(mfrow = c(1,2))

plot(pca_global$x[,1:2], type='n', main="PCA - global")
text(pca_global$x[,1:2], samples, col = sampleColors)

plot(pca_signature$x[,1:2], type='n', main="PCA - signature")
text(pca_signature$x[,1:2], samples, col = sampleColors)

par(mfrow = c(1,1))


##########################################################
# Influence of cell type composition

require(FlowSorted.Blood.450k)
data(FlowSorted.Blood.450k.compTable)
data(FlowSorted.Blood.450k.JaffeModelPars)

head(FlowSorted.Blood.450k.JaffeModelPars)

isSuspicious = p.adjust(FlowSorted.Blood.450k.compTable$p.value, 'bonf') < 0.05
suspiciousBloodCpgs_all = rownames(FlowSorted.Blood.450k.compTable)[ isSuspicious ]

mean( rownames(grSet) %in% suspiciousBloodCpgs_all)
mean( sigCpgs %in% suspiciousBloodCpgs_all)

sigCpgs_clean = setdiff(sigCpgs, suspiciousBloodCpgs_all)
sigCpgs_clean

# Top 600 CpGs variable used in Jaffe-Irizarry cell subtype model

suspiciousBloodCpgs_top600 = rownames(FlowSorted.Blood.450k.JaffeModelPars)

mean(rownames(grSet) %in% suspiciousBloodCpgs_top600)
mean(sigCpgs %in% suspiciousBloodCpgs_top600)


##########################################################
# Classification plot - similar to (Choufani et al., 2015)

# split data into training and test sets, find a training-set signature

set.seed(1234)
trainSamples = sample( colnames(grSet), size = 20)
testSamples = setdiff(colnames(grSet), trainSamples)

dmp_train = dmpFinder(m_matrix[,trainSamples], pheno[trainSamples,]$Sample_Group, type = "categorical")
head(dmp_train)

train_sigCpgs = rownames(dmp_train)[ dmp_train$qval < 0.01]
train_sigCpgs = setdiff(train_sigCpgs, suspiciousBloodCpgs_all)

# build DNAm profiles and score the training set

trainingDisease = intersect(trainSamples, samplesDisease)
trainingControl = intersect(trainSamples, samplesControl)

sigProfileDisease = rowMedians(beta_matrix[train_sigCpgs, trainingDisease])
sigProfileControl = rowMedians(beta_matrix[train_sigCpgs, trainingControl])

coords = sapply(testSamples, function(newSample) {
    sigProfile = beta_matrix[train_sigCpgs, newSample]
    x = cor(sigProfile, sigProfileControl)
    y = cor(sigProfile, sigProfileDisease)
    c(x,y)
} )
coords = t(coords)

# plot the classification score

minCorr = .8
plot( NULL, xlim=c(minCorr,1), ylim=c(minCorr,1), 
      xlab = "Similarity to control", ylab = "Similarity to disease")
abline(a=0, b=1, col = 'pink')

colors = ifelse(pheno[testSamples, "Sample_Group"] == 'ds', 'red', 'turquoise')
text(coords, testSamples, col = colors)


##########################################################
# Batch correction

require(minfiData)

stopifnot( all(rownames(grSet) %in% rownames(MsetEx)))

# simulate a contaminant: use the last sample of MsetEx dataset

NUM_BATCH_CPGS = 10000

batchCpgs = rownames(grSet)[1:NUM_BATCH_CPGS]
contaminant = getBeta(MsetEx)[batchCpgs, 6]

# add 15-20% contamination to randomly chosen samples

set.seed(1234)
compromisedSamples = sample( colnames(grSet), size = 30)

batch_beta = getBeta(grSet)[batchCpgs,] 
for(j in compromisedSamples) {
	q = runif(n=1, min = .15, max = .20)
	batch_beta[, j] = q * contaminant + (1-q) * batch_beta[,j]
}

# PCA plot

sampleNames = colnames(batch_beta)
isCompromized = (sampleNames %in% compromisedSamples)

pca_batch_compr = prcomp(t(batch_beta))
plot(pca_batch_compr$x[,1:2], type='n', main="PCA - batch")
text(pca_batch_compr$x[,1:2], sampleNames, col = ifelse(isCompromized, 'red', 'blue'))

# deviations of contaminated data from the original data

boxplot( getBeta(grSet)[batchCpgs,] - batch_beta, col = 'grey', las=2)

# batch correction

require(sva)

batch = as.numeric(isCompromized)
modcombat = model.matrix(~ Sample_Group, data=pheno)
combat_beta = ComBat(dat=batch_beta, batch=batch, mod=modcombat)


# deviations of batch-corrected data from the original data

boxplot( getBeta(grSet)[batchCpgs,] - combat_beta,  col = 'grey', las=2)

##########################################################
# DMRs Bumps

designMatrix = model.matrix(~ Sample_Group, data = pheno)

# Explore the number of DMRs with different cutoffs, without permutations 

clean_probes = setdiff( rownames(grSet), suspiciousBloodCpgs_all)

for( cutoff in seq(.1, .5, by = .1)) {
	print(cutoff)
	dmrs = bumphunter(grSet[clean_probes,], design = designMatrix, cutoff = cutoff, B=0, type="Beta")
}

# run the bump hunting

date()
dmrs = bumphunter(grSet[clean_probes,], design = designMatrix, cutoff = .3, B=10, type="Beta")
date()

head(dmrs$table)

selected_dmr_table = subset(dmrs$table, L >= 3) 

# plot DMR -  similar to Bioconductor package DMRcate

plotDMR = function(selectedDMR, grSet) {
	
	dataTable = getBeta(grSet)
	
	# retrieve array annotations
	
	arrayCpgs = rownames(annotationTable)
	chromosomes = paste0('chr',annotationTable$CHR)
	positions = annotationTable$MAPINFO
	names(positions) = arrayCpgs
	
	# find matching array probes
	
	chr = selectedDMR[,'chr'] 
	start = selectedDMR[,'start']
	end = selectedDMR[,'end']
	
	isMatchingCpg =  (chromosomes == chr) & (positions >= start) & (positions <= end)
	
	# match to the CpGs in the data and order by chromosomal position
	
	matchingCpgs = arrayCpgs [isMatchingCpg]
	matchingCpgs = intersect(matchingCpgs, rownames(dataTable))
	matchingCpgs = matchingCpgs[ order(positions[matchingCpgs]) ]
	
	matchingPositions = positions[matchingCpgs]
	
	# get a list of matching genes
	
	genes = as.character(annotationTable[matchingCpgs,'UCSC_RefGene_Name'])
	geneName = paste(sort(unique(unlist(strsplit(genes, ';')))), collapse = ', ')

	cat("Found ", length(matchingCpgs), "CpGs in the DMR\n")
	cat("REGION:", chr, start, end, '\n', "\tCPGS:", matchingCpgs, '\n')
	cat("\tGenes: ", geneName)
	
	# plot the DMR
	
	colorDisease = rgb(1,0,0, .5)
	colorControl = rgb(0,0,1, .5)
		
	plot(NULL, xlim=c(start, end), ylim=c(0,1), xlab  = paste0("DMR on ", chr, ": ", geneName) , ylab = "DNAm")
	
	# plot each CpG
	
	for(i in 1:length(matchingCpgs)) {
		cpg = matchingCpgs[i]
		pos = matchingPositions[i]
		points( cbind( pos, dataTable[cpg, samplesDisease]), pch=19, col=colorDisease)
		points( cbind( pos, dataTable[cpg, samplesControl]), pch=4,  col=colorControl)
	}
	
	# plot averages as lines
	
	lines(matchingPositions, rowMeans(dataTable[matchingCpgs, samplesDisease]), col=colorDisease)
	lines(matchingPositions, rowMeans(dataTable[matchingCpgs, samplesControl]), col=colorControl)
}

plotDMR( selected_dmr_table[1,], grSet)
plotDMR( selected_dmr_table[2,], grSet)

##########################################################
# End of script