### Tutorial 1 Answers.R ######################################################
# We are analyzing the GSE17172 dataset and looking to understand univariate
# data-analysis techniques.

### PREAMBLE ##################################################################
# step one is to tell R where the data is actually stored
setwd('C:/Documents and Settings/pboutros/Desktop/GSE17172/');

# next we load the affy library from BioConductor
library(affy);

### LOAD DATA #################################################################
# I personally find it easiest to make a vector of the files to be loaded
files.to.load <- c(
	'GSM429557_ST486_Fox1.CEL',
	'GSM429558_ST486_Fox2.CEL',
	'GSM429559_ST486_Fox3.CEL',
	'GSM429560_ST486_MYB1.CEL',
	'GSM429561_ST486_MYB2.CEL',
	'GSM429562_ST486_MYB3.CEL',
	'GSM429563_ST486_NT1.CEL',
	'GSM429564_ST486_NT2.CEL',
	'GSM429565_ST486_NT3.CEL'
	);

# You can then pass this character vector to ReadAffy to load the files. This
# approach makes life much easier if you want to only load a small fraction of
# the dataset at a later time.
raw.data <- ReadAffy(
	filenames = files.to.load,
	phenoData = 'phenodata.txt'
	);
# Note that R may go and download the CDF file, which annotates CEL files with
# useful information here. Also note that I specified the phenoData argument
# to tell R which samples are controls and which are experimental.

### PRE-PROCESS DATA: RMA #####################################################
# We will use the expresso() function here, although there are actually a
# couple of alternative ways to use RMA on a dataset like this (justRMA, rma).
# expresso() is by far the most flexible technique, though.  R may need to
# download another file at this step.
preprocessed.data <- expresso(
	afbatch = raw.data,
	bgcorrect.method = 'rma',
	normalize.method = 'quantiles',
	pmcorrect.method = 'pmonly',
	summary.method = 'medianpolish',
	);

# Note that all the arguments in this function are case-sensitive. Also note the
# way we are systematically ordering our code -- consistent indenting and variable
# naming is critical to making your work easy to interpret in the future. It reduces
# the likelihood of bugs and programming errors, and makes it easier for a stranger
# to read and maintain your work.

### USING AN ALTERNATIVE CDF ##################################################
# To reprocess the data using an alternative CDF we have to reload it using that CDF
# The first step is to go to the Brainarray website, download the CDF, and install
# it into your R session. Next, you reload the raw data, specifying this new mapping:
raw.data.alt <- ReadAffy(
	filenames = files.to.load,
	phenoData = 'phenodata.txt',
	cdfname = 'hgu95av2hsentrezgcdf'
	);

# And we can re-pre-process this new dataset using expresso() again:
preprocessed.data.alt <- expresso(
	afbatch = raw.data.alt,
	bgcorrect.method = 'rma',
	normalize.method = 'quantiles',
	pmcorrect.method = 'pmonly',
	summary.method = 'medianpolish',
	);

### ASSESS DATA QUALITY #######################################################
# We can start off by making a simple histogram of the raw data
hist(raw.data);

# But in general we want to save all figures to file. This has the advantage
# of providing a permanent record of our results. It only takes a couple of
# lines of code, but it can often save re-running scripts multiple times.
jpeg(
	filename = 'GSE17172-defaultCDF-rawdata-histogram.jpg',
	height = 800,
	width = 800
	);
hist(raw.data);
dev.off();

# Similarly we can look at the normalized data
jpeg(
	filename = 'GSE17172-defaultCDF-RMAdata-density.jpg',
	height = 800,
	width = 800
	);
plotDensity(exprs(preprocessed.data));
dev.off();

# And we can do the same for data processed using the alternative CDF
jpeg(
	filename = 'GSE17172-alternativeCDF-rawdata-histogram.jpg',
	height = 800,
	width = 800
	);
hist(raw.data.alt);
dev.off();

jpeg(
	filename = 'GSE17172-alternativeCDF-RMAdata-density.jpg',
	height = 800,
	width = 800
	);
plotDensity(exprs(preprocessed.data.alt));
dev.off();

# You can also combine these two plots quite easily
jpeg(
	filename = 'GSE17172-alternativeCDF-alldata-density.jpg',
	height = 800,
	width = 1600
	);
par(mfrow = c(1,2));
hist(raw.data.alt);
plotDensity(exprs(preprocessed.data.alt));
dev.off();

# Making a heatmap can be tricky -- there are a lot of parameters and
# options that can be set. In general heatmap.2() and levelplot() do
# a slightly better job than the default heatmap function in R, but
# there are a lot of customizations possible with any of these approaches.
# The first step to creating the heatmap is to create the correlation
# matrix:
correlation.matrix <- cor(exprs(preprocessed.data), method = 'spearman');
# Note that here we used a Spearman correlation, but it is trivial to change
# to a different correlation metric.  A rank-order correlation (like Spearman)
# tends to do a better job in discriminating large vectors.
# Note also that we used the exprs() function -- this is a function that
# takes the object format produces by expresso() and extracts just the
# preprocessed data in a more user-friendly format. In particular, the format
# it generates is often called an "expression matrix".  An expression matrix
# has samples as columns and genes/features as rows.

# Next we can make a simple heatmap directly with:
heatmap(correlation.matrix);

# However there are a number of improvements we can make here:

# First we stop scaling the data so the principle diagonal is equivalent
heatmap(correlation.matrix, scale = 'none');

# Next we add a main title
heatmap(
	correlation.matrix,
	scale = 'none',
	main = 'Correlation Map - RMA + Default CDF'
	);

# Then we add easier labels to the rows
heatmap(
	correlation.matrix,
	scale = 'none',
	main = 'Correlation Map - RMA + Default CDF',
	labRow = pData(preprocessed.data)$Type
	);

# And we can remove the column labels
heatmap(
	correlation.matrix,
	scale = 'none',
	main = 'Correlation Map - RMA + Default CDF',
	labRow = pData(preprocessed.data)$Type,
	labCol = NA
	);

# Lastly save this to file
jpeg(
	filename = 'GSE17172-defaultCDF-RMAdata-heatmap.jpg',
	height = 800,
	width = 1600
	);
heatmap(
	correlation.matrix,
	scale = 'none',
	main = 'Correlation Map - RMA + Default CDF',
	labRow = pData(preprocessed.data)$Type,
	labCol = NA
	);
dev.off();

# And we can make the same for the alternative CDF
correlation.matrix.alt <- cor(exprs(preprocessed.data.alt), method = 'spearman');

jpeg(
	filename = 'GSE17172-alternativeCDF-RMAdata-heatmap.jpg',
	height = 800,
	width = 1600
	);
heatmap(
	correlation.matrix.alt,
	scale = 'none',
	main = 'Correlation Map - RMA + Default CDF',
	labRow = pData(preprocessed.data.alt)$Type,
	labCol = NA
	);
dev.off();

### UNIVARIATE STATISTICAL ANALYSIS ###########################################
# The first step in doing this analysis is to create a place to store the
# resulting set of p-values. You can do this with a vector, but first we
# "localize" the data -- that is get rid of the function-call syntax:
expression.data <- exprs(preprocessed.data);
pvalues.fox.default.rma <- rep(NA, nrow(expression.data));

# We can then do a simple loop over the data
for (i in 1:nrow(expression.data)) {
	pvalues.fox.default.rma[i] <- t.test(
		x = expression.data[i,pData(preprocessed.data)$Type == 'Fox1'],
		y = expression.data[i,pData(preprocessed.data)$Type == 'Control']
		)$p.value;
	}

# We can repeat this almost exactly for Myb:
pvalues.myb.default.rma <- rep(NA, nrow(expression.data));

for (i in 1:nrow(expression.data)) {
	pvalues.myb.default.rma[i] <- t.test(
		x = expression.data[i,pData(preprocessed.data)$Type == 'Myb1'],
		y = expression.data[i,pData(preprocessed.data)$Type == 'Control']
		)$p.value;
	}

# we repeat this for the alternative CDF, but now merging things into one loop
expression.data.alt <- exprs(preprocessed.data.alt);
pvalues.fox.alternative.rma <- rep(NA, nrow(expression.data.alt));
pvalues.myb.alternative.rma <- rep(NA, nrow(expression.data.alt));

for (i in 1:nrow(expression.data.alt)) {
	pvalues.fox.alternative.rma[i] <- t.test(
		x = expression.data.alt[i,pData(preprocessed.data.alt)$Type == 'Fox1'],
		y = expression.data.alt[i,pData(preprocessed.data.alt)$Type == 'Control']
		)$p.value;
	pvalues.myb.alternative.rma[i] <- t.test(
		x = expression.data.alt[i,pData(preprocessed.data.alt)$Type == 'Myb1'],
		y = expression.data.alt[i,pData(preprocessed.data.alt)$Type == 'Control']
		)$p.value;
	}

# adjusting for multiple testing is straight-forward
qvalues.fox.default.rma <- p.adjust(pvalues.fox.default.rma, method = 'fdr');
qvalues.myb.default.rma <- p.adjust(pvalues.myb.default.rma, method = 'fdr');
qvalues.fox.alternative.rma <- p.adjust(pvalues.fox.alternative.rma, method = 'fdr');
qvalues.myb.alternative.rma <- p.adjust(pvalues.myb.alternative.rma, method = 'fdr');

# plots are also relatively simple: let's put four histograms on one page
jpeg(
	filename = 'GSE17172-defaultCDF-statistics-histograms.jpg',
	height = 1600,
	width = 1600
	);
par(mfrow = c(2,2));
hist(pvalues.fox.default.rma, main = 'FoxM1 P-values', las = 1, breaks = seq(0,1,0.1));
hist(qvalues.fox.default.rma, main = 'FoxM1 Q-values', las = 1, breaks = seq(0,1,0.1));
hist(pvalues.myb.default.rma, main = 'Myb P-Values', las = 1, breaks = seq(0,1,0.1));
hist(qvalues.myb.default.rma, main = 'Myb Q-Values', las = 1, breaks = seq(0,1,0.1));
dev.off();

# and next let's compare naive to adjusted p-values
jpeg(
	filename = 'GSE17172-defaultCDF-statistics-scatterplots.jpg',
	height = 800,
	width = 1600
	);
par(mfrow = c(1,2));
plot(
	x = pvalues.fox.default.rma,
	y = qvalues.fox.default.rma,
	main = 'FoxM1',
	las = 1
	);
plot(
	x = pvalues.myb.default.rma,
	y = qvalues.myb.default.rma,
	main = 'Myb',
	las = 1
	);
dev.off();

### COMPARE EXPERIMENTAL CONDITIONS ###########################################
# To compare the data directly is similar to what we did above. We simply
# create scatterplots that compare p-values between the two conditions.
jpeg(
	filename = 'GSE17172-bothCDF-RMA-conditioncomparison.jpg',
	height = 800,
	width = 1600
	);
par(mfrow = c(1,2));
plot(
	x = pvalues.fox.default.rma,
	y = pvalues.myb.default.rma,
	main = 'Default',
	las = 1
	);
plot(
	x = pvalues.fox.alternative.rma,
	y = pvalues.myb.alternative.rma,
	main = 'Alternative',
	las = 1
	);
dev.off();

# Similarly it is straight-forward to create Venn diagrams
library(VennDiagram);
venn.diagram(
	list(
		Fox = rownames(expression.data)[pvalues.fox.default.rma < 0.05],
		Myb = rownames(expression.data)[pvalues.myb.default.rma < 0.05]
		),
	filename = 'GSE17172-defaultCDF-RMA-conditionVenn-05.tiff'
	);

venn.diagram(
	list(
		Fox = rownames(expression.data)[pvalues.fox.default.rma < 0.01],
		Myb = rownames(expression.data)[pvalues.myb.default.rma < 0.01]
		),
	filename = 'GSE17172-defaultCDF-RMA-conditionVenn-01.tiff'
	);

venn.diagram(
	list(
		Fox = rownames(expression.data)[pvalues.fox.default.rma < 0.001],
		Myb = rownames(expression.data)[pvalues.myb.default.rma < 0.001]
		),
	filename = 'GSE17172-defaultCDF-RMA-conditionVenn-001.tiff'
	);

venn.diagram(
	list(
		Fox = rownames(expression.data.alt)[pvalues.fox.alternative.rma < 0.05],
		Myb = rownames(expression.data.alt)[pvalues.myb.alternative.rma < 0.05]
		),
	filename = 'GSE17172-alternativeCDF-RMA-conditionVenn-05.tiff'
	);

venn.diagram(
	list(
		Fox = rownames(expression.data.alt)[pvalues.fox.alternative.rma < 0.01],
		Myb = rownames(expression.data.alt)[pvalues.myb.alternative.rma < 0.01]
		),
	filename = 'GSE17172-alternativeCDF-RMA-conditionVenn-01.tiff'
	);

venn.diagram(
	list(
		Fox = rownames(expression.data.alt)[pvalues.fox.alternative.rma < 0.001],
		Myb = rownames(expression.data.alt)[pvalues.myb.alternative.rma < 0.001]
		),
	filename = 'GSE17172-alternativeCDF-RMA-conditionVenn-001.tiff'
	);

### CALCULATE FOLDCHANGES PER GENE ############################################
# Again, we pre-create variables to store the data
foldchanges.fox <- rep(NA, nrow(expression.data));
foldchanges.myb <- rep(NA, nrow(expression.data));

for (i in 1:nrow(expression.data)) {
	foldchanges.fox[i] <- mean(expression.data[i,pData(preprocessed.data)$Type == 'Fox1']) -
		mean(expression.data[i,pData(preprocessed.data)$Type == 'Control']);
	foldchanges.myb[i] <- mean(expression.data[i,pData(preprocessed.data)$Type == 'Myb1']) -
		mean(expression.data[i,pData(preprocessed.data)$Type == 'Control']);
	}

# And nearly identical code for the alternative CDF
foldchanges.fox.alt <- rep(NA, nrow(expression.data.alt));
foldchanges.myb.alt <- rep(NA, nrow(expression.data.alt));

for (i in 1:nrow(expression.data.alt)) {
	foldchanges.fox.alt[i] <- mean(expression.data.alt[i,pData(preprocessed.data.alt)$Type == 'Fox1']) -
		mean(expression.data.alt[i,pData(preprocessed.data.alt)$Type == 'Control']);
	foldchanges.myb.alt[i] <- mean(expression.data.alt[i,pData(preprocessed.data.alt)$Type == 'Myb1']) -
		mean(expression.data.alt[i,pData(preprocessed.data.alt)$Type == 'Control']);
	}

# Volcano plots can be easily created now
jpeg(
	filename = 'GSE17172-defaultCDF-statistics-volcanoplots.jpg',
	height = 800,
	width = 1600
	);
par(mfrow = c(1,2));
plot(
	x = foldchanges.fox,
	y = -log10(pvalues.fox.default.rma),
	main = 'FoxM1',
	las = 1
	);
plot(
	x = foldchanges.myb,
	y = -log10(pvalues.myb.default.rma),
	main = 'Myb',
	las = 1
	);
dev.off();

# And of course very similar code for the alternative CDF
jpeg(
	filename = 'GSE17172-alternativeCDF-statistics-volcanoplots.jpg',
	height = 800,
	width = 1600
	);
par(mfrow = c(1,2));
plot(
	x = foldchanges.fox.alt,
	y = -log10(pvalues.fox.alternative.rma),
	main = 'FoxM1',
	las = 1
	);
plot(
	x = foldchanges.myb.alt,
	y = -log10(pvalues.myb.alternative.rma),
	main = 'Myb',
	las = 1
	);
dev.off();