###########################################################################
##
## Basic Data Manipulation
##
###########################################################################

## ----echo=FALSE----------------------------------------------------------
fname <- "vignettes/ALLphenoData.tsv"
stopifnot(file.exists(fname))
pdata <- read.delim(fname)

## ----ALL-properties------------------------------------------------------
class(pdata)
colnames(pdata)
dim(pdata)
head(pdata)
summary(pdata$sex)
summary(pdata$cyto.normal)

## ----ALL-subset----------------------------------------------------------
pdata[1:5, 3:4]
pdata[1:5, ]
head(pdata[, 3:5])
tail(pdata[, 3:5], 3)
head(pdata$age)
head(pdata$sex)
head(pdata[pdata$age > 21,])

## ----ALL-subset-NA-------------------------------------------------------
idx <- pdata$sex == "F" & pdata$age > 40
table(idx)
dim(pdata[idx,])

## ----ALL-BCR/ABL-subset--------------------------------------------------
bcrabl <- pdata[pdata$mol.biol %in% c("BCR/ABL", "NEG"),]

## ----ALL-BCR/ABL-drop-unused---------------------------------------------
bcrabl$mol.biol <- factor(bcrabl$mol.biol)

## ----ALL-BT--------------------------------------------------------------
levels(bcrabl$BT)

## ----ALL-BT-recode-------------------------------------------------------
table(bcrabl$BT)
levels(bcrabl$BT) <- substring(levels(bcrabl$BT), 1, 1)
table(bcrabl$BT)

## ----ALL-BCR/ABL-BT------------------------------------------------------
xtabs(~ BT + mol.biol, bcrabl)

## ----ALL-aggregate-------------------------------------------------------
aggregate(age ~ mol.biol + sex, bcrabl, mean)

## ----ALL-age-------------------------------------------------------------
t.test(age ~ mol.biol, bcrabl)
boxplot(age ~ mol.biol, bcrabl)

##########################################################################
##
## Annotation and sequence data
##
##########################################################################

library(airway)
data(airway)
airway
class(airway)
colData(airway)
rowRanges(airway)
library(org.Hs.eg.db)
ensid <- head(rownames(airway))
mapIds(org.Hs.eg.db, ensid, "SYMBOL", "ENSEMBL")
keytypes(org.Hs.eg.db)

## ----txdb----------------------------------------------------------------
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb
exons(txdb)
exonsBy(txdb, "tx")
p <- promoters(txdb)

## ----BSgenome------------------------------------------------------------
library(BSgenome.Hsapiens.UCSC.hg19)
bsgenome <- BSgenome.Hsapiens.UCSC.hg19
ps <- getSeq(bsgenome, p)
ps
gccontent = letterFrequency(ps,'GC',as.prob=TRUE)
hist(gccontent)


###########################################################################
##
## GEOquery and gene expression dataset
##
###########################################################################


browseURL('http://www.ncbi.nlm.nih.gov/geo/')

browseURL('http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE2553')

## ----geoquery,eval=FALSE-------------------------------------------------
## source('http://bioconductor.org/biocLite.R')
## biocLite('GEOquery')


## ----geoqueryget, eval=FALSE---------------------------------------------
## require(GEOquery)
## eset = getGEO('GSE2553')[[1]]
## eset


## ----realgeoqueryget, echo=FALSE-----------------------------------------
require(GEOquery)
eset = getGEO('GSE2553',destdir=getwd())[[1]]
eset


## ----eset1---------------------------------------------------------------
class(eset)
dim(eset)


## ----assaydata,eval=FALSE------------------------------------------------
## head(exprs(eset))
## dim(exprs(eset))


## ----assaydata10,echo=FALSE----------------------------------------------
exprs(eset)[1:10,1:6]
dim(exprs(eset))


## ----corr----------------------------------------------------------------
# two samples of different histologies
cor(exprs(eset)[,1],exprs(eset)[,2],use='complete.obs')
# two samples of the same histology
cor(exprs(eset)[,2],exprs(eset)[,14],use='complete.obs')


## ----heatmap-------------------------------------------------------------
# calculate gene-wise standard deviations
sds = apply(exprs(eset),1,sd)
library(ComplexHeatmap)
annotation = HeatmapAnnotation(df = data.frame(histotype = pData(eset)$source_name_ch1))
png('heatmap.png',width=1500,height=900)
Heatmap(exprs(eset)[order(sds,decreasing=TRUE)[1:800],],top_annotation = annotation)
dev.off()
system('open heatmap.png')

###########################################################################
##
## FASTQ exercises
##
###########################################################################

## ----bioclite,eval=FALSE-------------------------------------------------
## source('http://bioconductor.org/biocLite.R')
## biocLite('ShortRead')


## ----loadshortread,warning=FALSE,message=FALSE---------------------------
library(ShortRead)


## ----findfastq-----------------------------------------------------------
# I just know from previous looking that the fastq files are here
fastqDir = system.file(package='ShortRead','extdata/E-MTAB-1147')


## ----readfastq-----------------------------------------------------------
fq = readFastq(dirPath = fastqDir,pattern='*.fastq.gz')
fq


## ----examinequals--------------------------------------------------------
quality(fq)


## ----matrixquals---------------------------------------------------------
m = as(quality(fq),'matrix')
head(m)


## ----heatmap-------------------------------------------------------------
boxplot(m[1:5000,],col=c('red','blue'))


## ----alphabetbycycle-----------------------------------------------------
m = alphabetByCycle(sread(fq))
head(m[,1:8])


## ----matplot-------------------------------------------------------------
ms = t(m[c('A','C','G','T','N'),])
matplot(ms,type='l',xlab='cycle',ylab='count',main='Alphabet by Cycle')


##########################################################################
##
## GVIZ
##
##########################################################################

library(Gviz)
data(cpgIslands)
chr <- as.character(unique(seqnames(cpgIslands)))
gen <- genome(cpgIslands)
atrack <- AnnotationTrack(cpgIslands, name = "CpG")
gtrack <- GenomeAxisTrack()
itrack <- IdeogramTrack(genome = gen, chromosome = chr)
data(geneModels)
grtrack <- GeneRegionTrack(geneModels, genome = gen,
                          chromosome = chr, name = "Gene Model")
plotTracks(list(itrack, gtrack, atrack, grtrack))