##Load the data library("airway") #to find out where on your computer the files from a package have been installed. indir <- system.file("extdata", package="airway", mustWork=TRUE) list.files(indir) #read the targets file of the experiment csvfile <- file.path(indir, "sample_table.csv") sampleTable <- read.csv(csvfile, row.names = 1) sampleTable ##store the bam files in object filenames filenames <- file.path(indir, paste0(sampleTable$Run, "_subset.bam")) file.exists(filenames) filenames ##Indicate that these are bam files library("Rsamtools") bamfiles <- BamFileList(filenames, yieldSize=2000000) ##Defining gene models library("GenomicFeatures") gtffile <- file.path(indir,"Homo_sapiens.GRCh37.75_subset.gtf") txdb <- makeTxDbFromGFF(gtffile, format = "gtf", circ_seqs = character()) txdb ##resume the exons by gene ebg <- exonsBy(txdb, by="gene") ebg ##Generating count Matrices library("GenomicAlignments") se <- summarizeOverlaps(features=ebg, reads=bamfiles, mode="Union", singleEnd=FALSE, ignore.strand=TRUE, fragments=TRUE ) ##See the count matrix se assay(se) #dimensions of the count matrix dim(se) ############################################################################# ##START FROM THE COMPLETE DATA ############################################################################# data("airway") se <- airway dim(se) #reorder the levels library(magrittr) se$dex %<>% relevel("untrt") se$dex ##Milions of fragments aligned to the genes round( colSums(assay(se)) / 1e6, 1 ) ##Check that the object is correct colData(se) ##Make a DESeqDataSet object library("DESeq2") dds <- DESeqDataSet(se, design = ~ cell + dex) ##Filter out those rows without any count nrow(dds) dds <- dds[ rowSums(counts(dds)) > 1, ] nrow(dds) ##variance stabilizing transformation (VST) vsd <- vst(dds, blind = FALSE) head(assay(vsd), 3) ##Sample distances. sampleDists <- dist(t(assay(vsd))) sampleDists ##Heatmap + Hierarchical clustering library("pheatmap") library("RColorBrewer") sampleDistMatrix <- as.matrix( sampleDists ) rownames(sampleDistMatrix) <- paste( vsd$dex, vsd$cell, sep = " - " ) colnames(sampleDistMatrix) <- NULL colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) pheatmap(sampleDistMatrix, clustering_distance_rows = sampleDists, clustering_distance_cols = sampleDists, col = colors) ##Principal Component Analysis Plot plotPCA(vsd, intgroup = c("dex", "cell")) ##Differential Expression Analysis dds <- DESeq(dds) ##building results table res <- results(dds, contrast=c("dex","trt","untrt")) ##Toptable head(res[order(res$padj),]) ##Information about the columns of the results table mcols(res, use.names = TRUE) ##Subset the significant genes with strongest downregulation resSig <- subset(res, padj < 0.1) head(resSig[ order(resSig$log2FoldChange), ]) ##Subset the significant genes with strongest upregulation head(resSig[ order(resSig$log2FoldChange, decreasing = TRUE), ]) ##Heatmap of genes library("genefilter") topVarGenes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 20) mat <- assay(vsd)[topVarGenes, ] mat <- mat - rowMeans(mat) anno <- as.data.frame(colData(vsd)[, c("cell","dex")]) pheatmap(mat, annotation_col = anno) ##Results annotation library("org.Hs.eg.db") library("AnnotationDbi") res$symbol <- mapIds(org.Hs.eg.db, keys=row.names(res), column="SYMBOL", keytype="ENSEMBL", multiVals="first") res$entrez <- mapIds(org.Hs.eg.db, keys=row.names(res), column="ENTREZID", keytype="ENSEMBL", multiVals="first") resOrdered <- res[order(res$pvalue),] head(resOrdered) ##Exporting the results resOrderedDF <- as.data.frame(resOrdered)[1:100, ] write.csv(resOrderedDF, file = "results.csv")