# ================================================== # # Canadian Bioinformatics Workshops Series # # Toronto, May 21 and 22 2015 # # Exploratory Analysis of Biological Data using R # # # # Faculty: Boris Steipe # # # # Some prior contributions by: # # Raphael Gottardo, FHCRC # # Sohrab Shah, UBC # # # # # # Module 1: EDA (Exploratory Data Analysis) # # # # ================================================== # # ================================================== # Setup a project and work-environment # ================================================== # Task: # 1 - Create a project directory on your computer, # call it "CBW". # 2 - Open R studio, and use the menu to create # a new project in this directory. # 3 - Open a new R script file. # 4 - Open this file on the workshop wiki. Copy its # contents and paste it into your script file. # 5 - Save the script under a name of 01-EDA.R # The code and comments you write during this # workshop will all be kept together in this file. # 6 - Also load PlottingReference.R # 7 - Open a third script file, call this "tmp.R"; # use this script as a scratchpad. # 8 - Set the current "working directory" to your CBW # workshop directory. # 9 - Confirm: getwd() # Confirm the correct directory dir() # Confirm that the right files are present. # Optional task: # If you are waiting for others to finish, try the # following: # 1 - Install/load the "RUnit" package. # 2 - Explore the vignettes it contains. # 3 - Explore the functions it contains. # ================================================== # Utilities # ================================================== # It's useful to keep some utility functions loaded # for your project. # In yesterday's course we have used the following # function a lot: typeInfo <- function(x) { print(x, digits=22) cat("str: ") str(x) cat("mode: ", mode(x), "\n") cat("typeof: ", typeof(x), "\n") cat("class: ", class(x), "\n") # if there are attributes, print them too if (! is.null(attributes(x))) { cat("attributes:\n") print(attributes(x)) } } # Task: # 1 - If you haven't done so already, put typeInfo() into # a file "utilities.R" in your working directory. # 2 - Then: source("utilities.R") # ================================================== # Load Data # ================================================== # Task: # In yesterday's workshop we have worked with a # a supplementary datafile from a 2014 publication # on single-cell RNAseq for the automatic definition # of tissue types. The data consists of gene-names, # Expression values of clustered cells in the # presence and absence of LPS stimulation, and # cluster assignment labels. The corresponding # paper is on th Wiki. () # Task: # 1 - If you don't have the file yet, download the # .csv version of the file from the Workshop # Wiki. # 2 - Read it into R with the following command: sup3 <- read.csv("table_S3.csv", skip = 6, header = FALSE, colClasses = c("character", rep("numeric", 15)), col.names = c("genes", "B.ctrl", "B.LPS", "MF.ctrl", "MF.LPS", "NK.ctrl", "NK.LPS", "Mo.ctrl", "Mo.LPS", "pDC.ctrl", "pDC.LPS", "DC1.ctrl", "DC1.LPS", "DC2.ctrl", "DC2.LPS", "cluster"), stringsAsFactors = FALSE) str(sup3) head(sup3) # ================================================== # Descriptive statistics # ================================================== set.seed(100) x <- rnorm(100, mean=0, sd=1) mean(x) median(x) IQR(x) var(x) sd(x) summary(x) # Task: # 1 - What would be interesting characterizations # of sup3? Explore them. # ============================================= # Quantiles: what is the threshold that has a # given fraction of values above/below it? x <- seq(-4, 4, 0.1) f <- dnorm(x, mean=0, sd=1) q90 <- qnorm(0.90, mean = 0, sd = 1) plot(x, f, xlab="x", ylab="density", type="l", lwd=5) abline(v=q90, col=2, lwd=5) # ============================================= # empirical quantiles set.seed(100) x <- rnorm(100, mean=0, sd=1) quantile(x) quantile(x, probs=c(.1, .2, .9)) # ============================================= set.seed(100) x <- rnorm(1000, mean=0, sd=1) boxplot(x) # Careful - a boxplot per se can obscure # important structure in your data. Consider # for example this bimodal distribution x <- rnorm(100, mean=-2, sd=1) x <- c(x, rnorm(100, mean=2, sd=1)) hist(x) boxplot(x) # Violin plot if (!require(ggplot2, quietly=TRUE)) { install.packages("ggplot2") library(ggplot2) } X <- as.data.frame(x) p <- ggplot(X, aes(1,x)) p + geom_violin() # See ggplot2 introductory tutorial at # http://www.r-bloggers.com/basic-introduction-to-ggplot2/ # and violin plot documentation at # http://docs.ggplot2.org/current/geom_violin.html # Plotting more than one column with a boxplot # places the plots side by side. colnames(sup3) boxplot(sup3[ ,c("MF.ctrl", "MF.LPS")]) boxplot(sup3[ ,2:14]) # Task: # 1 - What would be interesting quantiles # and boxplots in sup3? Explore this. # Interpret the result. # ================================================== # Explore plot types # (Section 1 of PlottingReference.R) # ================================================== # ================================================== # Probability distributions # ================================================== # The binomial distribution ?dbinom x <- 0:1 f <- dbinom(x, size=1, 0.1) plot(x, f, xlab="x", ylab="density", type="h", lwd=5) set.seed(100) x <- rbinom(100, size=1, prob=.1) hist(x) # ================================================== # The Normal distribution # ================================================== ?dnorm x <- seq(-4, 4, 0.1) f <- dnorm(x, mean=0, sd=1) plot(x, f, xlab="x", ylab="density", lwd=5, type="l") # ================================================== # Explore lines (Section 3 of PlottingReference.R) # ================================================== x <- seq(-4, 4, 0.1) f <- dnorm(x, mean=0, sd=1) plot(x, f, xlab="x", ylab="density", lwd=5, lty=3, type="l") # ============================================= # Overlay a histogram with a line set.seed(100) x <- rnorm(100, mean=0, sd=1) hist(x) lines(seq(-3,3,0.1),50*dnorm(seq(-3,3,0.1)), col="red") # ==== QQ PLOTS ==================================== set.seed(100) x <- rnorm(100, mean=0, sd=1) qqnorm(x) qqline(x, col=2) # ============================================= # Plotting lines and legends # Example: compare the normal distribution with # the t-distribution x <- seq(-4, 4, 0.1) f1 <- dnorm(x, mean=0, sd=1) f2 <- dt(x, df=2) plot(x, f1, xlab="x", ylab="density", lwd=5, type="l") lines(x, f2, lwd=5, col=2) legend(-4, .4, c("Normal", "t2"), col=1:2, lwd=5) # ============================================= # use qqplot to compare normally distributed samples with # t-distributed samples set.seed(100) t <- rt(100, df=2) qqnorm(t) qqline(t, col=2) # ============================================= # QQ- plot: sample against sample set.seed(100) x <- rnorm(100, mean=0, sd=1) t <- rt(100, df=2) qqplot(x, t) # Task: # 1 - What columns of sup3 could be compared with # a qqplot? Explore this. Interpret the result. # ================================================== # Flow cytometry data # ================================================== # Task: # 1 - Download the data file GvHD.txt and save it to # your project directory. # GvHD flow cytometry data gvhd <- read.table("GvHD.txt", header=TRUE) # Only extract the CD3 positive cells gvhdCD3p <- as.data.frame(gvhd[gvhd[, 5]>280, 3:6]) plot(gvhdCD3p[, 1:2]) # ================================================== # Explore scatter plots # Topics: # Section 6 - Plotting symbols and characters # 2 - Colors # 4 - Coordinates # ... of PlottingReference.R) # ================================================== # Scatter plots are extremely useful, but learning # a bit about R's plotting capabilities will help # tremendously to create INFORMATIVE plots. # Some special packages # The overlap in the GvHD data can obscure # data relationships. Here are some alternatives # for dense plots: # load "hexbin" package from CRAN if (!require(hexbin, quietly=TRUE)) { install.packages("hexbin") library(hexbin) } # variant one: hexbin hb <- hexbin(gvhdCD3p[, 1:2], xbins = 20) plot(hb, colramp = colorRampPalette(c("#FFFFDD", "#77EE99", "#3377AA", "#0033BB"))) # load "prada" package from BioConductor if (!require(prada, quietly=TRUE)) { source("http://www.bioconductor.org/biocLite.R") biocLite("prada") } # variant two: smoothScatter smoothScatter(gvhdCD3p[, 1:2], nrpoints=50, pch=20, cex=0.5, col="#6633BB55") # variant three: colors vary by density plot (gvhdCD3p[, c(1,2)], col=densCols(gvhdCD3p[, 1], gvhdCD3p[, 2]), pch=16) # Task: # 1 - Create a scatterplot from differences in # LPS activation between a pair of cell types in sup3. # 2 - Add a line that shows the situation if all # activations were equal. # 3 - Redo the plot with density Coloring plot(sup3[ ,"Mo.ctrl"] - sup3[ ,"Mo.LPS"], sup3[ ,"MF.ctrl"] - sup3[ ,"MF.LPS"]) abline(0,1,col = "red") # ============================================= # Trellis plots: all against all plot(gvhdCD3p, pch=".") # ============================================= boxplot(gvhdCD3p) # ============================================= oPar <- par(mfrow=c(2, 2)) hist(gvhdCD3p[, 1], 50, main=names(gvhdCD3p)[1], xlab="fluorescent intensity", ylim=c(0, 120)) hist(gvhdCD3p[, 2], 50, main=names(gvhdCD3p)[2], xlab="fluorescent intensity", ylim=c(0, 120)) hist(gvhdCD3p[, 3], 50, main=names(gvhdCD3p)[3], xlab="fluorescent intensity", ylim=c(0, 120)) hist(gvhdCD3p[, 4], 50, main=names(gvhdCD3p)[4], xlab="fluorescent intensity", ylim=c(0, 120)) par(oPar) # ============================================= # ================================================== # Explore 3D plots # # Section 8 - 8 - Plots of X-Y-Z coordinates # ... of PlottingReference.R) # ================================================== # [End]