--- title: "01: Introduction to _R_ and _Bioconductor_" author: - name: Martin Morgan affiliation: Roswell Park Comprehensive Cancer Center date: "`r format(Sys.time(), '%B %d, %Y')`" vignette: > %\VignetteIndexEntry{01: Introdction to R and Bioconductor} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} output: BiocStyle::html_document: number_sections: yes toc: true --- ```{r style, echo = FALSE, results = 'asis'} knitr::opts_chunk$set( eval=as.logical(Sys.getenv("KNITR_EVAL", "TRUE")), cache=as.logical(Sys.getenv("KNITR_CACHE", "TRUE")) ) options(width = 75) ``` # _R_ ## Vectors 'atomic' vectors - `logical()`, `integer()`, `numeric()`, `character()`, ... ```{r} x <- c(1, 3, 5) y <- 1:5 ``` - Interface: `length()`, `[`, `[<-` lists - Interface: `length()`, `[`, `[<-`, `[[`, `[[<-`, `$`, `$<-` Trouble! - `NA` ```{r} x <- c(TRUE, FALSE, NA) outer(x, x, `&`) outer(x, x, `|`) ``` - Factors ```{r} x <- c("Male", "Female", "male") gender <- factor(x, levels=c("Female", "Male")) gender ``` ## Objects The `data.frame` ```{r} x <- rnorm(100) hist(x) y <- x + rnorm(100) df <- data.frame(x, y) plot(y ~ x, df) ``` Basic `data.frame` operations - column access `$`, `[[` - Two-dimensional subsetting `[` ```{r} ridx <- df$x > 0 table(ridx) plot(y ~ x, df[ridx, ]) ``` More complicated objects and the methods that act on them ```{r} fit <- lm(y ~ x, df) class(fit) anova(fit) plot(y ~ x, df) abline(fit) ``` ## Packages ```{r, messgae = FALSE} library(ggplot2) ``` ```{r} ggplot(df, aes(x, y)) + geom_point() + geom_smooth(method="lm") ``` # Tidy _R_ ## Using the 'tidyverse' `readr` for data input ```{r, message = FALSE} library(readr) ``` ```{r, eval = FALSE} pdata_file <- file.choose() # ALL-sample-sheet.csv ``` ```{r, echo = FALSE} pdata_file <- system.file(package="BiocIntro", "extdata", "ALL-sample-sheet.csv") ``` ```{r} pdata <- read_csv(pdata_file) pdata ``` `dplyr` for data manipulation ```{r, message = FALSE} library(dplyr) ``` ```{r} pdata %>% select(sample, sex, age, mol.biol) pdata %>% filter(sex == "F", age < 50) pdata %>% mutate(sex = factor(sex, levels = c("F", "M"))) pdata %>% summarize( n = n(), ave_age = mean(age, na.rm=TRUE) ) ``` ```{r} pdata %>% group_by(sex) %>% summarize( n = n(), ave_age = mean(age, na.rm = TRUE) ) ``` ## Tidying data Input ```{r, eval = FALSE} pdata_file <- file.choose() # airway-sample-sheet.csv count_file <- file.choose() # airway-read-counts.csv ``` ```{r, echo = FALSE} pdata_file <- system.file(package="BiocIntro", "extdata", "airway-sample-sheet.csv") count_file <- system.file(package="BiocIntro", "extdata", "airway-read-counts.csv") ``` ```{r} pdata <- read_csv(pdata_file) pdata <- pdata %>% select(Run, cell, dex) ``` ```{r} counts <- read_csv(count_file) eg <- counts[, 1:6] # make it easy to work with eg ``` Joining ```{r} data <- left_join(pdata, eg) data ``` Gathering ```{r, message = FALSE} library(tidyr) ``` ```{r} tbl <- gather(data, "Gene", "Count", -(1:3)) tbl ``` ```{r} tbl %>% group_by(Run) %>% summarize(lib_size = sum(Count)) tbl %>% group_by(Gene) %>% summarize( ave_count = mean(Count), ave_log_count = mean(log(1 + Count)) ) ``` ## Visualization Tidy all the data. ```{r} counts_tbl <- gather(counts, "Gene", "Count", -Run) data_tbl <- left_join(pdata, counts_tbl) data_tbl ``` Summarize average 'expression' of each gene. ```{r} gene_summaries <- data_tbl %>% group_by(Gene) %>% summarize( ave_count = mean(Count), ave_log_count = mean(log(1 + Count)) ) gene_summaries ``` Visualize using `ggplot2` ```{r, message=FALSE} library(ggplot2) ``` ```{r} ggplot(gene_summaries, aes(ave_log_count)) + geom_density() ``` # Bioconductor ## Objects are important ```{r, message = FALSE} library(Biostrings) ``` ```{r} seq = c("AAACA", "CATGC") dna <- DNAStringSet(seq) reverseComplement(dna) ``` ```{r} dm3_upstream_file <- system.file(package="Biostrings", "extdata", "dm3_upstream2000.fa.gz") readLines(dm3_upstream_file, 10) ``` ```{r} dna <- readDNAStringSet(dm3_upstream_file) dna ``` ```{r} gc <- letterFrequency(dna, "GC", as.prob = TRUE) hist(gc) ``` ```{r, message = FALSE} library(BSgenome) library(BSgenome.Hsapiens.UCSC.hg38) ``` ```{r} BSgenome.Hsapiens.UCSC.hg38 chr17 <- BSgenome.Hsapiens.UCSC.hg38[["chr17"]] chr17 letterFrequency(chr17, "GC", as.prob=TRUE) ``` ## The interface to objects is important ```{r, message = FALSE} library(S4Vectors) ``` What is a vector? - `length()`, `[`, ... What is a `DataFrame`? - a column of vectors, as defined above ```{r} DataFrame(x = rnorm(100), y = rnorm(100)) ``` Is DNAStringSet a vector? ```{r} length(dna) dna[1:4] ``` So... ```{r} nms = names(dna) pos = sub(".* ", "", nms) df <- DataFrame( dna = unname(dna), pos = pos ) df$gc <- letterFrequency(df$dna, "GC", as.prob=TRUE)[,1] df ``` ## What's available? Web site: https://bioconductor.org Support site: https://support.bioconductor.org slack: https://community-bioc.slack.com (sign-up: ## Installing software ```{r, message = FALSE} library(BiocManager) ``` ```{r} BiocManager::available("BSgenome.Hsapiens") ``` ```{r, eval=FALSE} ## also possible to install CRAN, github packages BiocManager::install("BSgenome.Hsapiens.UCSC.hg38") ``` # Provenance ```{r} sessionInfo() ```