--- title: "An introduction to `ggplot`" author: "Federico Lopez" date: "October 23, 2015" output: pdf_document: number_sections: yes toc: yes --- Install the necessary packages ```{r, eval=FALSE} install.packages("RCurl") install.packages("ggplot2") install.packages("gridExtra") install.packages("ggmap") install.packages("maps") install.packages("rdryad") install.packages("rgbif") install.packages("plyr") install.packages("rWBclimate") install.packages("cati") ``` ```{r, warning=FALSE, message=FALSE} # Load libraries library(RCurl) # Download URLs library(ggplot2) # Create plots library(gridExtra) library(ggmap) # Create maps library(maps) # For map data library(rdryad) # Retrieve data sets from dryad library(rgbif) # Retrieve occurence data from gbif library(plyr) # Split data frames and apply functions library(rWBclimate) # Retrieve World Bank climate data library(cati) # For morphological data from Darwin's finches ``` # Getting started with `qplot` ggplot2 is a powerful plotting system for R. ggplot2 documentation is available at [docs.ggplot2.org](http://docs.ggplot2.org/current/). `qplot` is the basic plotting function in the `ggplot2` package. `qplot` is similar to the base R function `plot`; however, you cannot pass any type of R object to `qplot`. Basic usage: `qplot(x, y, data...)`, where data is the data frame to use and `...` means other aesthetics passed for each layer. ## Using qplot to create scatterplots, histograms and boxplots ### Scatterplots ```{r} # Retrieve a published dataset # Data source: http://onlinelibrary.wiley.com/doi/10.1111/evo.12793/abstract # http://datadryad.org/resource/doi:10.5061/dryad.0398p # Copy the file’s raw GitHub URL polistesdat.url <- "https://raw.githubusercontent.com/flopezo/atd/master/Tibbetts_et_al_2015_data.csv" polistesdat.url <- getURL(polistesdat.url) polistes.data <- read.csv(textConnection(polistesdat.url)) # Show the variables included in the dataset summary(polistes.data) ``` ```{r} # Create a scatter diagram qplot(logface, nest.size, data=polistes.data) + geom_point(size=3) qplot(logface, nest.size, data=polistes.data, shape=sfmf) + geom_point(size=3) # Add colors based on a categorical variable qplot(logface, nest.size, data=polistes.data, colour=sfmf) + geom_point(size=3) ``` In `ggplot`, geoms, short for geometric objects, describe the type of plot you will produce. For example, `geom_point` is used for scatterplots. `geom_point` is the default in `qplot` if x and y are specified. If only x is specified, `qplot` defaults to `geom_histogram`. In the scatter diagrams above, the `size` argument in `geom_point` controls the size of data points. ### Fit a linear model Add a smoothed line and fit linear models. `lm` is used to fit linear models and carry out regressions. A typical `lm` model has the form `response ~ terms` where `response` is the (numeric) response vector and `terms` specifies a linear predictor for response. ```{r} # Fit a linear model (by default includes 95% confidence region) qplot(logface, nest.size, data=polistes.data, geom=c("point", "smooth"), method="lm") + geom_point(size=3) # Create separate regressions for each factor and add labels qplot(logface, nest.size, data=polistes.data, geom=c("point", "smooth"), method="lm", colour=sfmf, main="Relationship between facial pattern brokenness and nest size", xlab="Facial pattern brokenness", ylab="NUmber of nest cells") + geom_point(size=3) ``` ### Boxplots and jittered points ```{r} # Load the ChickWeight dataset from the base R packages # Results from an experiment on the effect of diet on early growth of chicks cw <- ChickWeight summary(cw) tapply(cw$weight, cw$Diet, FUN=mean) qplot(Diet, weight, data=cw, geom="boxplot", colour=Diet) qplot(Diet, weight, data=cw, geom="jitter", colour=Diet) # Use I() to manually set the aesthetics, e.g., colour = I("red") or size = I(3) qplot(Time, weight, data=cw, geom="jitter", colour=Diet, size=I(3)) qplot(Diet, weight, data=cw, geom=c("boxplot","jitter"), colour=Diet) ``` ### Histograms and density plots ```{r, warning=FALSE} qplot(weight, data=cw, geom="histogram", fill=Diet) qplot(weight, data=cw, geom="density", fill=Diet, alpha=I(0.5)) ``` # `ggplot` `qplot` does not show the power of `ggplot`. `ggplot` functions can be chained with "+" signs. All the options that can be chained are available at [docs.ggplot2.org](http://docs.ggplot2.org/current/). Let us remake the previous graphs using a few of the wide variety of options available in `ggplot`. ## Scatterplots ```{r} ggplot(polistes.data, aes(x=weight, y=nest.size, colour=nest.size)) + geom_point(size=3, alpha=0.8) + # Add main and axis titles labs(title="Relationship between weight and nest size", x="Weight", y="Number of nest cells") + # Add text to graph annotate("text", x=0.130, y=75, label="Group 1", size=6) + annotate("text", x=0.075, y=180, label="Group 2", size=6) + # Use a manually defined palette scale_colour_gradientn(colours=c("darkred", "orange", "yellow", "white")) ``` ## Boxplots ```{r} # The redundant legend can be removed with '+ guides(fill=FALSE)' ggplot(cw, aes(x=Diet, y=weight, colour=Diet)) + geom_boxplot() + stat_summary(fun.y="mean", geom="point", shape=5, size=4) + labs(title="Relationship between diet and weight", x="Diet", y="Weight") + theme(plot.title=element_text(size=rel(1.5))) + theme(axis.title.x=element_text(size=rel(1.2))) + theme(axis.title.y=element_text(size=rel(1.2))) + scale_fill_brewer(palette="Pastel1") ``` ## Bargraphs ```{r} # Use stat="identity" if you want the heights of the bars to represent # values in the data ggplot(cw, aes(x=Diet, y=weight, fill=Diet)) + geom_bar(stat="identity", width=0.6) + scale_fill_grey() ggplot(cw, aes(x=Diet, y=weight, fill=Diet)) + geom_bar(stat="identity", width=0.6) + scale_fill_brewer(palette="Blues") # Add error bars # Split data frame, apply function, and return results in a data frame library(plyr) cwse <- ddply(cw, "Diet", summarise, weightm=mean(weight, na.rm=TRUE), sd=sd(weight, na.rm=TRUE), n=sum(!is.na(weight)), se=sd/sqrt(n)) cwse cwse$Diet <- as.factor(cwse$Diet) ggplot(cwse, aes(x=Diet, y=weightm, fill=Diet)) + geom_bar(stat="identity", width=0.6) + geom_errorbar(aes(ymin=weightm-se, ymax=weightm+se), width=0.1) + scale_fill_brewer(palette="Set3") # Stacked bar graph ggplot(cw, aes(x=Time, y=weight, fill=Diet)) + geom_bar(stat="identity") ``` ## Line graphs and stacked area graphs ```{r} # If your line graph looks wrong, specify the grouping variable with `group`. # Problems occur with line graphs because ggplot() is unable to determine # how to group the variables # A sawtooth pattern results from improper grouping ggplot(cw, aes(x=Time, y=weight, group=Chick, colour=Diet)) + geom_point(size=3) + geom_line(size=1) cwd12 <- subset(cw, Diet==1:2) summary(cwd12) cwse12 <- ddply(cwd12, c("Diet", "Time"), summarise, weightm=mean(weight, na.rm=TRUE), sd=sd(weight, na.rm=TRUE), n=sum(!is.na(weight)), se=sd/sqrt(n)) cwse12 ggplot(cwse12, aes(x=Time, y=weightm, colour=Diet)) + geom_errorbar(aes(ymin=weightm-se, ymax=weightm+se), width=.4, size=0.8) + geom_line(size=1.5) + geom_point(size=3) # Stacked area graph # Area graphs represent cumulative totals over time #tapply(cw$weight, cw$Diet, FUN=sum) #cwd4 <- subset(cw, Diet==4) #tapply(cwd4$weight, cwd4$Time, FUN=sum) ggplot(cw, aes(x=Time, y=weight, group=Chick, fill=Diet)) + geom_area(alpha=0.8) ggplot(cw, aes(x=Time, y=weight, group=Chick, fill=Diet)) + geom_area(colour="black", size=0.2, alpha=0.8) + scale_fill_brewer(palette="Blues") ``` ## Histograms ```{r, warning=FALSE} ggplot(polistes.data, aes(x=weight, fill=sfmf)) + geom_histogram(alpha=0.8) ggplot(cw, aes(x=weight, fill=Diet)) + geom_histogram(alpha=0.6) # Find the mean of each group library(plyr) # For each subset of a data frame, ddply applies a function and then combines # results into a data frame mwt <- ddply(cw, "Diet", summarise, weight.mean=mean(weight)) mwt # Overlaid histograms with means ggplot(cw, aes(x=weight, fill=Diet)) + geom_histogram(alpha=0.5) + geom_vline(data=mwt, aes(xintercept=weight.mean, colour=Diet), linetype="dashed", size=1) # Use facets to display subsets of the dataset in different panels ggplot(cw, aes(x=weight, fill=Diet)) + geom_histogram(alpha=0.5) + facet_grid(. ~ Diet) ``` ## Scatterplots with marginal density plots ```{r} empty.plot <- ggplot() + geom_point(aes(1,1), colour="white") + theme(plot.background=element_blank(), panel.grid.major=element_blank(), panel.grid.minor=element_blank(), panel.border=element_blank(), panel.background=element_blank(), axis.title.x=element_blank(), axis.title.y=element_blank(), axis.text.x=element_blank(), axis.text.y=element_blank(), axis.ticks=element_blank()) xyscatter <- ggplot(polistes.data, aes(x=weight, y=nest.size, colour=sfmf)) + geom_point(size=3, alpha=.8) + scale_color_manual(values=c("orange", "cornflowerblue")) + theme(legend.position=c(1,1), legend.justification=c(1,1)) xdensity.top <- ggplot(polistes.data, aes(weight, fill=sfmf)) + geom_density(alpha=0.5) + scale_fill_manual(values=c("orange", "cornflowerblue")) + theme(legend.position="none", axis.title.x=element_blank()) ydensity.right <- ggplot(polistes.data, aes(nest.size, fill=sfmf)) + geom_density(alpha=0.5) + coord_flip() + scale_fill_manual(values=c("orange", "cornflowerblue")) + theme(legend.position="none", axis.title.x=element_blank()) # Arrange the plots together grid.arrange(xdensity.top, empty.plot, xyscatter, ydensity.right, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4)) ``` # More `ggplot` and an introduction to rOpenSci packages rOpenSci is an inititative to create R packages for accessing data repositories. The full list of packages is available [here](https://ropensci.org/packages/) ```{r} # Here we will use the 'rdryad' package to retrieve a dataset from Dryad # Retrieve the data using the Dryad identifier '10255/dryad.34389' # which is at the end of the URL where the dataset is found # http://datadryad.org/handle/10255/dryad.34389 # the original publication is Kolbe et al. 2011, Evolution 65(12): 3608-3624 anolis.data <- download_url("10255/dryad.34389") anolis.data <- dryad_getfile(anolis.data) # create plot with discrete and continous variables anolis.scatter <- ggplot(anolis.data, aes(x=Ecomorph, y=Snout.vent.length, colour=Snout.vent.length)) + geom_point(size=3) anolis.scatter # create plot with continuous variables anolis.scatter <- ggplot(anolis.data, aes(x=Head.length, y=Snout.vent.length, colour=Ecomorph)) + geom_point(size=3) anolis.scatter ``` # `ggplot` and `rgbif` ## Plot GBIF occurrences of a species ```{r} # search for occurrences on GBIF ?occ_search apicea.occ <- occ_search(scientificName="Aphaenogaster picea", limit=1000, return='data', hasCoordinate=TRUE) # this dataset includes more than 100 columns # show first 10 lines for columns 1 to 4 head(apicea.occ, n=10L)[,1:4] # select four columns from the complete data set apicea.lat.lon <- apicea.occ[,c("name", "decimalLatitude", "decimalLongitude", "countryCode")] # show the unique country codes in the data set unique(apicea.lat.lon$countryCode) # select only US occurrences apicea.lat.lon <- subset(apicea.lat.lon, countryCode=="US") summary(apicea.lat.lon) # get map data for world #world_map <- map_data("world") # show unique regions #sort(unique(world_map$region)) # get US map data states.map <- map_data("state") head(states.map) #states.map <- subset(states.map, long > -100 & lat < 50) states.map <- ggplot(states.map, aes(x=long, y=lat, group=group)) + geom_polygon(fill="white", colour="black") + coord_map("mercator") #states.map states.map + geom_point(aes(x=decimalLongitude, y=decimalLatitude, group=name), colour="orange", size=3, data=apicea.lat.lon) #gbifmap(apicea.lat.lon) ``` ## Plot GBIF occurrences of two or more species ```{r} splist <- c("Bombus fraternus", "Bombus perplexus") # apply a function over a list or vector with sapply bombus.keys <- sapply(splist, function(x) name_backbone(name=x)$speciesKey, USE.NAMES=FALSE) bombus.occ <- occ_search(taxonKey=bombus.keys, limit=500, return="data", hasCoordinate=TRUE) summary(bombus.occ) head(bombus.occ$`1340432`, n=10L)[,1:4] bfraternus.lat.lon <- bombus.occ$`1340432`[,c("name", "decimalLatitude", "decimalLongitude", "countryCode")] head(bombus.occ$`1340406`, n=10L)[,1:4] bperplexus.lat.lon <- bombus.occ$`1340406`[,c("name", "decimalLatitude", "decimalLongitude", "countryCode")] bperplexus.lat.lon <- subset(bperplexus.lat.lon, decimalLatitude < 50) bombus.lat.lon <- rbind(bfraternus.lat.lon, bperplexus.lat.lon) bombus.lat.lon <- subset(bombus.lat.lon, countryCode=="US") states.map <- map_data("state") #states.map <- subset(states.map, long > -120 & lat < 50) states.map <- ggplot(states.map, aes(x=long, y=lat, group=group)) + geom_polygon(fill="white", colour="black") + coord_map("mercator") #states.map # create a title for ggplot that italicizes only the genus name bombus.occ.title <- expression(paste(italic("Bombus"), " occurrences")) bombus.map.occ <- geom_point(aes(x=decimalLongitude, y=decimalLatitude, group=name, colour=name, shape=name), size=2, data=bombus.lat.lon) bombus.maplabels <- labs(title=bombus.occ.title, x="Longitude", y="Latitude", colour="Species", shape="Species") bombus.maplegend <- theme(plot.title=element_text(face="italic", size=rel(1.2))) + theme(legend.title=element_text(size=rel(0.6))) + theme(legend.text=element_text(face="italic", size=rel(0.6))) + theme(axis.title.x=element_text(size=rel(0.8))) + theme(axis.title.y=element_text(size=rel(0.8))) states.map + bombus.map.occ + bombus.maplabels + bombus.maplegend bombus.map.den <- stat_density2d(aes(x=decimalLongitude, y=decimalLatitude, group=name, colour=name), size=0.6, data=bombus.lat.lon, geom="density2d") bombus.den.title <- expression(paste(italic("Bombus"), " distribution")) bombus.maplabels <- labs(title=bombus.den.title, x="Longitude", y="Latitude", colour="Species", shape="Species") states.map + bombus.map.den + bombus.maplabels + bombus.maplegend ``` ## Map environmental data ```{r warning=FALSE, results='hide'} # Create a directory for kml files dir.create("~/Desktop/kmltemp") options(kmlpath="~/Desktop/kmltemp") #http://data.worldbank.org/developers/climate-data-api #http://unstats.un.org/unsd/methods/m49/m49alpha.htm #http://data.worldbank.org/sites/default/files/climate_data_api_basins.pdf # Create a vector of World Bank basin IDs nam <- 328:365 # Download map for vector of basins nam.basin <- create_map_df(nam) ``` ```{r} # Retrieve historical precipitation data temp.dat <- get_historical_temp(nam, "decade") # Create a subset using only data from one year temp.dat <- subset(temp.dat, temp.dat$year==2000) # Create maps of climate data nam.map <- climate_map(nam.basin, temp.dat, return_map=TRUE) nam.map ``` # Other examples using morphological data for Darwin's finches ```{r, warning=FALSE} data(finch.ind) tr <- traits.finch; sp <- sp.finch; dist <- ind.plot.finch tr$sp <- sp; tr$dist <- dist head(tr) ggplot(tr, aes(x=BeakH, y=UBeakL, colour=dist)) + geom_point(size=4, alpha=0.4) + stat_smooth(method=lm, colour="black", level=0.95) + scale_colour_brewer(palette="Set3") mbh <- ddply(tr, "dist", summarise, bh.mean=mean(BeakH, na.rm=TRUE)) mbh ggplot(tr, aes(x=BeakH, fill=dist)) + geom_density(alpha=0.4) + scale_fill_brewer(palette="Set1") + geom_vline(data=mbh, aes(xintercept=bh.mean, colour=dist), linetype="dashed", size=1) ```