#' ---
#' title: "Data visualization"
#' author: Aaron A. King
#' output:
#'   html_document:
#'     toc: yes
#'     toc_depth: 4
#' bibliography: ../course.bib
#' csl: ../ecology.csl
#' ---
#' 
## ----prelims,include=FALSE,cache=FALSE-----------------------------------
options(
  keep.source=TRUE,
  encoding="UTF-8"
  )

set.seed(594709947L)
library(ggplot2)
theme_set(theme_bw())

#' 
#' ## How to use this document.
#' 
#' This is an extremely condensed introduction to **R**'s base graphics and---more importantly---the powerful data-visualization package **ggplot2**, developed by Hadley Wickham.
#' Run the codes shown and study the outputs to learn about these tools.
#' When questions are posed, do your best to answer them.
#' 
#' For your convenience, [the **R** codes for this document are provided in an **R** script](http://raw.githubusercontent.com/kingaa/short-course/master/hadley/viz.R) which you can download, edit, and run.
#' 
#' ## Getting started: **R**'s base graphics
#' 
#' ### Transgenic mosquito experiment
#' 
#' Let's load the data on transgenic mosquito survival time.
#' 
## ------------------------------------------------------------------------
dat <- read.csv("http://kingaa.github.io/short-course/hadley/mosquitoes.csv")

#' 
#' Let's compare the average lifespan of transgenic vs wildtype mosquitoes from this experiment.
#' The following split the data into two subsets, one for each genetic type.
## ------------------------------------------------------------------------
wt <- subset(dat,type=="wildtype",select=lifespan)
tg <- subset(dat,type=="transgenic",select=-type)

#' 
#' Let's try and visualize the data.
## ------------------------------------------------------------------------
plot(dat)
op <- par(mfrow=c(1,2))
hist(tg$lifespan,breaks=seq(0,55,by=5),ylim=c(0,40))
hist(wt$lifespan,breaks=seq(0,55,by=5),ylim=c(0,40))
par(op)

#' 
#' **Question:** What does the second `par` command accomplish?
#' 
#' Another way to visualize a distribution is via the *empirical cumulative distribution plot*.
#' 
## ------------------------------------------------------------------------
plot(sort(dat$lifespan),seq(1,nrow(dat))/nrow(dat),type='n')
lines(sort(wt$lifespan),seq(1,nrow(wt))/nrow(wt),type='s',col='blue')
lines(sort(tg$lifespan),seq(1,nrow(tg))/nrow(tg),type='s',col='red')

#' **Question:** What does `type="n"` do in the first line above?
#' 
#' ### Mammal body and brain sizes
#' 
#' The data on mammal body and brain sizes is included in the **MASS** package:
## ------------------------------------------------------------------------
library(MASS)

plot(mammals)
plot(mammals,log='x')
plot(mammals,log='xy')
plot(mammals$body,mammals$brain,log='xy')
plot(brain~body,data=mammals,log='xy')

#' 
#' ### Oil production
#' 
## ------------------------------------------------------------------------
read.csv("http://kingaa.github.io/short-course/hadley/oil_production.csv",
         comment.char="#") -> oil
head(oil)
summary(oil)
plot(oil)
plot(Gbbl~year,data=oil,subset=region=="North.America",type='l')
lines(Gbbl~year,data=oil,subset=region=="Eurasia",type="l",col='red')

library(reshape2)

dcast(oil,year~region) -> wideOil
names(wideOil)
wideOil$total <- wideOil$Africa+wideOil$Asia+wideOil$Central+wideOil$Eurasia+wideOil$Europe+wideOil$Middle+wideOil$North.America
wideOil$total <- apply(wideOil[,-1],1,sum)
plot(wideOil$year,wideOil$total,type='l')

#' 
#' ## A systematic approach to visualization: the Grammar of Graphics
#' 
#' Parts of a graphic:
#' 
#' 1. ***Data***
#' 1. ***Geometrical object***: point, line, box, bar, density plot, contours, ribbons
#' 1. ***Statistical transformations***: bins, mean, median, quantile, ECDF, identity 
#' 1. ***Aesthetic attributes***: x and y position, color, fill, size, shape, line type, transparency
#' 1. ***Scales***: map the data onto the aesthetic attributes
#' 1. A ***coordinate system***: maps x and y position onto the page
#' 1. A ***faceting system***: multiple plots
#' 
#' You construct a graphical visualization by choosing the constituent parts.
#' This is implemented in the **ggplot2** package.
#' 
#' ### References
#' 
#' - [ggplot2.org](http://ggplot2.org)
#' - [ggplot2 documentation](http://docs.ggplot2.org/)
#' 
#' 
#' ## Examples
#' 
#' ### Energy production
#' 
## ------------------------------------------------------------------------
read.csv("http://kingaa.github.io/short-course/hadley/energy_production.csv",
         comment.char="#") -> energy

library(ggplot2)

ggplot(data=energy,mapping=aes(x=year,y=TJ,color=region,linetype=source))+geom_line()
ggplot(data=energy,mapping=aes(x=year,y=TJ,color=region))+geom_line()+facet_wrap(~source)
ggplot(data=energy,mapping=aes(x=year,y=TJ,color=source))+geom_line()+facet_wrap(~region,ncol=2)

#' 
#' What can you conclude from the above?
#' Try plotting these data on the log scale (`scale_y_log10()`).
#' How does your interpretation change?
#' 
## ------------------------------------------------------------------------
ggplot(data=energy,mapping=aes(x=year,y=TJ))+geom_line()
ggplot(data=energy,mapping=aes(x=year,y=TJ,group=source))+geom_line()

#' 
#' **Question:** How do you account for the appearance of the two plots immediately above?
#' 
## ------------------------------------------------------------------------
ggplot(data=energy,mapping=aes(x=year,y=TJ,group=source:region))+geom_line()

#' 
#' **Question:** What does the `group` aesthetic do?
#' 
#' Let's aggregate across regions by year and source of energy.
## ------------------------------------------------------------------------
library(reshape2)

tot <- dcast(energy,year+source~'TJ',value.var="TJ",fun.aggregate=sum)
ggplot(data=tot,mapping=aes(x=year,y=TJ,color=source))+geom_line()
ggplot(data=tot,mapping=aes(x=year,y=TJ,fill=source))+geom_area()


#' 
#' Now let's aggregate across years by region and source.
#' 
## ------------------------------------------------------------------------
reg <- dcast(energy,region+source~'TJ',value.var="TJ",fun.aggregate=mean)
ggplot(data=reg,mapping=aes(x=region,y=TJ,fill=source))+
   geom_bar(stat="identity")+coord_flip()

#' 
#' An even better way to manipulate the data is to use the **plyr** package.
#' [See the data munging tutorial.](./data_munging.html)
#' 
## ------------------------------------------------------------------------
library(plyr)

ddply(energy,~region+source,summarize,TJ=mean(TJ)) -> x

ggplot(data=x,mapping=aes(x=region,y=TJ,fill=source))+
   geom_bar(stat="identity")+coord_flip()

ddply(x,~region,mutate,frac=TJ/sum(TJ)) -> y

ggplot(data=y,mapping=aes(x=region,y=frac,fill=source))+
   geom_bar(stat="identity")+coord_flip()+labs(x="fraction of production")


#' 
#' In the above, we first average across years for every region and source.
#' Then, for each region, we compute the fraction of the total production due to each source.
#' Finally, we plot the fractions using a barplot.
#' The `coord_flip` coordinate specification gives us horizontal bars instead of the default vertical bars.
#' Fancy!
#' 
#' Let's compare fossil fuel production to renewable.
## ------------------------------------------------------------------------
library(plyr)

mutate(energy,
       source=as.character(source),
       source1=mapvalues(source,
                         from=c("Hydro","Other Renewables","Coal","Oil","Gas"),
                         to=c("Renewable","Renewable","Carbon","Carbon","Carbon"))
       ) -> energy

ddply(energy,~source1+region+year,summarize,TJ=sum(TJ)) -> x

ggplot(data=x,mapping=aes(x=year,y=TJ,fill=source1))+
    geom_area()+
    facet_wrap(~region,scales="free_y",ncol=2)

ddply(energy,~source1+year,summarize,TJ=sum(TJ)) -> x

ggplot(data=x,mapping=aes(x=year,y=TJ,fill=source1))+
    geom_area()

#' --------------------------
#' 
#' ### Exercise
#' 
#' Ask a question regarding one of the datasets shown here and devise a visualization to answer it.
#' 
#' --------------------------
#' 
#' ## [Back to course homepage](http://kingaa.github.io/short-course)
#' ## [**R** codes for this document](http://raw.githubusercontent.com/kingaa/short-course/master/hadley/viz.R)
#' 
#' --------------------------