# Data Carpentry 6/13/16
# Session 2 Overview
# Lessons
## 0. Introduction to R and RStudio
## 1. Importing data and working with data.frames
## 2. Manipulating data with the dplyr package
## 3. Data visualization with the ggplot2 package


rm(list = ls(all=TRUE))

# Lesson 1 - Importing data and working with data.frames ---------------------------------------------
svydat <-  read.csv("Data/surveys.csv", header=TRUE) 
View(svydat)
head(svydat)
dim(svydat)
nrow(svydat)
ncol(svydat)
str(svydat) # character variables are automatically coerced to be factors

# subsetting data
svydat[1:10, 1:4]
svydat[1:10, c("record_id", "plot_id", "species_id")]

svydat$sex
svydat$sex[1:10]

svydat$plot_id[is.na(svydat$weight)]
table(svydat$plot_id[is.na(svydat$weight)])
table(svydat$plot_id[!is.na(svydat$weight)])

str(svydat)
# force categorical variables to factors
svydat$record_id <- as.factor(svydat$record_id)
svydat$plot_id <- as.factor(svydat$plot_id)
str(svydat)

summary(svydat)

table(svydat$plot_id)
table(svydat$plot_id, useNA="ifany")
levels(svydat$plot_id)

levels(svydat$species_id)
# change "" values to NA
svydat$species_id[svydat$species_id==""] <- NA
table(svydat$species_id, useNA="ifany")
svydat$species_id <- droplevels(svydat$species_id)
table(svydat$species_id, useNA="ifany")

levels(svydat$sex)
svydat$sex[svydat$sex==""] <- NA
svydat$sex <- droplevels(svydat$sex)
levels(svydat$sex)

# could have saved ourselves a bunch of time had we known there were "" values that should be NA's:
svydat_2 <-  read.csv("Data/surveys.csv", header=TRUE, na.strings="")
summary(svydat_2)

# what if we have a factor variable that we want to change back to a numerical variable?
svydat_2$hindfoot_length[1:10]
svydat_2$hindfoot_length <- as.factor(svydat_2$hindfoot_length)
svydat_2$hindfoot_length[1:10]
as.numeric(svydat_2$hindfoot_length)[1:10]  # don't do this! all the values have changed!
as.numeric(as.character(svydat_2$hindfoot_length))[1:10] # use this!
svydat_2$hindfoot_length <- as.numeric(levels(svydat_2$hindfoot_length))[svydat_2$hindfoot_length] # or even better, this!


# Lesson 2 - Managing and analyzing data with the dplyr package --------------------------------------------
install.packages("dplyr")
library(dplyr) # masks a few base R functions

# Why use dplyr? 
## Code is intuitive 
## Constrained number of options that correspond to the most common data manipulations
## Fast

# create a local dataframe
svy <- tbl_df(svydat)
svydat
svy # shows only the first 10 rows and as many variables as will fit on the screen
    # shows dimension and variable types
    # can use either svydat or svy in dplyr functions, we will use svy since it prints so nicely
    # svy will print more nicely even if we're not using dplyr functions
print(svy, n=20) # to see more rows

# 5 main verbs/functions in dplyr: filter, select, arrange, mutate, summarise (+ group_by)
## FILTER - return rows with matching conditions -------------
## return data from January 1983
# base R:
svy[svy$year==1983 & svy$month==1,]
svydat[svydat$year==1983 & svydat$month==1,][1:10,] # rownames are not retained with local df

# dplyr:
filter(svy, year==1983, month==1) 
filter(svy, year==1983 & month==1) # can use , or &; doesn't change svy dataframe

# to create a new dataframe
jan1983 <- filter(svy, year==1983 & month==1) # look at data

## return data for species DS from January 1983
# base R:
svy[svy$year==1983 & svy$month==1 & svy$species_id=="DS",] 
print(svy[svy$year==1983 & svy$month==1 & svy$species_id=="DS",], n=22)
svy$species_id[svy$year==1983 & svy$month==1] # 4 obs with missing species_id in January 1983; need to remove
svy[svy$year==1983 & svy$month==1 & svy$species_id=="DS" & !is.na(svy$species_id),]

# dplyr:
filter(svy, year==1983, month==1, species_id=="DS") # automatically excludes obs with missing species_id

# can also use |, and %in%
# data for species UP and UR
filter(svy, species_id=="UP" | species_id=="UR")
filter(svy, species_id %in% c("UP","UR"))


# Challenge: Create a data.frame containing obs for species PF where weight is not missing.
# How many males and females are there?
pf_wts = filter(svydat, species_id=="PF", !is.na(weight))
table(pf_wts$sex)


## SELECT - pick columns by name -------------
# base R:
svy[, c("species_id", "sex", "hindfoot_length", "weight")]

# dplyr:
select(svy, species_id, sex, hindfoot_length, weight) # very similar to filter command

select(svy, hindfoot_length:weight, contains("id")) # also can use "starts_with", "ends_with", "matches"


# "chaining" or "pipes" - not a main verb, but can help you avoid nesting functions
# return year, species_id, weight and then filter weights less than 5
# can use two steps:
sp_wgt = select(svy, year, species_id, weight)
filter(sp_wgt, weight<5) # could filter and then select
  # have to define useless intermediate data.frame

# or nest functions:
filter(select(svy, year, species_id, weight), weight<5)
  # code isn't that hard to read, but does take a bit of thought

# or use pipes to take the output of one function and then send it directly to the next
# comes from magrittr package (installed with dplyr)
# dplyr/magrittr
svy %>%
  select(year, species_id, weight) %>%
  filter(weight<5)
  # read %>% as "then"
  # don't have to repeat dataframe name
  # increases readability when there are multiple nested functions
  # doesn't actually have to go on 3 lines

# can use it to create a new dataframe:
sm_sp = svy %>%
  select(year, species_id, weight) %>%
  filter(weight<5)


# Challenge: Create a new data.frame that contains year, species_id, and weight for  
# observations from 2000.
# What is the mean weight?
sp2000 = svy %>%
  select(year, species_id, weight) %>%   # could change order of operations here too
  filter(year==2000)
mean(sp2000$weight, na.rm=TRUE)

# or
sp2000 = svy %>%
  select(year, species_id, weight) %>%   
  filter(year==2000, !is.na(weight))
mean(sp2000$weight)


## ARRANGE - reorder rows -------------
# return year, month, sex, hindfoot_length sorted by year and month 
# base R:
svy[order(svy$year, svy$month), c("year", "month", "sex", "hindfoot_length")]

# dplyr:
svy %>%
  select(year, month, sex, hindfoot_length) %>%
  arrange(year, month)

svy %>%
  select(year, month, sex, hindfoot_length) %>%
  arrange(desc(year), month)


## MUTATE - adds new variables (transmutate drops existing variables) ---------
# create variable for ratio of weight/hindfoot_length
# base R:
svy$wl_ratio1 = svy$weight/svy$hindfoot_length

# dplyr:
mutate(svy, wl_ratio2 = weight/hindfoot_length)  # just prints the new variable
svy %>% mutate(wl_ratio2 = weight/hindfoot_length) # using pipes
svy <- svy %>% mutate(wl_ratio2 = weight/hindfoot_length) # to store the new variable CHECK THIS

# can remove variables with transmute
svy <- svy %>% transmute(wl_ratio1=NULL, wl_ratio2=NULL) 


## SUMMARISE - summarize variables --------------
# useful in conjuction with group_by
# calculate average weight for each species
# base R:
tapply(svy$weight, svy$species_id, mean)
tapply(svy$weight, svy$species_id, mean, na.rm=TRUE)
aggregate(weight ~ species_id, svy, mean)

# dplyr:
svy %>%
  group_by(species_id) %>%
  summarise(mean(weight, na.rm=TRUE))
svy %>%
  filter(!is.na(weight)) %>%
  group_by(species_id) %>%
  summarise(mean(weight))  
# now save it as a data.frame and add variable name
sp_av_wts <- svy %>%
  filter(!is.na(weight)) %>%
  group_by(species_id) %>%
  summarise(avg_wt=mean(weight))

# count number of observations per month 
svy %>%
  group_by(month) %>%
  summarise(count=n())
# or use tally command
svy %>%
  group_by(month) %>%
  tally
# now save it in a data.frame
monthly_counts <- svy %>%
  group_by(month) %>%
  tally


# Challenge: For each species and year, count the number of observations per sex and calculate the
# mean weight per sex (hint: don't use tally).
svy %>%
  filter(!is.na(weight)) %>%
  group_by(species_id, sex) %>%
  summarise(count=n(), mean_wt=mean(weight)) 


# dplyr cheatsheet:
# https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf


# Create output datafile to use in next section: -------------------
# Remove obs with missing species_id, weight, hindfoot_lenth, or sex:
svy_complete <- svy %>%
  filter(species_id != "", !is.na(weight), !is.na(hindfoot_length), !is.na(sex))               

# Remove data for rare species (those with fewer than 50 obs):
# create a list of species IDs for non-rare species
species_nonrare <- svy_complete %>%
  group_by(species_id) %>%
  summarise(count=n()) %>%
  filter(count >= 50) %>%
  select(species_id)

# Keep data for most common species:
svy_complete <- svy_complete %>%
  filter(species_id %in% species_nonrare$species_id) 

dim(svy_complete) # should have 30463 rows and 9 variables

write.csv(svy_complete, file="Data/svy_complete.csv", row.names=FALSE)


# Lesson 3 - Data visualization with the ggplot2 package -----------------------------------------

rm(list = ls(all=TRUE))

svy <- read.csv("Data/svy_complete.csv", header=TRUE)

library(dplyr)
library(ggplot2)

# ggplot2: 
# ggplot function initializes the basic graph structure, then elements are addded to the graph
# aes (aesthetics) - maps variables to the plot
# geom - adds geometric objects to the plot (geom_point, geom_line, geom_boxplot, geom_smooth)
# scales - to modify axes and labels
# facets - plot panels 

# Three examples: scatterplot, side-by-side boxplots, longitudinal (time series) plot


# Scatterplot of hindfoot lenght versus weight --------------
ggplot(data = svy)
ggplot(data = svy, aes(x = weight, y = hindfoot_length))
ggplot(data = svy, aes(x = weight, y = hindfoot_length)) +
  geom_point() # anything set up in aes can be seen by the geom layers, or these can be set up 
               # separately in the geom function
# note: can go to next line but + must be on prior line

# can also create a plot object
svy_plot <- ggplot(data = svy, aes(x = weight, y = hindfoot_length))

# and then add to it and render it later
svy_plot + geom_point()  # I'm not going to do this so my code doesn't run off the screen

# can customize at each level
ggplot(data = svy, aes(x = weight, y = hindfoot_length)) +
  geom_point(alpha = 0.5) # make points transparent; change to 0.5 and 0.1

# change color
ggplot(data = svy, aes(x = weight, y = hindfoot_length)) +
  geom_point(alpha = 0.1, color = "blue") 

# color by species
ggplot(data = svy, aes(x = weight, y = hindfoot_length)) +
  geom_point(alpha = 0.1, aes(color=species_id))  

# could have also put the color in the ggplot function aes function
ggplot(data = svy, aes(x = weight, y = hindfoot_length, color=species_id)) +
  geom_point(alpha = 0.1)

# add axis labels and change name on legend
ggplot(data = svy, aes(x = weight, y = hindfoot_length)) +
  geom_point(alpha = 0.1, aes(color=species_id)) +
  xlab("Weight (g)") +
  ylab("Hindfoot Length (mm)") +
  scale_fill_discrete(name="Species")


# Boxplots of hindfoot length by species ----------------------
ggplot(data = svy, aes(x = species_id, y = hindfoot_length)) +
  geom_boxplot()

# add jitter to data points
ggplot(data = svy, aes(x = species_id, y = hindfoot_length)) +
  geom_boxplot() +
  geom_jitter(alpha = 0.3, color = "tomato")  

# change order of plotting so we can see boxplots
ggplot(data = svy, aes(x = species_id, y = hindfoot_length)) +
  geom_jitter(alpha = 0.3, color = "tomato") +
  geom_boxplot() 

# add axis labels
ggplot(data = svy, aes(x = species_id, y = hindfoot_length)) +
  geom_jitter(alpha = 0.3, color = "tomato") +
  geom_boxplot() +
  xlab("Species ID") +
  ylab("Hindfoot Length (mm)")


# Challenge: Create violin plots of the weights of each species using the geom_violin function in 
# place of the geom_boxplot function (without jitter); add appropriate axis labels and add a main 
# title using the ggtitle function.
ggplot(data = svy, aes(x = species_id, y = weight)) +
  geom_violin() +
  xlab("Species ID") +
  ylab("Weight (g)") +
  ggtitle("Weights of Small Mammal Species")

# change scale of the y axis
ggplot(data = svy, aes(x = species_id, y = weight)) +
  geom_violin() +
  xlab("Species ID") +
  ylab("Log Weight") +
  ggtitle("Weights of Small Mammal Species") + scale_y_log10()


# Longitudinal plot of number of each species over time (years) ----------------
# calculate the number of species per year
yearly_counts <- svy %>%
  group_by(year, species_id) %>%
  tally      

ggplot(data = yearly_counts, aes(x = year, y = n)) +
  geom_line() 

# create separate lines for each species
ggplot(data = yearly_counts, aes(x = year, y = n, group = species_id)) +
  geom_line()

# add colors
ggplot(data = yearly_counts, aes(x = year, y = n, group = species_id, colour = species_id)) +
  geom_line()

# faceting can be used to create separate plots for each species
ggplot(data = yearly_counts, aes(x = year, y = n, group = species_id, colour = species_id)) +
  geom_line() +
  facet_wrap(~ species_id)

# split each line up by sex
# create new counts data frame grouped by year, species_id, and sex
yearly_sex_counts <- svy %>%
  group_by(year, species_id, sex) %>%
  tally

ggplot(data = yearly_sex_counts, aes(x = year, y = n, color = species_id, group = sex)) +
  geom_line() +
  facet_wrap(~ species_id)

# makes more sense to color by sex
ggplot(data = yearly_sex_counts, aes(x = year, y = n, color = sex, group = sex)) +
  geom_line() +
  facet_wrap(~ species_id)

# change background theme
ggplot(data = yearly_sex_counts, aes(x = year, y = n, color = sex, group = sex)) +
  geom_line() +
  facet_wrap(~ species_id) +
  theme_bw()


# Challenge: Create a plot that shows the average weight of each species plotted over time (years)
# (hint: need to use dplyr to create appropriate data.frame first).
yearly_weight <- svy %>%
  group_by(year, species_id) %>%
  summarize(avg_weight = mean(weight))

ggplot(data = yearly_weight, aes(x=year, y=avg_weight, color = species_id, group = species_id)) +
  geom_line() +
  xlab("Year") +
  ylab("Average Weight (g)")

# by sex
yearly_sex_weight <- svy %>%
  group_by(year, sex, species_id) %>%
  summarize(avg_weight = mean(weight))

ggplot(data = yearly_sex_weight, aes(x=year, y=avg_weight, color = species_id, group = species_id)) +
  geom_line() +
  facet_wrap(~sex)

# same plot using facet_grid
ggplot(data = yearly_sex_weight, aes(x=year, y=avg_weight, color = species_id, group = species_id)) +
  geom_line() +
  facet_grid(.~ sex)

# facet by rows instead of columns
ggplot(data = yearly_sex_weight, aes(x=year, y=avg_weight, color = species_id, group = species_id)) +
  geom_line() +
  facet_grid(sex ~.)

# ggplot2 cheatsheet:
# https://www.rstudio.com/wp-content/uploads/2015/03/ggplot2-cheatsheet.pdf