## diversity-lab.R
## A script to introduce you to diversity estimation and comparison

## Before starting this tutorial, please run
devtools::install_github("adw96/breakaway")
library(breakaway)
# Let's test whether we can use the apples dataset in breakaway
# to confirm everything is loading.
data(apples) 
head(apples) # Something should output
devtools::install_github("adw96/DivNet")
library(DivNet)
divnet ## something should output

# If this doesn't work, run install.packages("devtools") and try again
# If this doesn't work, read the error message carefully 
# and try to debug it yourself (this is what you have to do at home -- so
# it's good to practice here)
# If this doesn't work, call a TA over

## Lab authors: Amy Willis, Bryan Martin, Pauline Trinh

## breakaway authors: Amy Willis, Kathryn Barger, John Bunge, 
##                        Bryan Martin, 2012+

## Start by checking what directory you are in with
getwd()

# You will need to change your working directory to what you want to work in,
# probably your STAMPS folder!
## Set it to your local copy: setwd([your directory name here])

library(tidyverse)

## Let's use data distributed from phyloseq
library(phyloseq)
data("GlobalPatterns")
GlobalPatterns 
## We see that GlobalPatterns is a phyloseq object containing an 
## otu table, sample data, taxonomy table, and a phylogenetic tree

## Have a look through GlobalPatterns samples
GlobalPatterns %>% sample_data


# Let's just look at the water samples of GlobalPatterns
# To speed things up, let's just aggregate taxa to the 
# order level. So we're going to estimate *order* level 
# diversity 

# This will take a while, but it will speed things up later
water <- GlobalPatterns %>%
  subset_samples(SampleType %in% c("Freshwater", 
                                   "Freshwater (creek)", 
                                   "Ocean", 
                                   "Sediment (estuary)")) %>%
  tax_glom("Order")

water

# phyloseq has some built in tools for exploring alpha
# diversity, but they're not great. They underestimate richness,
# underestimate uncertainty, and don't allow hypothesis testing

# Enter: breakaway :D
# breakaway was specifically designed for these tasks

## Let's look at the observed richness of the water samples
observed_c <- sample_richness(water)
summary(observed_c)
plot(observed_c, water, color = "SampleType")

# Hmmmm, but what if we observed these samples at different depth?
# Depth may be confounded with observed richness. 
# Let's practice ggplot and phyloseq to look at this

data.frame("observed_richness" = (observed_c %>% summary)$estimate,
           "depth" = phyloseq::sample_sums(water), # Easter egg! Phyloseq's function to get depth
           "type" = water %>% sample_data %>% get_variable("SampleType")) %>%
  ggplot(aes(x = depth, y = observed_richness, color = type)) +
  geom_point()

# So our lowest depth samples (Sediment) had lower richness
# compared to Freshwater (creek)!
# Not surprising!

# Recall that the best way to adjust for this is by estimating the 
# number of missing species using a species richness estimate.

ba <- breakaway(water)
ba 
plot(ba, water, color = "SampleType")

## Cool! How do these estimates work?

## Let's look at just one sample
tr <- water %>% subset_samples(X.SampleID == "TRRsed1")
tr

# what's the structure of this dataset?
fc <- tr %>% otu_table %>% make_frequency_count_table
# this is the frequency count table for this dataset
fc %>% head(10)
# So there are 11 singletons (Orders observed once) here

# Quiz: how many times was the most common Order observed?
# (Hint: what does tail() do?)
# (A: 8863 times)

# Let's fit the breakaway to this sample
ba_tr <- breakaway(fc)
ba_tr
# this is an alpha diversity estimate -- a special class for
# alpha diversity estimates

# breakaway picks lots of models and chooses the best
# Which model did it pick?
ba_tr$model

# Kemp -- that's a non-mixed Poisson model. Cool!

# Kemp models work by fitting a legit probabilistic model to
# a transformation of the data. We can plot the transformation and the fit:
ba_tr %>% plot
# (Not super important -- but if you want to know more, check out
# the paper: Willis & Bunge (2015), Biometrics)

# breakaway is easy to use! You can run it on one frequency table, but
# in general you'll run it on a phyloseq object, like here:
ba <- breakaway(water)
ba 

# you can plot using our default plotting function...
plot(ba, water, color = "SampleType")

# ... or you can take the estimates and turn them into 
# data frames so you can plot them yourself
# these are in the same order as the phyloseq samples:
summary(ba) %>%
  add_column("SampleNames" = water %>% otu_table %>% sample_names)


# Also, since the breakaway package implements lots of
# species richness estimates, you could choose a different one
# e.g., if you wanted something more stable
water %>%
  chao_bunge %>%
  plot(water, color = "SampleType")

# However, note that species richness is a challenging problem, 
# and that error bars will generally be large

# grrrrr don't use this one
water %>%
  chao1 %>%
  plot(water, color = "SampleType")
# (but note that these are the real error bars on this estimate)

# In many cases the absolute number of species
# isn't as interesting as comparing ecosystems. 
# Let's test the hypothesis that different types of water systems
# have the same microbial diversity

# betta() works like a regression model
# but it accounts for the uncertainty in estimating
# diversity


bt <- betta(summary(ba)$estimate,
            summary(ba)$error,
            make_design_matrix(water, "SampleType"))
bt$table

# betta() estimates that the mean Order-level
# diversity in Freshwater is 154 orders.
# It estimates that the diversity in
# creeks is significantly higher
# (on average 23 orders) while oceans
# have significantly lower diversity
# (on average, 16 orders). However, 
# estuaries do not have significantly different diversity 
# than fresh water sources.

# Note that these estimates account for
# different sequencing depths!
# breakaway estimates the number of missing species
# based on the sequence depth and
# number of rare taxa in the data

# The citation for betta() is
# Willis, Bunge & Whitman, 2016, JRSS-C

# Please use it! It's very important that
# you account for the error bars in
# diversity when doing hypothesis testing.
# t.test, lm(), aov(), anova() do not
# account for this -- betta does!

#### #### #### #### #### #### #### #### 
#### OTHER DIVERSITY INDICES
#### #### #### #### #### #### #### #### 

# Species richness counts all species equally.
# However, if a species is rare, you may think that
# it doesn't play a role in the community.
#  Another alpha diversity index, called the Shannon
# index, works similarly to species richness but it
# downweights the importance of rare taxa
# i.e. if a taxon is present but in low abundance, 
# it doesn't count for "1" taxon, but something less 
# (to reflect its low abundance)

# Since rare taxa may be dubious, the Shannon index
# is very popular in microbial ecology

# For the reasons discussed in the lecture, it's important to
# estimate the Shannon diversity using a valid
# estimator of the Shannon diversity.
# It's even more important to come up with
# the standard error, and use that
# standard error in testing using betta()

# Shockingly, until recently, there were no tools
# to estimate Shannon diversity
# in the presence of an ecological/microbial network!

# DivNet is a new tool that allows you to estimate this.
# It also adjusts for different sequencing depths
# so you don't have to throw away any data
# (you don't need to rarefy)!

# Check out our preprint: Willis & Martin (2018+), bioRxiv

# Let's load the package DivNet
library(DivNet)

# to test if DivNet loaded correctly, try
dv_water_testing <- divnet(water, ncores = 1, tuning = "test")
# If this command loads but other commands
# don't load, the problem is not DivNet,
# but a package that DivNet depends on

# DivNet is very flexible, but by default
# it estimates the
# microbial network and uses it only to
# adjust the standard errors on diversity estimates

# *If you looked at the package parallel*
# in the "more R" tutorials, 
# you should be able to run DivNet in parallel: 
dv_water <- divnet(water, ncores = 4)
# This might take a minute; if it errors,
# try running it in series (slower
# but more portable):
# dv_water <- divnet(water, ncores = 1)

# DivNet outputs a list of the estimates
# shannon, simpson (alpha diversity)
# bray-curtis, euclidean (beta diversity)
dv_water %>% names

# You can pull them out individually:
dv_water$shannon %>%
  summary %>%
  add_column("SampleNames" = water %>% otu_table %>% sample_names)

# or plot them:
plot(dv_water$shannon, 
     water, 
     col = "SampleType")

# Let's compare this to the naive approach of
# just "plugging in" the observed proportions
# to the Shannon diversity  formula

plot(water %>% sample_shannon, 
     water, 
     col = "SampleType") + ylim(0, 3.5)

# You will notice that the estimates are almost the same
# as previously, but the error bars differ significantly
# i.e., *there are error bars*

# The error bars were the same as previously
# because we haven't told DivNet anything about the experimental
# design. Here we observed
# 4 different water systems, so we will add this
# as a covariate
dv_water_st <- water %>%
  divnet(X = "SampleType", ncores = 8)

plot(dv_water_st$shannon, 
     water, 
     col = "SampleType")
# Now we see that organisms in the same group 
# are estimated to have the same diversity.
# We are now estimating the diversity 
# of this *type* of ecosystem. We're
# focusing on the ecosystem, not
# just the samples. If we want to 
# reproduce the results of our study,
# it's better to focus on the populations that
# the samples come from, not the samples themselves

# You can analyse sex, time, disease status in this way.
# It's actually ideal for longitudinal/epigenetic
# studies, because you can say something about the
# groups, not just the people/mice in your study

# Let's look at hypothesis testing for DivNet

testDiversity(dv_water_st, "shannon")
# So we have significantly lower Shannon diversity
# in creeks, oceans and estuaries than in
# freshwater. 

# This is just a wrapper for betta(), btw

# We can also do the same thing for Simpson, of course:
plot(dv_water_st$simpson, 
     water, 
     col = "SampleType")
testDiversity(dv_water_st, "simpson")

## To test hypotheses about beta diversity
# using DivNet, let's pull out our
# estimated distance matrix:
bc <- dv_water_st$`bray-curtis`

# You'll notice that all samples with the same
# SampleType at the same estimate. DivNet uses
# covariate information to share strength
# across samples and obtain an estimate
# about the beta diversity of the *ecosystems*
#   not the samples

# We can consider the  unique rows using
bc %>% unique

# Uniquely, DivNet also has variance estimates:
simplifyBeta(dv_water_st, water, "bray-curtis", "SampleType")
simplifyBeta(dv_water_st, water, "euclidean", "SampleType")

# You can plot this easily
simplifyBeta(dv_water_st, water, "bray-curtis", "SampleType") %>%
  ggplot(aes(x = interaction(Covar1, Covar2), 
             y = beta_est)) +
  geom_point() +
  geom_linerange(aes(ymin = lower, ymax = upper)) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("") + ylab("Estimates of Bray-Curtis distance")

# Finally, a teaser! Soon we will have
# support for phylodivnet(), to estimate UniFrac
# in the presence of a network. Subscribe
# to our github/Amy's Twitter feed
# for updates. 


## THANKS FOR STAYING THROUGH TO THE END! 
#
# I'm always happy to take questions
# either today, later, or after the course.
# 
# I'd also  love to hear if you
# hated it/loved it/worst day ever -- 
# please give me feedback to 
# better help you!! 
# 
# I want these tools to be used so if you
# have feedback on how to make them
# easier to use, please let me know.
#
# :) Amy