#' Script: Beta-Diversity
#' Version: 1.1
#' Last modified on 05/7/2022
#' Author: Ilias Lagkouvardos
#' Contributions by: Thomas Clavel, Sandra Reitmeier
#'
#' Calculate beta-diversity for microbial communities
#' based on permutational mulitvariate analysis of variances (PERMANOVA) using multiple distance matrices
#' computed from phylogenetic distances between observed organisms
#'
#' Input:
#' 1. Set the path to the directory where the file is stored 
#' 2. Write the name of the normalized OTU table without taxonomy information 
#' 3. Write the name of the mapping file that includes the samples groups
#' 4. Write the name of the OTU tree
#' 5. Write the name of the variable (sample group) used for comparison 
#' 
#'
#' Output: 
#' The script generates three graphical outputs (pdf), one text file and a newick tree
#' 1. A phylogram with colour-coded group clustering
#' 2. MDS and NMDS plots showing information about beta-diversity across all sample groups
#' 3. MDS and NMDS plots of all pairwise comparisons
#' 4. The distance matrix
#' 5. Plot showing the optimal number of clusters  
#' 6. Dendogram for all samples in a newick tree file
#'
#' Concept:
#' A distance matrix is calculated based on the generalized UniFrac approach
#' (Chen J, et al. Associating microbiome composition with environmental covariates using generalized UniFrac distances. 2012)
#' Samples are clustered based on the distance matrix using the Ward's hierarchical clustering method
#' To determine similarities between samples, a multivariate analysis is applied
#' and sample distribution is illustrated by means of MDS and NMDS (non-metric) plots
#' The Calinski-Harabasz (CH) and Silhouette Index are used to assess the optimal number of clusters the dataset was most robustly partitioned into  

##################################################################################
######             Set parameters in this section manually                  ######
##################################################################################

#' Please set the directory of the script as the working folder (e.g D:/studyname/NGS-Data/Rhea/beta-diversity/)
#' Note: the path is denoted by forward slash "/"
setwd("D:/path/to/Rhea/3.Beta-Diversity/")  #<--- CHANGE ACCORDINGLY !!!

#' Please give the file name of the normalized OTU-table without taxonomic classification
input_otu = "OTUs_Table-norm.tab"              #<--- CHANGE ACCORDINGLY !!!

#' Please give the name of the meta-file that contains individual sample information
input_meta = "mapping_file.tab"                #<--- CHANGE ACCORDINGLY !!!

#' Please give the name of the phylogenetic tree constructed from the OTU sequences
input_tree = "OTUs-Tree.tre"                   #<--- CHANGE ACCORDINGLY !!!

#' Please give the column name (in the mapping file) of the categorical variable to be used for comparison (e.g. Genotype)
#' Please make sure that your name do not contain hyphens "-" as they will cause problems in the parsing of the names.
group_name = "Diet"                            #<--- CHANGE ACCORDINGLY !!!

##################################################################################
######                  Additional parameters                               ######
##################################################################################

#' Turn on sample labeling
#' 0 = Samples are not labeled in the MDS/NMDS plots
#' 1 = All Samples are labeled in the MDS/NMDS plots
label_samples = 0

#' Determine which sample label should appear
#' Write the name of samples (in quotation marks), which should appear in the MDS/NMDS plots, in the vector (c) below
#' If more than one sample should be plotted, please separate their IDs by comma (e.g. c("sample1","sample2"))
label_id =c("")

#' De-Novo Clustering will be performed for the number of samples or maximal for the set limit
#' Default Limit is 100
kmers_limit=20

######                  NO CHANGES ARE NEEDED BELOW THIS LINE               ######

##################################################################################
######                             Main Script                              ######
##################################################################################

###################       Load all required libraries     ########################

# Check if required packages are already installed, and install if missing
packages <-c("ade4","GUniFrac","phangorn","cluster","fpc","vegan","clusterSim") 

# Function to check whether the package is installed
InsPack <- function(pack)
{
  if ((pack %in% installed.packages()) == FALSE) {
    install.packages(pack,repos ="http://cloud.r-project.org/")
  } 
}

# Applying the installation on the list of packages
lapply(packages, InsPack)

# Make the libraries
lib <- lapply(packages, require, character.only = TRUE)

# Check if it was possible to install all required libraries
flag <- all(as.logical(lib))


###################       Read all required input files      ####################
# Load the mapping file containing individual sample information (sample names in the first column)
meta_file <- read.table (file = input_meta, check.names = FALSE, header = TRUE, dec = ".", sep = "\t", row.names = 1, comment.char = "")

# Clean table from empty lines
meta_file <- data.frame(meta_file[!apply(is.na(meta_file) | meta_file=="",1,all),,drop=FALSE])

# Order the mapping file by sample names (ascending)
meta_file <- data.frame(meta_file[order(row.names(meta_file)),,drop=FALSE])

# Save the position of the target group name in the mapping file
meta_file_pos <- which(colnames(meta_file) == group_name)

# Select metadata group based on the pre-set group name
all_groups <- as.factor(meta_file[,meta_file_pos])

#------------------------------------------------------------------------

# Load the tab-delimited file containing the values to be analyzed (samples names in the first column)
otu_file <- read.table (file = input_otu, check.names = FALSE, header = TRUE, dec = ".", sep = "\t", row.names = 1, comment.char = "")

# Clean table from empty lines
otu_file <- otu_file[!apply(is.na(otu_file) | otu_file =="",1,all),]

# keep only those rows that appear in the mapping file
otu_file <- otu_file[,rownames(meta_file)]

# OTU-table and mapping file should have the same order and number of sample names
# Order the OTU-table by sample names (ascending)
otu_file <- otu_file[,order(names(otu_file))]

# Transpose OTU-table and convert format to a data frame
otu_file <- data.frame(t(otu_file), check.names = FALSE)

#------------------------------------------------------------------------

# Load the phylogenetic tree calculated from the OTU sequences 
tree_file <- read.tree(input_tree)

# Remove single quotes from the tips of the tree
tree_file$tip.label <- gsub("'", "", tree_file$tip.label)

# Root the OTU tree at midpoint 
rooted_tree <- midpoint(tree_file)


####################       Calculate beta-diversity          ###################

# Create the directory where all output files are saved (is named after the target group name set above for comparisons)
dir.create(group_name)

# Calculate the UniFrac distance matrix for comparing microbial communities
unifracs <- GUniFrac(otu_file, rooted_tree, alpha = c(0.0,0.5,1.0))$unifracs

# Weight on abundant lineages so the distance is not dominated by highly abundant lineages with 0.5 having the best power
unifract_dist <- unifracs[, , "d_0.5"]

################ Generate tree #######################

# Save the UniFrac output as distance object
all_dist_matrix <- as.dist(unifract_dist)

# Apply a hierarchical cluster analysis on the distance matrix based on the Ward's method
all_fit <- hclust(all_dist_matrix, method = "ward.D2")

# Generates a tree from the hierarchically generated object
tree <- as.phylo(all_fit)
my_tree_file_name <- paste(group_name,"/phylogram.pdf",sep="")
plot_color<-rainbow(length(levels(all_groups)))[all_groups]

# Save the generated phylogram in a pdf file
pdf(my_tree_file_name)

# The tree is visualized as a Phylogram color-coded by the selected group name
plot(tree, type = "phylogram",use.edge.length = TRUE, tip.color = (plot_color), label.offset = 0.01)
print.phylo(tree)
axisPhylo()
tiplabels(pch = 16, col = plot_color)
dev.off()

#################            Build NMDS plot           ########################

# Generated figures are saved in a pdf file 
file_name <- paste(group_name,"beta-diversity.pdf",sep="_")
pdf(paste(group_name,"/",file_name,sep=""))

# Calculate the significance of variance to compare multivariate sample means (including two or more dependent variables)
# Omit cases where there isn't data for the sample (NA)
all_groups_comp <- all_groups[!is.na(all_groups)]
unifract_dist_comp <- unifract_dist[!is.na(all_groups), !is.na(all_groups)]
adonis<-adonis2(as.dist(unifract_dist_comp) ~ all_groups_comp)
permdisp <- permutest(betadisper(as.dist(unifract_dist_comp),as.factor(all_groups_comp),type="median"))
all_groups_comp<-factor(all_groups_comp,levels(all_groups_comp)[unique(all_groups_comp)])

if(nrow(unifract_dist_comp)>2){
  # Calculate and display the MDS plot (Multidimensional Scaling plot)
  s.class(
    cmdscale(unifract_dist_comp, k = 2), col = unique(plot_color), cpoint =
      2, fac = all_groups_comp, sub = paste("MDS plot of Microbial Profiles\nPERMDISP     p=",permdisp[["tab"]][["Pr(>F)"]][1],"\n",
                                            "PERMANOVA  p=",adonis[1,5],sep="")
  )
  if (label_samples==1) {
    lab_samples <- row.names(cmdscale(unifract_dist_comp, k = 2))
    ifelse (label_id != "",lab_samples <- replace(lab_samples, !(lab_samples %in% label_id), ""), lab_samples)
    text(cmdscale(unifract_dist_comp, k = 2),labels=lab_samples,cex=0.7,adj=c(-.1,-.8))
  }
  
  # Calculate and display the NMDS plot (Non-metric Multidimensional Scaling plot)
  meta <- metaMDS(unifract_dist_comp,k = 2)
  s.class(
    meta$points, col = unique(plot_color), cpoint = 2, fac = all_groups_comp,
    sub = paste("metaNMDS plot of Microbial Profiles\nPERMDISP     p=",permdisp[["tab"]][["Pr(>F)"]][1],"\n",
                "PERMANOVA  p=",adonis[1,5],sep="")
  )
  if (label_samples==1){
    lab_samples <- row.names(meta$points)
    ifelse (label_id != "",lab_samples <- replace(lab_samples, !(lab_samples %in% label_id), ""), lab_samples)
    text(meta$points,labels=lab_samples,cex=0.7,adj=c(-.1,-.8))
  }
}

#close the pdf file
dev.off()

###############          NMDS for pairwise analysis        ###################

# This plot is only generated if there are more than two groups included in the comparison
# Calculate the pairwise significance of variance for group pairs
# Get all groups contained in the mapping file
unique_groups <- levels(all_groups_comp)
if (dim(table(unique_groups)) > 2) {
  
  # Initialise vector and lists
  pVal = NULL
  permdisppval=NULL
  pairedMatrixList <- list(NULL)
  pair_1_list <- NULL
  pair_2_list <- NULL
  
  for (i in 1:length(combn(unique_groups,2)[1,])) {
    
    # Combine all possible pairs of groups
    pair_1 <- combn(unique_groups,2)[1,i]
    pair_2 <- combn(unique_groups,2)[2,i]
    
    # Save pairs information in a vector
    pair_1_list[i] <- pair_1
    pair_2_list[i] <- pair_2
    
    # Generate a subset of all samples within the mapping file related to one of the two groups
    inc_groups <-
      rownames(subset(meta_file, meta_file[,meta_file_pos] == pair_1
                      |
                        meta_file[,meta_file_pos] == pair_2))
    
    # Convert UniFrac distance matrix to data frame
    paired_dist <- as.data.frame(unifract_dist_comp)
    
    # Save all row names of the mapping file
    row_names <- rownames(paired_dist)
    
    # Add row names to the distance matrix
    paired_dist <- cbind(row_names,paired_dist)
    
    # Generate distance matrix with samples of the compared groups (column-wise)
    paired_dist <- paired_dist[sapply(paired_dist[,1], function(x) all(x %in% inc_groups)),]
    
    # Remove first column with unnecessary group information
    paired_dist[,1] <- NULL
    paired_dist <- rbind(row_names,paired_dist)
    
    # Generate distance matrix with samples of the compared group (row-wise)
    paired_dist <- paired_dist[,sapply(paired_dist[1,], function(x) all(x %in% inc_groups))]
    
    # Remove first row with unnecessary group information 
    paired_dist <- paired_dist[-1,]
    
    # Convert generated distance matrix to data type matrix (needed by multivariate analysis)
    paired_matrix <- as.matrix(paired_dist)
    class(paired_matrix) <- "numeric"
    
    # Save paired matrix in list
    pairedMatrixList[[i]] <- paired_matrix
    
    # Applies multivariate analysis to a pair out of the selected groups
    adonis <- adonis2(paired_matrix ~ all_groups_comp[all_groups_comp == pair_1 |
                                                        all_groups_comp == pair_2])
    
    permdisp <- permutest(betadisper(as.dist(paired_matrix),as.factor(all_groups_comp[all_groups_comp == pair_1 |
                                                                                        all_groups_comp == pair_2]),type="median"),pairwise = T)
    
    # List p-values
    pVal[i] <- adonis[1,5]
    permdisppval[i] <- permdisp$pairwise[2]
    
  }
  
  # Adjust p-values for multiple testing according to Benjamini-Hochberg method
  pVal_BH <- round(p.adjust(pVal,method="BH", n=length(pVal)),4)
  permdisppval_BH <- round(p.adjust(permdisppval,method="BH", n=length(permdisppval)),4)
  
  
  # Generated NMDS plots are stored in one pdf file called "pairwise-beta-diversity-nMDS.pdf"
  file_name <- paste(group_name,"pairwise-beta-diversity-NMDS.pdf",sep="_")
  pdf(paste(group_name,"/",file_name,sep=""))
  
  for(i in 1:length(combn(unique_groups,2)[1,])){
    if (nrow(pairedMatrixList[[i]])>2){
      meta <- metaMDS(pairedMatrixList[[i]], k = 2)
      s.class(
        meta$points,
        col = rainbow(length(levels(all_groups_comp))), cpoint = 2,
        fac = as.factor(all_groups_comp[all_groups_comp == pair_1_list[i] |
                                          all_groups_comp == pair_2_list[i]]),
        sub = paste("NMDS plot of Microbial Profiles\n ",pair_1_list[i]," - ",pair_2_list[i], "\n PERMDISP     p=",permdisppval[[i]],",","  p.adj=", permdisppval_BH[i],"\n",
                    " PERMANOVA  p=",pVal[i],","," p.adj=",pVal_BH[i],sep="")
      )
    }
  }
  dev.off()
  
  # Generated MDS plots are stored in one pdf file called "pairwise-beta-diversity-MDS.pdf"
  file_name <- paste(group_name,"pairwise-beta-diversity-MDS.pdf",sep="_")
  pdf(paste(group_name,"/",file_name,sep=""))
  
  for(i in 1:length(combn(unique_groups,2)[1,])){
    if (nrow(pairedMatrixList[[i]])>2){
      # Calculate and display the MDS plot (Multidimensional Scaling plot)
      s.class(
        cmdscale(pairedMatrixList[[i]], k = 2), col = rainbow(length(levels(all_groups_comp))), cpoint =
          2, fac = as.factor(all_groups_comp[all_groups_comp == pair_1_list[i] |
                                               all_groups_comp == pair_2_list[i]]), 
        sub = paste("MDS plot of Microbial Profiles\n ",pair_1_list[i]," - ",pair_2_list[i], "\n PERMDISP     p=",permdisppval[[i]],",","  p.adj=", permdisppval_BH[i],"\n",
                    " PERMANOVA  p=",pVal[i],","," p.adj=",pVal_BH[i],sep="")
      )
    }
  }
  dev.off()                                     
  
}

######                        Determine number of clusters                           ######

ch_nclusters=NULL
sil_nclusters=NULL
dunn_nclusters=NULL
db_nclusters=NULL

if (dim(otu_file)[1]-1 <= kmers_limit) {
  kmers_limit=dim(otu_file)[1]-1
}
for (k in 1:kmers_limit) { 
  if (k==1) {
    ch_nclusters[k]=NA 
    sil_nclusters[k]=NA
    dunn_nclusters=NA
    db_nclusters=NA
  } else {
    # Partitioning the data into k clusters (max k is number of samples within the dataset)
    data_cluster=as.vector(pam(as.dist(unifract_dist_comp), k, diss=TRUE)$clustering)
    
    # Calculate Calinski-Harabasz and silhouette Index 
    index=cluster.stats(as.dist(unifract_dist_comp),data_cluster)
    index_db=index.DB(x=otu_file,cl=data_cluster,d=as.dist(unifract_dist_comp), centrotypes="medoids")
    ch_nclusters[k] <- index[["ch"]]
    sil_nclusters[k] <- index[["avg.silwidth"]]
    dunn_nclusters[k] <- index[["dunn2"]]
    db_nclusters[k] <-  index_db[["DB"]]
    print(k)
  }
}

# Generated plot showing the optimal number of clusters
for (i in 1:2){
  if (i==1) { pdf("De-novo-clustering.pdf")}
  if (i==2) { pdf(paste0(group_name,"/","De-novo-clustering.pdf"))}
  
  plot(ch_nclusters, type="h", xlab="k clusters", ylab="CH index",main="Optimal number of clusters (CH index)")
  title(sub="*The higher the value the better", adj=0, cex.sub=0.9)
  plot(sil_nclusters, type="h", xlab="k clusters", ylab="Average silhouette width",main="Optimal number of clusters (Silhouette index)")
  title(sub="*The higher the value the better", adj=0, cex.sub=0.9)
  plot(dunn_nclusters, type="h", xlab="k clusters", ylab="Dunn Index",main="Optimal number of clusters (Dunn Index)")
  title(sub="*The higher the value the better", adj=0, cex.sub=0.9)
  plot(db_nclusters, type="h", xlab="k clusters", ylab="Davies-Bouldin Index",main="Optimal number of clusters (Davies-Bouldin Index)")
  title(sub="*The lower the value the better", adj=0, cex.sub=0.9)
  
  dev.off()
}

#################################################################################
######                        Write Output Files                           ######
#################################################################################

# Write the distance matrix table in a file
file_name <- paste(group_name,"distance-matrix-gunif.tab",sep="_")
write.table( unifract_dist_comp, paste(group_name,"/",file_name,sep=""), sep = "\t", col.names = NA, quote = FALSE)
write.table( unifract_dist_comp, "distance-matrix-gunif.tab", sep = "\t", col.names = NA, quote = FALSE)
write.tree(tree,"samples-Tree.nwk",tree.names = FALSE)

# Graphical output files are generated in the main part of the script
if(!flag) { stop("
                 It was not possible to install all required R libraries properly.
                 Please check the installation of all required libraries manually.\n
                 Required libaries:ade4, GUniFrac, phangorn")
}

#################################################################################
######                           End of Script                             ######
#################################################################################