---
title: "08.DESeq2_DEG_analysis"
output: html_document
date: "2023-11-11"
---

```{r}
library(tidyverse)
library(cowplot)
library(edgeR)
library(reshape2)
library(SingleCellExperiment)
library(pheatmap)
library(apeglm)
library(png)
library(DESeq2)
library(RColorBrewer)
library(data.table)
library(FactoMineR)
library(factoextra)  
library(tidyverse)
library(pheatmap)
library(DESeq2)
library(RColorBrewer)
library(viridis)  
library(ggplot2)
library(grid)
library(ggrepel)
library(ggrepel)
library(Cairo)
```

#PCA
```{r}
# Transform counts for data visualization
rld <- vst(DESeq, blind=TRUE)

# Plot PCA
DESeq2::plotPCA(rld, ntop = 500, intgroup = c("group.id","sample.id"))
```

```{r}
#plot the PCA
pcaPlot <- plotPCA(rld, ntop = 500, intgroup = c("group.id"), returnData = TRUE)
# Convert to ggplot object

# Set your desired figure size and font size
fig_width <- 3
fig_height <- 2
font_size <- 3 # Adjust as needed
point_size <- 3 # Adjust as needed

# Define colors for group.id (adjust the colors as needed)
#group_colors <- c("ctrl" = "blue", "MφOC" = "red", "TregTex"= "orange", "Mo"="black") # Replace with actual group names and desired colors
group_colors <- c("low" = "blue", "high"= "orange") # Replace with actual group names and desired colors
#group_colors <- c("LC" = "blue", "KC"= "orange", "BC"="red", "ctrl"="black", "CC"="purple","EC"="pink","TC"="green") # Replace with actual group names and desired colors

# Generate the plot with ggrepel for non-overlapping text labels
ggplot(pcaPlot, aes(x = PC1, y = PC2, color = group.id, label = group.id)) +
    geom_point(size = point_size) +
    geom_text_repel(size = font_size, 
                    box.padding = 0.5, 
                    point.padding = 0.3,
                    segment.size = 0.2) +
    scale_color_manual(values = group_colors) +
    theme(legend.position="none", 
          text = element_text(size = font_size),
          panel.background = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    theme_minimal() +
    theme(axis.text.x = element_text(size = font_size),
          axis.text.y = element_text(size = font_size),
          axis.title = element_text(size = font_size))


#ggsave(filename = "pca_by_group_id.pdf",device = cairo_pdf, width = 6, height = 4, dpi = 300, path = './outs/epithelium/by_muts_by_pheno/KC/')
```

#Hierarchical clustering
```{r}
# Extract the rlog matrix from the object and compute pairwise correlation values
rld_mat <- assay(rld)
rld_cor <- cor(rld_mat)

# Plot heatmap
h <- pheatmap(rld_cor, annotation = cluster_metadata[, c("group.id", "sample.id"), drop=F])
#ggsave(h, filename = "clustering.pdf",  device = cairo_pdf, width = 12, height = 9, dpi = 300, path = './outs/epithelium/by_muts_by_pheno/KC/')
```

#Running DESeq2
```{r}
# Run DESeq2 differential expression analysis
DESeq <- DESeq(DESeq)
# Plot dispersion estimates
#We can check the fit of the DESeq2 model to our data by looking at the plot of dispersion estimates.
plotDispEsts(DESeq)
#ggsave(filename = "DEGseq_plot.pdf", width = 12, height = 9, dpi = 300, path = '../outs/pheno_B/MφOC_vs_TregTex/')
```

#Results
```{r}
#Let’s compare the stimulated group relative to the control:
# Output results of Wald test for contrast for pos vs neg
#method 1 design the exp and ctl groups -----------------------------------------------------------------------------------------
levels(cluster_metadata$group.id)[1]
levels(cluster_metadata$group.id)[2]
levels(cluster_metadata$group.id)[3]
levels(cluster_metadata$group.id)[4]
levels(cluster_metadata$group.id)[5]
levels(cluster_metadata$group.id)[6]
levels(cluster_metadata$group.id)[7]

#the following contrast matrix can only compare two groups
contrast <- c("group.id", 
              levels(cluster_metadata$group.id)[1],
              #levels(cluster_metadata$group.id)[1],
              levels(cluster_metadata$group.id)[2]
              )

res <- results(DESeq, 
               contrast = contrast,
               alpha = 0.05)

res
#First let’s generate the results table for all of our results:
# Turn the results object into a tibble for use with tidyverse functions
res_tbl <- res %>%
        data.frame() %>%
        rownames_to_column(var="gene") %>%
        as_tibble()

# Check results output
res_tbl

# Write all results to file
write.csv(res_tbl,
          paste0("results/", clusters[1], "_", levels(cluster_metadata$sample)[2], "_vs_", levels(cluster_metadata$sample)[1], "_all_genes.csv"),
          quote = FALSE, 
          row.names = FALSE)


#method 2, design exp and ctl group by default comparisons----------------------------------------------------------------------------------------------
# Check the coefficients for the comparison
resultsNames(DESeq)
#[1] "Intercept"                 "group.id_DSS_vs_ctrl"      "group.id_DSS.B109_vs_ctrl"

# Generate results object
res <- results(DESeq, 
               name = "group.id_pos_vs_neg", 
               alpha = 0.05)

# Shrink the log2 fold changes to be more appropriate using the apeglm method - should cite [paper]() when using this method
res <- lfcShrink(DESeq, 
                 coef = "group.id_DSS_vs_ctrl",
                 res=res,
                 type = "apeglm")
```

#Table of results for all genes
```{r}
# Turn the results object into a tibble for use with tidyverse functions
res_tbl <- res %>%
        data.frame() %>%
        rownames_to_column(var="gene") %>%
        as_tibble()


# Check results output
res_tbl
res_tbl <- na.omit(res_tbl)
# Write all results to file
#write.csv(res_tbl,
#          quote = FALSE, 
#          row.names = FALSE,
#          './outs/epithelium/by_cancer_by_pheno/BC/res_tbl.csv')

#saveRDS(res_tbl, './outs/epithelium/by_cancer_by_pheno/BC/res_tbl.rds')
```

Table of results for significant genes
```{r}
# Set thresholds
pvalue_cutoff <- 0.05

# Subset the significant results
sig_res <- dplyr::filter(res_tbl, pvalue < pvalue_cutoff) %>%
        dplyr::arrange(pvalue)

# Check significant genes output
sig_res

#write.csv(sig_res,
#          quote = FALSE, 
#          row.names = FALSE,
#          './outs/epithelium/by_cancer_by_pheno/BC/sig_res.csv')

#saveRDS(sig_res, './outs/epithelium/by_cancer_by_pheno/BC/sig_res.rds')
```

```{r, normolize}
normalized_counts <- counts(DESeq, normalized = TRUE)
```

#Scatterplot of normalized expression of top 20 most significant genes
##Now that we have identified the significant genes, we can plot a scatterplot of the top 20 significant genes. This plot is a good check to make sure that we are interpreting our fold change values correctly, as well.
```{r}
## ggplot of top genes
##fitering parameters
log2FC_cutoff = log2(1.5)
pvalue_cutoff = 0.05

##select the differential gene list 
need_DEG <- sig_res[,c(1,3,6)] #select log2FoldChange, p value information
colnames(need_DEG) <- c('gene','log2FoldChange','p_value') 

need_DEG$significance  <- as.factor(ifelse(need_DEG$p_value < pvalue_cutoff & abs(need_DEG$log2FoldChange) > log2FC_cutoff, ifelse(need_DEG$log2FoldChange > log2FC_cutoff ,'UP','DOWN'),'NOT'))

title <- paste0(' Up :  ',nrow(need_DEG[need_DEG$significance =='UP',]) ,
                     '\n Down : ',nrow(need_DEG[need_DEG$significance =='DOWN',]),
                     '\n FoldChange >= ',round(2^log2FC_cutoff,3))

need_DEG$delabel <- NA
need_DEG$delabel[need_DEG$significance != "NOT"] <- row.names(need_DEG)[need_DEG$significance != "NOT"]

g <- ggplot(data=need_DEG, 
            aes(x=log2FoldChange, y=-log10(p_value), 
                color=significance, label=delabel)) +
  #dots and background
  geom_point(alpha=0.4, size=1) +
  theme_classic()+ #grid
  xlab("log2 ( FoldChange )") + 
  ylab("-log10 ( P.value )") +
  ggtitle( title ) +
  scale_colour_manual(values = c('blue','grey'
                                 ,'red'))+ 
  scale_fill_brewer(palette="Set1")+
  geom_vline(xintercept = c(-log2FC_cutoff,log2FC_cutoff),lty=4,col="grey",lwd=0.5) +
  geom_hline(yintercept = -log10(pvalue_cutoff),lty=4,col="grey",lwd=0.5) +
  
  theme(plot.title = element_text(hjust = 0.5), 
        plot.margin=unit(c(2,2,2,2),'lines'), #top, right, bottom, left
        legend.title = element_blank(), 
        legend.position="right")
g

#ggsave(g,filename = './outs/epithelium/by_cancer_by_pheno/BC/vocanol_no_labeling.pdf',width =6,height =4, dpi= 300)


```

```{r, with dot labeled}
need_DEG$significance<- "NOT"

need_DEG$significance[need_DEG$log2FoldChange > 0.58 & need_DEG$p_value < 0.05] <- "UP"

need_DEG$significance[need_DEG$log2FoldChange < -0.58 & need_DEG$p_value < 0.05] <- "DOWN"


need_DEG %>% remove_rownames %>% column_to_rownames(var="gene")

need_DEG$delabel <- NA
need_DEG$delabel[need_DEG$significance != "NOT"] <- need_DEG$gene[need_DEG$significance != "NOT"]

v <- ggplot(data=need_DEG, aes(x=log2FoldChange, y=-log10(p_value), col=significance, label=delabel)) +
        geom_point() + 
        theme_minimal() +
        geom_text_repel() +
        scale_color_manual(values=c("blue"
                                    , "grey"
                                    , "red")) +
        geom_vline(xintercept=c(-0.6, 0.6),lty=2,col="grey",lwd=0.5) +
        #geom_hline(yintercept=-log10(0.05), lty=4,col="grey",lwd=0.8) +
    geom_point(alpha=0.4, size=1) +
  theme_classic()+ #grid
  xlab("log2 ( Fold Change )") + 
  ylab("-log10 ( P value )") +
  ggtitle( title ) +
  scale_colour_manual(values = c('blue'
                                 ,'grey'
                                 ,'red'))+ 
  geom_vline(xintercept = c(-log2FC_cutoff,log2FC_cutoff),lty=2,col="grey",lwd=0.5) +
  geom_hline(yintercept = -log10(pvalue_cutoff),lty=2,col="grey",lwd=0.5) +
  
  theme(plot.title = element_text(hjust = 0.5), 
        plot.margin=unit(c(2,2,2,2),'lines'), #top, right, bottom, left
        legend.title = element_blank(), 
        legend.position="right")

v

#ggsave(v,filename = './outs/epithelium/by_cancer_by_pheno/BC/vocanol_labeling.pdf',width =6,height =4, dpi= 300)

```

```{r, Heatmap}
# Heatmap
## Extract normalized counts for significant genes only
sig_counts <- normalized_counts[rownames(normalized_counts) %in% sig_res$gene, ]

## Set a color-blind friendly palette
heat_colors <- rev(brewer.pal(10, "PuOr"))
breaks <- seq(-1.5, 1.5, length.out = length(heat_colors) + 1)

## Run pheatmap using the metadata data frame for the annotation
hp1 <- pheatmap(sig_counts, 
         color = heat_colors, 
        breaks = breaks,  # Use the defined breaks
         cluster_rows = TRUE, 
         show_rownames = FALSE,
         annotation = cluster_metadata[, c("sample.id", "group.id", "cell.id")], 
         border_color = NA, 
         fontsize = 7, 
         scale = "row", 
         fontsize_row = 10, 
         height = 20,
         angle_col = 45)

#ggsave(hp1, filename = 'heatmap_sig_genes.pdf',width =10,height =18, dpi = 300, path = './outs/epithelium/by_cancer_by_pheno/BC/')
```


```{r}
#make the plot color darker, adjust the color scale
# Function to make colors darker
sig_counts <- normalized_counts[rownames(normalized_counts) %in% sig_res$gene, ]

## Set a color-blind friendly palette
heat_colors <- rev(brewer.pal(10, "PuOr"))

#assign the colors for annotation
#group_colors <- c(#"ctrl" = "black", "MφOC" = "red", 
                  #"high"= "orange", "low"="purple")
group_colors <- c("MφΟC" = "blue", "ΤregTex"= "orange", "Mo" = "green") # Replace with actual group names and desired colors

cluster_metadata$group.id <- factor(cluster_metadata$group.id)
# Create a list of annotation colors
annotation_colors <- list(group.id = group_colors)
# Define the breaks for the colors, ensuring to cover the range -4 to 4
breaks <- seq(-1, 1, length.out = length(heat_colors) + 0.5)

# Run pheatmap using the metadata data frame for the annotation
hp1 <- pheatmap(sig_counts, 
         color = heat_colors, 
         breaks = breaks,  # Use the defined breaks
         cluster_rows = TRUE, 
         show_rownames = FALSE,
         annotation_colors = annotation_colors, 
         annotation = cluster_metadata[, c("sample.id", "group.id", "cell.id")], 
         border_color = NA, 
         fontsize = 7, 
         scale = "row", 
         fontsize_row = 5, 
         height = 15,
         angle_col = 90)
#ggsave(hp1, filename = 'heatmap_sig_genes_.pdf',device = cairo_pdf, width =8,height =10, dpi = 300, path = './outs/stroma/OB_macs_vs_t/')

```