---
title: "08.DESeq2_DEG_analysis"
output: html_document
date: "2023-11-11"
---
```{r}
library(tidyverse)
library(cowplot)
library(edgeR)
library(reshape2)
library(SingleCellExperiment)
library(pheatmap)
library(apeglm)
library(png)
library(DESeq2)
library(RColorBrewer)
library(data.table)
library(FactoMineR)
library(factoextra)
library(tidyverse)
library(pheatmap)
library(DESeq2)
library(RColorBrewer)
library(viridis)
library(ggplot2)
library(grid)
library(ggrepel)
library(ggrepel)
library(Cairo)
```
#PCA
```{r}
# Transform counts for data visualization
rld <- vst(DESeq, blind=TRUE)
# Plot PCA
DESeq2::plotPCA(rld, ntop = 500, intgroup = c("group.id","sample.id"))
```
```{r}
#plot the PCA
pcaPlot <- plotPCA(rld, ntop = 500, intgroup = c("group.id"), returnData = TRUE)
# Convert to ggplot object
# Set your desired figure size and font size
fig_width <- 3
fig_height <- 2
font_size <- 3 # Adjust as needed
point_size <- 3 # Adjust as needed
# Define colors for group.id (adjust the colors as needed)
#group_colors <- c("ctrl" = "blue", "MφOC" = "red", "TregTex"= "orange", "Mo"="black") # Replace with actual group names and desired colors
group_colors <- c("low" = "blue", "high"= "orange") # Replace with actual group names and desired colors
#group_colors <- c("LC" = "blue", "KC"= "orange", "BC"="red", "ctrl"="black", "CC"="purple","EC"="pink","TC"="green") # Replace with actual group names and desired colors
# Generate the plot with ggrepel for non-overlapping text labels
ggplot(pcaPlot, aes(x = PC1, y = PC2, color = group.id, label = group.id)) +
geom_point(size = point_size) +
geom_text_repel(size = font_size,
box.padding = 0.5,
point.padding = 0.3,
segment.size = 0.2) +
scale_color_manual(values = group_colors) +
theme(legend.position="none",
text = element_text(size = font_size),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
theme_minimal() +
theme(axis.text.x = element_text(size = font_size),
axis.text.y = element_text(size = font_size),
axis.title = element_text(size = font_size))
#ggsave(filename = "pca_by_group_id.pdf",device = cairo_pdf, width = 6, height = 4, dpi = 300, path = './outs/epithelium/by_muts_by_pheno/KC/')
```
#Hierarchical clustering
```{r}
# Extract the rlog matrix from the object and compute pairwise correlation values
rld_mat <- assay(rld)
rld_cor <- cor(rld_mat)
# Plot heatmap
h <- pheatmap(rld_cor, annotation = cluster_metadata[, c("group.id", "sample.id"), drop=F])
#ggsave(h, filename = "clustering.pdf", device = cairo_pdf, width = 12, height = 9, dpi = 300, path = './outs/epithelium/by_muts_by_pheno/KC/')
```
#Running DESeq2
```{r}
# Run DESeq2 differential expression analysis
DESeq <- DESeq(DESeq)
# Plot dispersion estimates
#We can check the fit of the DESeq2 model to our data by looking at the plot of dispersion estimates.
plotDispEsts(DESeq)
#ggsave(filename = "DEGseq_plot.pdf", width = 12, height = 9, dpi = 300, path = '../outs/pheno_B/MφOC_vs_TregTex/')
```
#Results
```{r}
#Let’s compare the stimulated group relative to the control:
# Output results of Wald test for contrast for pos vs neg
#method 1 design the exp and ctl groups -----------------------------------------------------------------------------------------
levels(cluster_metadata$group.id)[1]
levels(cluster_metadata$group.id)[2]
levels(cluster_metadata$group.id)[3]
levels(cluster_metadata$group.id)[4]
levels(cluster_metadata$group.id)[5]
levels(cluster_metadata$group.id)[6]
levels(cluster_metadata$group.id)[7]
#the following contrast matrix can only compare two groups
contrast <- c("group.id",
levels(cluster_metadata$group.id)[1],
#levels(cluster_metadata$group.id)[1],
levels(cluster_metadata$group.id)[2]
)
res <- results(DESeq,
contrast = contrast,
alpha = 0.05)
res
#First let’s generate the results table for all of our results:
# Turn the results object into a tibble for use with tidyverse functions
res_tbl <- res %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
# Check results output
res_tbl
# Write all results to file
write.csv(res_tbl,
paste0("results/", clusters[1], "_", levels(cluster_metadata$sample)[2], "_vs_", levels(cluster_metadata$sample)[1], "_all_genes.csv"),
quote = FALSE,
row.names = FALSE)
#method 2, design exp and ctl group by default comparisons----------------------------------------------------------------------------------------------
# Check the coefficients for the comparison
resultsNames(DESeq)
#[1] "Intercept" "group.id_DSS_vs_ctrl" "group.id_DSS.B109_vs_ctrl"
# Generate results object
res <- results(DESeq,
name = "group.id_pos_vs_neg",
alpha = 0.05)
# Shrink the log2 fold changes to be more appropriate using the apeglm method - should cite [paper]() when using this method
res <- lfcShrink(DESeq,
coef = "group.id_DSS_vs_ctrl",
res=res,
type = "apeglm")
```
#Table of results for all genes
```{r}
# Turn the results object into a tibble for use with tidyverse functions
res_tbl <- res %>%
data.frame() %>%
rownames_to_column(var="gene") %>%
as_tibble()
# Check results output
res_tbl
res_tbl <- na.omit(res_tbl)
# Write all results to file
#write.csv(res_tbl,
# quote = FALSE,
# row.names = FALSE,
# './outs/epithelium/by_cancer_by_pheno/BC/res_tbl.csv')
#saveRDS(res_tbl, './outs/epithelium/by_cancer_by_pheno/BC/res_tbl.rds')
```
Table of results for significant genes
```{r}
# Set thresholds
pvalue_cutoff <- 0.05
# Subset the significant results
sig_res <- dplyr::filter(res_tbl, pvalue < pvalue_cutoff) %>%
dplyr::arrange(pvalue)
# Check significant genes output
sig_res
#write.csv(sig_res,
# quote = FALSE,
# row.names = FALSE,
# './outs/epithelium/by_cancer_by_pheno/BC/sig_res.csv')
#saveRDS(sig_res, './outs/epithelium/by_cancer_by_pheno/BC/sig_res.rds')
```
```{r, normolize}
normalized_counts <- counts(DESeq, normalized = TRUE)
```
#Scatterplot of normalized expression of top 20 most significant genes
##Now that we have identified the significant genes, we can plot a scatterplot of the top 20 significant genes. This plot is a good check to make sure that we are interpreting our fold change values correctly, as well.
```{r}
## ggplot of top genes
##fitering parameters
log2FC_cutoff = log2(1.5)
pvalue_cutoff = 0.05
##select the differential gene list
need_DEG <- sig_res[,c(1,3,6)] #select log2FoldChange, p value information
colnames(need_DEG) <- c('gene','log2FoldChange','p_value')
need_DEG$significance <- as.factor(ifelse(need_DEG$p_value < pvalue_cutoff & abs(need_DEG$log2FoldChange) > log2FC_cutoff, ifelse(need_DEG$log2FoldChange > log2FC_cutoff ,'UP','DOWN'),'NOT'))
title <- paste0(' Up : ',nrow(need_DEG[need_DEG$significance =='UP',]) ,
'\n Down : ',nrow(need_DEG[need_DEG$significance =='DOWN',]),
'\n FoldChange >= ',round(2^log2FC_cutoff,3))
need_DEG$delabel <- NA
need_DEG$delabel[need_DEG$significance != "NOT"] <- row.names(need_DEG)[need_DEG$significance != "NOT"]
g <- ggplot(data=need_DEG,
aes(x=log2FoldChange, y=-log10(p_value),
color=significance, label=delabel)) +
#dots and background
geom_point(alpha=0.4, size=1) +
theme_classic()+ #grid
xlab("log2 ( FoldChange )") +
ylab("-log10 ( P.value )") +
ggtitle( title ) +
scale_colour_manual(values = c('blue','grey'
,'red'))+
scale_fill_brewer(palette="Set1")+
geom_vline(xintercept = c(-log2FC_cutoff,log2FC_cutoff),lty=4,col="grey",lwd=0.5) +
geom_hline(yintercept = -log10(pvalue_cutoff),lty=4,col="grey",lwd=0.5) +
theme(plot.title = element_text(hjust = 0.5),
plot.margin=unit(c(2,2,2,2),'lines'), #top, right, bottom, left
legend.title = element_blank(),
legend.position="right")
g
#ggsave(g,filename = './outs/epithelium/by_cancer_by_pheno/BC/vocanol_no_labeling.pdf',width =6,height =4, dpi= 300)
```
```{r, with dot labeled}
need_DEG$significance<- "NOT"
need_DEG$significance[need_DEG$log2FoldChange > 0.58 & need_DEG$p_value < 0.05] <- "UP"
need_DEG$significance[need_DEG$log2FoldChange < -0.58 & need_DEG$p_value < 0.05] <- "DOWN"
need_DEG %>% remove_rownames %>% column_to_rownames(var="gene")
need_DEG$delabel <- NA
need_DEG$delabel[need_DEG$significance != "NOT"] <- need_DEG$gene[need_DEG$significance != "NOT"]
v <- ggplot(data=need_DEG, aes(x=log2FoldChange, y=-log10(p_value), col=significance, label=delabel)) +
geom_point() +
theme_minimal() +
geom_text_repel() +
scale_color_manual(values=c("blue"
, "grey"
, "red")) +
geom_vline(xintercept=c(-0.6, 0.6),lty=2,col="grey",lwd=0.5) +
#geom_hline(yintercept=-log10(0.05), lty=4,col="grey",lwd=0.8) +
geom_point(alpha=0.4, size=1) +
theme_classic()+ #grid
xlab("log2 ( Fold Change )") +
ylab("-log10 ( P value )") +
ggtitle( title ) +
scale_colour_manual(values = c('blue'
,'grey'
,'red'))+
geom_vline(xintercept = c(-log2FC_cutoff,log2FC_cutoff),lty=2,col="grey",lwd=0.5) +
geom_hline(yintercept = -log10(pvalue_cutoff),lty=2,col="grey",lwd=0.5) +
theme(plot.title = element_text(hjust = 0.5),
plot.margin=unit(c(2,2,2,2),'lines'), #top, right, bottom, left
legend.title = element_blank(),
legend.position="right")
v
#ggsave(v,filename = './outs/epithelium/by_cancer_by_pheno/BC/vocanol_labeling.pdf',width =6,height =4, dpi= 300)
```
```{r, Heatmap}
# Heatmap
## Extract normalized counts for significant genes only
sig_counts <- normalized_counts[rownames(normalized_counts) %in% sig_res$gene, ]
## Set a color-blind friendly palette
heat_colors <- rev(brewer.pal(10, "PuOr"))
breaks <- seq(-1.5, 1.5, length.out = length(heat_colors) + 1)
## Run pheatmap using the metadata data frame for the annotation
hp1 <- pheatmap(sig_counts,
color = heat_colors,
breaks = breaks, # Use the defined breaks
cluster_rows = TRUE,
show_rownames = FALSE,
annotation = cluster_metadata[, c("sample.id", "group.id", "cell.id")],
border_color = NA,
fontsize = 7,
scale = "row",
fontsize_row = 10,
height = 20,
angle_col = 45)
#ggsave(hp1, filename = 'heatmap_sig_genes.pdf',width =10,height =18, dpi = 300, path = './outs/epithelium/by_cancer_by_pheno/BC/')
```
```{r}
#make the plot color darker, adjust the color scale
# Function to make colors darker
sig_counts <- normalized_counts[rownames(normalized_counts) %in% sig_res$gene, ]
## Set a color-blind friendly palette
heat_colors <- rev(brewer.pal(10, "PuOr"))
#assign the colors for annotation
#group_colors <- c(#"ctrl" = "black", "MφOC" = "red",
#"high"= "orange", "low"="purple")
group_colors <- c("MφΟC" = "blue", "ΤregTex"= "orange", "Mo" = "green") # Replace with actual group names and desired colors
cluster_metadata$group.id <- factor(cluster_metadata$group.id)
# Create a list of annotation colors
annotation_colors <- list(group.id = group_colors)
# Define the breaks for the colors, ensuring to cover the range -4 to 4
breaks <- seq(-1, 1, length.out = length(heat_colors) + 0.5)
# Run pheatmap using the metadata data frame for the annotation
hp1 <- pheatmap(sig_counts,
color = heat_colors,
breaks = breaks, # Use the defined breaks
cluster_rows = TRUE,
show_rownames = FALSE,
annotation_colors = annotation_colors,
annotation = cluster_metadata[, c("sample.id", "group.id", "cell.id")],
border_color = NA,
fontsize = 7,
scale = "row",
fontsize_row = 5,
height = 15,
angle_col = 90)
#ggsave(hp1, filename = 'heatmap_sig_genes_.pdf',device = cairo_pdf, width =8,height =10, dpi = 300, path = './outs/stroma/OB_macs_vs_t/')
```