---
title: "additional data analysis"
output: html_document
date: "2024-09-16"
---
```{r}
# Load necessary libraries
rm(list=ls())
library(GSVA)
library(GSEABase)
library(pheatmap)
```
```{r}
# Step 1: Load expression data
# The expression matrix should have genes as rows and patients as columns
# Replace 'path/to/expression_matrix.txt' with your actual file path
load("/Volumes/Fengshuo_14T/Lab/Yunfeng/cancer_comparsions/Zenodo/data/Bulk_Microarray_Data(published)/PMID-28878133.RData")
marker <- readRDS("./data/Bulk_Microarray_Data(published)/cell_type_markers_from_scRNA_data_for_estimation.rds")
expression_matrix <- adrain.clps
# Step 2: Use your marker data
# Assuming 'marker' is your data frame with cell type markers
# If 'marker' is not already in your R environment, read it from a file
# marker <- read.table("path/to/marker_data.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)
# Create a list of gene sets (one per cell type)
cell_type_markers <- split(marker$gene, marker$cluster)
cell_type_markers <- lapply(cell_type_markers, function(genes) {
toupper(genes[1:min(20, length(genes))]) # Keep top 20 genes
})
# Step 3: Ensure gene names match between expression data and cell markers
# Convert gene names to uppercase (or lowercase) if necessary to match
rownames(expression_matrix) <- toupper(rownames(expression_matrix))
cell_type_markers <- lapply(cell_type_markers, toupper)
# Step 4: Run ssGSEA to calculate enrichment scores
ssgsea_scores <- gsva(expression_matrix, cell_type_markers, method='ssgsea', ssgsea.norm=TRUE)
# ssgsea_scores is a matrix with cell types as rows and patients as columns
# Step 5: Generate heatmap
# Optionally, scale the scores across cell types (rows) or patients (columns)
# Step 7: Generate heatmap for patients of type "M"
breaks <- seq(-1.5, 1.5, length.out = 100)
# Generate heatmap
pheatmap(ssgsea_scores,
scale = "row",
breaks = breaks,
clustering_distance_rows = 'correlation',
clustering_distance_cols = 'correlation',
clustering_method = 'average',
show_colnames = TRUE,
show_rownames = TRUE,
cluster_rows = T,
cluster_cols = TRUE,
fontsize_row = 10,
fontsize_col = 10)
```
```{r}
# Step 5: Subset the expression data to include only patients where type == "M"
# Get the sample names where type == "M"
samples_M <- rownames(adrain.ann)[adrain.ann$type == "M"]
# Ensure that these sample names are present in the expression matrix
samples_M <- intersect(samples_M, colnames(expression_matrix))
# Subset the expression matrix
expression_matrix_M <- expression_matrix[, samples_M]
# Step 6: Run ssGSEA to calculate enrichment scores using the subsetted data
ssgsea_scores_M <- gsva(expression_matrix_M, cell_type_markers, method='ssgsea', ssgsea.norm=TRUE)
# ssgsea_scores_M is a matrix with cell types as rows and patients as columns (only type "M")
# Step 7: Generate heatmap for patients of type "M"
breaks <- seq(-1.5, 1.5, length.out = 100)
# Generate heatmap
pheatmap(ssgsea_scores_M,
scale = "row",
breaks = breaks,
clustering_distance_rows = 'correlation',
clustering_distance_cols = 'correlation',
clustering_method = 'average',
show_colnames = TRUE,
show_rownames = TRUE,
cluster_rows = T,
cluster_cols = TRUE,
fontsize_row = 10,
fontsize_col = 10)
```
```{r}
# Prepare gene names
rownames(expression_matrix) <- toupper(rownames(expression_matrix))
cell_type_markers <- split(marker$gene, marker$cluster)
cell_type_markers <- lapply(cell_type_markers, toupper)
cell_type_markers <- lapply(cell_type_markers, function(genes) {
toupper(genes[1:min(10, length(genes))]) # Keep top 20 genes
})
# Run ssGSEA to calculate enrichment scores using all cell types
ssgsea_scores_all <- gsva(expression_matrix, cell_type_markers, method='ssgsea', ssgsea.norm=TRUE)
# Filter out unwanted cell types after calculating ssGSEA scores
cell_types_to_keep <- c("pro Mono", "CD14hi Mono", "CD16hi Mono", "OC", "Mφ", "CD4 Treg", "CD8 Tex", "exhausting CD8 T")
rownames(ssgsea_scores_all) <- trimws(rownames(ssgsea_scores_all))
cell_types_to_keep <- trimws(cell_types_to_keep)
ssgsea_scores_filtered <- ssgsea_scores_all[cell_types_to_keep, , drop=FALSE]
# Filter patients where patient type is "M"
common_samples <- intersect(colnames(ssgsea_scores_filtered), rownames(adrain.ann))
adrain.ann <- adrain.ann[common_samples, ]
ssgsea_scores_filtered <- ssgsea_scores_filtered[, common_samples]
samples_type_M <- rownames(adrain.ann)[adrain.ann$type == "P"]
ssgsea_scores_filtered <- ssgsea_scores_filtered[, samples_type_M]
# Scale the data after filtering
ssgsea_scores_scaled <- t(scale(t(ssgsea_scores_filtered)))
# Generate heatmap
# Adjust the scale range to -1 to 1
# Adjust the scale range to -1 to 1
breaks <- seq(-1.5, 1.5, length.out = 100)
# Generate heatmap
pheatmap(ssgsea_scores_scaled,
breaks = breaks,
clustering_distance_rows = 'correlation',
clustering_distance_cols = 'correlation',
clustering_method = 'complete',
show_colnames = TRUE,
show_rownames = TRUE,
cluster_rows = FALSE,
cluster_cols = TRUE,
fontsize_row = 10,
fontsize_col = 10)
```
```{r}
# Load necessary libraries
library(GSVA)
library(GSEABase)
library(pheatmap)
# Assuming 'expression_matrix' is your expression data matrix with genes as rows and samples as columns
# Assuming 'marker' is your data frame with cell type markers
# Prepare gene names and keep top 10 genes for each cell type
rownames(expression_matrix) <- toupper(rownames(expression_matrix))
# Split the marker data into cell types and only keep the top 10 genes per cell type
cell_type_markers <- split(marker$gene, marker$cluster)
cell_type_markers <- lapply(cell_type_markers, function(genes) {
toupper(genes[1:min(10, length(genes))]) # Keep top 10 genes
})
# Run ssGSEA to calculate enrichment scores using all cell types
ssgsea_scores_all <- gsva(expression_matrix, cell_type_markers, method='ssgsea', ssgsea.norm=TRUE)
# Filter out unwanted cell types after calculating ssGSEA scores
cell_types_to_keep <- c("pro Mono", "CD14hi Mono", "CD16hi Mono", "OC", "Mφ", "CD4 Treg", "CD8 Tex","exhausting CD8 T")
rownames(ssgsea_scores_all) <- trimws(rownames(ssgsea_scores_all))
cell_types_to_keep <- trimws(cell_types_to_keep)
ssgsea_scores_filtered <- ssgsea_scores_all[cell_types_to_keep, , drop=FALSE]
# Scale the data after filtering
ssgsea_scores_scaled <- t(scale(t(ssgsea_scores_filtered)))
# Generate heatmap
# Adjust the scale range to -1.5 to 1.5
breaks <- seq(-1.5, 1.5, length.out = 100)
pheatmap(ssgsea_scores_scaled,
breaks = breaks,
clustering_distance_rows = 'correlation',
clustering_distance_cols = 'correlation',
clustering_method = 'complete',
show_colnames = TRUE,
show_rownames = TRUE,
cluster_rows = FALSE,
cluster_cols = TRUE,
fontsize_row = 10,
fontsize_col = 10)
```
```{r}
# Load necessary libraries
library(reshape2)
library(ggplot2)
library(gridExtra)
# Assuming your data frame is named 'ssgsea_scores_filtered'
# Normalize the data from 0 to 1 per cell type (row-wise normalization)
normalized_data <- t(apply(ssgsea_scores_filtered, 1, function(x) (x - min(x)) / (max(x) - min(x))))
normalized_data <- as.data.frame(normalized_data)
# Extract "P" and "M" samples
P_samples <- normalized_data[, grep("P$", colnames(normalized_data))]
M_samples <- normalized_data[, grep("M$", colnames(normalized_data))]
# Names of the samples
P_names <- colnames(P_samples)
M_names <- colnames(M_samples)
# Initialize a list to store the plots
plots <- list()
# Loop over each cell type to generate heatmaps
for (cell_type in rownames(normalized_data)) {
# Extract data for current cell type
P_values <- as.numeric(P_samples[cell_type, ])
M_values <- as.numeric(M_samples[cell_type, ])
# Create data frames for correlation
P_df <- data.frame(P_values)
M_df <- data.frame(M_values)
# Assign sample names to the rows
rownames(P_df) <- P_names
rownames(M_df) <- M_names
# Combine P and M values into one data frame
combined_df <- data.frame(Sample = c(P_names, M_names),
Group = c(rep("P", length(P_values)), rep("M", length(M_values))),
Value = c(P_values, M_values))
# Compute the correlation matrix (since we have only one value per sample, correlation isn't meaningful here)
# Instead, we'll create a confusion matrix-style plot of the values
# Create a matrix of P vs M sample values
value_matrix <- outer(P_values, M_values, function(x, y) (x + y) / 2)
rownames(value_matrix) <- P_names
colnames(value_matrix) <- M_names
# Melt the matrix for ggplot2
value_melted <- melt(value_matrix)
# Generate the heatmap
p <- ggplot(value_melted, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = "red", name="Value") +
theme_minimal() +
ggtitle(cell_type) +
xlab("P Samples") +
ylab("M Samples") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Add the plot to the list
plots[[cell_type]] <- p
}
# Combine all the heatmaps into one figure
do.call(grid.arrange, c(plots, ncol = 2))
```
```{r}
# Load necessary libraries
library(ggplot2)
library(reshape2)
library(gridExtra)
# Assuming your data frame is named 'ssgsea_scores_filtered'
# Step 1: Normalize the data from 0 to 1 per cell type (row-wise normalization)
normalized_data <- t(apply(ssgsea_scores_filtered, 1, function(x) {
(x - min(x)) / (max(x) - min(x))
}))
normalized_data <- as.data.frame(normalized_data)
# Step 2: Transpose the data so that samples are rows and cell types are columns
normalized_data_t <- t(normalized_data)
normalized_data_t <- as.data.frame(normalized_data_t)
# Step 3: Add sample group information ("P" or "M") to each sample
normalized_data_t$SampleName <- rownames(normalized_data_t)
normalized_data_t$Group <- ifelse(grepl("P$", normalized_data_t$SampleName), "P", "M")
# Step 4: Process each cell type
# Since we need to plot one confusion matrix per cell type, we'll loop over each cell type
# Initialize a list to store the plots
plots <- list()
cell_types <- colnames(normalized_data_t)[!colnames(normalized_data_t) %in% c("SampleName", "Group")]
for (cell_type in cell_types) {
# Extract data for the current cell type
data_cell_type <- normalized_data_t[, c(cell_type, "SampleName", "Group")]
# Extract "M" samples and perform hierarchical clustering to get sample order
M_samples <- data_cell_type[data_cell_type$Group == "M", ]
M_data <- M_samples[, cell_type, drop = FALSE]
rownames(M_data) <- M_samples$SampleName
# Check if we have enough "M" samples to cluster
if (nrow(M_data) > 2) {
# Compute distance matrix and perform clustering
dist_M <- dist(M_data)
hc <- hclust(dist_M)
sample_order_M <- M_samples$SampleName[hc$order]
} else {
# If not enough samples, use existing order
sample_order_M <- M_samples$SampleName
}
# Get sample IDs without the group suffix
sample_ids_ordered <- sub("[PM]$", "", sample_order_M)
# Now, for all samples (both "P" and "M"), create the sample order based on sample IDs
sample_order <- c()
for (id in sample_ids_ordered) {
# Check for "P" sample
P_sample_name <- data_cell_type$SampleName[data_cell_type$SampleName == paste0(id, "P")]
if (length(P_sample_name) > 0) {
sample_order <- c(sample_order, P_sample_name)
}
# Check for "M" sample
M_sample_name <- data_cell_type$SampleName[data_cell_type$SampleName == paste0(id, "M")]
if (length(M_sample_name) > 0) {
sample_order <- c(sample_order, M_sample_name)
}
}
# Reorder the data
data_ordered <- data_cell_type[match(sample_order, data_cell_type$SampleName), ]
data_vector <- data_ordered[, cell_type]
names(data_vector) <- data_ordered$SampleName
group_labels <- data_ordered$Group
# Remove "P" and "M" suffixes from sample names for plotting
sample_order_no_suffix <- sub("[PM]$", "", sample_order)
# Create a matrix to hold the differences
n <- length(data_vector)
diff_matrix <- outer(data_vector, data_vector, FUN = function(x, y) abs(x - y))
rownames(diff_matrix) <- colnames(diff_matrix) <- sample_order_no_suffix
# Create a mask for the upper-left and lower-right triangles
mask <- matrix(NA, n, n)
rownames(mask) <- colnames(mask) <- sample_order_no_suffix
for (i in 1:n) {
for (j in 1:n) {
if (group_labels[i] == "P" && group_labels[j] == "P" && i >= j) {
# Upper-left triangle: P vs P
mask[i, j] <- diff_matrix[i, j]
} else if (group_labels[i] == "M" && group_labels[j] == "M" && i <= j) {
# Lower-right triangle: M vs M
mask[i, j] <- diff_matrix[i, j]
}
# Else, leave as NA
}
}
# Melt the masked matrix for plotting
masked_melted <- melt(mask, na.rm = TRUE)
# Generate the heatmap
p <- ggplot(masked_melted, aes(x = Var2, y = Var1, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "white", high = "red", name="Abs Difference") +
theme_minimal() +
xlab("Samples") +
ylab("Samples") +
ggtitle(cell_type) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# Add the plot to the list
plots[[cell_type]] <- p
}
# Step 5: Combine all the plots into one figure
do.call(grid.arrange, c(plots, ncol = 2))
```
```{r}
# Load necessary libraries
library(ggplot2)
library(reshape2)
library(gridExtra)
library(RColorBrewer) # For enhanced color palettes
# Assuming your data frame is named 'ssgsea_scores_filtered'
# Step 1: Normalize the data per cell type using Z-score normalization
normalized_data <- t(apply(ssgsea_scores_filtered, 1, function(x) {
(x - mean(x)) / sd(x)
}))
normalized_data <- as.data.frame(normalized_data)
# Step 2: Transpose the data so that samples are rows and cell types are columns
normalized_data_t <- t(normalized_data)
normalized_data_t <- as.data.frame(normalized_data_t)
# Step 3: Add sample group information ("P" or "M") to each sample
normalized_data_t$SampleName <- rownames(normalized_data_t)
normalized_data_t$Group <- ifelse(grepl("P$", normalized_data_t$SampleName), "P", "M")
# Step 4: Process each cell type
# Loop over each cell type to generate heatmaps
plots <- list()
cell_types <- colnames(normalized_data_t)[!colnames(normalized_data_t) %in% c("SampleName", "Group")]
for (cell_type in cell_types) {
# Extract data for the current cell type
data_cell_type <- normalized_data_t[, c(cell_type, "SampleName", "Group")]
# Extract "M" samples and perform hierarchical clustering to get sample order
M_samples <- data_cell_type[data_cell_type$Group == "M", ]
M_data <- M_samples[, cell_type, drop = FALSE]
rownames(M_data) <- M_samples$SampleName
# Check if we have enough "M" samples to cluster
if (nrow(M_data) > 2) {
# Compute distance matrix and perform clustering
dist_M <- dist(M_data)
hc <- hclust(dist_M)
sample_order_M <- M_samples$SampleName[hc$order]
} else {
# If not enough samples, use existing order
sample_order_M <- M_samples$SampleName
}
# Get sample IDs without the group suffix
sample_ids_ordered <- sub("[PM]$", "", sample_order_M)
# Create the sample order based on sample IDs
sample_order <- c()
for (id in sample_ids_ordered) {
# Add "P" and "M" samples if they exist
P_sample_name <- data_cell_type$SampleName[data_cell_type$SampleName == paste0(id, "P")]
if (length(P_sample_name) > 0) {
sample_order <- c(sample_order, P_sample_name)
}
M_sample_name <- data_cell_type$SampleName[data_cell_type$SampleName == paste0(id, "M")]
if (length(M_sample_name) > 0) {
sample_order <- c(sample_order, M_sample_name)
}
}
# Reorder the data
data_ordered <- data_cell_type[match(sample_order, data_cell_type$SampleName), ]
data_vector <- data_ordered[, cell_type]
names(data_vector) <- data_ordered$SampleName
group_labels <- data_ordered$Group
# Remove "P" and "M" suffixes from sample names for plotting
sample_order_no_suffix <- sub("[PM]$", "", sample_order)
# Step 5: Create a matrix of differences (without absolute value)
n <- length(data_vector)
diff_matrix <- outer(data_vector, data_vector, FUN = function(x, y) x - y)
rownames(diff_matrix) <- colnames(diff_matrix) <- sample_order_no_suffix
# Step 6: Create a mask for the upper-left and lower-right triangles
mask <- matrix(NA, n, n)
rownames(mask) <- colnames(mask) <- sample_order_no_suffix
for (i in 1:n) {
for (j in 1:n) {
if (group_labels[i] == "P" && group_labels[j] == "P" && i >= j) {
# Upper-left triangle: P vs P
mask[i, j] <- diff_matrix[i, j]
} else if (group_labels[i] == "M" && group_labels[j] == "M" && i <= j) {
# Lower-right triangle: M vs M
mask[i, j] <- diff_matrix[i, j]
}
# Else, leave as NA
}
}
# Melt the masked matrix for plotting
masked_melted <- melt(mask, na.rm = TRUE)
# Step 7: Adjust the color scale limits to enhance contrast
max_abs_value <- max(abs(masked_melted$value))
# Step 8: Generate the heatmap with enhanced color contrast
p <- ggplot(masked_melted, aes(x = Var2, y = Var1, fill = value)) +
geom_tile(color = "grey80") +
scale_fill_gradientn(colors = rev(brewer.pal(11, "PiGN")),
limits = c(-max_abs_value, max_abs_value),
name = "Difference (Z-score)") +
theme_minimal() +
xlab("Samples") +
ylab("Samples") +
ggtitle(cell_type) +
theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
# Add the plot to the list
plots[[cell_type]] <- p
}
# Step 9: Combine all the plots into one figure
do.call(grid.arrange, c(plots, ncol = 2))
```
```{r}
# Load necessary libraries
library(ggplot2)
library(reshape2)
library(RColorBrewer)
library(tidyr)
library(dplyr)
# Assuming your data frame is named 'ssgsea_scores_filtered'
# Step 1: Normalize the data per cell type using Z-score normalization
normalized_data <- t(apply(ssgsea_scores_filtered, 1, function(x) {
(x - mean(x)) / sd(x)
}))
normalized_data <- as.data.frame(normalized_data)
# Step 2: Transpose the data so that samples are rows and cell types are columns
normalized_data_t <- t(normalized_data)
normalized_data_t <- as.data.frame(normalized_data_t)
# Add sample information
normalized_data_t$SampleName <- rownames(normalized_data_t)
normalized_data_t$Group <- ifelse(grepl("P$", normalized_data_t$SampleName), "P", "M")
normalized_data_t$SampleID <- sub("[PM]$", "", normalized_data_t$SampleName)
# Step 3: Reshape the data to long format
data_long <- normalized_data_t %>%
gather(key = "CellType", value = "Value", -SampleName, -Group, -SampleID)
# Step 4: Create a combined identifier for each sample-cell type
data_long$Sample_CellType <- paste(data_long$SampleID, data_long$CellType, sep = "_")
# Step 5: Prepare data for computing pairwise differences
comb_data <- data_long %>%
select(Sample_CellType, Value) %>%
distinct()
# Step 6: Create a vector of values with names
value_vector <- comb_data$Value
names(value_vector) <- comb_data$Sample_CellType
# Step 7: Compute the pairwise differences
diff_matrix <- outer(value_vector, value_vector, FUN = function(x, y) x - y)
# Step 8: Assign row and column names
labels <- comb_data$Sample_CellType
rownames(diff_matrix) <- labels
colnames(diff_matrix) <- labels
# Step 9: Cluster the matrix to enhance visualization
# Compute distance matrix
dist_matrix <- dist(value_vector)
# Perform hierarchical clustering
hc <- hclust(dist_matrix)
# Reorder the matrix
ordered_labels <- labels[hc$order]
diff_matrix_ordered <- diff_matrix[ordered_labels, ordered_labels]
# Step 10: Melt the ordered matrix for plotting
diff_melted <- melt(diff_matrix_ordered, varnames = c("Var1", "Var2"), value.name = "Difference")
# Step 11: Adjust the color scale limits
max_abs_value <- max(abs(diff_melted$Difference))
# Step 12: Generate the heatmap
p <- ggplot(diff_melted, aes(x = Var2, y = Var1, fill = Difference)) +
geom_tile(color = "grey80") +
scale_fill_gradientn(colors = rev(brewer.pal(11, "RdBu")),
limits = c(-max_abs_value, max_abs_value),
name = "Difference (Z-score)") +
theme_minimal() +
xlab("Sample and Cell Type") +
ylab("Sample and Cell Type") +
ggtitle("Combined Confusion Matrix") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 5),
axis.text.y = element_text(size = 5),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
# Print the plot
print(p)
```
```{r}
# Load necessary libraries
library(ggplot2)
library(reshape2)
library(gridExtra)
library(RColorBrewer) # For enhanced color palettes
library(pheatmap) # For advanced heatmap plotting
# Assuming your data frame is named 'ssgsea_scores_filtered'
ssgsea_scores_filtered <- ssgsea_scores
# Step 1: Normalize the data per cell type using Z-score normalization
normalized_data <- t(apply(ssgsea_scores_filtered, 1, function(x) {
(x - mean(x)) / sd(x)
}))
normalized_data <- as.data.frame(normalized_data)
# Step 2: Transpose the data so that samples are rows and cell types are columns
normalized_data_t <- t(normalized_data)
normalized_data_t <- as.data.frame(normalized_data_t)
# Step 3: Add sample group information ("P" or "M") to each sample
normalized_data_t$SampleName <- rownames(normalized_data_t)
normalized_data_t$Group <- ifelse(grepl("P$", normalized_data_t$SampleName), "P", "M")
# Step 4: Split the data into "M" and "P" groups
data_P <- normalized_data_t[normalized_data_t$Group == "P", ]
data_M <- normalized_data_t[normalized_data_t$Group == "M", ]
# Remove 'SampleName' and 'Group' columns to retain only cell type data
cell_types <- colnames(normalized_data_t)[!colnames(normalized_data_t) %in% c("SampleName", "Group")]
data_P_values <- data_P[, cell_types]
data_M_values <- data_M[, cell_types]
# Step 5: Compute correlation matrices for each group
# Transpose the data to have cell types as rows and patients as columns
data_P_values_t <- t(data_P_values)
data_M_values_t <- t(data_M_values)
# Compute correlation matrices (cell types vs. cell types across patients)
cor_matrix_P <- cor(data_P_values_t, use = "pairwise.complete.obs")
cor_matrix_M <- cor(data_M_values_t, use = "pairwise.complete.obs")
# Step 6: Visualize the correlation matrices using heatmaps
# Use the same color scale for both heatmaps for comparison
# Define color palette
my_palette <- colorRampPalette(rev(brewer.pal(11, "RdBu")))(100)
# Create heatmaps
pheatmap_P <- pheatmap(cor_matrix_P,
color = my_palette,
breaks = seq(-0.5, 0.5, length.out = 100),
cluster_rows = TRUE,
cluster_cols = TRUE,
main = "Cell Type Correlation in P Group",
fontsize = 10)
pheatmap_M <- pheatmap(cor_matrix_M,
color = my_palette,
breaks = seq(-0.5, 0.5, length.out = 100),
cluster_rows = TRUE,
cluster_cols = TRUE,
main = "Cell Type Correlation in M Group",
fontsize = 10)
# Optional Step: Compute the difference between the two correlation matrices
cor_matrix_diff <- cor_matrix_M - cor_matrix_P
# Visualize the difference matrix
pheatmap_diff <- pheatmap(cor_matrix_diff,
color = my_palette,
breaks = seq(-0.5, 0.5, length.out = 100),
cluster_rows = TRUE,
cluster_cols = TRUE,
main = "Difference in Correlations (M - P)",
fontsize = 10)
# Since pheatmap outputs are grid objects, we can arrange them using grid.arrange
# Extract the grobs
gP <- pheatmap_P$gtable
gM <- pheatmap_M$gtable
gDiff <- pheatmap_diff$gtable
# Arrange the heatmaps side by side
grid.arrange(gP, gM, gDiff, ncol = 3)
```
```{r}
# Load necessary libraries
library(ggplot2)
library(reshape2)
library(gridExtra)
library(RColorBrewer) # For color palettes
library(pheatmap) # For heatmap plotting
# Assuming your data frame is named 'ssgsea_scores_filtered'
# If you have 'ssgsea_scores', assign it to 'ssgsea_scores_filtered'
# ssgsea_scores_filtered <- ssgsea_scores
# Step 1: Transpose the data to have samples as rows and cell types as columns
data_transposed <- as.data.frame(t(ssgsea_scores_filtered))
# Add SampleName and Group columns
data_transposed$SampleName <- rownames(data_transposed)
data_transposed$Group <- ifelse(grepl("P$", data_transposed$SampleName), "P", "M")
# Remove "M" and "P" suffix to get base SampleID
data_transposed$SampleID <- sub("[PM]$", "", data_transposed$SampleName)
# Identify the cell type columns (exclude 'SampleName', 'Group', 'SampleID')
cell_types <- setdiff(colnames(data_transposed), c("SampleName", "Group", "SampleID"))
# Step 2: Normalize the data for each sample (row-wise Z-score normalization)
# Note: Since we are comparing samples, it's better to normalize across cell types (columns)
normalized_data <- as.data.frame(scale(data_transposed[, cell_types]))
# Add back SampleID and Group information
normalized_data$SampleID <- data_transposed$SampleID
normalized_data$Group <- data_transposed$Group
# Step 3: Split the data into "M" and "P" groups
data_M <- normalized_data[normalized_data$Group == "M", ]
data_P <- normalized_data[normalized_data$Group == "P", ]
# Step 4: Set SampleID as row names to align samples between groups
rownames(data_M) <- data_M$SampleID
rownames(data_P) <- data_P$SampleID
# Remove non-numeric columns for correlation computation
data_M_values <- data_M[, cell_types]
data_P_values <- data_P[, cell_types]
# Step 5: Compute correlation matrices between samples for each group
# Since samples are rows and cell types are columns, we compute correlations between samples
cor_matrix_M <- cor(t(data_M_values), use = "pairwise.complete.obs")
cor_matrix_P <- cor(t(data_P_values), use = "pairwise.complete.obs")
# Step 6: Obtain sample order from the "M" group correlation matrix
# Perform hierarchical clustering on the "M" group's correlation matrix
hc_M <- hclust(as.dist(1 - cor_matrix_M))
# Get the sample order from the clustering
sample_order <- hc_M$labels[hc_M$order]
# Reorder the correlation matrices according to the sample order
cor_matrix_M <- cor_matrix_M[sample_order, sample_order]
cor_matrix_P <- cor_matrix_P[sample_order, sample_order]
# Step 7: Plot the correlation matrices using the same sample order
# Define a consistent color palette
my_palette <- colorRampPalette(rev(brewer.pal(11, "RdBu")))(100)
# Create heatmaps without clustering (since we already ordered the samples)
pheatmap_M <- pheatmap(cor_matrix_M,
color = my_palette,
breaks = seq(-1, 1, length.out = 100),
cluster_rows = FALSE,
cluster_cols = FALSE,
main = "Bone Metastasis TME",
fontsize = 10)
pheatmap_P <- pheatmap(cor_matrix_P,
color = my_palette,
breaks = seq(-1, 1, length.out = 100),
cluster_rows = FALSE,
cluster_cols = FALSE,
main = "Primary Tumor TME",
fontsize = 10)
# Optional Step: Compute the difference between the two correlation matrices
cor_matrix_diff <- cor_matrix_M - cor_matrix_P
# Visualize the difference matrix
pheatmap_diff <- pheatmap(cor_matrix_diff,
color = my_palette,
breaks = seq(-1, 1, length.out = 100),
cluster_rows = FALSE,
cluster_cols = FALSE,
main = "Difference in Correlations (M - P)",
fontsize = 10)
# Arrange the heatmaps side by side
gM <- pheatmap_M$gtable
gP <- pheatmap_P$gtable
gDiff <- pheatmap_diff$gtable
grid.arrange(gM, gP, gDiff, ncol = 3)
# Open a PDF device to save the plots
pdf("~/Desktop/correlation_heatmaps.pdf", width = 8, height = 2) # Adjust width and height as needed
# Arrange and draw the plots into the PDF
grid.arrange(gM, gP, gDiff, ncol = 3)
# Close the PDF device
dev.off()
```
```{r}
# Load necessary libraries
library(ggplot2)
library(reshape2)
library(gridExtra)
library(RColorBrewer) # For color palettes
library(pheatmap) # For heatmap plotting
# Assuming your data frame is named 'ssgsea_scores_filtered'
# If you have 'ssgsea_scores', assign it to 'ssgsea_scores_filtered'
# ssgsea_scores_filtered <- ssgsea_scores
# Step 1: Transpose the data to have samples as rows and cell types as columns
data_transposed <- as.data.frame(t(ssgsea_scores_filtered))
# Add SampleName and Group columns
data_transposed$SampleName <- rownames(data_transposed)
data_transposed$Group <- ifelse(grepl("P$", data_transposed$SampleName), "P", "M")
# Remove "M" and "P" suffix to get base SampleID
data_transposed$SampleID <- sub("[PM]$", "", data_transposed$SampleName)
# Identify the cell type columns (exclude 'SampleName', 'Group', 'SampleID')
cell_types <- setdiff(colnames(data_transposed), c("SampleName", "Group", "SampleID"))
# Step 2: Normalize the data for each sample (row-wise Z-score normalization)
# Note: Since we are comparing samples, it's better to normalize across cell types (columns)
normalized_data <- as.data.frame(scale(data_transposed[, cell_types]))
# Add back SampleID and Group information
normalized_data$SampleID <- data_transposed$SampleID
normalized_data$Group <- data_transposed$Group
# Step 3: Split the data into "M" and "P" groups
data_M <- normalized_data[normalized_data$Group == "M", ]
data_P <- normalized_data[normalized_data$Group == "P", ]
# Step 4: Set SampleID as row names to align samples between groups
rownames(data_M) <- data_M$SampleID
rownames(data_P) <- data_P$SampleID
# Remove non-numeric columns for correlation computation
data_M_values <- data_M[, cell_types]
data_P_values <- data_P[, cell_types]
# Step 5: Compute correlation matrices between samples for each group using Spearman correlation
# Since samples are rows and cell types are columns, we compute correlations between samples
cor_matrix_M <- cor(t(data_M_values), method = "spearman", use = "pairwise.complete.obs")
cor_matrix_P <- cor(t(data_P_values), method = "spearman", use = "pairwise.complete.obs")
# Step 6: Obtain sample order from the "M" group correlation matrix
# Perform hierarchical clustering on the "M" group's correlation matrix
hc_M <- hclust(as.dist(1 - cor_matrix_M))
# Get the sample order from the clustering
sample_order <- hc_M$labels[hc_M$order]
# Reorder the correlation matrices according to the sample order
cor_matrix_M <- cor_matrix_M[sample_order, sample_order]
cor_matrix_P <- cor_matrix_P[sample_order, sample_order]
# Step 7: Plot the correlation matrices using the same sample order
# Define a consistent color palette
my_palette <- colorRampPalette(rev(brewer.pal(11, "RdBu")))(100)
# Create heatmaps without clustering (since we already ordered the samples)
pheatmap_M <- pheatmap(
cor_matrix_M,
color = my_palette,
breaks = seq(-1, 1, length.out = 100),
cluster_rows = FALSE,
cluster_cols = FALSE,
border_color = NA, # Remove grid lines if desired
main = "Bone Metastasis TME (Spearman)",
fontsize = 10
)
pheatmap_P <- pheatmap(
cor_matrix_P,
color = my_palette,
breaks = seq(-1, 1, length.out = 100),
cluster_rows = FALSE,
cluster_cols = FALSE,
border_color = NA, # Remove grid lines if desired
main = "Primary Tumor TME (Spearman)",
fontsize = 10
)
# Optional Step: Compute the difference between the two correlation matrices
cor_matrix_diff <- cor_matrix_M - cor_matrix_P
# Visualize the difference matrix
pheatmap_diff <- pheatmap(
cor_matrix_diff,
color = my_palette,
breaks = seq(-1, 1, length.out = 100),
cluster_rows = FALSE,
cluster_cols = FALSE,
border_color = NA, # Remove grid lines if desired
main = "Difference in Correlations (M - P, Spearman)",
fontsize = 10
)
# Arrange the heatmaps side by side
gM <- pheatmap_M$gtable
gP <- pheatmap_P$gtable
gDiff <- pheatmap_diff$gtable
# Open a PDF device to save the plots
#pdf("~/Desktop/correlation_heatmaps_spearman.pdf", width = 8, height = 2) # Adjust width and height as needed
# Arrange and draw the plots into the PDF
grid.arrange(gM, gP, gDiff, ncol = 3)
# Close the PDF device
#dev.off()
```