--- title: "additional data analysis" output: html_document date: "2024-09-16" --- ```{r} # Load necessary libraries rm(list=ls()) library(GSVA) library(GSEABase) library(pheatmap) ``` ```{r} # Step 1: Load expression data # The expression matrix should have genes as rows and patients as columns # Replace 'path/to/expression_matrix.txt' with your actual file path load("/Volumes/Fengshuo_14T/Lab/Yunfeng/cancer_comparsions/Zenodo/data/Bulk_Microarray_Data(published)/PMID-28878133.RData") marker <- readRDS("./data/Bulk_Microarray_Data(published)/cell_type_markers_from_scRNA_data_for_estimation.rds") expression_matrix <- adrain.clps # Step 2: Use your marker data # Assuming 'marker' is your data frame with cell type markers # If 'marker' is not already in your R environment, read it from a file # marker <- read.table("path/to/marker_data.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE) # Create a list of gene sets (one per cell type) cell_type_markers <- split(marker$gene, marker$cluster) cell_type_markers <- lapply(cell_type_markers, function(genes) { toupper(genes[1:min(20, length(genes))]) # Keep top 20 genes }) # Step 3: Ensure gene names match between expression data and cell markers # Convert gene names to uppercase (or lowercase) if necessary to match rownames(expression_matrix) <- toupper(rownames(expression_matrix)) cell_type_markers <- lapply(cell_type_markers, toupper) # Step 4: Run ssGSEA to calculate enrichment scores ssgsea_scores <- gsva(expression_matrix, cell_type_markers, method='ssgsea', ssgsea.norm=TRUE) # ssgsea_scores is a matrix with cell types as rows and patients as columns # Step 5: Generate heatmap # Optionally, scale the scores across cell types (rows) or patients (columns) # Step 7: Generate heatmap for patients of type "M" breaks <- seq(-1.5, 1.5, length.out = 100) # Generate heatmap pheatmap(ssgsea_scores, scale = "row", breaks = breaks, clustering_distance_rows = 'correlation', clustering_distance_cols = 'correlation', clustering_method = 'average', show_colnames = TRUE, show_rownames = TRUE, cluster_rows = T, cluster_cols = TRUE, fontsize_row = 10, fontsize_col = 10) ``` ```{r} # Step 5: Subset the expression data to include only patients where type == "M" # Get the sample names where type == "M" samples_M <- rownames(adrain.ann)[adrain.ann$type == "M"] # Ensure that these sample names are present in the expression matrix samples_M <- intersect(samples_M, colnames(expression_matrix)) # Subset the expression matrix expression_matrix_M <- expression_matrix[, samples_M] # Step 6: Run ssGSEA to calculate enrichment scores using the subsetted data ssgsea_scores_M <- gsva(expression_matrix_M, cell_type_markers, method='ssgsea', ssgsea.norm=TRUE) # ssgsea_scores_M is a matrix with cell types as rows and patients as columns (only type "M") # Step 7: Generate heatmap for patients of type "M" breaks <- seq(-1.5, 1.5, length.out = 100) # Generate heatmap pheatmap(ssgsea_scores_M, scale = "row", breaks = breaks, clustering_distance_rows = 'correlation', clustering_distance_cols = 'correlation', clustering_method = 'average', show_colnames = TRUE, show_rownames = TRUE, cluster_rows = T, cluster_cols = TRUE, fontsize_row = 10, fontsize_col = 10) ``` ```{r} # Prepare gene names rownames(expression_matrix) <- toupper(rownames(expression_matrix)) cell_type_markers <- split(marker$gene, marker$cluster) cell_type_markers <- lapply(cell_type_markers, toupper) cell_type_markers <- lapply(cell_type_markers, function(genes) { toupper(genes[1:min(10, length(genes))]) # Keep top 20 genes }) # Run ssGSEA to calculate enrichment scores using all cell types ssgsea_scores_all <- gsva(expression_matrix, cell_type_markers, method='ssgsea', ssgsea.norm=TRUE) # Filter out unwanted cell types after calculating ssGSEA scores cell_types_to_keep <- c("pro Mono", "CD14hi Mono", "CD16hi Mono", "OC", "Mφ", "CD4 Treg", "CD8 Tex", "exhausting CD8 T") rownames(ssgsea_scores_all) <- trimws(rownames(ssgsea_scores_all)) cell_types_to_keep <- trimws(cell_types_to_keep) ssgsea_scores_filtered <- ssgsea_scores_all[cell_types_to_keep, , drop=FALSE] # Filter patients where patient type is "M" common_samples <- intersect(colnames(ssgsea_scores_filtered), rownames(adrain.ann)) adrain.ann <- adrain.ann[common_samples, ] ssgsea_scores_filtered <- ssgsea_scores_filtered[, common_samples] samples_type_M <- rownames(adrain.ann)[adrain.ann$type == "P"] ssgsea_scores_filtered <- ssgsea_scores_filtered[, samples_type_M] # Scale the data after filtering ssgsea_scores_scaled <- t(scale(t(ssgsea_scores_filtered))) # Generate heatmap # Adjust the scale range to -1 to 1 # Adjust the scale range to -1 to 1 breaks <- seq(-1.5, 1.5, length.out = 100) # Generate heatmap pheatmap(ssgsea_scores_scaled, breaks = breaks, clustering_distance_rows = 'correlation', clustering_distance_cols = 'correlation', clustering_method = 'complete', show_colnames = TRUE, show_rownames = TRUE, cluster_rows = FALSE, cluster_cols = TRUE, fontsize_row = 10, fontsize_col = 10) ``` ```{r} # Load necessary libraries library(GSVA) library(GSEABase) library(pheatmap) # Assuming 'expression_matrix' is your expression data matrix with genes as rows and samples as columns # Assuming 'marker' is your data frame with cell type markers # Prepare gene names and keep top 10 genes for each cell type rownames(expression_matrix) <- toupper(rownames(expression_matrix)) # Split the marker data into cell types and only keep the top 10 genes per cell type cell_type_markers <- split(marker$gene, marker$cluster) cell_type_markers <- lapply(cell_type_markers, function(genes) { toupper(genes[1:min(10, length(genes))]) # Keep top 10 genes }) # Run ssGSEA to calculate enrichment scores using all cell types ssgsea_scores_all <- gsva(expression_matrix, cell_type_markers, method='ssgsea', ssgsea.norm=TRUE) # Filter out unwanted cell types after calculating ssGSEA scores cell_types_to_keep <- c("pro Mono", "CD14hi Mono", "CD16hi Mono", "OC", "Mφ", "CD4 Treg", "CD8 Tex","exhausting CD8 T") rownames(ssgsea_scores_all) <- trimws(rownames(ssgsea_scores_all)) cell_types_to_keep <- trimws(cell_types_to_keep) ssgsea_scores_filtered <- ssgsea_scores_all[cell_types_to_keep, , drop=FALSE] # Scale the data after filtering ssgsea_scores_scaled <- t(scale(t(ssgsea_scores_filtered))) # Generate heatmap # Adjust the scale range to -1.5 to 1.5 breaks <- seq(-1.5, 1.5, length.out = 100) pheatmap(ssgsea_scores_scaled, breaks = breaks, clustering_distance_rows = 'correlation', clustering_distance_cols = 'correlation', clustering_method = 'complete', show_colnames = TRUE, show_rownames = TRUE, cluster_rows = FALSE, cluster_cols = TRUE, fontsize_row = 10, fontsize_col = 10) ``` ```{r} # Load necessary libraries library(reshape2) library(ggplot2) library(gridExtra) # Assuming your data frame is named 'ssgsea_scores_filtered' # Normalize the data from 0 to 1 per cell type (row-wise normalization) normalized_data <- t(apply(ssgsea_scores_filtered, 1, function(x) (x - min(x)) / (max(x) - min(x)))) normalized_data <- as.data.frame(normalized_data) # Extract "P" and "M" samples P_samples <- normalized_data[, grep("P$", colnames(normalized_data))] M_samples <- normalized_data[, grep("M$", colnames(normalized_data))] # Names of the samples P_names <- colnames(P_samples) M_names <- colnames(M_samples) # Initialize a list to store the plots plots <- list() # Loop over each cell type to generate heatmaps for (cell_type in rownames(normalized_data)) { # Extract data for current cell type P_values <- as.numeric(P_samples[cell_type, ]) M_values <- as.numeric(M_samples[cell_type, ]) # Create data frames for correlation P_df <- data.frame(P_values) M_df <- data.frame(M_values) # Assign sample names to the rows rownames(P_df) <- P_names rownames(M_df) <- M_names # Combine P and M values into one data frame combined_df <- data.frame(Sample = c(P_names, M_names), Group = c(rep("P", length(P_values)), rep("M", length(M_values))), Value = c(P_values, M_values)) # Compute the correlation matrix (since we have only one value per sample, correlation isn't meaningful here) # Instead, we'll create a confusion matrix-style plot of the values # Create a matrix of P vs M sample values value_matrix <- outer(P_values, M_values, function(x, y) (x + y) / 2) rownames(value_matrix) <- P_names colnames(value_matrix) <- M_names # Melt the matrix for ggplot2 value_melted <- melt(value_matrix) # Generate the heatmap p <- ggplot(value_melted, aes(Var1, Var2, fill = value)) + geom_tile(color = "white") + scale_fill_gradient(low = "white", high = "red", name="Value") + theme_minimal() + ggtitle(cell_type) + xlab("P Samples") + ylab("M Samples") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Add the plot to the list plots[[cell_type]] <- p } # Combine all the heatmaps into one figure do.call(grid.arrange, c(plots, ncol = 2)) ``` ```{r} # Load necessary libraries library(ggplot2) library(reshape2) library(gridExtra) # Assuming your data frame is named 'ssgsea_scores_filtered' # Step 1: Normalize the data from 0 to 1 per cell type (row-wise normalization) normalized_data <- t(apply(ssgsea_scores_filtered, 1, function(x) { (x - min(x)) / (max(x) - min(x)) })) normalized_data <- as.data.frame(normalized_data) # Step 2: Transpose the data so that samples are rows and cell types are columns normalized_data_t <- t(normalized_data) normalized_data_t <- as.data.frame(normalized_data_t) # Step 3: Add sample group information ("P" or "M") to each sample normalized_data_t$SampleName <- rownames(normalized_data_t) normalized_data_t$Group <- ifelse(grepl("P$", normalized_data_t$SampleName), "P", "M") # Step 4: Process each cell type # Since we need to plot one confusion matrix per cell type, we'll loop over each cell type # Initialize a list to store the plots plots <- list() cell_types <- colnames(normalized_data_t)[!colnames(normalized_data_t) %in% c("SampleName", "Group")] for (cell_type in cell_types) { # Extract data for the current cell type data_cell_type <- normalized_data_t[, c(cell_type, "SampleName", "Group")] # Extract "M" samples and perform hierarchical clustering to get sample order M_samples <- data_cell_type[data_cell_type$Group == "M", ] M_data <- M_samples[, cell_type, drop = FALSE] rownames(M_data) <- M_samples$SampleName # Check if we have enough "M" samples to cluster if (nrow(M_data) > 2) { # Compute distance matrix and perform clustering dist_M <- dist(M_data) hc <- hclust(dist_M) sample_order_M <- M_samples$SampleName[hc$order] } else { # If not enough samples, use existing order sample_order_M <- M_samples$SampleName } # Get sample IDs without the group suffix sample_ids_ordered <- sub("[PM]$", "", sample_order_M) # Now, for all samples (both "P" and "M"), create the sample order based on sample IDs sample_order <- c() for (id in sample_ids_ordered) { # Check for "P" sample P_sample_name <- data_cell_type$SampleName[data_cell_type$SampleName == paste0(id, "P")] if (length(P_sample_name) > 0) { sample_order <- c(sample_order, P_sample_name) } # Check for "M" sample M_sample_name <- data_cell_type$SampleName[data_cell_type$SampleName == paste0(id, "M")] if (length(M_sample_name) > 0) { sample_order <- c(sample_order, M_sample_name) } } # Reorder the data data_ordered <- data_cell_type[match(sample_order, data_cell_type$SampleName), ] data_vector <- data_ordered[, cell_type] names(data_vector) <- data_ordered$SampleName group_labels <- data_ordered$Group # Remove "P" and "M" suffixes from sample names for plotting sample_order_no_suffix <- sub("[PM]$", "", sample_order) # Create a matrix to hold the differences n <- length(data_vector) diff_matrix <- outer(data_vector, data_vector, FUN = function(x, y) abs(x - y)) rownames(diff_matrix) <- colnames(diff_matrix) <- sample_order_no_suffix # Create a mask for the upper-left and lower-right triangles mask <- matrix(NA, n, n) rownames(mask) <- colnames(mask) <- sample_order_no_suffix for (i in 1:n) { for (j in 1:n) { if (group_labels[i] == "P" && group_labels[j] == "P" && i >= j) { # Upper-left triangle: P vs P mask[i, j] <- diff_matrix[i, j] } else if (group_labels[i] == "M" && group_labels[j] == "M" && i <= j) { # Lower-right triangle: M vs M mask[i, j] <- diff_matrix[i, j] } # Else, leave as NA } } # Melt the masked matrix for plotting masked_melted <- melt(mask, na.rm = TRUE) # Generate the heatmap p <- ggplot(masked_melted, aes(x = Var2, y = Var1, fill = value)) + geom_tile(color = "white") + scale_fill_gradient(low = "white", high = "red", name="Abs Difference") + theme_minimal() + xlab("Samples") + ylab("Samples") + ggtitle(cell_type) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # Add the plot to the list plots[[cell_type]] <- p } # Step 5: Combine all the plots into one figure do.call(grid.arrange, c(plots, ncol = 2)) ``` ```{r} # Load necessary libraries library(ggplot2) library(reshape2) library(gridExtra) library(RColorBrewer) # For enhanced color palettes # Assuming your data frame is named 'ssgsea_scores_filtered' # Step 1: Normalize the data per cell type using Z-score normalization normalized_data <- t(apply(ssgsea_scores_filtered, 1, function(x) { (x - mean(x)) / sd(x) })) normalized_data <- as.data.frame(normalized_data) # Step 2: Transpose the data so that samples are rows and cell types are columns normalized_data_t <- t(normalized_data) normalized_data_t <- as.data.frame(normalized_data_t) # Step 3: Add sample group information ("P" or "M") to each sample normalized_data_t$SampleName <- rownames(normalized_data_t) normalized_data_t$Group <- ifelse(grepl("P$", normalized_data_t$SampleName), "P", "M") # Step 4: Process each cell type # Loop over each cell type to generate heatmaps plots <- list() cell_types <- colnames(normalized_data_t)[!colnames(normalized_data_t) %in% c("SampleName", "Group")] for (cell_type in cell_types) { # Extract data for the current cell type data_cell_type <- normalized_data_t[, c(cell_type, "SampleName", "Group")] # Extract "M" samples and perform hierarchical clustering to get sample order M_samples <- data_cell_type[data_cell_type$Group == "M", ] M_data <- M_samples[, cell_type, drop = FALSE] rownames(M_data) <- M_samples$SampleName # Check if we have enough "M" samples to cluster if (nrow(M_data) > 2) { # Compute distance matrix and perform clustering dist_M <- dist(M_data) hc <- hclust(dist_M) sample_order_M <- M_samples$SampleName[hc$order] } else { # If not enough samples, use existing order sample_order_M <- M_samples$SampleName } # Get sample IDs without the group suffix sample_ids_ordered <- sub("[PM]$", "", sample_order_M) # Create the sample order based on sample IDs sample_order <- c() for (id in sample_ids_ordered) { # Add "P" and "M" samples if they exist P_sample_name <- data_cell_type$SampleName[data_cell_type$SampleName == paste0(id, "P")] if (length(P_sample_name) > 0) { sample_order <- c(sample_order, P_sample_name) } M_sample_name <- data_cell_type$SampleName[data_cell_type$SampleName == paste0(id, "M")] if (length(M_sample_name) > 0) { sample_order <- c(sample_order, M_sample_name) } } # Reorder the data data_ordered <- data_cell_type[match(sample_order, data_cell_type$SampleName), ] data_vector <- data_ordered[, cell_type] names(data_vector) <- data_ordered$SampleName group_labels <- data_ordered$Group # Remove "P" and "M" suffixes from sample names for plotting sample_order_no_suffix <- sub("[PM]$", "", sample_order) # Step 5: Create a matrix of differences (without absolute value) n <- length(data_vector) diff_matrix <- outer(data_vector, data_vector, FUN = function(x, y) x - y) rownames(diff_matrix) <- colnames(diff_matrix) <- sample_order_no_suffix # Step 6: Create a mask for the upper-left and lower-right triangles mask <- matrix(NA, n, n) rownames(mask) <- colnames(mask) <- sample_order_no_suffix for (i in 1:n) { for (j in 1:n) { if (group_labels[i] == "P" && group_labels[j] == "P" && i >= j) { # Upper-left triangle: P vs P mask[i, j] <- diff_matrix[i, j] } else if (group_labels[i] == "M" && group_labels[j] == "M" && i <= j) { # Lower-right triangle: M vs M mask[i, j] <- diff_matrix[i, j] } # Else, leave as NA } } # Melt the masked matrix for plotting masked_melted <- melt(mask, na.rm = TRUE) # Step 7: Adjust the color scale limits to enhance contrast max_abs_value <- max(abs(masked_melted$value)) # Step 8: Generate the heatmap with enhanced color contrast p <- ggplot(masked_melted, aes(x = Var2, y = Var1, fill = value)) + geom_tile(color = "grey80") + scale_fill_gradientn(colors = rev(brewer.pal(11, "PiGN")), limits = c(-max_abs_value, max_abs_value), name = "Difference (Z-score)") + theme_minimal() + xlab("Samples") + ylab("Samples") + ggtitle(cell_type) + theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust=1), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) # Add the plot to the list plots[[cell_type]] <- p } # Step 9: Combine all the plots into one figure do.call(grid.arrange, c(plots, ncol = 2)) ``` ```{r} # Load necessary libraries library(ggplot2) library(reshape2) library(RColorBrewer) library(tidyr) library(dplyr) # Assuming your data frame is named 'ssgsea_scores_filtered' # Step 1: Normalize the data per cell type using Z-score normalization normalized_data <- t(apply(ssgsea_scores_filtered, 1, function(x) { (x - mean(x)) / sd(x) })) normalized_data <- as.data.frame(normalized_data) # Step 2: Transpose the data so that samples are rows and cell types are columns normalized_data_t <- t(normalized_data) normalized_data_t <- as.data.frame(normalized_data_t) # Add sample information normalized_data_t$SampleName <- rownames(normalized_data_t) normalized_data_t$Group <- ifelse(grepl("P$", normalized_data_t$SampleName), "P", "M") normalized_data_t$SampleID <- sub("[PM]$", "", normalized_data_t$SampleName) # Step 3: Reshape the data to long format data_long <- normalized_data_t %>% gather(key = "CellType", value = "Value", -SampleName, -Group, -SampleID) # Step 4: Create a combined identifier for each sample-cell type data_long$Sample_CellType <- paste(data_long$SampleID, data_long$CellType, sep = "_") # Step 5: Prepare data for computing pairwise differences comb_data <- data_long %>% select(Sample_CellType, Value) %>% distinct() # Step 6: Create a vector of values with names value_vector <- comb_data$Value names(value_vector) <- comb_data$Sample_CellType # Step 7: Compute the pairwise differences diff_matrix <- outer(value_vector, value_vector, FUN = function(x, y) x - y) # Step 8: Assign row and column names labels <- comb_data$Sample_CellType rownames(diff_matrix) <- labels colnames(diff_matrix) <- labels # Step 9: Cluster the matrix to enhance visualization # Compute distance matrix dist_matrix <- dist(value_vector) # Perform hierarchical clustering hc <- hclust(dist_matrix) # Reorder the matrix ordered_labels <- labels[hc$order] diff_matrix_ordered <- diff_matrix[ordered_labels, ordered_labels] # Step 10: Melt the ordered matrix for plotting diff_melted <- melt(diff_matrix_ordered, varnames = c("Var1", "Var2"), value.name = "Difference") # Step 11: Adjust the color scale limits max_abs_value <- max(abs(diff_melted$Difference)) # Step 12: Generate the heatmap p <- ggplot(diff_melted, aes(x = Var2, y = Var1, fill = Difference)) + geom_tile(color = "grey80") + scale_fill_gradientn(colors = rev(brewer.pal(11, "RdBu")), limits = c(-max_abs_value, max_abs_value), name = "Difference (Z-score)") + theme_minimal() + xlab("Sample and Cell Type") + ylab("Sample and Cell Type") + ggtitle("Combined Confusion Matrix") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 5), axis.text.y = element_text(size = 5), panel.grid.major = element_blank(), panel.grid.minor = element_blank()) # Print the plot print(p) ``` ```{r} # Load necessary libraries library(ggplot2) library(reshape2) library(gridExtra) library(RColorBrewer) # For enhanced color palettes library(pheatmap) # For advanced heatmap plotting # Assuming your data frame is named 'ssgsea_scores_filtered' ssgsea_scores_filtered <- ssgsea_scores # Step 1: Normalize the data per cell type using Z-score normalization normalized_data <- t(apply(ssgsea_scores_filtered, 1, function(x) { (x - mean(x)) / sd(x) })) normalized_data <- as.data.frame(normalized_data) # Step 2: Transpose the data so that samples are rows and cell types are columns normalized_data_t <- t(normalized_data) normalized_data_t <- as.data.frame(normalized_data_t) # Step 3: Add sample group information ("P" or "M") to each sample normalized_data_t$SampleName <- rownames(normalized_data_t) normalized_data_t$Group <- ifelse(grepl("P$", normalized_data_t$SampleName), "P", "M") # Step 4: Split the data into "M" and "P" groups data_P <- normalized_data_t[normalized_data_t$Group == "P", ] data_M <- normalized_data_t[normalized_data_t$Group == "M", ] # Remove 'SampleName' and 'Group' columns to retain only cell type data cell_types <- colnames(normalized_data_t)[!colnames(normalized_data_t) %in% c("SampleName", "Group")] data_P_values <- data_P[, cell_types] data_M_values <- data_M[, cell_types] # Step 5: Compute correlation matrices for each group # Transpose the data to have cell types as rows and patients as columns data_P_values_t <- t(data_P_values) data_M_values_t <- t(data_M_values) # Compute correlation matrices (cell types vs. cell types across patients) cor_matrix_P <- cor(data_P_values_t, use = "pairwise.complete.obs") cor_matrix_M <- cor(data_M_values_t, use = "pairwise.complete.obs") # Step 6: Visualize the correlation matrices using heatmaps # Use the same color scale for both heatmaps for comparison # Define color palette my_palette <- colorRampPalette(rev(brewer.pal(11, "RdBu")))(100) # Create heatmaps pheatmap_P <- pheatmap(cor_matrix_P, color = my_palette, breaks = seq(-0.5, 0.5, length.out = 100), cluster_rows = TRUE, cluster_cols = TRUE, main = "Cell Type Correlation in P Group", fontsize = 10) pheatmap_M <- pheatmap(cor_matrix_M, color = my_palette, breaks = seq(-0.5, 0.5, length.out = 100), cluster_rows = TRUE, cluster_cols = TRUE, main = "Cell Type Correlation in M Group", fontsize = 10) # Optional Step: Compute the difference between the two correlation matrices cor_matrix_diff <- cor_matrix_M - cor_matrix_P # Visualize the difference matrix pheatmap_diff <- pheatmap(cor_matrix_diff, color = my_palette, breaks = seq(-0.5, 0.5, length.out = 100), cluster_rows = TRUE, cluster_cols = TRUE, main = "Difference in Correlations (M - P)", fontsize = 10) # Since pheatmap outputs are grid objects, we can arrange them using grid.arrange # Extract the grobs gP <- pheatmap_P$gtable gM <- pheatmap_M$gtable gDiff <- pheatmap_diff$gtable # Arrange the heatmaps side by side grid.arrange(gP, gM, gDiff, ncol = 3) ``` ```{r} # Load necessary libraries library(ggplot2) library(reshape2) library(gridExtra) library(RColorBrewer) # For color palettes library(pheatmap) # For heatmap plotting # Assuming your data frame is named 'ssgsea_scores_filtered' # If you have 'ssgsea_scores', assign it to 'ssgsea_scores_filtered' # ssgsea_scores_filtered <- ssgsea_scores # Step 1: Transpose the data to have samples as rows and cell types as columns data_transposed <- as.data.frame(t(ssgsea_scores_filtered)) # Add SampleName and Group columns data_transposed$SampleName <- rownames(data_transposed) data_transposed$Group <- ifelse(grepl("P$", data_transposed$SampleName), "P", "M") # Remove "M" and "P" suffix to get base SampleID data_transposed$SampleID <- sub("[PM]$", "", data_transposed$SampleName) # Identify the cell type columns (exclude 'SampleName', 'Group', 'SampleID') cell_types <- setdiff(colnames(data_transposed), c("SampleName", "Group", "SampleID")) # Step 2: Normalize the data for each sample (row-wise Z-score normalization) # Note: Since we are comparing samples, it's better to normalize across cell types (columns) normalized_data <- as.data.frame(scale(data_transposed[, cell_types])) # Add back SampleID and Group information normalized_data$SampleID <- data_transposed$SampleID normalized_data$Group <- data_transposed$Group # Step 3: Split the data into "M" and "P" groups data_M <- normalized_data[normalized_data$Group == "M", ] data_P <- normalized_data[normalized_data$Group == "P", ] # Step 4: Set SampleID as row names to align samples between groups rownames(data_M) <- data_M$SampleID rownames(data_P) <- data_P$SampleID # Remove non-numeric columns for correlation computation data_M_values <- data_M[, cell_types] data_P_values <- data_P[, cell_types] # Step 5: Compute correlation matrices between samples for each group # Since samples are rows and cell types are columns, we compute correlations between samples cor_matrix_M <- cor(t(data_M_values), use = "pairwise.complete.obs") cor_matrix_P <- cor(t(data_P_values), use = "pairwise.complete.obs") # Step 6: Obtain sample order from the "M" group correlation matrix # Perform hierarchical clustering on the "M" group's correlation matrix hc_M <- hclust(as.dist(1 - cor_matrix_M)) # Get the sample order from the clustering sample_order <- hc_M$labels[hc_M$order] # Reorder the correlation matrices according to the sample order cor_matrix_M <- cor_matrix_M[sample_order, sample_order] cor_matrix_P <- cor_matrix_P[sample_order, sample_order] # Step 7: Plot the correlation matrices using the same sample order # Define a consistent color palette my_palette <- colorRampPalette(rev(brewer.pal(11, "RdBu")))(100) # Create heatmaps without clustering (since we already ordered the samples) pheatmap_M <- pheatmap(cor_matrix_M, color = my_palette, breaks = seq(-1, 1, length.out = 100), cluster_rows = FALSE, cluster_cols = FALSE, main = "Bone Metastasis TME", fontsize = 10) pheatmap_P <- pheatmap(cor_matrix_P, color = my_palette, breaks = seq(-1, 1, length.out = 100), cluster_rows = FALSE, cluster_cols = FALSE, main = "Primary Tumor TME", fontsize = 10) # Optional Step: Compute the difference between the two correlation matrices cor_matrix_diff <- cor_matrix_M - cor_matrix_P # Visualize the difference matrix pheatmap_diff <- pheatmap(cor_matrix_diff, color = my_palette, breaks = seq(-1, 1, length.out = 100), cluster_rows = FALSE, cluster_cols = FALSE, main = "Difference in Correlations (M - P)", fontsize = 10) # Arrange the heatmaps side by side gM <- pheatmap_M$gtable gP <- pheatmap_P$gtable gDiff <- pheatmap_diff$gtable grid.arrange(gM, gP, gDiff, ncol = 3) # Open a PDF device to save the plots pdf("~/Desktop/correlation_heatmaps.pdf", width = 8, height = 2) # Adjust width and height as needed # Arrange and draw the plots into the PDF grid.arrange(gM, gP, gDiff, ncol = 3) # Close the PDF device dev.off() ``` ```{r} # Load necessary libraries library(ggplot2) library(reshape2) library(gridExtra) library(RColorBrewer) # For color palettes library(pheatmap) # For heatmap plotting # Assuming your data frame is named 'ssgsea_scores_filtered' # If you have 'ssgsea_scores', assign it to 'ssgsea_scores_filtered' # ssgsea_scores_filtered <- ssgsea_scores # Step 1: Transpose the data to have samples as rows and cell types as columns data_transposed <- as.data.frame(t(ssgsea_scores_filtered)) # Add SampleName and Group columns data_transposed$SampleName <- rownames(data_transposed) data_transposed$Group <- ifelse(grepl("P$", data_transposed$SampleName), "P", "M") # Remove "M" and "P" suffix to get base SampleID data_transposed$SampleID <- sub("[PM]$", "", data_transposed$SampleName) # Identify the cell type columns (exclude 'SampleName', 'Group', 'SampleID') cell_types <- setdiff(colnames(data_transposed), c("SampleName", "Group", "SampleID")) # Step 2: Normalize the data for each sample (row-wise Z-score normalization) # Note: Since we are comparing samples, it's better to normalize across cell types (columns) normalized_data <- as.data.frame(scale(data_transposed[, cell_types])) # Add back SampleID and Group information normalized_data$SampleID <- data_transposed$SampleID normalized_data$Group <- data_transposed$Group # Step 3: Split the data into "M" and "P" groups data_M <- normalized_data[normalized_data$Group == "M", ] data_P <- normalized_data[normalized_data$Group == "P", ] # Step 4: Set SampleID as row names to align samples between groups rownames(data_M) <- data_M$SampleID rownames(data_P) <- data_P$SampleID # Remove non-numeric columns for correlation computation data_M_values <- data_M[, cell_types] data_P_values <- data_P[, cell_types] # Step 5: Compute correlation matrices between samples for each group using Spearman correlation # Since samples are rows and cell types are columns, we compute correlations between samples cor_matrix_M <- cor(t(data_M_values), method = "spearman", use = "pairwise.complete.obs") cor_matrix_P <- cor(t(data_P_values), method = "spearman", use = "pairwise.complete.obs") # Step 6: Obtain sample order from the "M" group correlation matrix # Perform hierarchical clustering on the "M" group's correlation matrix hc_M <- hclust(as.dist(1 - cor_matrix_M)) # Get the sample order from the clustering sample_order <- hc_M$labels[hc_M$order] # Reorder the correlation matrices according to the sample order cor_matrix_M <- cor_matrix_M[sample_order, sample_order] cor_matrix_P <- cor_matrix_P[sample_order, sample_order] # Step 7: Plot the correlation matrices using the same sample order # Define a consistent color palette my_palette <- colorRampPalette(rev(brewer.pal(11, "RdBu")))(100) # Create heatmaps without clustering (since we already ordered the samples) pheatmap_M <- pheatmap( cor_matrix_M, color = my_palette, breaks = seq(-1, 1, length.out = 100), cluster_rows = FALSE, cluster_cols = FALSE, border_color = NA, # Remove grid lines if desired main = "Bone Metastasis TME (Spearman)", fontsize = 10 ) pheatmap_P <- pheatmap( cor_matrix_P, color = my_palette, breaks = seq(-1, 1, length.out = 100), cluster_rows = FALSE, cluster_cols = FALSE, border_color = NA, # Remove grid lines if desired main = "Primary Tumor TME (Spearman)", fontsize = 10 ) # Optional Step: Compute the difference between the two correlation matrices cor_matrix_diff <- cor_matrix_M - cor_matrix_P # Visualize the difference matrix pheatmap_diff <- pheatmap( cor_matrix_diff, color = my_palette, breaks = seq(-1, 1, length.out = 100), cluster_rows = FALSE, cluster_cols = FALSE, border_color = NA, # Remove grid lines if desired main = "Difference in Correlations (M - P, Spearman)", fontsize = 10 ) # Arrange the heatmaps side by side gM <- pheatmap_M$gtable gP <- pheatmap_P$gtable gDiff <- pheatmap_diff$gtable # Open a PDF device to save the plots #pdf("~/Desktop/correlation_heatmaps_spearman.pdf", width = 8, height = 2) # Adjust width and height as needed # Arrange and draw the plots into the PDF grid.arrange(gM, gP, gDiff, ncol = 3) # Close the PDF device #dev.off() ```