--- title: "07.Pseudobulk_processing for DESeq2" output: html_document date: "2023-09-07" --- ```{r, read in data} rm(list = setdiff(ls(), "df")) library(tidyverse) library(dplyr) library(cowplot) library(edgeR) library(Matrix) library(reshape2) library(S4Vectors) library(SingleCellExperiment) library(pheatmap) library(apeglm) library(png) library(DESeq2) library(RColorBrewer) library(data.table) ``` ```{r} #read i # Extract raw counts and metadata to create SingleCellExperiment object counts <- df@assays$RNA@counts metadata <- df@meta.data # Set up metadata as desired for aggregation and DE analysis convert to factors df@active.ident <- factor(df$celltype_C) metadata$cell.id <- factor(df@active.ident) metadata$group.id <- factor(df$archetype) metadata$sample.id <- factor(df$cancer.id) # Create single cell experiment object sce <- SingleCellExperiment(assays = list(counts = counts), colData = metadata) ################################################################################################################################################# #Preparing the single-cell dataset for pseudobulk analysis ##Extracting necessary metrics for aggregation by cell type in a sample # Extract unique names of clusters (= levels of cluster_id factor variable) cluster_names <- levels(colData(sce)$cell.id) # Extract unique names of samples (= levels of sample_id factor variable) sample_names <- levels(colData(sce)$sample.id) ################################################################################################################################################# ##Aggregating counts to the sample level for each cluster # Identify groups for aggregation of counts groups <- colData(sce)[, c("cell.id","sample.id")] # Aggregate across cluster-sample groups # transposing row/columns to have cell_ids as row names matching those of groups aggr_counts <- aggregate.Matrix(t(counts(sce)), groupings = groups, fun = "sum") ################################################################################################################################################# ##Splitting the counts matrix by cell type # Transpose aggregated matrix to have genes as rows and samples as columns aggr_counts <- t(aggr_counts) # Understanding tstrsplit() ## Exploring structure of function output (list) tstrsplit(colnames(aggr_counts), "_") %>% str() ## Comparing the first 10 elements of our input and output strings head(colnames(aggr_counts), n = 10) head(tstrsplit(colnames(aggr_counts), "_")[[1]], n = 10) # Loop over all cell types to extract corresponding counts, and store information in a list ## Initiate empty list counts_ls <- list() for (i in 1:length(cluster_names)) { ## Extract indexes of columns in the global matrix that match a given cluster column_idx <- which(tstrsplit(colnames(aggr_counts), "_")[[1]] == cluster_names[i]) ## Store corresponding sub-matrix as one element of a list counts_ls[[i]] <- aggr_counts[, column_idx] names(counts_ls)[i] <- cluster_names[i] } # Explore the different components of the list str(counts_ls) ################################################################################################################################################# #Generating matching metadata at the sample-level # Reminder: explore structure of metadata head(colData(sce)) # Extract sample-level variables metadata <- colData(sce) %>% as.data.frame() %>% dplyr::select(group.id,sample.id) dim(metadata) head(metadata) # Exclude duplicated rows metadata <- metadata[!duplicated(metadata), ] dim(metadata) head(metadata) # Rename rows rownames(metadata) <- metadata$sample.id head(metadata) # Number of cells per sample and cluster t <- table(colData(sce)$sample.id, colData(sce)$cell.id) ################################################################################################################################################# # Creating metadata list ## Initiate empty list metadata_ls <- list() for (i in 1:length(counts_ls)) { ## Initiate a data frame for cluster i with one row per sample (matching column names in the counts matrix) df <- data.frame(cell_sample.id = colnames(counts_ls[[i]])) ## Use tstrsplit() to separate cluster (cell type) and sample IDs df$cell.id <- tstrsplit(df$cell_sample.id, "_")[[1]] df$sample.id <- tstrsplit(df$cell_sample.id, "_")[[2]] ## Retrieve cell count information for this cluster from global cell count table idx <- which(colnames(t) == unique(df$cell.id)) cell_counts <- t[, idx] ## Remove samples with zero cell contributing to the cluster cell_counts <- cell_counts[cell_counts > 0] ## Match order of cell_counts and sample_ids sample_order <- match(df$sample.id, names(cell_counts)) cell_counts <- cell_counts[sample_order] ## Append cell_counts to data frame df$cell_count <- cell_counts ## Join data frame (capturing metadata specific to cluster) to generic metadata df <- plyr::join(df, metadata, by = intersect(names(df), names(metadata))) ## Update rownames of metadata to match colnames of count matrix, as needed later for DE rownames(df) <- df$cell_sample.id ## Store complete metadata for cluster i in list metadata_ls[[i]] <- df names(metadata_ls)[i] <- unique(df$cell.id) } # Explore the different components of the list str(metadata_ls) ``` #Creating a DESeq2 object ```{r} # Select cell type of interest cluster_names # Double-check that both lists have same names all(names(counts_ls) == names(metadata_ls)) idx <- which(names(counts_ls) == "OC")##any cell type cluster_counts <- counts_ls[[idx]] cluster_metadata <- metadata_ls[[idx]] # Check contents of extracted objects #cluster_counts[1:6, 1:9] head(cluster_metadata) # Check matching of matrix columns and metadata rows all(colnames(cluster_counts) == rownames(cluster_metadata)) # Create DESeq2 object DESeq <- DESeqDataSetFromMatrix(cluster_counts, colData = cluster_metadata, design = ~ group.id) #counts <- DESeq@assays@data #saveRDS(counts, './outs/pseudobulk/counts.rds') #write.csv(DESeq@assays@data, './outs/pseudobulk/counts.csv') #saveRDS(DESeq, "./outs/pseudobulk/DESeq.rds") #saveRDS(cluster_metadata, './outs/pseudobulk/counts_metadata.rds') ``` ```{r, export the pseudubulk matrix} # Identify the numeric patient IDs numeric_patient_ids <- grep("^\\d+$", df1$patient.id, value = TRUE) # Add the prefix "MDACC" to these numeric patient IDs df1$patient.id <- ifelse(df1$patient.id %in% numeric_patient_ids, paste0("MDACC_", df1$patient.id), df1$patient.id) # Check the modified patient IDs unique(df1$patient.id) library(SingleCellExperiment) library(tidyverse) library(dplyr) library(cowplot) library(edgeR) library(Matrix) library(reshape2) library(S4Vectors) library(SingleCellExperiment) library(pheatmap) library(apeglm) library(png) library(DESeq2) library(RColorBrewer) library(data.table) # Step 1: Extract raw counts and metadata from the Seurat object counts <- df@assays$RNA@counts metadata <- df@meta.data # Step 2: Ensure sample.id is correctly formatted as a factor metadata$sample.id <- factor(df$cancer.id) # Step 3: Create a SingleCellExperiment object (optional, but useful for metadata) sce <- SingleCellExperiment(assays = list(counts = counts), colData = metadata) # Step 4: Aggregate counts by sample (ignoring cell types) # Group by sample.id groups <- colData(sce)[, "sample.id"] # Aggregate counts matrix by sample.id, summing counts for each gene across cells for each sample aggr_counts <- aggregate.Matrix(t(counts(sce)), groupings = groups, fun = "sum") # Step 5: Transpose the matrix to have genes as rows and samples as columns aggr_counts <- t(aggr_counts) # Step 6: Save the aggregated counts matrix as a CSV file write.csv(aggr_counts, "./outs/all.aggregated_gene_counts_by_sample.csv", row.names = TRUE) # Optional: Check the resulting aggregated matrix head(aggr_counts) ```