--- title: "06.inferCNV" output: html_document date: "2023-12-16" --- ```{r} #rm(list = ls()) library(infercnv) library(dplyr) library(Seurat) library(ggplot2) library(scCustomize) library(RColorBrewer) library(viridis) ``` ```{r} options(scipen = 100) options(error = function() traceback(2)) options(bitmapType="Xlib") # if you are using linux system ``` ```{r, read data} All.merge % tibble::rownames_to_column(var = "cellbarcode") write.table(cellanno, "./data/infercnv/cnv_cellanno.txt", sep = "\t", col.names = F,row.names =FALSE, quote =F ) head(cellanno) gtf = "./data/infercnv/GRCh38.genes.gtf" gtf = plyranges::read_gff(gtf) head(gtf) table(gtf$type) gene_order_anno = gtf %>% plyranges::filter(type == "gene" & gene_name %in% rownames(seurat_obj)) %>% as.data.frame() %>% dplyr::select(gene_name, seqnames, start, end) %>% dplyr::distinct(gene_name, .keep_all=T) head(gene_order_anno) write.table(gene_order_anno, "./data/infercnv/gene_order_anno.txt", col.names =F, row.names =F, sep = "\t", quote =F ) head(gene_order_anno) getwd() head(cellanno) print("=======run======here==========") table(cellanno$celltype) 4 # create the infercnv object infercnv_obj = CreateInfercnvObject(raw_counts_matrix=counts_matrix, annotations_file="./data/infercnv/cnv_cellanno.txt", delim="\t", gene_order_file="./data/infercnv/gene_order_anno.txt", ref_group_names=c(#'immune' '0', '1', '4', '6', '7', '8', '10' )) # 5 # perform infercnv operations to reveal cnv signal infercnv_obj = infercnv::run(infercnv_obj, cutoff=0.1, # use 1 for smart-seq, 0.1 for 10x-genomics out_dir="./outs/infercnv/", analysis_mode="subclusters", cluster_by_groups=T, denoise=T, # noise_logistic=TRUE, # turns gradient filtering on num_threads=20, HMM=T) #2 infercnv_obj = infercnv::run(infercnv_obj, cutoff=1, # cutoff=1 works well for Smart-seq2, and cutoff=0.1 works well for 10x Genomics out_dir=out_dir, cluster_by_groups=T, plot_steps=F, denoise=T, sd_amplifier=3, # sets midpoint for logistic noise_logistic=TRUE # turns gradient filtering on ) #3 infercnv_obj_medianfiltered = infercnv::apply_median_filtering(infercnv_obj) infercnv::plot_cnv(infercnv_obj_medianfiltered, out_dir='./outs/inferCNV_stroma/', output_filename='infercnv.median_filtered', cluster_by_groups = TRUE, #k_obs_groups = 3, x.range="auto", x.center=1, title = "infercnv", output_format = "pdf", write_phylo = FALSE, color_safe_pal = FALSE) save(infercnv_obj,file = "./data/infercnv/infercnv_obj.rds") save(infercnv_obj_medianfiltered,file = "./data/infercnv/infercnv_obj_medianfiltered.rds") ``` #apply infercnv result ```{r} All.merge <- readRDS("./data/infercnv/epi_stro.rds") infer_CNV_obj<-readRDS('./outs/infercsv_outs/run.final.infercnv_obj') expr<-infer_CNV_obj@expr.data expr[1:4,1:4] data_cnv<-as.data.frame(expr) dim(expr) colnames(data_cnv) rownames(data_cnv) meta <- All.merge@meta.data ``` ```{r} if(T){ tmp1 = expr[,infer_CNV_obj@reference_grouped_cell_indices$`ref-1`] tmp2 = expr[,infer_CNV_obj@reference_grouped_cell_indices$`ref-2`] tmp= cbind(tmp1,tmp2) down=mean(rowMeans(tmp)) - 2 * mean( apply(tmp, 1, sd)) up=mean(rowMeans(tmp)) + 2 * mean( apply(tmp, 1, sd)) oneCopy=up-down oneCopy a1= down- 2*oneCopy a2= down- 1*oneCopy down;up a3= up + 1*oneCopy a4= up + 2*oneCopy cnv_score_table<-infer_CNV_obj@expr.data cnv_score_table[1:4,1:4] cnv_score_mat <- as.matrix(cnv_score_table) # Scoring cnv_score_table[cnv_score_mat > 0 & cnv_score_mat < a2] <- "A" #complete loss. 2pts cnv_score_table[cnv_score_mat >= a2 & cnv_score_mat < down] <- "B" #loss of one copy. 1pts cnv_score_table[cnv_score_mat >= down & cnv_score_mat < up ] <- "C" #Neutral. 0pts cnv_score_table[cnv_score_mat >= up & cnv_score_mat <= a3] <- "D" #addition of one copy. 1pts cnv_score_table[cnv_score_mat > a3 & cnv_score_mat <= a4 ] <- "E" #addition of two copies. 2pts cnv_score_table[cnv_score_mat > a4] <- "F" #addition of more than two copies. 2pts # Check table(cnv_score_table[,1]) # Replace with score cnv_score_table_pts <- cnv_score_mat rm(cnv_score_mat) # cnv_score_table_pts[cnv_score_table == "A"] <- 2 cnv_score_table_pts[cnv_score_table == "B"] <- 1 cnv_score_table_pts[cnv_score_table == "C"] <- 0 cnv_score_table_pts[cnv_score_table == "D"] <- 1 cnv_score_table_pts[cnv_score_table == "E"] <- 2 cnv_score_table_pts[cnv_score_table == "F"] <- 3 cnv_score_table_pts[1:4,1:4] str( as.data.frame(cnv_score_table_pts[1:4,1:4])) cell_scores_CNV <- as.data.frame(colSums(cnv_score_table_pts)) colnames(cell_scores_CNV) <- "cnv_score" } ``` ```{r} head(cell_scores_CNV) score=cell_scores_CNV head(score) meta$totalCNV = score[match(colnames(All.merge), rownames(score)),1] All.merge$totalCNV = score[match(colnames(All.merge), rownames(score)),1] ggplot(meta, aes(x=celltype_C , y=totalCNV, fill=totalCNV )) + geom_boxplot()+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x-axis labels for readability #ggsave(filename = "score_per_infercnv_subcluster.pdf", width = 20,height = 5, dpi = 600, device=cairo_pdf, path = './outs/inferCNV_stroma//') ##ordered boxplot by subclusters # Calculate the median of totalCNV for each infercnv_subcluster ordering <- meta %>% group_by(infercnv_subcluster) %>% summarise(median_totalCNV = median(totalCNV)) %>% arrange(desc(median_totalCNV)) %>% mutate(infercnv_subcluster = factor(infercnv_subcluster, levels = infercnv_subcluster)) # Merge the ordering information back to the original data frame meta <- meta %>% left_join(ordering, by = "infercnv_subcluster") # Now plot with the ordered factor levels and remove outliers ggplot(meta, aes(x=infercnv_subcluster, y=totalCNV, fill=infercnv_subcluster)) + geom_boxplot(outlier.shape = NA) + # Remove outliers theme(axis.text.x = element_text(angle = 90, hjust = 1, ), legend.position = "none") + scale_x_discrete(limits = ordering$infercnv_subcluster) + # Order the x-axis based on median_totalCNV scale_fill_brewer(palette = "viridis") # Use a predefined color palette ggsave(filename = "score_per_infercnv_subcluster_ordered_by_infercnv_subcluster.pdf", width = 25,height = 5, dpi = 600, device=cairo_pdf, path = './outs/infercnv/') ##order boxplot by sample.ID # Calculate the median of totalCNV for each infercnv_subcluster ordering <- meta %>% group_by(cancer.id) %>% summarise(median_totalCNV = median(totalCNV)) %>% arrange(desc(median_totalCNV)) %>% mutate(infercnv_subcluster = factor(cancer.id, levels = cancer.id)) ordering <- meta %>% group_by(cancer.id) %>% summarise(sum_totalCNV = mean(totalCNV, na.rm = TRUE)) %>% arrange(desc(sum_totalCNV)) %>% mutate(infercnv_subcluster = factor(cancer.id, levels = cancer.id)) #export the ordering write.csv(ordering, './outs/cnv_score_cancer.id_new.csv') # Merge the ordering information back to the original data frame meta <- meta %>% left_join(ordering, by = "cancer.id") # Now plot with the ordered factor levels and remove outliers ggplot(meta, aes(x=cancer.id, y=totalCNV, fill=cancer.id)) + geom_boxplot(outlier.shape = NA) + # Remove outliers theme(axis.text.x = element_text(angle = 90, hjust = 1, ), legend.position = "none") + scale_x_discrete(limits = ordering$cancer.id) + # Order the x-axis based on median_totalCNV scale_fill_brewer(palette = "Set3") # Use a predefined color palette ggsave(filename = "score_per_infercnv_subcluster_ordered_by_sample.pdf", width = 15,height = 5, dpi = 600, device=cairo_pdf, path = './outs/infercnv/') # Calculate the median of totalCNV for each infercnv_subcluster ordering <- meta %>% group_by(infercnv_subcluster) %>% summarise(median_totalCNV = median(totalCNV)) %>% arrange(desc(median_totalCNV)) %>% mutate(infercnv_subcluster = factor(infercnv_subcluster, levels = infercnv_subcluster)) # Merge the ordering information back to the original data frame meta <- meta %>% left_join(ordering, by = "infercnv_subcluster") # Now plot with the ordered factor levels and remove outliers ggplot(meta, aes(x=infercnv_subcluster, y=totalCNV, fill=infercnv_subcluster)) + geom_boxplot(outlier.shape = NA) + # Remove outliers theme(axis.text.x = element_text(angle = 90, hjust = 1, ), legend.position = "none") + scale_x_discrete(limits = ordering$infercnv_subcluster) + # Order the x-axis based on median_totalCNV scale_fill_brewer(palette = "Set2") # Use a predefined color palette ggsave(filename = "score_per_infercnv_subcluster_ordered_by_sample.pdf", width = 15,height = 5, dpi = 600, device=cairo_pdf, path = './outs/infercnv/') # Example in R with Seurat VlnPlot(All.merge, features = marker_genes_of_celltype_A, group.by = "condition") ``` ```{r} library(dplyr) library(ggplot2) library(viridis) # Extract the metadata meta <- All.merge@meta.data # Step 1: Calculate the mean and standard deviation of totalCNV for reference clusters reference_clusters <- c(1, 2, 4, 5) reference_stats <- meta %>% filter(rpca_clusters %in% reference_clusters) %>% summarise( mean_totalCNV = mean(totalCNV), sd_totalCNV = sd(totalCNV) ) mean_ref <- reference_stats$mean_totalCNV sd_ref <- reference_stats$sd_totalCNV # Step 2: Normalize totalCNV for cells in rpca_clusters 0 and 3 # Correct the scaling to make sure 0 and 3 are higher meta <- meta %>% mutate( totalCNV_normalized = ifelse(rpca_clusters %in% c(0, 3), (totalCNV - mean_ref) / sd_ref, NA) ) # Step 3: Rescale totalCNV_normalized for clusters 0 and 3 to reflect higher scores # Set minimum to 0 and maximum to 10, ensuring 0 and 3 are properly scaled min_val <- min(meta$totalCNV_normalized[meta$rpca_clusters %in% c(0, 3)], na.rm = TRUE) max_val <- max(meta$totalCNV_normalized[meta$rpca_clusters %in% c(0, 3)], na.rm = TRUE) meta <- meta %>% mutate( totalCNV_scaled = ifelse(rpca_clusters %in% c(0, 3), 10 * (totalCNV_normalized - min_val) / (max_val - min_val), NA) ) # Correct scaling for reference clusters to ensure a proper comparison # Rescale reference clusters to have a range similar to the normalized 0 and 3 clusters meta <- meta %>% mutate(totalCNV_scaled = ifelse(is.na(totalCNV_scaled), (totalCNV - mean_ref) / sd_ref * 10, totalCNV_scaled)) # Store back in All.merge metadata All.merge@meta.data <- meta # Step 4: Recalculate ordering based on scaled totalCNV ordering_scaled <- meta %>% group_by(infercnv_subcluster) %>% summarise(median_totalCNV_scaled = median(totalCNV_scaled)) %>% arrange(desc(median_totalCNV_scaled)) %>% mutate(infercnv_subcluster = factor(infercnv_subcluster, levels = infercnv_subcluster)) # Merge the ordering information back to the metadata meta <- meta %>% left_join(ordering_scaled, by = "infercnv_subcluster") # Step 5: Plot the normalized and scaled totalCNV for all infercnv_subclusters # Color scale based on rank meta <- meta %>% mutate(rank_order = as.numeric(factor(infercnv_subcluster, levels = ordering_scaled$infercnv_subcluster))) ggplot(meta, aes(x = infercnv_subcluster, y = totalCNV_scaled, fill = rank_order)) + geom_boxplot(outlier.shape = NA) + # Remove outliers theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "none") + scale_x_discrete(limits = ordering_scaled$infercnv_subcluster) + # Order the x-axis based on median_totalCNV_scaled scale_fill_viridis_c(option = "D", direction = -1) + # Use viridis color palette with gradient based on rank labs(x = "Infercnv Subcluster", y = "Scaled Total CNV", title = "Scaled Total CNV per Infercnv Subcluster") ggsave(filename = "infercnv_subcluster.pdf", width = 15,height = 5, dpi = 600, device=cairo_pdf, path = './outs/') ``` ```{r} #re-classfying the sub_clusters # Load necessary library library(dplyr) # Assuming 'All.merge' is your Seurat object # Extract unique 'infercnv_subcluster' and their corresponding 'rank_order' cluster_info <- All.merge@meta.data %>% select(infercnv_subcluster, rank_order) %>% distinct() # Extract the cluster prefix (number before the underscore) cluster_info <- cluster_info %>% mutate(cluster_prefix = sub('_.*', '', infercnv_subcluster)) # Assign 'ref' to clusters starting with 1, 2, 4, or 5 clusters_ref <- cluster_info %>% filter(cluster_prefix %in% c('1', '2', '4', '5')) %>% mutate(cnv_burden = 'ref') # For clusters starting with 0 or 3 clusters_0_3 <- cluster_info %>% filter(cluster_prefix %in% c('0', '3')) # Calculate the median of 'rank_order' for clusters starting with 0 or 3 median_rank <- median(clusters_0_3$rank_order) # Assign 'high_cnv' or 'low_cnv' based on 'rank_order' compared to the median clusters_0_3 <- clusters_0_3 %>% mutate(cnv_burden = ifelse(rank_order <= median_rank, 'high_cnv', 'low_cnv')) # Combine the two groups cluster_cnv_burden <- bind_rows(clusters_ref, clusters_0_3) # Merge the 'cnv_burden' information back into the Seurat object's metadata All.merge@meta.data <- All.merge@meta.data %>% left_join(cluster_cnv_burden[, c('infercnv_subcluster', 'cnv_burden')], by = 'infercnv_subcluster') # Optional: Check the result table(All.merge@meta.data$cnv_burden) unique(All.merge$cnv_burden) saveRDS(obj, './data/Epi.rds') ``` ```{r} # Check if the number of cells (columns) in both objects is the same if (ncol(obj) != ncol(epi_stro)) { stop("The number of cells in obj and epi_stro do not match.") } # Assign the cell names from epi_stro to obj # Update column names in the assays (counts, data, scale.data) colnames(obj@assays$RNA@counts) <- colnames(epi_stro) colnames(obj@assays$RNA@data) <- colnames(epi_stro) # If scale.data exists, update its column names if (!is.null(obj@assays$RNA@scale.data)) { colnames(obj@assays$RNA@scale.data) <- colnames(epi_stro) } # Update row names in the meta.data rownames(obj@meta.data) <- colnames(epi_stro) # The following line is not needed and should be removed: # obj@cell.names <- colnames(epi_stro) # Optional: Verify that the cell names have been updated head(colnames(obj)) head(rownames(obj@meta.data)) ``` ```{r} # Assuming "score" is your dataframe and "All.merge" is your Seurat object with metadata # Convert the metadata from the Seurat object to a dataframe meta <- data.frame(barcode = colnames(All.merge), cancer.id = All.merge$cancer.id, cancer = All.merge$cancer, celltype_C = All.merge$celltype_C, row.names = NULL) # Merge "score" dataframe with metadata based on matching cell barcodes infercnv_score_patient <- merge(score, meta, by.x = "row.names", by.y = "barcode") # Rename the merged column appropriately colnames(infercnv_score_patient)[1] <- "barcode" infercnv_score_patient$barcode <- sub("_.*", "", infercnv_score_patient$barcode) # Check the result head(infercnv_score_patient) # Export the new dataframe to a CSV file write.csv(infercnv_score_patient, "./outs/infercnv_score_patient.csv", row.names = FALSE) # Normalize the cnv_score by subtracting 10000 infercnv_score_patient$cnv_score_scaled <- (infercnv_score_patient$cnv_score - 10000)/100 # Check the result head(infercnv_score_patient) ``` ```{r} library(sceasy) library(reticulate) DefaultAssay(All.merge) <- "RNA" sceasy::convertFormat(All.merge, from="seurat", to="anndata", outFile='./data/integrated_till_0712/epi_infercnv_clustered.h5ad') ``` ```{r} #annotate All.merge@active.ident <- factor(All.merge$infercnv_subcluster) malignant_low <- c("0_s1", "1_s9", "5_s4", "5_s15", "12_s8", "9_s1", "2_s30", "3_s6", "12_s6", "12_s3", "10_s3", "3_s5", "3_s1", "12_s1", "5_s14", "3_s3", "10_s4", "10_s8", "1_s17") All.merge@meta.data$epi_sub <-ifelse(All.merge@active.ident %in% malignant_low ,'malignant_low','malignant_high') ```