# MicroSEC: Microhomology-induced chimeric read-originating sequence error cleaning pipeline for FFPE samples # # Author: "Masachika Ikegami" # # This pipeline is designed for filtering mutations found in formalin-fixed and paraffin-embedded (FFPE) samples. # The MicroSEC filter utilizes a statistical analysis, and the results for mutations with less than 10 supporting reads are not reliable. # # Two files are necessary for the analysis: mutation information file, BAM file # An additional file is preferable: mutation supporting read ID information file. # A sample information tsv file is mandatory if multiple samples are processed simultaneously. # # File 1: mutation information file (mandatory) # This tsv or tsv.gz file should contain at least these columns: # Sample Mut_type Chr Pos Ref Alt SimpleRepeat_TRF Neighborhood_sequence # SL_1010-N6-B 1-snv chr1 108130741 C T N CTACCTGGAGAATGGGCCCATGTGTCCAGGTAGCAGTAAGC # SimpleRepeat_TRF: Whether the mutation locates at a simple repeat sequence or not ("Y" or "N"). # Neighborhood_sequence: [5'-20 bases] + [Alt sequence] + [3'-20 bases] # Sample, Mut_type, Chr, Pos, Ref, and Alt should be set exactly. # If you do not know the SimpleRepeat_TRF, Mut_type, or Neighborhood_sequence, enter "-". Automatically detected. # # File 2: BAM file (mandatory) # # File 3: sample information tsv file (mandatory, if multiple samples are processed in a batch) # Seven to ten columns are necessary (without column names). # Optional columns can be deleted if they are not applicable. # [sample name] [mutation information tsv file] [BAM file] [read length] [adapter sequence read 1] [optional: adapter sequence read 2] [sample type: Human or Mouse] [panel name] [optional: reference genome fasta file] [optional: simple repeat region bed file] # PC9 ./source/CCLE.tsv ./source/Cell_line/PC9.bam 127 AGATCGGAAGAGCACACGTCTGAACTCCAGTCA AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT Human TOP # A375 ./source/CCLE.tsv.gz ./source/Cell_line/A375.bam 127 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT Hg38 TOP ./reference/hg38.fa ./reference/simpleRepeat.bed.gz # # File 4: Reference genome: Human (hg38 or hg19) or Mouse (mm10) (optional, but mandatory with cram files) # # File 5: simple repeat region bed file (optional, but mandatory to detect simple repeat derived artifacts) # # This pipeline contains 8 filtering processes. # Filter 1 : Shorter-supporting lengths distribute too unevenly to occur (1-1 and 1-2). # Filter 1-1: P-values are less than the threshold_p(default: 10^(-6)). # Filter 1-2: The shorter-supporting lengths distributed over less than 75% of the read length. # Filter 2 : Hairpin-structure induced error detection (2-1 and 2-2). # Filter 2-1: Palindromic sequences exist within 150 bases. # Filter 2-2: >=50% mutation-supporting reads contains a reverse complementary sequence of the opposite strand consisting >= 15 bases. # Filter 3 : 3’-/5’-supporting lengths are too unevenly distributed to occur (3-1 and 3-3). # Filter 3-1: P-values are less than the threshold_p(default: 10^(-6)). # Filter 3-2: The distributions of 3’-/5’-supporting lengths are within 75% of the read length. # Filter 4 : >=15% mutations were called by chimeric reads comprising two distant regions. # Filter 5 : >=50% mutations were called by soft-clipped reads. # Filter 6 : Mutations locating at simple repeat sequences. # Filter 7 : Mutations locating at a >=15 homopolymer. # Filter 8 : >=10% low quality bases (Quality score <18) in the mutation supporting reads. # # Supporting lengths are adjusted considering small repeat sequences around the mutations. # # How to use # Rscript MicroSEC.R [working/output directory] [sample information tsv file] [progress bar Y/N] # # Example # Rscript MicroSEC.R /mnt/HDD8TB/MicroSEC /mnt/HDD8TB/MicroSEC/source/Sample_list_test.txt N # Rscript MicroSEC.R /mnt/HDD8TB/MicroSEC /mnt/HDD8TB/MicroSEC/source//sample_info_test.tsv.gz Y # # If you want to know the progress visually, [progress bar Y/N] should be Y. # # Results are saved in a tsv file. # load necessary packages library(MicroSEC) library(dplyr) library(readr) library(stringr) library(Rsamtools) library(BiocGenerics) library(Biostrings) # set arguments args <- commandArgs(trailingOnly = T) wd <- args[1] sample_list <- args[2] progress_bar <- args[3] setwd(wd) # load sample information tsv file sample_info <- read.csv(sample_list, header = FALSE, stringsAsFactors = FALSE, sep = "\t") # initialize msec <- NULL homology_search <- NULL mut_depth <- NULL for (sample in seq_len(dim(sample_info)[1])) { sample_name <- sample_info[sample, 1] mutation_file <- sample_info[sample, 2] bam_file <- sample_info[sample, 3] read_length <- as.integer(sample_info[sample, 4]) adapter_1 <- sample_info[sample, 5] if (sample_info[sample, 6] %in% c("Human", "Mouse", "hg19", "hg38", "mm10")) { adapter_2 <- adapter_1 organism <- sample_info[sample, 6] panel <- sample_info[sample, 7] if (dim(sample_info)[2] == 8) { reference_genome <- sample_info[sample, 8] } if (dim(sample_info)[2] == 9) { reference_genome <- sample_info[sample, 8] simple_repeat_list <- sample_info[sample, 9] } } else{ adapter_2 <- sample_info[sample, 6] organism <- sample_info[sample, 7] panel <- sample_info[sample, 8] if (dim(sample_info)[2] == 9) { reference_genome <- sample_info[sample, 9] } if (dim(sample_info)[2] == 10) { reference_genome <- sample_info[sample, 9] simple_repeat_list <- sample_info[sample, 10] } } bam_file_slim <- paste0(bam_file, ".SLIM.bam") bam_file_tmp = paste0(bam_file, ".tmp.bam") # load genomic sequence ref_genome <- fun_load_genome(organism) chr_no <- fun_load_chr_no(organism) if (ref_genome@user_seqnames[[1]] == "chr1") { chromosomes <- paste0("chr", c(seq_len(chr_no - 2),"X", "Y")) } if (ref_genome@user_seqnames[[1]] == "1") { chromosomes <- paste0("", c(seq_len(chr_no - 2),"X", "Y")) } # load mutation information df_mutation <- fun_load_mutation(mutation_file, sample_name, ref_genome, chr_no) if (tools::file_ext(bam_file) == "bam") { bam_file_bai <- paste0(bam_file, ".bai") } else if (tools::file_ext(bam_file) == "cram") { bam_file_bai <- paste0(bam_file, ".crai") } if (!file.exists(bam_file_bai) & !file.exists(bam_file_slim)) { print("Sorting a BAM/CRAM file...") if (tools::file_ext(bam_file) == "bam") { bam_file_sort <- paste0(bam_file, "_sort.bam") syscom <- paste0("samtools sort -@ 4 -o ", bam_file_sort, " ", bam_file) } else if (tools::file_ext(bam_file) == "cram") { bam_file_sort <- paste0(bam_file, "_sort.cram") syscom <- paste0("samtools sort -@ 4 -O cram -o ", bam_file_sort," ", bam_file) } system(syscom) syscom <- paste0("samtools index ", bam_file_sort) system(syscom) bam_file <- bam_file_sort } if (!file.exists(paste0(bam_file_slim, ".bai"))) { print("Trimming a BAM/CRAM file...") df_mutation_save <- df_mutation download_region <- data.frame(matrix(rep(NA,3),nrow=1))[numeric(0),] colnames(download_region) <- c("chrom", "chromStart", "chromEnd") print(paste0(sample_name, ": ", dim(df_mutation_save)[1], " mutations")) for (i in chromosomes) { df_mutation <- df_mutation_save[df_mutation_save$Chr == i,] continuous = FALSE for (mut_no in seq_len(dim(df_mutation)[1])) { if (mut_no == 1 & mut_no != dim(df_mutation)[1]) { if (df_mutation$Chr[mut_no + 1] != df_mutation$Chr[mut_no] | df_mutation$Pos[mut_no + 1] > df_mutation$Pos[mut_no] + 400) { download_region <- rbind( download_region, c(df_mutation$Chr[mut_no], max(1, df_mutation$Pos[mut_no] - 200) - 1, df_mutation$Pos[mut_no] + 200)) colnames(download_region) <- c("chrom", "chromStart", "chromEnd") continuous <- FALSE } else { continuous <- TRUE pos_last <- max(1, df_mutation$Pos[mut_no] - 200) } } else if (mut_no == 1 & mut_no == dim(df_mutation)[1]) { download_region <- rbind( download_region, c(df_mutation$Chr[mut_no], max(1, df_mutation$Pos[mut_no] - 200) - 1, df_mutation$Pos[mut_no] + 200)) colnames(download_region) <- c("chrom", "chromStart", "chromEnd") } else if (mut_no == dim(df_mutation)[1]) { if (continuous) { download_region <- rbind( download_region, c(df_mutation$Chr[mut_no], pos_last - 1, df_mutation$Pos[mut_no] + 200)) colnames(download_region) <- c("chrom", "chromStart", "chromEnd") } else { download_region = rbind( download_region, c(df_mutation$Chr[mut_no], max(1, df_mutation$Pos[mut_no] - 200) - 1, df_mutation$Pos[mut_no] + 200)) colnames(download_region) <- c("chrom", "chromStart", "chromEnd") } } else { if (continuous) { if (df_mutation$Chr[mut_no + 1] != df_mutation$Chr[mut_no] | df_mutation$Pos[mut_no + 1] > df_mutation$Pos[mut_no] + 400) { download_region = rbind( download_region, c(df_mutation$Chr[mut_no], pos_last - 1, df_mutation$Pos[mut_no] + 200)) colnames(download_region) <- c("chrom", "chromStart", "chromEnd") continuous = FALSE } } else { if (df_mutation$Chr[mut_no + 1] != df_mutation$Chr[mut_no] | df_mutation$Pos[mut_no + 1] > df_mutation$Pos[mut_no] + 400) { download_region <- rbind( download_region, c(df_mutation$Chr[mut_no], max(1, df_mutation$Pos[mut_no] - 200) - 1, df_mutation$Pos[mut_no] + 200)) colnames(download_region) <- c("chrom", "chromStart", "chromEnd") } else { continuous <- TRUE pos_last <- max(1, df_mutation$Pos[mut_no] - 200) } } } } } write_tsv(x = download_region, file = paste0(bam_file,".bed"), progress = F, col_names = F) rm(download_region) system_out <- 1 if (tools::file_ext(bam_file) == "bam") { syscom <- paste0("samtools view -h --no-PG ", bam_file, " -L ", paste0(bam_file,".bed"), " > ", bam_file_slim) } else if (tools::file_ext(bam_file) == "cram") { file_crai <- paste0(bam_file, ".crai") syscom <- paste0("samtools view -h --no-PG ", bam_file, " -T ", reference_genome, " ", "-X ", file_crai, " -L ", paste0(bam_file,".bed"), " > ", bam_file_slim) } system_out = (system(syscom)) if(system_out == 0){ system_out <- 1 syscom <- paste0("samtools sort -o ", bam_file_tmp, " ", bam_file_slim) print("Sorting BAM file...") system_out <- (system(syscom)) if(system_out == 0){ system_out <- 1 syscom = paste0("samtools view -bS ", bam_file_tmp, " > ", bam_file_slim) print("Compressing BAM file...") system_out <- (system(syscom)) if(system_out == 0){ system_out <- 1 syscom = paste0("samtools index ", bam_file_slim) print("Indexing BAM file...") system_out <- (system(syscom)) if(system_out == 0){ print(paste0("Slimmed BAM files were saved as ", bam_file_slim)) #file.remove(bam_file) file.remove(bam_file_tmp) } } } } df_mutation <- df_mutation_save rm(df_mutation_save) } bam_file <- bam_file_slim df_bam <- fun_load_bam(bam_file) # analysis result <- fun_read_check(df_mutation = df_mutation, df_bam = df_bam, ref_genome = ref_genome, sample_name = sample_name, read_length = read_length, adapter_1 = adapter_1, adapter_2 = adapter_2, short_homology_search_length = 4, min_homology_search = 40, progress_bar = progress_bar) msec <- rbind(msec, result[[1]]) homology_search <- rbind(homology_search, result[[2]]) mut_depth <- list(rbind(mut_depth[[1]], result[[3]][[1]]), rbind(mut_depth[[2]], result[[3]][[2]]), rbind(mut_depth[[3]], result[[3]][[3]])) } # search homologous sequences msec = fun_homology(msec, homology_search, min_homology_search = 40, ref_genome, chr_no, progress_bar = progress_bar) # statistical analysis msec = fun_summary(msec) msec = fun_analysis(msec, mut_depth, short_homology_search_length = 4, min_homology_search = 40, threshold_p = 10 ^ (-6), threshold_hairpin_ratio = 0.50, threshold_short_length = 0.75, threshold_distant_homology = 0.15, threshold_soft_clip_ratio = 0.50, threshold_low_quality_rate = 0.1, homopolymer_length = 15) # save the results fun_save(msec, paste0(wd, "/", sample_info[1,1], ".tsv.gz"))