--- knit: (function(inputFile, encoding) { rmarkdown::render(inputFile, encoding = encoding, output_file ='one-to-n-matches_around-CoV-2-plus-GISAID.html') }) title: "One-to-N sequence matches in coronavirus genomes" author: "Jacques van Helden" date: "`r Sys.Date()`" output: html_document: df_print: paged code_folding: hide fig_caption: yes highlight: zenburn self_contained: no theme: cerulean toc: yes toc_depth: 3 toc_float: yes ioslides_presentation: colortheme: dolphin fig_caption: yes fig_height: 6 fig_width: 7 fonttheme: structurebold highlight: tango smaller: yes toc: yes widescreen: yes beamer_presentation: colortheme: dolphin fig_caption: yes fig_height: 6 fig_width: 7 fonttheme: structurebold highlight: tango incremental: no keep_tex: no slide_level: 2 theme: Montpellier toc: yes word_document: toc: yes toc_depth: '3' slidy_presentation: fig_caption: yes fig_width: 7 highlight: tango incremental: no keep_md: yes smaller: yes theme: cerulean toc: yes widescreen: yes pdf_document: fig_caption: yes highlight: zenburn toc: yes toc_depth: 3 font-import: http://fonts.googleapis.com/css?family=Risque font-family: Garamond transition: linear --- ```{r libraries, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE} #### Install required packages #### required.packages <- c("knitr") for (pkg in required.packages) { if (!require(pkg, character.only = TRUE)) { message("Installing package ", pkg) install.packages(pkg, dependencies = TRUE) } require(pkg, character.only = TRUE) } #### Load libraries #### if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } requiredBioconductor <- c("Biostrings", "DECIPHER") for (lib in requiredBioconductor) { if (!require(lib, character.only = TRUE, quietly = TRUE)) { message("Installing bioconductor package ", lib) BiocManager::install(lib) require(lib, character.only = TRUE) } } ``` ```{r knitr_settings, include=FALSE, echo=FALSE, eval=TRUE} library(knitr) options(width = 300) knitr::opts_chunk$set( fig.path = 'figures/one-to-n-matches/', fig.width = 7, fig.height = 5, fig.align = "center", size = "tiny", echo = TRUE, eval = TRUE, warning = FALSE, message = FALSE, results = TRUE, comment = "") # knitr::asis_output("\\footnotesize") ## Store original graphic parameters to restore them after chunks par.ori <- par(no.readonly = TRUE) ``` ```{r parameters} #### General parameters for the analysis #### reloadImage <- FALSE quickTest <- FALSE # if True, only analyse the 2 first features #### Sequence collection #### ## Supported: collections <- c( "around-CoV-2", "selected", "around-CoV-2-plus-GISAID", "selected-plus-GISAID" ) # collection <- "around-CoV-2" # ~20 strains # collection <- "all" # ~60 strains # collection <- "selected" # ~50 strains collection <- "around-CoV-2-plus-GISAID" # 16 strains # collection <- "selected-plus-GISAID" # ~40 strains # collection <- "all-plus-GISAID" # ~60 strains ## Choose a collection-specific path for the figures knitr::opts_chunk$set(fig.path = paste0('figures/one-to-n-matches_', collection, '/', collection, "_")) ## Note about GIDAID sequences. ## ## A few genomes were not available in NCBI Genbank at the time of ## this analysis, and had to be downloaded from GISAID. These sequences ## can however not be redistributed, they should thus be downloaded ## manually to reproduce the full trees. ## Exclude incomplete genomes (i.e. those containing a lof of Ns) to avoid biases in the distance computations excludeIncomplete <- TRUE #### Define directories and files #### dir <- list(main = '..') dir$R <- file.path(dir$main, "scripts/R") ## Input directory for sequences dir$seqdata <- file.path(dir$main, "data") dir.create(dir$seqdata, showWarnings = FALSE, recursive = TRUE) ## Create output directory for aligned sequences dir$results <- file.path(dir$main, "results") dir.create(dir$results, showWarnings = FALSE, recursive = TRUE) ## Instantiate a list for output files outfiles <- vector() ## Memory image dir$images <- file.path(dir$main, "memory_images") dir.create(dir$images, recursive = TRUE, showWarnings = FALSE) outfiles["Memory image"] <- file.path( dir$images, paste0(collection, "_one-to-n_matches.Rdata")) ## Input files infiles <- list() ## Output tables # di$output <- file.path(dir.main, "") # dir$tables <- ## Load custom functions source(file.path(dir$R, "align_n_to_one.R")) source(file.path(dir$R, "plot_pip_profiles.R")) source(file.path(dir$R, "plot_pip_profiles_from_list.R")) ## A unequivocal pattern to identify the reference genome in the sequence names of the input file refPattern <- "HuCoV2_WH01_2019" ## Exclude some genomes with a lot of Ns, because they bias the PIP profiles and alignments and trees excludePatterns <- c("PnGu-P2S_2019") #### Features of interest in the reference genome #### features <- list() #### Specific features #### ## Cited from Sun et al (https://doi.org/10.1016/j.molmed.2020.02.008)in Lin et al. https://doi.org/10.20944/preprints202006.0044.v1 features[['Sun_2020']] <- c(start = 13522, end = 23686, isCDS = FALSE) ## S1, first cleavage product of the spike protein features[['S1']] <- c(start = 21599, end = 23617, isCDS = TRUE) ## Receptor binding domain features[['RBD']] <- c(start = 22517, end = 23185, isCDS = TRUE) ## S2, second cleavage product of the spike protein features[['S2']] <- c(start = 23618, end = 25381, isCDS = TRUE) ## Regions around the insertions features[['Ins1-pm120']] <- c(start = 21647, end = 21907, isCDS = FALSE) features[['Ins2-pm120']] <- c(start = 21899, end = 22156, isCDS = FALSE) features[['Ins3-pm120']] <- c(start = 21899, end = 23152, isCDS = FALSE) features[['Ins4-pm120']] <- c(start = 23483, end = 23734, isCDS = FALSE) features[['Ins4-m240']] <- c(start = 23363, end = 23614, isCDS = FALSE) ## Receptor-binding domain ## Potential Pangolin origin after Xiao (https://doi.org/10.1101/2020.02.17.951335) features[['Recomb-Xiao']] <- c(start = 22871, end = 23092, isCDS = FALSE) ## Recombinant region 1 seen on the PIP profiles features[['Recomb-reg-1']] <- c(start = 2900, end = 3800, isCDS = FALSE) ## Recombinant region 2 seen on the PIP profiles features[['Recomb-reg-2']] <- c(start = 21000, end = 24000, isCDS = FALSE) ## Recombinant region 3 seen on the PIP profiles features[['Recomb-reg-3']] <- c(start = 27500, end = 28550, isCDS = FALSE) ## Recombinant region 4 in the RBD ## Slighty wider than Xiao's limits in order to see the context features[['Recomb-RBD']] <- c(start = 22700, end = 23200, isCDS = FALSE) # features[['Recomb-RBD']] <- c(start = 22760, end = 23160, isCDS = FALSE) ## Annotated coding sequences features[['CDS-S']] <- c(start = 21563, end = 25384, isCDS = TRUE) ## Spike gene features[['CDS-ORF3a']] <- c(start = 25393, end = 26220, isCDS = TRUE) features[['CDS-E']] <- c(start = 26245, end = 26472, isCDS = TRUE) features[['CDS-M']] <- c(start = 26523, end = 27191, isCDS = TRUE) features[['CDS-ORF6']] <- c(start = 27202, end = 27387, isCDS = TRUE) features[['CDS-ORF7a']] <- c(start = 27394, end = 27759, isCDS = TRUE) features[['CDS-ORF8']] <- c(start = 27894, end = 28259, isCDS = TRUE) features[['CDS-N']] <- c(start = 28274, end = 29533, isCDS = TRUE) features[['CDS-ORF10']] <- c(start = 29558, end = 29674, isCDS = TRUE) ## Large ORF covering 2 thirds of the genome, coding for polyprotein features[['CDS-ORF1ab']] <- c(start = 266, end = 21555, isCDS = TRUE) ## All the sequences after the big polyprotein-coding ORF1ab features[['After-ORF1ab']] <- c(start = 21556, end = 29899, isCDS = FALSE) ## Only keep the two first features if quick test has been specified if (quickTest) { features <- features[1:2] } ## Convert feature list into a feature tbale featureTable <- as.data.frame(t(as.data.frame.list( features, stringsAsFactors = FALSE)), row.names = names(features) # to avoid replacement of "-" by "." ) # class(featureTable) featureTable$length <- featureTable$end - featureTable$start + 1 # row.names(featureTable) # featureTable["CDS.S", ] featureTable <- featureTable[order(featureTable$start), ] message("Analysing ", length(features), " features") message("\t", paste(collapse = ", ", names(features))) ## Report the parameters message("\tReference pattern: ", refPattern) ``` ## Strain collection: `r collection` ```{r load_sequences} ## Genome dir and files if (length(grep(pattern = "GISAID", x = collection)) > 0) { useGISAID <- TRUE dir$sequences <- file.path(dir$main, "data", "GISAID_genomes") # collections <- paste0(collections, "-plus-GISAID") # collection <- paste0(collection, "-plus-GISAID") } else { dir$sequences <- file.path(dir$main, "data", "genomes") } ## Define the input sequences infiles$sequences <- file.path( dir$sequences, paste0("genomes_", collection, ".fasta")) ## Check if the input sequence file exists if (!file.exists(infiles$sequences)) { stop("Genome sequence file is missing", "\n", infiles$sequences) } else { message("Genome sequence file", "\n", infiles$sequences) } #### Load genome sequences #### genomes <- readDNAStringSet(filepath = infiles$sequences, format = "fasta") ## Shorten sequence names by suppressing the fasta comment (after the space) names(genomes) <- sub(pattern = " .*", replacement = "", x = names(genomes), perl = TRUE) ## Exclude genomes if (excludeIncomplete) { excludePattern = paste0("(", paste(collapse = ")|(", excludePatterns), ")") excludedstrainNames <- grep(pattern = excludePattern, x = names(genomes), value = TRUE, invert = FALSE) filteredGenomeIndices <- grep(pattern = excludePattern, x = names(genomes), value = FALSE, invert = TRUE) message("\tExcluded ", length(excludedstrainNames)," genomes: ", paste(collapse = ", ", excludedstrainNames)) message("\tRemaining genomes: ", length(filteredGenomeIndices)) genomes <- genomes[filteredGenomeIndices] # names(genomes) } ## Report the number of genoomes strainNames <- names(genomes) nbStrains <- length(strainNames) message("\tLoaded ", nbStrains, " genomes from file ", infiles$sequences) # View(genomes) #### Define reference and query genomes #### refStrainName <- grep(pattern = refPattern, x = names(genomes), ignore.case = TRUE, value = TRUE) if (is.null(refStrainName)) { stop("Could not identify reference genome with pattern ", refPattern) } message("\tReference genome name: ", refStrainName) ``` ```{r reload_image} allFeatureAlignments <- list() ## Reload memory image if specified in parameters if (reloadImage) { load(outfiles["Memory image"]) reloadImage <- TRUE } ``` ## Parameters ```{r print_parameters} #### Define a list of parameters #### parameters <- list() parameters$collection <- collection parameters$refStrainName <- refStrainName parameters$nbStrains <- nbStrains parameters$sequenceDir <- dir$sequences parameters$sequenceFile <- infiles$sequences parameters$memoryImage <- outfiles["Memory image"] ## Exclude metagenomic assemblies parameters$pasDeTambouille <- FALSE if (parameters$pasDeTambouille) { excludePatterns <- append(excludePatterns, "BtYu-RmYN02_2019") } kable(t(as.data.frame.list(parameters)), caption = "Parameters of the analysis", col.names = "Parameter") ``` ## Strain statistics ```{r strain_stat} #### Compute statistics about sequence sizes ### strainStats <- data.frame( n = 1:length(genomes), row.names = names(genomes), status = rep("Query", length.out = length(strainNames)) ) strainStats[,"status"] <- as.vector(strainStats[,"status"]) strainStats[refStrainName,"status"] <- "Reference" g <- 1 for (g in strainNames) { strainStats[g, "length"] <- length(genomes[[g]]) } ``` ```{r strain_colors} #### Define the color associated to each sequence #### ## Color palette per species speciesPalette <- list( Human = "#880000", Bat = "#888888", Pangolin = "#447700", Camel = "#BB8800", Pig = "#FFBBBB", Civet = "#00BBFF" ) ## Species prefix in the tip labels speciesPrefix <- c("Hu" = "Human", "Bt" = "Bat", "Pn" = "Pangolin", "Cm" = "Camel", "Pi" = "Pig", "Cv" = "Civet") ## Strain-specific colors strainColor <- c( "HuCoV2_WH01_2019" = "red", "HuSARS-Frankfurt-1_2003" = "#0044BB", "PnGu1_2019" = "#00CC00", "PnMP789" = "#00FF88", "BtRaTG13_" = "#FF6600", "BtYu-RmYN" = "#FFBB22", "BtZXC21" = "black", "BtZC45" = "black") ## Identify species per tip for (prefix in names(speciesPrefix)) { strainStats[grep(pattern = paste0("^", prefix), x = row.names(strainStats), perl = TRUE), "species"] <- speciesPrefix[prefix] } ## Assign acolor to each species strainStats$color <- "grey" # default strainStats$color <- speciesPalette[as.vector(strainStats$species)] for (strain in names(strainColor)) { strainStats[grep(pattern = paste0("^", strain), x = row.names(strainStats), perl = TRUE), "color"] <- strainColor[strain] } ## Assign specific color to some nodes ## Define a color for each strain strainColors <- (unlist(strainStats$color)) names(strainColors) <- row.names(strainStats) ``` ```{r print_strain_stats} kable(strainStats, caption = "Reference and query genomes") ``` The collection `r collection` contains `r length(genomes)` virus genome sequences. ## Features ```{r feature_table} ## Print feature coordinates kable(featureTable, caption = "Feature coordinates. ") ``` ## Feature matches and alignments 1. We perform a global pairwise alignment (Needle-Waterman algorithm) between each feature of the reference strain (`r refStrainName`) and each one of the query genomes. 2. The collected nucleic sequences are then aligned based on their translated sequences. This ensures a more relevant placing of the gaps. These translation-based alignments are exported in aminoacids and DNA formats (the DNA sequences are obtained by reverse-mapping the aligned aminoacid sequences onto the input DNA sequences). 3. For each alignment (AA and DNA) wee compute a consensus matrix, indicating the frequency of each residue (row of the consensus matrix) in each position of the alignment (column of the matrix). ```{r one2N_feature_matches, results="asis", fig.width=10, fig.height=6, out.width="100%", fig.cap="Feature-specific Percent Identical Positions (PIP) profiles. "} #### One-to-N alignmemnt of user-speficied genomic features #### i <- 1 testFeature <- "Recomb-RBD" i <- which(names(features) == testFeature) ## Rune one-ton matches and translation-based mumti^ple alignment + PIP profiles for (i in 1:length(features)) { featureName <- names(features)[i] ## Compute feature length featureLimits <- features[[featureName]] featureStart <- featureLimits[["start"]] featureEnd <- featureLimits[["end"]] featureLength <- featureEnd - featureStart + 1 message("Searching matches for feature ", i, "/", length(features), "\t", featureName, " (", featureLimits[1], "-", featureLimits[2], ")", " in collection ", collection) ## Print a section title with the feature name and limits cat(sep = "", "\n### ", featureName, " (", featureLimits[1], "-", featureLimits[2], "; ", featureLength,"bp)", "\n") ## Choose a collection- and feature-specific path for the figures knitr::opts_chunk$set(fig.path = paste0( 'figures/one-to-n-matches_', collection, '/', collection, '_vs_', featureName)) #### Define feature-specific output directory #### featurePrefix <- paste0( featureName, "_", collection) dir[[featureName]] <- file.path(dir$results, featurePrefix) message("\tOutput directory: ", dir[[featureName]] ) dir.create(dir[[featureName]], showWarnings = FALSE, recursive = TRUE) #### Find feature matches by 1-to-1 alignment with each genome #### ## Define output file for unaligned matching sequences outfiles[paste0(featurePrefix, "_matches")] <- file.path( dir[[featureName]], paste0(featurePrefix, "_matches.fasta")) message("\tMatching sequences: ", outfiles[paste0(featurePrefix, "_matches")]) ## Get sequence of the feature from the reference genome refSeq <- subseq(genomes[refStrainName], start = featureLimits[1], end = featureLimits[2]) ## Match the reference feature with all the genomes if (reloadImage) { featureAlignmentsNto1 <- allFeatureAlignments[[featureName]] } else { featureAlignmentsNto1 <- alignNtoOne( refSequence = refSeq, querySequences = genomes, type = "global-local", outfile = outfiles[paste0(featurePrefix, "_matches")]) allFeatureAlignments[[featureName]] <- featureAlignmentsNto1 } #### Translation-based alignments #### if (featureTable[featureName, "isCDS"]) { message("\tTranslation-based multiple alignment of feature ", featureName, "; DNA output") #### Align translated sequenecs and return correponding nucleic sequences #### featureAlignmentsNto1$tralignedDNA <- AlignTranslation( DNAStringSet(featureAlignmentsNto1$sequences), type = "DNAStringSet", verbose = FALSE) ## Save multiple-aligned DNA sequences outfiles[paste0(featurePrefix, "_tralignedDNA")] <- file.path( dir[[featureName]], paste0(featurePrefix, "_tralignedDNA.fasta")) message("\tTranslation-based alignment saved in file ", outfiles[paste0(featurePrefix, "_tralignedDNA")]) writeXStringSet( x = featureAlignmentsNto1$tralignedDNA, filepath = outfiles[paste0(featurePrefix, "_tralignedDNA")] ) ## Convert alignment to position-weight matrix featureAlignmentsNto1$tralignedDNAConsensus <- consensusMatrix( x = featureAlignmentsNto1$tralignedDNA) #### Align translated sequenecs and return correponding AA sequences #### message("\tTranslation-based multiple alignment of feature ", featureName, "; AA output") featureAlignmentsNto1$tralignedAA <- AlignTranslation( DNAStringSet(featureAlignmentsNto1$sequences), type = "AAStringSet", verbose = FALSE) ## Convert alignment to position-weight matrix featureAlignmentsNto1$tralignedAAConsensus <- consensusMatrix( x = featureAlignmentsNto1$tralignedAA) ## Save multiple-aligned AA sequences outfiles[paste0(featurePrefix, "_tralignedAA")] <- file.path( dir[[featureName]], paste0(featurePrefix, "_tralignedAA.fasta")) message("\tTranslation-based alignment saved in file ", outfiles[paste0(featurePrefix, "_tralignedAA")]) writeXStringSet( x = featureAlignmentsNto1$tralignedAA, filepath = outfiles[paste0(featurePrefix, "_tralignedAA")] ) } ## Compute sequence order by decreasing PIP score seqOrder <- order(featureAlignmentsNto1$stat$score, decreasing = TRUE) ## Print matching statistics kable(featureAlignmentsNto1$stats[seqOrder, ], caption = paste0( "One-to-N alignment of feature ", featureName)) #### PIP profiles #### ## Choose window size if (featureLength > 20000) { windowSize <- 800 } else if (featureLength > 5000) { windowSize <- 500 } else if (featureLength > 3000) { windowSize <- 200 } else if (featureLength > 400) { windowSize <- 100 } else { windowSize <- max(50, 10 * round(featureLength / 100)) } ## DNA PIP profile of one-to-N alignment PlotPIPprofilesFromList( # alignments = featureAlignmentsNto1$alignments[seqOrder], alignments = featureAlignmentsNto1$alignments, reversePlot = TRUE, windowSize = windowSize, main = paste0("PIP profiles: collection ", collection, "\nGenomic feature ", featureName, " (", featureStart, ":", featureEnd ,", ", featureLength, " bp)"), # colors = NULL, colors = strainColors, legendMargin = 0.25, legendCorner = "topright", lwd = 1, legendCex = 0.5, ylim = c(30, 100)) ## Identify the correspodences between matches and sequences seqNames <- names(featureAlignmentsNto1$alignments) refSeqName <- refStrainName matchNames <- names(featureAlignmentsNto1$sequences) refMatchName <- grep(pattern = paste0("^", refStrainName), matchNames, perl = TRUE, value = TRUE) ## Define colors matchColors <- strainColors[seqNames] names(matchColors) <- matchNames if (featureTable[featureName, "isCDS"]) { #### DNA PIP profile of translation-based multiple alignment #### PlotPIPprofiles( alignment = featureAlignmentsNto1$tralignedDNA, refSeqName = refMatchName, plotRefGaps = FALSE, reversePlot = TRUE, windowSize = windowSize, main = paste0("DNA PIP profiles – collection: ", collection, "\nGenomic feature ", featureName, " (", featureStart, ":", featureEnd ,", ", featureLength, " bp)"), colors = matchColors, legendMargin = 0.3, legendCorner = "topright", lwd = 1, legendCex = 0.5, ylim = c(30, 100)) ## AA PIP profile of translation-based multiple alignment PlotPIPprofiles( # alignments = featureAlignmentsNto1$alignments[seqOrder], alignment = featureAlignmentsNto1$tralignedAA, refSeqName = refMatchName, plotRefGaps = FALSE, reversePlot = TRUE, windowSize = windowSize/3, main = paste0("AA PIP profiles: collection ", collection, "\nGenomic feature ", featureName, " (", featureStart, ":", featureEnd ,", ", featureLength, " bp)"), colors = matchColors, legendMargin = 0.3, legendCorner = "topright", lwd = 1, legendCex = 0.5, ylim = c(30, 100)) } } # } ``` ## Output files ```{r output_files} kable(t(as.data.frame(dir)), col.names = "Dir", caption = "Directories") outfileTable <- data.frame(path = as.vector(outfiles)) outfileTable$basename <- basename(as.vector(outfileTable$path)) outfileTable$dir <- dirname(as.vector(outfileTable$path)) outfileTable$link <- paste0( "[", outfileTable$basename, "](", outfileTable$path, ")" ) kable(outfileTable[, c("dir", "link")], col.names = c("dir", "file"), caption = "Output files") ``` ## Memory image We store the result ina memory image, in oder to be able reloading it to plot PIP profiles with different parameters. ```{r} save.image(file = outfiles["Memory image"]) ``` ## Session info ```{r session_info} sessionInfo() ```