--- title: "Analysis of coronavirus sequences" author: "Jacques van Helden" date: "`r Sys.Date()`" output: html_document: df_print: paged code_folding: hide fig_caption: yes highlight: zenburn self_contained: no theme: cerulean toc: yes toc_depth: 3 toc_float: yes ioslides_presentation: colortheme: dolphin fig_caption: yes fig_height: 6 fig_width: 7 fonttheme: structurebold highlight: tango smaller: yes toc: yes widescreen: yes beamer_presentation: colortheme: dolphin fig_caption: yes fig_height: 6 fig_width: 7 fonttheme: structurebold highlight: tango incremental: no keep_tex: no slide_level: 2 theme: Montpellier toc: yes word_document: toc: yes toc_depth: '3' slidy_presentation: fig_caption: yes fig_height: 6 fig_width: 7 highlight: tango incremental: no keep_md: yes smaller: yes theme: cerulean toc: yes widescreen: yes pdf_document: fig_caption: yes highlight: zenburn toc: yes toc_depth: 3 font-import: http://fonts.googleapis.com/css?family=Risque font-family: Garamond transition: linear --- ```{r libraries, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE} #### Install required packages #### required.packages <- c("knitr") for (pkg in required.packages) { if (!require(pkg, character.only = TRUE)) { message("Installing package ", pkg) install.packages(pkg, dependencies = TRUE) } require(pkg, character.only = TRUE) } #### Load libraries #### if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } if (!require("Biostrings", quietly = TRUE)) { BiocManager::install("Biostrings") require("Biostrings") } ``` ```{r knitr_settings, include=FALSE, echo=FALSE, eval=TRUE} library(knitr) options(width = 300) knitr::opts_chunk$set( fig.width = 7, fig.height = 5, fig.path = 'figures/pip_profiles/RNA_', fig.align = "center", size = "tiny", echo = TRUE, eval = TRUE, warning = FALSE, message = FALSE, results = TRUE, comment = "") # knitr::asis_output("\\footnotesize") ## Store original graphic parameters to restore them after chunks par.ori <- par(no.readonly = TRUE) ``` ```{r parameters} #### General parameters for the analysis #### ## Use (or not) GIDAID sequences ## ## A few genoes were not available in NCBI Genbank at the time of ## this analysis, and had to be downloaded from GISAID. These sequences ## can however not be redistributed, they should thus be downloaded ## manually to reproduce the full trees. Alternatively, useGISAID ## can be set to FALSE, whcih will reproduce the analysis with almost ## all the sequences of the paper. useGISAID <- TRUE ## Sequence collection ## Supported: ## collection <- "around-CoV-2" # ~20 genomes ## collection <- "selected" # ~60 genomes ## collection <- "all" # ~60 genomes collections <- c("around-CoV-2", "selected", "all") collection <- "around-CoV-2" # ~20 genomes #### Define directories and files #### dir <- list(main = '..') dir$R <- file.path(dir$main, "scripts/R") #### Create output directory for sequences #### dir$outseq <- file.path( dir$main, "results", "S-gene", "Nto1_alignments") dir.create(dir$outseq, showWarnings = FALSE, recursive = TRUE) ## Instantiate a list for output files outfiles <- vector() ## Input files infiles <- list() ## Genome dir and files if (useGISAID) { dir$genomes <- file.path(dir$main, "data", "GISAID_genomes") infiles$genomes <- file.path( dir$genomes, paste0("genomes_", collection, "-plus-GISAID.fasta")) } else { dir$genomes <- file.path(dir$main, "data", "virus_genomes") infiles$genomes <- file.path( dir$genomes, paste0("genomes_", collection,".fasta")) } ## Genome sequences if (!file.exists(infiles$genomes)) { stop("Genome sequence file is missing", "\n", infiles$genomes) } ## Output tables # di$output <- file.path(dir.main, "") # dir$tables <- ## Load custom functions source(file.path(dir$R, "align_n_to_one.R")) source(file.path(dir$R, "plot_pip_profiles.R")) source(file.path(dir$R, "plot_pip_profiles_from_list.R")) ## Reference genome refPattern <- "HuCoV2_WH01_2019" ## Features : coordinates of features of interest in the reference genome features <- list() ## Spike gene (coding for the spike protein) features[['CDS-ORF1ab']] <- c(start = 266, end = 21555) features[['CDS-S']] <- c(start = 21563, end = 25384) features[['CDS-ORF3a']] <- c(start = 25393, end = 26220) features[['CDS-E']] <- c(start = 26245, end = 26472) features[['CDS-M']] <- c(start = 26523, end = 27191) features[['CDS-ORF6']] <- c(start = 27202, end = 27387) features[['CDS-ORF7a']] <- c(start = 27394, end = 27759) features[['CDS-ORF8']] <- c(start = 27894, end = 28259) features[['CDS-N']] <- c(start = 28274, end = 29533) features[['CDS-ORF10']] <- c(start = 29558, end = 29674) ## Query genomes queryPatterns <- c( "BtRaTG13_2013_Yunnan", "BtZC45", "BtZXC21", "PnMP789", "PnGX-P1E_2017", "HuSARS-Frankfurt-1_2003", "BtRc-o319 LC556375.1_ref_genome", "BtRacCS203 MW251308_ref_genome", "BtRacCS264 MW251311_ref_genome", "BtRacCS253 MW251310.1_ref_genome", "BtRacCS224 MW251309.1_ref_genome", "BtRacCS271 MW251312_genome" ) #### Add GISAID IDs to the query pattern #### ## Note that GISAI genomes are be submitted to the github repo because they cannot be redistributed if (useGISAID) { queryPatterns <- append(queryPatterns, c("BtYu-RmYN02_2019", "PnGu1_2019", "BtCambodia/RShSTT200/2010", "BtCambodia/RShSTT182/2010" )) } message("\tReference genomes: ", refPattern) message("\tNumber of query patterns: ", length(queryPatterns)) message("\tQuery patterns: ", paste(collapse = ", ", queryPatterns)) ``` ```{r load_sequences} #### Load genome sequences #### genomes <- readDNAStringSet(filepath = infiles$genome, format = "fasta") ## Shorten sequence names by suppressing the fasta comment (after the space) names(genomes) <- sub(pattern = " .*", replacement = "", x = names(genomes), perl = TRUE) genomeNames <- names(genomes) nbGenomes <- length(genomeNames) message("Loaded ", nbGenomes, " genomes from file ", infiles$genomes) # View(genomes) #### Define reference and query genomes #### refGenomeName <- grep(pattern = refPattern, x = names(genomes), ignore.case = TRUE, value = TRUE) if (is.null(refGenomeName)) { stop("Could not identify reference genome with pattern ", refPattern) } message("Reference genome name: ", refGenomeName) ## Query genomes queryRegExp <- paste0("(", paste(collapse = ")|(", queryPatterns), ")") queryGenomeNames <- grep(pattern = queryRegExp, x = genomeNames, ignore.case = TRUE, value = TRUE) nbQueryGenomes <- length(queryGenomeNames) if (nbQueryGenomes == 0) { stop("Could not identify any query genome with query pattern\n", queryRegExp) } if (length(queryPatterns) != length(queryGenomeNames)) { foundPatterns <- grep(pattern = queryRegExp, x = queryGenomeNames, value = TRUE) missingPatterns <- setdiff(queryPatterns, queryGenomeNames) message("\t", length(missingPatterns), " Missing genomes: ", paste(collapse = ", ", missingPatterns)) } ## Compute some statistics about genome sizes genomeStat <- data.frame( row.names = c(refGenomeName, queryGenomeNames), status = c("Reference", rep("Query", length.out = length(queryGenomeNames))) ) g <- 1 for (g in c(refGenomeName, queryGenomeNames)) { genomeStat[g, "length"] <- length(genomes[[g]]) } kable(genomeStat, caption = "Reference and query genomes") ``` The collection `r collection` contains `r length(genomes)` virus genome sequences. ## Strain statistics ```{r strain_stat} ## Report the number of genoomes strainNames <- names(genomes) nbStrains <- length(strainNames) message("\tLoaded ", nbStrains, " genomes from file ", infiles$sequences) # View(genomes) #### Define reference and query genomes #### refStrainName <- grep(pattern = refPattern, x = names(genomes), ignore.case = TRUE, value = TRUE) if (is.null(refStrainName)) { stop("Could not identify reference genome with pattern ", refPattern) } message("\tReference genome name: ", refStrainName) #### Compute statistics about sequence sizes ### strainStats <- data.frame( n = 1:length(genomes), row.names = names(genomes), status = rep("Query", length.out = length(strainNames)) ) strainStats[,"status"] <- as.vector(strainStats[,"status"]) strainStats[refStrainName,"status"] <- "Reference" g <- 1 for (g in strainNames) { strainStats[g, "length"] <- length(genomes[[g]]) } ``` ```{r strain_colors} #### Define the color associated to each sequence #### ## Color palette per species speciesPalette <- list( Human = "#880000", Bat = "#888888", Pangolin = "#448800", Camel = "#BB8800", Pig = "#FFBBBB", Civet = "#00BBFF" ) ## Species prefix in the tip labels speciesPrefix <- c("Hu" = "Human", "Bt" = "Bat", "Pn" = "Pangolin", "Cm" = "Camel", "Pi" = "Pig", "Cv" = "Civet") ## Strain-specific colors strainColor <- c( "HuCoV2_WH01_2019" = "red", "HuSARS-Frankfurt-1_2003" = "#0044BB", "PnGu1_2019" = "#00BB00", "PnMP789" = "#00FF88", "BtRaTG13_" = "#FF6600", "BtYu-RmYN" = "#FFBB22", "BtZXC21" = "black", "BtZC45" = "black") ## Identify species per tip for (prefix in names(speciesPrefix)) { strainStats[grep(pattern = paste0("^", prefix), x = row.names(strainStats), perl = TRUE), "species"] <- speciesPrefix[prefix] } ## Assign acolor to each species strainStats$color <- "grey" # default strainStats$color <- speciesPalette[as.vector(strainStats$species)] for (strain in names(strainColor)) { strainStats[grep(pattern = paste0("^", strain), x = row.names(strainStats), perl = TRUE), "color"] <- strainColor[strain] } ## Assign specific color to some nodes ## Define a color for each strain strainColors <- (unlist(strainStats$color)) names(strainColors) <- row.names(strainStats) ``` ```{r print_strain_stats} kable(strainStats, caption = "Reference and query genomes") ``` ## N-to-1 full genome alignments We perform a pairwise lignment between each genome query and the reference genome (`r refGenomeName`). ```{r full-genomes_align} #### N-to-1 genome alignments #### ## Define output file for genome alignments outfiles["Genome alignments"] <- file.path( dir$outseq, paste0("genome_alignments_ref_", refGenomeName)) ## Get sequences for reference and query genomes refGenome <- genomes[refGenomeName] queryGenomes <- genomes[queryGenomeNames] genomesNto1 <- alignNtoOne( refSequence = refGenome, querySequences = queryGenomes, outfile = outfiles[["Genome alignments"]] ) kable(genomesNto1$stats[order(genomesNto1$stats$score, decreasing = TRUE), ], caption = "N-to-one alignment of full genomes") ``` ## Full genome PIP plot ```{r genome_pip, fig.width=10, fig.height=6, out.width="100%", fig.cap="Percent Identical Positions profile over the whole genome of SARS-CoV-2. "} ## PIP profile of full genome N-to-1 alignments PlotPIPprofilesFromList(alignments = genomesNto1$alignments, windowSize = 500, vgrid1 = 5000, vgrid2 = 1000, colors = strainColors, main = paste0("Full genome PIP profile", "\nRef: ", refGenomeName), legendMargin = 0, legendCorner = "bottom", legendCex = 0.8, ylim = c(40,100)) ``` ## SARS vs Civet ```{r SARS_vs_civet_genome} ## Define output file for genome alignments outfiles["Genome alignments - SARS"] <- file.path( dir$outseq, paste0("genome_alignments_ref_", "HuSARS-Frankfurt-1_2003")) #### Compare SARS (2002) wih the related Civet genome #### SARSvsCivetGenome <- alignNtoOne( refSequence = genomes["HuSARS-Frankfurt-1_2003"], querySequences = genomes[c("Cv007-2004", "HuCoV2_WH01_2019")], outfile = outfiles["Genome alignments - SARS"] ) kable(SARSvsCivetGenome$stats[order(SARSvsCivetGenome$stats$score, decreasing = TRUE), ], caption = "SARS (2003). N-to-one alignment of full genomes of the closest animal virus (Civet) and of Human SARS-CoV-2. ") ``` ```{r PIP_SARS_vs_civet_genome, fig.width=10, fig.height=6, out.width="100%", fig.cap="Percent Identical Positions profile over the whole genome of SARS (2002-2003). "} ## PIP profile of full genome N-to-1 alignments PlotPIPprofilesFromList(alignments = SARSvsCivetGenome$alignments, windowSize = 500, colors = strainColors, legend = paste0(names(SARSvsCivetGenome$alignments), " (", round(digits = 2, SARSvsCivetGenome$stats$pid), "%)"), main = paste0("Percent Identical Positions - Full genome", "\nRef: ", "Human_SARS-CoV_Frankfurt_1"), legendMargin = 0, legendCorner = "bottom", legendCex = 0.7, ylim = c(40,100)) ``` ## N-to-1 alignemnts of spike genes ```{r S-gene_align_queries} featureName <- "CDS-S" featureLimits <- features[[featureName]] #### N-to-1 alignments of spike-coding sequences #### dir[[featureName]] <- file.path(dir$main, "results", featureName) dir.create(dir[[featureName]], showWarnings = FALSE, recursive = TRUE) outfiles[featureName] <- file.path( dir[[featureName]], paste0(featureName, "_", collection, "_matches.fasta")) ## Get sequences for reference and query genomes refSeq <- subseq(genomes[refGenomeName], start = featureLimits[1], end = featureLimits[2]) featureAlignmentsNto1 <- alignNtoOne( refSequence = refSeq, querySequences = queryGenomes, type = "global-local", outfile = outfiles[featureName]) kable(featureAlignmentsNto1$stats[order(featureAlignmentsNto1$stat$score, decreasing = TRUE),], caption = "N-to-one alignment of S genes") ``` ### Spike gene PIP plot ```{r Sgene_pip, fig.width=10, fig.height=4.5, out.width="100%", fig.cap="Percent Identical Positions profile over the whole genome of SARS-CoV-2. "} ## PIP profile of spike N-to-1 alignments PlotPIPprofilesFromList(alignments = featureAlignmentsNto1$alignments, windowSize = 250, colors = strainColors, vgrid1 = 1000, vgrid2 = 200, # legend = paste0(names(featureAlignmentsNto1$alignments), " (", round(digits = 2, featureAlignmentsNto1$stats$pid), "%)"), main = paste0(featureName, " - PIP profile", "\nRef: ", refGenomeName), legendMargin = 0, legendCorner = "bottomright", legendCex = 0.8, ylim = c(30, 100)) ``` ## Output files ```{r output_files} kable(t(as.data.frame(dir)), col.names = "Dir", caption = "Directories") kable(outfiles, col.names = "File", caption = "Output files") ``` ## Session info ```{r session_info} sessionInfo() ```