--- knit: (function(inputFile, encoding) { rmarkdown::render(inputFile, encoding = encoding, output_file ='PIP_spike-proteins_selected-plus-GISAID.html') }) title: "PIP profiles of coronavirus spike proteins - protein and RNA sequences" author: "Jacques van Helden" date: "`r Sys.Date()`" output: html_document: df_print: paged code_folding: hide fig_caption: yes highlight: zenburn self_contained: no theme: cerulean toc: yes toc_depth: 3 toc_float: yes ioslides_presentation: colortheme: dolphin fig_caption: yes fig_height: 6 fig_width: 7 fonttheme: structurebold highlight: tango smaller: yes toc: yes widescreen: yes beamer_presentation: colortheme: dolphin fig_caption: yes fig_height: 6 fig_width: 7 fonttheme: structurebold highlight: tango incremental: no keep_tex: no slide_level: 2 theme: Montpellier toc: yes word_document: toc: yes toc_depth: '3' slidy_presentation: fig_caption: yes fig_height: 6 fig_width: 7 highlight: tango incremental: no keep_md: yes smaller: yes theme: cerulean toc: yes widescreen: yes pdf_document: fig_caption: yes highlight: zenburn toc: yes toc_depth: 3 font-import: http://fonts.googleapis.com/css?family=Risque font-family: Garamond transition: linear --- ```{r libraries, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE} #### Install required packages #### required.packages <- c("knitr") for (pkg in required.packages) { if (!require(pkg, character.only = TRUE)) { message("Installing package ", pkg) install.packages(pkg, dependencies = TRUE) } require(pkg, character.only = TRUE) } #### Load libraries #### if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } if (!require("Biostrings", quietly = TRUE)) { BiocManager::install("Biostrings") require("Biostrings") } ``` ```{r knitr_settings, include=FALSE, echo=FALSE, eval=TRUE} library(knitr) options(width = 300) knitr::opts_chunk$set( fig.path = 'figures/spike-protein_PIP/', fig.width = 7, fig.height = 5, fig.align = "center", size = "tiny", echo = TRUE, eval = TRUE, warning = FALSE, message = FALSE, results = TRUE, comment = "") # knitr::asis_output("\\footnotesize") ## Store original graphic parameters to restore them after chunks par.ori <- par(no.readonly = TRUE) ``` ```{r parameters} #### General parameters for the analysis #### ## Use (or not) GIDAID sequences ## ## A few genoes were not available in NCBI Genbank at the time of ## this analysis, and had to be downloaded from GISAID. These sequences ## can however not be redistributed, they should thus be downloaded ## manually to reproduce the full trees. Alternatively, useGISAID ## can be set to FALSE, whcih will reproduce the analysis with almost ## all the sequences of the paper. useGISAID <- TRUE #### Define directories and files #### dir <- list(main = '..') dir$R <- file.path(dir$main, "scripts/R") #### Create output directory for sequences #### seqPrefix <- "spike_proteins" dir$outseq <- file.path( dir$main, "results", seqPrefix, "PIP_profiles") dir.create(dir$outseq, showWarnings = FALSE, recursive = TRUE) # list.files(dir$outseq) ## Instantiate a list for output files outfiles <- vector() ## Input files infiles <- list() #### Sequence collection #### ## Supported: collections <- c( "around-CoV-2", "selected", "around-CoV-2-plus-GISAID", "selected-plus-GISAID" ) ## Selected collection # collection <- "around-CoV-2" # 14 strains # collection <- "all" # 16 strains # collection <- "selected" # ~60 strains # collection <- "around-CoV-2-plus-GISAID" # 16 strains collection <- "selected-plus-GISAID" # ~40 strains # collection <- "all-plus-GISAID" # ~60 strains ## Use a collection-specific path for the figures knitr::opts_chunk$set( fig.path = paste0('figures/spike-protein_PIP_', collection, '/', collection, "_")) ## Genome dir and files if (length(grep(pattern = "GISAID", x = collection)) > 0) { useGISAID <- TRUE dir$sequences <- file.path(dir$main, "data", "GISAID_genomes") } else { dir$sequences <- file.path(dir$main, "data", "genomes") } seqPrefix <- paste0("spike_proteins_", collection) infiles$sequences <- file.path(dir$sequences, paste0(seqPrefix,".fasta")) ## Sequence sequences if (!file.exists(infiles$sequences)) { stop("¨Protein sequence file is missing", "\n", infiles$sequences) } ## Load custom functions source(file.path(dir$R, "align_n_to_one.R")) source(file.path(dir$R, "plot_pip_profiles.R")) ## Query patterns for SARS-CoV-2 queryPatterns <- list() queryPatterns[["HuCoV2_WH01_2019"]] <- c( "PnGX-P1E_2017", "PnGu1_2019", "BtRaTG13_2013_Yunnan", "BtZC45", "BtZXC21", "HuSARS-Frankfurt-1_2003", "CmMERS", "HuMERS_172-06_2015" ) queryPatterns[["PnGu1_2019"]] <- c( "HuCoV2_WH01_2019", "PnGX-P1E_2017", "BtRaTG13_2013_Yunnan" ) queryPatterns[["BtRaTG13_2013_Yunnan"]] <- c( "HuCoV2_WH01_2019", "PnGX-P1E_2017", "PnGu1_2019" ) queryPatterns[["HuSARS-Frankfurt-1_2003"]] <- c( "CvSZ3", "BtRs4874", "BtWIV16_2013", "BtRs4231", "BtRs7327", "BtRsSHC014", "Btrec-SARSp_2008", "PnGX-P1E_2017", "HuCoV2_WH01_2019", "BtRaTG13_2013_Yunnan", "BtZC45", "BtZXC21", "CmMERS", "HuMERS_172-06_2015" ) #### Add GISAID IDs to the query pattern #### ## Note that GISAID sequences are be submitted to the github repo because they cannot be redistributed if (useGISAID) { for (ref in names(queryPatterns)) queryPatterns[[ref]] <- append(queryPatterns[[ref]], c("BtYu-RmYN02_2019", "PnGu1_2019" )) } # message("\tReference strain: ", refPattern) message("\tQuery patterns") for (ref in names(queryPatterns)) { queryPatterns[[ref]] <- unique(queryPatterns[[ref]]) message("\t", ref, "\t\t", length(queryPatterns[[ref]]), "\t", paste(collapse = ", ", queryPatterns[[ref]])) } ``` ```{r load_sequences} #### Load sequences #### sequences <- readAAStringSet(filepath = infiles$sequences, format = "fasta") ## Shorten sequence names by suppressing the fasta comment (after the space) names(sequences) <- sub(pattern = " .*", replacement = "", x = names(sequences), perl = TRUE) sequencesNames <- names(sequences) nbsequences <- length(sequencesNames) message("Loaded ", nbsequences, " sequences from file ", infiles$sequences) # View(sequences) #### Define reference and query sequences #### refSequenceNames <- vector() for (ref in names(queryPatterns)) { refSequenceNames[ref] <- unique( grep(pattern = ref, x = names(sequences), ignore.case = TRUE, value = TRUE)) if (is.null(refSequenceNames[ref])) { stop("Could not identify reference sequences with pattern ", ref) } } ## Query sequences # querySequenceNames <- list() # for (ref in names(queryPatterns)) { # # message("Identifying query sequences for reference: ", ref) # queryRegExp <- paste0("(", paste(collapse = ")|(", queryPatterns[[ref]]), ")") # querySequenceNames[[ref]] <- grep(pattern = queryRegExp, # x = sequencesNames, # ignore.case = TRUE, value = TRUE) # nbquerySeq <- length(querySequenceNames[[ref]]) # # if (nbquerySeq == 0) { # stop("Could not identify any query sequences with query pattern\n", queryRegExp) # } # # if (length(unlist(queryPatterns[ref[]])) != length(querySequenceNames[[ref]])) { # foundPatterns <- grep(pattern = queryRegExp, x = querySequenceNames[[ref]], value = TRUE) # missingPatterns <- setdiff(foundPatterns, queryPatterns[[ref]]) # message("\t", # length(missingPatterns), " Missing sequences: ", # paste(collapse = ", ", missingPatterns)) # } # # message("\tFound ", # "\t", length(querySequenceNames[[ref]]), # " queries for\t", ref) # # ## Compute some statistics about sequences sizes # sequencestat <- data.frame( # row.names = c(ref, querySequenceNames[[ref]]), # status = c("Reference", # rep("Query", length.out = length(querySequenceNames[[ref]]))) # ) # # g <- 1 # for (g in c(ref, querySequenceNames[[ref]])) { # sequencestat[g, "length"] <- length(sequences[[g]]) # } # kable(sequencestat, caption = paste0("Query sequences for reference ", ref)) # } ``` ## Strain collection: `r collection` The collection `r collection` contains `r length(sequences)` virus sequences sequences. ```{r strain_colors} ## Report the number of strains strainNames <- names(sequences) nbStrains <- length(strainNames) message("\tLoaded ", nbStrains, " sequences from file ", infiles$sequences) # View(genomes) #### Compute sequence sizes #### strainStats <- data.frame( n = 1:length(sequences), row.names = names(sequences), status = rep("Query", length.out = length(strainNames)) ) strainStats[,"status"] <- as.vector(strainStats[,"status"]) strainStats[refSequenceNames,"status"] <- "Reference" g <- 1 for (g in strainNames) { strainStats[g, "length"] <- length(sequences[[g]]) } #### Define the color associated to each sequence #### ## Color palette per species speciesPalette <- list( Human = "#880000", Bat = "#888888", Pangolin = "#448800", Camel = "#BB8800", Pig = "#FFBBBB", Civet = "#00BBFF" ) ## Species prefix in the tip labels speciesPrefix <- c("Hu" = "Human", "Bt" = "Bat", "Pn" = "Pangolin", "Cm" = "Camel", "Pi" = "Pig", "Cv" = "Civet") ## Strain-specific colors strainColor <- c( "HuCoV2_WH01_2019" = "red", "HuSARS-Frankfurt-1_2003" = "#0044BB", "BtRs4874" = "#BB00BB", "PnGu1_2019" = "#00BB00", "BtRaTG13_" = "#FF6600", "BtYu-RmYN" = "#FFBB22", "BtZXC21" = "black", "BtZC45" = "black") ## Identify species per tip for (prefix in names(speciesPrefix)) { strainStats[grep(pattern = paste0("^", prefix), x = row.names(strainStats), perl = TRUE), "species"] <- speciesPrefix[prefix] } ## Assign acolor to each species strainStats$color <- "grey" # default strainStats$color <- speciesPalette[as.vector(strainStats$species)] for (strain in names(strainColor)) { strainStats[grep(pattern = paste0("^", strain), x = row.names(strainStats), perl = TRUE), "color"] <- strainColor[strain] } ## Assign specific color to some nodes ## Define a color for each strain strainColors <- (unlist(strainStats$color)) names(strainColors) <- row.names(strainStats) ``` ## N-to-1 full sequence alignments We perform a pairwise lignment between each sequences query and the reference sequences (`r refSequenceNames[[1]]`). ```{r full-sequences_align} sequencesNto1 <- list() refSequenceName <- "HuCoV2_WH01_2019" for (refSequenceName in refSequenceNames) { ## Define output file for sequences alignments outfile <- file.path( dir$outseq, paste0("one-to-n_alignments_ref_", refSequenceName)) outfiles[paste0(refSequenceName, " alignments")] <- outfile ## Get sequences for reference and query sequences refSequence <- sequences[refSequenceName] # querySeq <- sequences[querySequenceNames[[refSequenceName]]] message("\tAligning ", length(sequences), " sequences", " to reference\t", refSequenceName) sequencesNto1[[refSequenceName]] <- alignNtoOne( refSequence = refSequence, querySequences = sequences, # querySequences = querySeq, sortByPIP = TRUE, # querySequences = sequences, outfile = outfile) } ``` ```{r spike-protein_PIP, results="asis", fig.width=12, fig.height=7, out.width="100%", fig.cap="Percent Identical Positions (PIP) profiles of spike protein sequences. "} refSequenceName <- "HuCoV2_WH01_2019" for (refSequenceName in refSequenceNames) { cat(sep = "", "\n## Spike proteins: ", collection, " vs reference ", refSequenceName) knitr::opts_chunk$set( fig.path = paste0('figures/spike-protein_PIP/', collection, '_vs_', refSequenceName)) kable(sequencesNto1[[refSequenceName]]$stats[order(sequencesNto1[[refSequenceName]]$stats$score, decreasing = TRUE), ], caption = "N-to-one alignment of full sequences") ## PIP profile of full sequences N-to-1 alignments if (length(grep(pattern = "around-CoV-2", x = collection))) { minPIP <- 30 legendCex <- 0.7 legendCorner <- "bottomright" legendMargin <- 0 } else { minPIP <- 0 legendCex <- 0.5 legendCorner <- "topright" legendMargin <- 0.25 } plotPIPprofiles( alignments = sequencesNto1[[refSequenceName]]$alignments, windowSize = 100, main = paste0("Spike protein: collection ", collection, "\nvs strain ", refSequenceName), colors = strainColors, legendMargin = legendMargin, legendCorner = legendCorner, lwd = 1, legendCex = legendCex, ylim = c(minPIP,100)) # legendMargin = 0.25, # legendCorner = "topright", lwd = 1, # legendCex = 0.5, ylim = c(30, 100)) } ``` ## Output files ```{r output_files} kable(t(as.data.frame(dir)), col.names = "Dir", caption = "Directories") kable(outfiles, col.names = "File", caption = "Output files") ``` ## Session info ```{r session_info} sessionInfo() ```