---
title: "Analysis of coronavirus sequences"
author: "Jacques van Helden"
date: "`r Sys.Date()`"
output:
  html_document:
    df_print: paged
    code_folding: hide
    fig_caption: yes
    highlight: zenburn
    self_contained: no
    theme: cerulean
    toc: yes
    toc_depth: 3
    toc_float: yes
  ioslides_presentation:
    colortheme: dolphin
    fig_caption: yes
    fig_height: 6
    fig_width: 7
    fonttheme: structurebold
    highlight: tango
    smaller: yes
    toc: yes
    widescreen: yes
  beamer_presentation:
    colortheme: dolphin
    fig_caption: yes
    fig_height: 6
    fig_width: 7
    fonttheme: structurebold
    highlight: tango
    incremental: no
    keep_tex: no
    slide_level: 2
    theme: Montpellier
    toc: yes
  word_document:
    toc: yes
    toc_depth: '3'
  slidy_presentation:
    fig_caption: yes
    fig_height: 6
    fig_width: 7
    highlight: tango
    incremental: no
    keep_md: yes
    smaller: yes
    theme: cerulean
    toc: yes
    widescreen: yes
  pdf_document:
    fig_caption: yes
    highlight: zenburn
    toc: yes
    toc_depth: 3
font-import: http://fonts.googleapis.com/css?family=Risque
font-family: Garamond
transition: linear
---

```{r libraries, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE}
#### Install required packages ####
required.packages <- c("knitr")

for (pkg in required.packages) {
  if (!require(pkg, character.only = TRUE)) {
    message("Installing package ", pkg)
    install.packages(pkg, dependencies = TRUE)
  }
  require(pkg, character.only = TRUE)
}

#### Load libraries ####
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}
if (!require("Biostrings", quietly = TRUE)) {
  BiocManager::install("Biostrings")
  require("Biostrings")
}


```


```{r knitr_settings, include=FALSE, echo=FALSE, eval=TRUE}
library(knitr)
options(width = 300)
knitr::opts_chunk$set(
  fig.width = 7, fig.height = 5, 
  fig.path = 'figures/pip_profiles/RNA_',
  fig.align = "center", 
  size = "tiny", 
  echo = TRUE, eval = TRUE, 
  warning = FALSE, message = FALSE, 
  results = TRUE, comment = "")
# knitr::asis_output("\\footnotesize")


## Store original graphic parameters to restore them after chunks
par.ori <- par(no.readonly = TRUE)


```


```{r parameters}

#### General parameters for the analysis ####

## Use (or not) GIDAID sequences
## 
## A few genoes were not available in NCBI Genbank at the time of 
## this analysis, and had to be downloaded from GISAID. These sequences
##  can however not be redistributed, they should thus be downloaded 
##  manually to reproduce the full trees. Alternatively, useGISAID
##  can be set to FALSE, whcih will reproduce the analysis with almost 
##  all the sequences of the paper.
useGISAID <- TRUE

## Sequence collection
## Supported: 
## collection <- "around-CoV-2" # ~20 genomes
## collection <- "selected" # ~60 genomes
## collection <- "all" # ~60 genomes
collections <- c("around-CoV-2", "selected", "all")
collection <- "around-CoV-2" # ~20 genomes


#### Define directories and files ####
dir <- list(main = '..')
dir$R <- file.path(dir$main, "scripts/R")

#### Create output directory for sequences ####
dir$outseq <- file.path(
  dir$main, "results", "S-gene", "Nto1_alignments")
dir.create(dir$outseq, showWarnings = FALSE, recursive = TRUE)

## Instantiate a list for output files
outfiles <- vector()

## Input files
infiles <- list()


## Genome dir and files
if (useGISAID) {
  dir$genomes <- file.path(dir$main, "data", "GISAID_genomes")
  infiles$genomes <- file.path(
    dir$genomes, 
    paste0("genomes_", collection, "-plus-GISAID.fasta"))
} else {
  dir$genomes <- file.path(dir$main, "data", "virus_genomes")
  infiles$genomes <- file.path(
    dir$genomes, 
    paste0("genomes_", collection,".fasta"))
}

## Genome sequences
if (!file.exists(infiles$genomes)) {
  stop("Genome sequence file is missing", "\n", infiles$genomes)
}


## Output tables
# di$output <- file.path(dir.main, "")
# dir$tables <- 

## Load custom functions
source(file.path(dir$R, "align_n_to_one.R"))
source(file.path(dir$R, "plot_pip_profiles.R"))
source(file.path(dir$R, "plot_pip_profiles_from_list.R"))

## Reference genome
refPattern <- "HuCoV2_WH01_2019"


## Features : coordinates of features of interest in the reference genome
features <- list()
## Spike gene  (coding for the spike protein)
features[['CDS-ORF1ab']] <- c(start = 266, end = 21555)
features[['CDS-S']] <- c(start = 21563, end = 25384)
features[['CDS-ORF3a']] <- c(start = 25393, end = 26220)
features[['CDS-E']] <- c(start = 26245, end = 26472)
features[['CDS-M']] <- c(start = 26523, end = 27191)
features[['CDS-ORF6']] <- c(start = 27202, end = 27387)
features[['CDS-ORF7a']] <- c(start = 27394, end = 27759)
features[['CDS-ORF8']] <- c(start = 27894, end = 28259)
features[['CDS-N']] <- c(start = 28274, end = 29533)
features[['CDS-ORF10']] <- c(start = 29558, end = 29674)                             

## Query genomes
queryPatterns <- c(
  "BtRaTG13_2013_Yunnan", 
  "BtZC45",
  "BtZXC21",
  "PnMP789",
  "PnGX-P1E_2017", 
  "HuSARS-Frankfurt-1_2003",
  "BtRc-o319 LC556375.1_ref_genome",
  "BtRacCS203 MW251308_ref_genome",
  "BtRacCS264 MW251311_ref_genome",
  "BtRacCS253 MW251310.1_ref_genome",
  "BtRacCS224 MW251309.1_ref_genome",
  "BtRacCS271 MW251312_genome"
)

#### Add GISAID IDs to the query pattern ####
## Note that GISAI genomes are be submitted to the github repo because they cannot be redistributed
if (useGISAID) {
  queryPatterns <- append(queryPatterns, 
                          c("BtYu-RmYN02_2019",
                            "PnGu1_2019",
                            "BtCambodia/RShSTT200/2010",
                            "BtCambodia/RShSTT182/2010"
                          ))
}

message("\tReference genomes: ", refPattern)
message("\tNumber of query patterns: ", length(queryPatterns))
message("\tQuery patterns: ", paste(collapse = ", ", queryPatterns))

```


```{r load_sequences}

#### Load genome sequences ####
genomes <- readDNAStringSet(filepath = infiles$genome, format = "fasta")

## Shorten sequence names by suppressing the fasta comment (after the space)
names(genomes) <- sub(pattern = " .*", replacement = "", x = names(genomes), perl = TRUE)

genomeNames <- names(genomes)
nbGenomes <- length(genomeNames)
message("Loaded ", nbGenomes, " genomes from file ", infiles$genomes)
# View(genomes)

#### Define reference and query genomes ####
refGenomeName <- grep(pattern = refPattern, x = names(genomes), 
                      ignore.case = TRUE, value = TRUE)
if (is.null(refGenomeName)) {
  stop("Could not identify reference genome with pattern ", refPattern)
}
message("Reference genome name: ", refGenomeName)

## Query genomes
queryRegExp <- paste0("(", paste(collapse = ")|(", queryPatterns), ")")
queryGenomeNames <- grep(pattern = queryRegExp, 
                         x = genomeNames, 
                         ignore.case = TRUE, value = TRUE)
nbQueryGenomes <- length(queryGenomeNames)

if (nbQueryGenomes == 0) {
  stop("Could not identify any query genome with query pattern\n", queryRegExp)
}

if (length(queryPatterns) != length(queryGenomeNames)) {
  foundPatterns <- grep(pattern = queryRegExp, x = queryGenomeNames, value = TRUE)
  missingPatterns <- setdiff(queryPatterns, queryGenomeNames)
  message("\t", 
          length(missingPatterns), " Missing genomes: ", 
          paste(collapse = ", ", missingPatterns))
}


## Compute some statistics about genome sizes
genomeStat <- data.frame(
  row.names = c(refGenomeName, queryGenomeNames),
  status = c("Reference", rep("Query", length.out = length(queryGenomeNames)))
)

g <- 1
for (g in c(refGenomeName, queryGenomeNames)) {
  genomeStat[g, "length"] <- length(genomes[[g]])
}

kable(genomeStat, caption = "Reference and query genomes")

```

The collection `r collection` contains `r length(genomes)` virus genome sequences.


## Strain statistics


```{r strain_stat}

## Report the number of genoomes
strainNames <- names(genomes)
nbStrains <- length(strainNames)
message("\tLoaded ", nbStrains, " genomes from file ", infiles$sequences)
# View(genomes)


#### Define reference and query genomes ####
refStrainName <- grep(pattern = refPattern, x = names(genomes), 
                      ignore.case = TRUE, value = TRUE)
if (is.null(refStrainName)) {
  stop("Could not identify reference genome with pattern ", refPattern)
}
message("\tReference genome name: ", refStrainName)

#### Compute statistics about sequence sizes ###
strainStats <- data.frame(
  n = 1:length(genomes),
  row.names = names(genomes),
  status = rep("Query", length.out = length(strainNames))
)
strainStats[,"status"] <- as.vector(strainStats[,"status"])
strainStats[refStrainName,"status"] <- "Reference"
g <- 1
for (g in strainNames) {
  strainStats[g, "length"] <- length(genomes[[g]])
}

```


```{r strain_colors}
#### Define the color associated to each sequence ####


## Color palette per species
speciesPalette <- list(
  Human = "#880000",
  Bat = "#888888",
  Pangolin = "#448800",
  Camel = "#BB8800",
  Pig = "#FFBBBB",
  Civet = "#00BBFF"
)

## Species prefix in the tip labels
speciesPrefix <- c("Hu" = "Human",
                   "Bt" = "Bat",
                   "Pn" = "Pangolin",
                   "Cm" = "Camel",
                   "Pi" = "Pig",
                   "Cv" = "Civet")

## Strain-specific colors
strainColor <- c(
  "HuCoV2_WH01_2019" = "red",
  "HuSARS-Frankfurt-1_2003" = "#0044BB",
  "PnGu1_2019" = "#00BB00",
  "PnMP789" = "#00FF88",
  "BtRaTG13_" = "#FF6600",
  "BtYu-RmYN" = "#FFBB22",
  "BtZXC21" = "black", 
  "BtZC45" = "black")

## Identify species per tip
for (prefix in names(speciesPrefix)) {
  strainStats[grep(pattern = paste0("^", prefix), x = row.names(strainStats), perl = TRUE), "species"] <- speciesPrefix[prefix]
  
}

## Assign acolor to each species
strainStats$color <- "grey" # default
strainStats$color <- speciesPalette[as.vector(strainStats$species)]

for (strain in names(strainColor)) {
  strainStats[grep(pattern = paste0("^", strain), 
                  x = row.names(strainStats), perl = TRUE), "color"] <- strainColor[strain]
  
}


## Assign specific color to some nodes

## Define a color for each strain
strainColors <- (unlist(strainStats$color))
names(strainColors) <- row.names(strainStats)

```


```{r print_strain_stats}
kable(strainStats, caption = "Reference and query genomes")

```


## N-to-1 full genome alignments

We perform a pairwise lignment between each genome query and the reference genome (`r refGenomeName`).

```{r full-genomes_align}
#### N-to-1 genome alignments ####

## Define output file for genome alignments
outfiles["Genome alignments"] <- file.path(
  dir$outseq, paste0("genome_alignments_ref_", refGenomeName))

## Get sequences for reference and query genomes
refGenome <- genomes[refGenomeName]
queryGenomes <- genomes[queryGenomeNames]
genomesNto1 <- alignNtoOne(
  refSequence = refGenome, 
  querySequences = queryGenomes,
  outfile = outfiles[["Genome alignments"]] )

kable(genomesNto1$stats[order(genomesNto1$stats$score, decreasing = TRUE), ], 
      caption = "N-to-one alignment of full genomes")
```


## Full genome PIP plot


```{r genome_pip, fig.width=10, fig.height=6, out.width="100%", fig.cap="Percent Identical Positions profile over the whole genome of SARS-CoV-2. "}

## PIP profile of full genome N-to-1 alignments
PlotPIPprofilesFromList(alignments = genomesNto1$alignments, 
                windowSize = 500, 
                vgrid1 = 5000, 
                vgrid2 = 1000, colors = strainColors,
                main = paste0("Full genome PIP profile", "\nRef: ", refGenomeName),
                legendMargin = 0, 
                legendCorner = "bottom", 
                legendCex = 0.8, 
                ylim = c(40,100))

```


## SARS vs Civet

```{r SARS_vs_civet_genome}

## Define output file for genome alignments
outfiles["Genome alignments - SARS"] <- file.path(
  dir$outseq, paste0("genome_alignments_ref_",
                     "HuSARS-Frankfurt-1_2003"))

#### Compare SARS (2002) wih the related Civet genome ####
SARSvsCivetGenome <- alignNtoOne(
  refSequence = genomes["HuSARS-Frankfurt-1_2003"], 
  querySequences = genomes[c("Cv007-2004", 
                             "HuCoV2_WH01_2019")],
  outfile = outfiles["Genome alignments - SARS"] )

kable(SARSvsCivetGenome$stats[order(SARSvsCivetGenome$stats$score, decreasing = TRUE), ], 
      caption = "SARS (2003). N-to-one alignment of full genomes of the closest animal virus (Civet) and of Human SARS-CoV-2. ")

```

```{r PIP_SARS_vs_civet_genome, fig.width=10, fig.height=6, out.width="100%", fig.cap="Percent Identical Positions profile over the whole genome of SARS (2002-2003). "}

## PIP profile of full genome N-to-1 alignments
PlotPIPprofilesFromList(alignments = SARSvsCivetGenome$alignments, 
                windowSize = 500, colors = strainColors, 
                legend = paste0(names(SARSvsCivetGenome$alignments), " (", round(digits = 2, SARSvsCivetGenome$stats$pid), "%)"),
                main = paste0("Percent Identical Positions - Full genome", "\nRef: ", "Human_SARS-CoV_Frankfurt_1"),
                legendMargin = 0, legendCorner = "bottom", legendCex = 0.7, ylim = c(40,100))


```


## N-to-1 alignemnts of spike genes

```{r S-gene_align_queries}

featureName <- "CDS-S"
featureLimits <- features[[featureName]]


#### N-to-1 alignments of spike-coding sequences ####
dir[[featureName]] <- file.path(dir$main, "results", featureName)
dir.create(dir[[featureName]], showWarnings = FALSE, recursive = TRUE)
outfiles[featureName]  <- file.path(
  dir[[featureName]], paste0(featureName, "_", collection, "_matches.fasta"))
  
## Get sequences for reference and query genomes
refSeq <- subseq(genomes[refGenomeName], start = featureLimits[1], end = featureLimits[2])

featureAlignmentsNto1 <- alignNtoOne(
  refSequence = refSeq, 
  querySequences = queryGenomes, 
  type = "global-local",
  outfile = outfiles[featureName])

kable(featureAlignmentsNto1$stats[order(featureAlignmentsNto1$stat$score, decreasing = TRUE),], 
      caption = "N-to-one alignment of S genes")

```


### Spike gene PIP plot

```{r Sgene_pip, fig.width=10, fig.height=4.5, out.width="100%", fig.cap="Percent Identical Positions profile over the whole genome of SARS-CoV-2. "}

## PIP profile of spike N-to-1 alignments
PlotPIPprofilesFromList(alignments = featureAlignmentsNto1$alignments, 
                windowSize = 250,  
                colors = strainColors,
                vgrid1 = 1000, 
                vgrid2 = 200,
#                legend = paste0(names(featureAlignmentsNto1$alignments), " (", round(digits = 2, featureAlignmentsNto1$stats$pid), "%)"),
                main = paste0(featureName, " - PIP profile", "\nRef: ", refGenomeName),
                legendMargin = 0, legendCorner = "bottomright", legendCex = 0.8, ylim = c(30, 100))

```


<!-- ### Getting sequences of spike gene in all the selected coronavirus genomes -->


<!-- ```{r  match sequence fragment against all genomes and export the matches to a fasta file} -->


<!-- #' @title Export sequencesquery + subject sequences from N-to-1 alignments -->
<!-- #' @author Jacques.van-Helden@france)bioinformatique.fr -->
<!-- #' @param refsequence  sequence used as reference (DB) for the N-to-1 alignment -->
<!-- #' @param genomes genome sequences -->
<!-- #' @param IDsuffix=NULL if not null, append the suffix to the sequence name for the fasta header -->
<!-- #' @param outfile=NULL output file. If specified, the matches will be exported as fasta-formatted sequences. -->
<!-- #' @export -->
<!-- ExportSegmentMatches <- function(refsequence, -->
<!--                                  genomes, -->
<!--                                  IDsuffix = NULL, -->
<!--                                  outfile=NULL) { -->

<!--   #### N-to-1 alignments of all spike-coding sequences #### -->

<!--   ## Get sequences for reference and query genomes -->
<!--   alignmentsNto1 <- alignNtoOne( -->
<!--     refSequence = refsequence,  -->
<!--     querySequences = genomes,  -->
<!--     type = "global-local") -->

<!--   if (!is.null(outfile)) { -->

<!--     ## Write the reference sequence in the output fle -->
<!--     writeXStringSet(refsequence, filepath = outfile, format = "fasta") -->

<!--     i <- 1 -->
<!--     nbAlignments <- length(alignmentsNto1$alignments) -->
<!--     for (i in 1:nbAlignments) { -->
<!--       genomeName <- names(genomes[i]) -->
<!--       alignment <- alignmentsNto1$alignments[[i]] -->
<!--       subject <- subject(alignment) -->

<!--       ## Suppress the dashes from the alignment to get the raw sequence -->
<!--       sequence <- as.character(subject) -->
<!--       sequenceDesaligned <- gsub(pattern = "-", replacement = "", x = sequence) -->
<!--       seqStringSet <- DNAStringSet(x = sequenceDesaligned) #, start = start(subject), end=end(subject)) -->


<!--       ## Define a sequence ID for the fasta header -->
<!--       sequenceID <- genomeName -->
<!--       if (!is.null(IDsuffix)) { -->
<!--         sequenceID <- paste0(sequenceID, IDsuffix) -->
<!--       }  -->
<!--       sequenceID <- paste0(sequenceID, "_", start(subject), "-", end(subject)) -->
<!--       names(seqStringSet) <- sequenceID -->
<!--       #  -->
<!--       ## Write pairwise alignment (temporarily disaactivated) -->
<!--       # alignmentFile <- paste0("pairwise-alignment_",  -->
<!--       #                         # gsub(pattern = "/", replacement = "-", x = genomeName),  -->
<!--       #                         ".txt") -->
<!--       # writePairwiseAlignments(x = alignment, file = outfile) -->

<!--       ## Append the sequence to the file -->
<!--       message("\tAppending sequence ", i, "/", nbAlignments, "\t", sequenceID) -->
<!--       writeXStringSet(seqStringSet, -->
<!--                       filepath = outfile, format = "fasta", append = TRUE) -->

<!--     } -->
<!--     message("\tExported alignments to\t", outfile) -->

<!--   }  -->
<!--   return(alignmentsNto1) -->

<!-- } -->
<!-- ``` -->

<!-- ```{r Sgene_exportseq} -->


<!-- #### Export S gene sequences found in the reference genomes #### -->
<!-- # toString(refSeq) -->
<!-- writeXStringSet( -->
<!--   refSeq,  -->
<!--   filepath = file.path( -->
<!--     dir$outseq,  -->
<!--     paste0("S-gene_", refGenomeName, ".fasta")),  -->
<!--   format = "fasta") -->


<!-- ## Export matches to S-gene -->
<!-- outfiles["S gene matches"] <- file.path( -->
<!--   dir$outseq,  -->
<!--   paste0("S-gene_", refGenomeName, "_matches.fasta")) -->

<!-- featureAlignmentsNto1All <- ExportSegmentMatches( -->
<!--   refsequence = refSeq, -->
<!--   genomes = genomes,  -->
<!--   IDsuffix = "_S-gene", -->
<!--   outfile = outfiles["S gene matches"]) -->


<!-- kable(featureAlignmentsNto1All$stats[order(featureAlignmentsNto1All$stat$score, decreasing = TRUE),],  -->
<!--         caption = "N-to-one alignment of S genes") -->

<!-- ``` -->

<!-- ## Export matches to the coding sequence of the S1 cleavage product -->


<!-- ## N-to-1 alignemnts of S1 coding sequences -->

<!-- ```{r S1_align_queries} -->

<!-- refS1Limits <- refSpikeLimits[1] + 3*c(13,	685) -->

<!-- #### N-to-1 alignments of S1-coding sequences #### -->

<!-- ## Get sequences for reference and query genomes -->
<!-- refS1 <- subseq(genomes[refGenomeName], start = refS1Limits[1], end = refS1Limits[2]) -->

<!-- S1Nto1 <- alignNtoOne(refSequence = refS1, querySequences = queryGenomes, type = "global-local") -->

<!-- kable(S1Nto1$stats[order(S1Nto1$stat$score, decreasing = TRUE),],  -->
<!--       caption = "N-to-one alignment of S1 coding sequence") -->

<!-- ``` -->

<!-- ### S1 gene PIP plot -->

<!-- ```{r S1_pip, fig.width=10, fig.height=5, out.width="100%", fig.cap="Percent Identical Positions profile over the whole genome of SARS-CoV-2. "} -->

<!-- ## PIP profile of S1 N-to-1 alignments -->
<!-- plotPIPprofiles(alignments = S1Nto1$alignments,  -->
<!--                 windowSize = 200, colors = strainColors,  -->
<!--                 legend = paste0(names(S1Nto1$alignments), " (", round(digits = 2, S1Nto1$stats$pid), "%)"), -->
<!--                 main = paste0("S1 coding sequence - PIP profile", "\nRef: ", refGenomeName), -->
<!--                 legendMargin = 0, legendCorner = "bottomleft", legendCex = 0.8, ylim = c(30, 100)) -->

<!-- ``` -->


## Output files

```{r output_files}
kable(t(as.data.frame(dir)), col.names = "Dir", caption = "Directories")

kable(outfiles, col.names = "File", caption = "Output files")
```

## Session info

```{r session_info}
sessionInfo()
```