---
knit: (function(inputFile, encoding) { rmarkdown::render(inputFile, encoding = encoding, output_file ='one-to-n-matches_around-CoV-2-plus-GISAID.html') })
title: "One-to-N sequence matches in coronavirus genomes"
author: "Jacques van Helden"
date: "`r Sys.Date()`"
output:
  html_document:
    df_print: paged
    code_folding: hide
    fig_caption: yes
    highlight: zenburn
    self_contained: no
    theme: cerulean
    toc: yes
    toc_depth: 3
    toc_float: yes
  ioslides_presentation:
    colortheme: dolphin
    fig_caption: yes
    fig_height: 6
    fig_width: 7
    fonttheme: structurebold
    highlight: tango
    smaller: yes
    toc: yes
    widescreen: yes
  beamer_presentation:
    colortheme: dolphin
    fig_caption: yes
    fig_height: 6
    fig_width: 7
    fonttheme: structurebold
    highlight: tango
    incremental: no
    keep_tex: no
    slide_level: 2
    theme: Montpellier
    toc: yes
  word_document:
    toc: yes
    toc_depth: '3'
  slidy_presentation:
    fig_caption: yes
    fig_width: 7
    highlight: tango
    incremental: no
    keep_md: yes
    smaller: yes
    theme: cerulean
    toc: yes
    widescreen: yes
  pdf_document:
    fig_caption: yes
    highlight: zenburn
    toc: yes
    toc_depth: 3
font-import: http://fonts.googleapis.com/css?family=Risque
font-family: Garamond
transition: linear
---

```{r libraries, echo=FALSE, results=FALSE, warning=FALSE, message=FALSE}

#### Install required packages ####
required.packages <- c("knitr")

for (pkg in required.packages) {
  if (!require(pkg, character.only = TRUE)) {
    message("Installing package ", pkg)
    install.packages(pkg, dependencies = TRUE)
  }
  require(pkg, character.only = TRUE)
}

#### Load libraries ####
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}
requiredBioconductor <- c("Biostrings", "DECIPHER")
for (lib in requiredBioconductor) {
  if (!require(lib, character.only = TRUE, quietly = TRUE)) {
    message("Installing bioconductor package ", lib)
    BiocManager::install(lib)
    require(lib, character.only = TRUE)
  }
  
}


```


```{r knitr_settings, include=FALSE, echo=FALSE, eval=TRUE}
library(knitr)
options(width = 300)
knitr::opts_chunk$set(
  fig.path = 'figures/one-to-n-matches/',
  fig.width = 7, fig.height = 5, 
  fig.align = "center", 
  size = "tiny", 
  echo = TRUE, 
  eval = TRUE, 
  warning = FALSE, 
  message = FALSE, 
  results = TRUE, 
  comment = "")
# knitr::asis_output("\\footnotesize")


## Store original graphic parameters to restore them after chunks
par.ori <- par(no.readonly = TRUE)


```


```{r parameters}

#### General parameters for the analysis ####
reloadImage <- FALSE
quickTest <- FALSE # if True, only analyse the 2 first features


#### Sequence collection ####

## Supported: 
collections <- c(
  "around-CoV-2", 
  "selected",
  "around-CoV-2-plus-GISAID", 
  "selected-plus-GISAID"
  )


# collection <- "around-CoV-2" # ~20 strains
# collection <- "all" # ~60 strains
# collection <- "selected" # ~50 strains
collection <- "around-CoV-2-plus-GISAID" # 16 strains
# collection <- "selected-plus-GISAID" # ~40 strains
# collection <- "all-plus-GISAID" # ~60 strains


## Choose a collection-specific path for the figures
knitr::opts_chunk$set(fig.path = paste0('figures/one-to-n-matches_', collection, '/', collection, "_"))

## Note about GIDAID sequences.
## 
## A few genomes were not available in NCBI Genbank at the time of 
## this analysis, and had to be downloaded from GISAID. These sequences
##  can however not be redistributed, they should thus be downloaded 
##  manually to reproduce the full trees. 


## Exclude incomplete genomes (i.e. those containing a lof of Ns) to avoid biases in the distance computations
excludeIncomplete <- TRUE


#### Define directories and files ####
dir <- list(main = '..')
dir$R <- file.path(dir$main, "scripts/R")

## Input directory for sequences
dir$seqdata <- file.path(dir$main, "data")
dir.create(dir$seqdata, showWarnings = FALSE, recursive = TRUE)


## Create output directory for aligned sequences
dir$results <- file.path(dir$main, "results")
dir.create(dir$results, showWarnings = FALSE, recursive = TRUE)

## Instantiate a list for output files
outfiles <- vector()

## Memory image
dir$images <- file.path(dir$main, "memory_images")
dir.create(dir$images, recursive = TRUE, showWarnings = FALSE)
outfiles["Memory image"] <- file.path(
  dir$images, 
  paste0(collection, "_one-to-n_matches.Rdata"))

## Input files
infiles <- list()

## Output tables
# di$output <- file.path(dir.main, "")
# dir$tables <- 

## Load custom functions
source(file.path(dir$R, "align_n_to_one.R"))
source(file.path(dir$R, "plot_pip_profiles.R"))
source(file.path(dir$R, "plot_pip_profiles_from_list.R"))

## A unequivocal pattern to identify the reference genome in the sequence names of the input file
refPattern <- "HuCoV2_WH01_2019"

## Exclude some genomes with a lot of Ns, because they bias the PIP profiles and alignments and trees
excludePatterns <- c("PnGu-P2S_2019")


#### Features of interest in the reference genome ####

features <- list()


#### Specific features  ####

## Cited from Sun et al (https://doi.org/10.1016/j.molmed.2020.02.008)in Lin et al. https://doi.org/10.20944/preprints202006.0044.v1

features[['Sun_2020']]  <- c(start = 13522, end =  23686, isCDS = FALSE)

## S1, first cleavage product of the spike protein
features[['S1']] <- c(start = 21599, end = 23617, isCDS = TRUE)

## Receptor binding domain
features[['RBD']] <- c(start = 22517, end = 23185, isCDS = TRUE)

## S2, second cleavage product of the spike protein
features[['S2']] <- c(start = 23618, end = 25381, isCDS = TRUE)


## Regions around the insertions
features[['Ins1-pm120']] <- c(start = 21647, end = 21907, isCDS = FALSE)
features[['Ins2-pm120']] <- c(start = 21899, end = 22156, isCDS = FALSE)
features[['Ins3-pm120']] <- c(start = 21899, end = 23152, isCDS = FALSE)
features[['Ins4-pm120']] <- c(start = 23483, end = 23734, isCDS = FALSE)
features[['Ins4-m240']] <- c(start = 23363, end = 23614, isCDS = FALSE)

## Receptor-binding domain

## Potential Pangolin origin after Xiao (https://doi.org/10.1101/2020.02.17.951335)
features[['Recomb-Xiao']] <- c(start = 22871, end = 23092, isCDS = FALSE)

## Recombinant region 1 seen on the PIP profiles
features[['Recomb-reg-1']] <- c(start = 2900, end = 3800, isCDS = FALSE)

## Recombinant region 2 seen on the PIP profiles
features[['Recomb-reg-2']] <- c(start = 21000, end = 24000, isCDS = FALSE)


## Recombinant region 3 seen on the PIP profiles
features[['Recomb-reg-3']] <- c(start = 27500, end = 28550, isCDS = FALSE)

## Recombinant region 4 in the RBD
## Slighty wider than Xiao's limits in order to see the context
features[['Recomb-RBD']] <- c(start = 22700, end = 23200, isCDS = FALSE)
# features[['Recomb-RBD']] <- c(start = 22760, end = 23160, isCDS = FALSE)

## Annotated coding sequences
features[['CDS-S']] <- c(start = 21563, end = 25384, isCDS = TRUE) ## Spike gene
features[['CDS-ORF3a']] <- c(start = 25393, end = 26220, isCDS = TRUE)
features[['CDS-E']] <- c(start = 26245, end = 26472, isCDS = TRUE)
features[['CDS-M']] <- c(start = 26523, end = 27191, isCDS = TRUE)
features[['CDS-ORF6']] <- c(start = 27202, end = 27387, isCDS = TRUE)
features[['CDS-ORF7a']] <- c(start = 27394, end = 27759, isCDS = TRUE)
features[['CDS-ORF8']] <- c(start = 27894, end = 28259, isCDS = TRUE)
features[['CDS-N']] <- c(start = 28274, end = 29533, isCDS = TRUE)
features[['CDS-ORF10']] <- c(start = 29558, end = 29674, isCDS = TRUE)

## Large ORF covering 2 thirds of the genome, coding for polyprotein
features[['CDS-ORF1ab']] <- c(start = 266, end = 21555, isCDS = TRUE)

## All the sequences after the big polyprotein-coding ORF1ab
features[['After-ORF1ab']] <- c(start = 21556, end = 29899, isCDS = FALSE)

## Only keep the two first features if quick test has been specified
if (quickTest) {
  features <- features[1:2]
}

## Convert feature list into a feature tbale
featureTable <- as.data.frame(t(as.data.frame.list(
  features, stringsAsFactors = FALSE)),
  row.names = names(features)  # to avoid replacement of "-" by "."
  )

# class(featureTable)
featureTable$length <- featureTable$end - featureTable$start + 1
# row.names(featureTable)
# featureTable["CDS.S", ]

featureTable <- featureTable[order(featureTable$start), ]


message("Analysing ", length(features), " features")
message("\t", paste(collapse = ", ", names(features)))

## Report the parameters
message("\tReference pattern: ", refPattern)

```

## Strain collection: `r collection`


```{r load_sequences}


## Genome dir and files
if (length(grep(pattern = "GISAID", x = collection)) > 0) {
  useGISAID <- TRUE
  dir$sequences <- file.path(dir$main, "data", "GISAID_genomes")
  # collections <-  paste0(collections, "-plus-GISAID")
  # collection <-  paste0(collection, "-plus-GISAID")
} else {
  dir$sequences <- file.path(dir$main, "data", "genomes")
}


## Define the input sequences
infiles$sequences <- file.path(
  dir$sequences, 
  paste0("genomes_", collection, ".fasta"))

## Check if the input  sequence file exists
if (!file.exists(infiles$sequences)) {
  stop("Genome sequence file is missing", "\n", infiles$sequences)
} else {
  message("Genome sequence file", "\n", infiles$sequences)
}

#### Load genome sequences ####
genomes <- readDNAStringSet(filepath = infiles$sequences, format = "fasta")

## Shorten sequence names by suppressing the fasta comment (after the space)
names(genomes) <- sub(pattern = " .*", replacement = "", x = names(genomes), perl = TRUE)

## Exclude genomes
if (excludeIncomplete) {
  excludePattern = paste0("(", paste(collapse = ")|(", excludePatterns), ")")
  excludedstrainNames <- grep(pattern = excludePattern, x = names(genomes), 
                              value = TRUE, invert = FALSE)
  filteredGenomeIndices <- grep(pattern = excludePattern, x = names(genomes), 
                                value = FALSE, invert = TRUE)
  message("\tExcluded ", length(excludedstrainNames)," genomes: ", paste(collapse = ", ", excludedstrainNames)) 
  message("\tRemaining genomes: ", length(filteredGenomeIndices))
  genomes <- genomes[filteredGenomeIndices]
# names(genomes)
}

## Report the number of genoomes
strainNames <- names(genomes)
nbStrains <- length(strainNames)
message("\tLoaded ", nbStrains, " genomes from file ", infiles$sequences)
# View(genomes)


#### Define reference and query genomes ####
refStrainName <- grep(pattern = refPattern, x = names(genomes), 
                      ignore.case = TRUE, value = TRUE)
if (is.null(refStrainName)) {
  stop("Could not identify reference genome with pattern ", refPattern)
}
message("\tReference genome name: ", refStrainName)


```


```{r reload_image}
allFeatureAlignments <- list()

## Reload memory image if specified in parameters
if (reloadImage) {
  load(outfiles["Memory image"])
  reloadImage <- TRUE
}

```

## Parameters

```{r print_parameters}
#### Define a list of parameters ####
parameters <- list()
parameters$collection <- collection
parameters$refStrainName <- refStrainName
parameters$nbStrains <- nbStrains
parameters$sequenceDir <- dir$sequences
parameters$sequenceFile <- infiles$sequences
parameters$memoryImage <- outfiles["Memory image"]


## Exclude metagenomic assemblies
parameters$pasDeTambouille <- FALSE
if (parameters$pasDeTambouille) {
  excludePatterns <- append(excludePatterns, "BtYu-RmYN02_2019")
}

kable(t(as.data.frame.list(parameters)), 
      caption = "Parameters of the analysis",
      col.names = "Parameter")


```


## Strain statistics


```{r strain_stat}
#### Compute statistics about sequence sizes ###
strainStats <- data.frame(
  n = 1:length(genomes),
  row.names = names(genomes),
  status = rep("Query", length.out = length(strainNames))
)
strainStats[,"status"] <- as.vector(strainStats[,"status"])
strainStats[refStrainName,"status"] <- "Reference"
g <- 1
for (g in strainNames) {
  strainStats[g, "length"] <- length(genomes[[g]])
}

```


```{r strain_colors}
#### Define the color associated to each sequence ####


## Color palette per species
speciesPalette <- list(
  Human = "#880000",
  Bat = "#888888",
  Pangolin = "#447700",
  Camel = "#BB8800",
  Pig = "#FFBBBB",
  Civet = "#00BBFF"
)

## Species prefix in the tip labels
speciesPrefix <- c("Hu" = "Human",
                   "Bt" = "Bat",
                   "Pn" = "Pangolin",
                   "Cm" = "Camel",
                   "Pi" = "Pig",
                   "Cv" = "Civet")

## Strain-specific colors
strainColor <- c(
  "HuCoV2_WH01_2019" = "red",
  "HuSARS-Frankfurt-1_2003" = "#0044BB",
  "PnGu1_2019" = "#00CC00",
  "PnMP789" = "#00FF88",
  "BtRaTG13_" = "#FF6600",
  "BtYu-RmYN" = "#FFBB22",
  "BtZXC21" = "black", 
  "BtZC45" = "black")

## Identify species per tip
for (prefix in names(speciesPrefix)) {
  strainStats[grep(pattern = paste0("^", prefix), x = row.names(strainStats), perl = TRUE), "species"] <- speciesPrefix[prefix]
  
}

## Assign acolor to each species
strainStats$color <- "grey" # default
strainStats$color <- speciesPalette[as.vector(strainStats$species)]

for (strain in names(strainColor)) {
  strainStats[grep(pattern = paste0("^", strain), 
                  x = row.names(strainStats), perl = TRUE), "color"] <- strainColor[strain]
  
}


## Assign specific color to some nodes

## Define a color for each strain
strainColors <- (unlist(strainStats$color))
names(strainColors) <- row.names(strainStats)

```


```{r print_strain_stats}
kable(strainStats, caption = "Reference and query genomes")

```


The collection `r collection` contains `r length(genomes)` virus genome sequences.

## Features

```{r feature_table}
## Print feature coordinates
kable(featureTable, caption = "Feature coordinates. ")

```


## Feature matches and alignments

1. We perform a global pairwise alignment (Needle-Waterman algorithm) between each feature of the reference strain (`r refStrainName`) and each one of the query genomes.

2. The collected nucleic sequences are then aligned based on their translated sequences. This ensures a more relevant placing of the gaps. These translation-based alignments are exported in aminoacids and DNA formats (the DNA sequences are obtained by reverse-mapping the aligned aminoacid sequences onto the input DNA sequences).

3. For each alignment (AA and DNA) wee compute a consensus matrix, indicating the frequency of each residue (row of the consensus matrix) in each position of the alignment (column of the matrix). 

```{r one2N_feature_matches, results="asis", fig.width=10, fig.height=6, out.width="100%", fig.cap="Feature-specific Percent Identical Positions (PIP) profiles. "}
#### One-to-N alignmemnt of user-speficied genomic features ####


i <- 1
testFeature <- "Recomb-RBD"
i <- which(names(features) == testFeature)

## Rune one-ton matches and translation-based mumti^ple alignment + PIP profiles
for (i in 1:length(features)) {
  featureName <- names(features)[i]
  
  ## Compute feature length
  featureLimits <- features[[featureName]]
  featureStart <- featureLimits[["start"]]
  featureEnd <- featureLimits[["end"]]
  featureLength <- 
    featureEnd - featureStart  + 1
  
  message("Searching matches for feature ", 
          i, "/", length(features), "\t",
          featureName, 
          " (", featureLimits[1], "-", featureLimits[2], ")", 
          " in collection ", collection)
  
  ## Print a section title with the feature name and limits
  cat(sep = "", "\n### ",  featureName, 
      " (", featureLimits[1], "-", featureLimits[2], 
      "; ", featureLength,"bp)", "\n")


  ## Choose a collection- and feature-specific path for the figures
  knitr::opts_chunk$set(fig.path = paste0(
    'figures/one-to-n-matches_', collection, '/',
    collection, '_vs_', featureName))

  
  #### Define feature-specific output directory ####
  featurePrefix <- paste0(
    featureName,
    "_", collection)
  
  dir[[featureName]] <- file.path(dir$results, featurePrefix)
  message("\tOutput directory: ", dir[[featureName]] )
  dir.create(dir[[featureName]], showWarnings = FALSE, recursive = TRUE)
  
  #### Find feature matches by 1-to-1 alignment with each genome ####

  ## Define output file for unaligned matching sequences
  outfiles[paste0(featurePrefix, "_matches")]  <- file.path(
    dir[[featureName]], paste0(featurePrefix, "_matches.fasta"))
  message("\tMatching sequences: ", 
          outfiles[paste0(featurePrefix, "_matches")])
  
  ## Get sequence of the feature from the reference genome
  refSeq <- subseq(genomes[refStrainName], 
                   start = featureLimits[1], 
                   end = featureLimits[2])

  ## Match the reference feature with all the genomes
  if (reloadImage) {
    featureAlignmentsNto1 <- allFeatureAlignments[[featureName]]
  } else {  
    featureAlignmentsNto1 <- alignNtoOne(
      refSequence = refSeq, 
      querySequences = genomes, 
      type = "global-local",
      outfile = outfiles[paste0(featurePrefix, "_matches")])
    allFeatureAlignments[[featureName]] <- featureAlignmentsNto1
    
  }
  
  
  #### Translation-based alignments ####
  if (featureTable[featureName, "isCDS"]) {
    message("\tTranslation-based multiple alignment of feature ", featureName, "; DNA output")
    
    #### Align translated sequenecs and return correponding nucleic sequences ####
    featureAlignmentsNto1$tralignedDNA <- AlignTranslation(
      DNAStringSet(featureAlignmentsNto1$sequences), 
      type = "DNAStringSet",
      verbose = FALSE)
    
    ## Save multiple-aligned DNA sequences
    outfiles[paste0(featurePrefix, "_tralignedDNA")]  <- file.path(
      dir[[featureName]], paste0(featurePrefix, "_tralignedDNA.fasta"))
    message("\tTranslation-based alignment saved in file ", 
            outfiles[paste0(featurePrefix, "_tralignedDNA")])
    writeXStringSet(
      x = featureAlignmentsNto1$tralignedDNA, 
      filepath = outfiles[paste0(featurePrefix, "_tralignedDNA")] )

    ## Convert alignment to position-weight matrix
    featureAlignmentsNto1$tralignedDNAConsensus <- consensusMatrix(
      x = featureAlignmentsNto1$tralignedDNA)
    
    #### Align translated sequenecs and return correponding AA sequences ####
    message("\tTranslation-based multiple alignment of feature ", featureName, "; AA output")
    featureAlignmentsNto1$tralignedAA <- AlignTranslation(
      DNAStringSet(featureAlignmentsNto1$sequences), 
      type = "AAStringSet",
      verbose = FALSE)

    ## Convert alignment to position-weight matrix
    featureAlignmentsNto1$tralignedAAConsensus <- consensusMatrix(
      x = featureAlignmentsNto1$tralignedAA)
  
    ## Save multiple-aligned AA sequences
    outfiles[paste0(featurePrefix, "_tralignedAA")]  <- file.path(
      dir[[featureName]], paste0(featurePrefix, "_tralignedAA.fasta"))
    message("\tTranslation-based alignment saved in file ", 
            outfiles[paste0(featurePrefix, "_tralignedAA")])
    writeXStringSet(
      x = featureAlignmentsNto1$tralignedAA, 
      filepath = outfiles[paste0(featurePrefix, "_tralignedAA")] )
  }
  
  ## Compute sequence order by decreasing PIP score
  seqOrder <- order(featureAlignmentsNto1$stat$score, decreasing = TRUE)
  

  ## Print matching statistics
  kable(featureAlignmentsNto1$stats[seqOrder, ], 
        caption = paste0(
          "One-to-N alignment of feature ", 
          featureName))
  
  #### PIP profiles ####
  ## Choose window size  
  if (featureLength > 20000) {
    windowSize <- 800 
  } else if (featureLength > 5000) {
    windowSize <- 500 
  } else if (featureLength > 3000) {
    windowSize <- 200 
  } else if (featureLength > 400) {
    windowSize <- 100 
  } else {
    windowSize <- max(50, 10 * round(featureLength / 100))
  }

  
  ## DNA PIP profile of one-to-N alignment
  PlotPIPprofilesFromList(
#    alignments = featureAlignmentsNto1$alignments[seqOrder],
    alignments = featureAlignmentsNto1$alignments,
    reversePlot = TRUE,
    windowSize = windowSize,
    main = paste0("PIP profiles: collection ", collection,
                  "\nGenomic feature ", featureName, " (", featureStart, ":", featureEnd ,", ", featureLength, " bp)"),
    #    colors = NULL,
    colors = strainColors,
    legendMargin = 0.25,
    legendCorner = "topright", 
    lwd = 1,
    legendCex = 0.5, 
    ylim = c(30, 100))

  ## Identify the correspodences between matches and sequences
  seqNames <- names(featureAlignmentsNto1$alignments)
  refSeqName <- refStrainName
  matchNames <- names(featureAlignmentsNto1$sequences) 
  refMatchName <- grep(pattern = paste0("^", refStrainName), matchNames, 
                       perl = TRUE, value = TRUE)

  ## Define colors
  matchColors <- strainColors[seqNames]
  names(matchColors) <- matchNames
  
  if (featureTable[featureName, "isCDS"]) {
    
    #### DNA PIP profile of translation-based multiple alignment ####
    PlotPIPprofiles(
      alignment = featureAlignmentsNto1$tralignedDNA, 
      refSeqName = refMatchName,
      plotRefGaps = FALSE,
      reversePlot = TRUE,
      windowSize = windowSize,
      main = paste0("DNA PIP profiles – collection: ", collection,
                    "\nGenomic feature ", featureName, " (", featureStart, ":", featureEnd ,", ", featureLength, " bp)"),
      colors = matchColors,
      legendMargin = 0.3,
      legendCorner = "topright",
      lwd = 1,
      legendCex = 0.5,
      ylim = c(30, 100))
    
    ## AA PIP profile of translation-based multiple alignment
    PlotPIPprofiles(
      #    alignments = featureAlignmentsNto1$alignments[seqOrder],
      alignment = featureAlignmentsNto1$tralignedAA, 
      refSeqName = refMatchName,
      plotRefGaps = FALSE,
      reversePlot = TRUE,
      windowSize = windowSize/3,
      main = paste0("AA PIP profiles: collection ", collection,
                    "\nGenomic feature ", featureName, " (", featureStart, ":", featureEnd ,", ", featureLength, " bp)"),
      colors = matchColors,
      legendMargin = 0.3,
      legendCorner = "topright",
      lwd = 1,
      legendCex = 0.5,
      ylim = c(30, 100))
  }    
  
  
}
# }
```


<!-- ## Translation-based multiple alignments -->

<!-- For the features corresponding to coding sequences (and in frame), it is more accurate to run aminoacid-based alignment, and to align in a second time the RNA codons onto the aligned aminoacids.  -->

<!-- ```{r align_translation} -->
<!-- # featureName <- "CDS-S" -->
<!-- featureName <- "S1" -->
<!-- for (featureName in row.names(featureTable)) { -->
<!--   if (featureTable[featureName, "isCDS"]) { -->
<!--     message("Translation-based multiple alignment of feature ", featureName) -->
<!--     featureAlignmentsNto1 <- allFeatureAlignments[[featureName]] -->
<!--     rawSequences <- DNAStringSet(featureAlignmentsNto1$sequences) -->
<!--     class(rawSequences) -->

<!--     traligned <- AlignTranslation( -->
<!--       rawSequences,  -->
<!--       type = "DNAStringSet", -->
<!--       verbose = FALSE) -->

<!--     allFeatureAlignments[[featureName]]$trAlignment <- traligned -->
<!--     # names(allFeatureAlignments[[featureName]]) -->


<!--     ## Save alignment -->
<!--     featurePrefix <- paste0( -->
<!--       featureName, -->
<!--       "_", collection) -->

<!--     ## Define output file for unaligned matching sequences -->
<!--     outfiles[paste0(featurePrefix, "_traligned")]  <- file.path( -->
<!--       dir[[featureName]], paste0(featurePrefix, "_traligned.fasta")) -->
<!--     message("\tTranslation-based alignment saved in file ",  -->
<!--             outfiles[paste0(featurePrefix, "_traligned")]) -->

<!--     ## Save aligned sequences in fasta format -->
<!--     writeXStringSet( -->
<!--       x = traligned,  -->
<!--       filepath = outfiles[paste0(featurePrefix, "_traligned")]) -->
<!--   } -->

<!--   ## Convert alignment to position-weight matrix -->
<!--   consensusM <- consensusMatrix(x = traligned) -->

<!--   allFeatureAlignments[[featureName]]$consensusM <- consensusM -->
<!-- } -->


<!-- ``` -->


<!-- ```{r one2N_feature_matches_PIP, fig.width=10, fig.height=6, out.width="100%", fig.cap="Feature-specific Percent Identical Positions (PIP) profiles. "} -->
<!-- #### One-to-N alignmemnt of user-speficied genomic features #### -->
<!-- allFeatureAlignments <- list() -->
<!-- # featureName <- "CDS-S" # for the test -->
<!-- featureName <- "S1" # for the test -->
<!-- # featureName <- "Recomb-reg-3" # for the test -->
<!-- # featureName <- "RBD" # for the test -->
<!-- # for (collection in (collections)) { -->
<!-- for (featureName in names(features)) { -->
<!--   featureLimits <- features[[featureName]] -->
<!--   featureStart <- featureLimits[["start"]] -->
<!--   featureEnd <- featureLimits[["end"]] -->
<!--   featureLength <-  -->
<!--      featureEnd - featureStart  + 1 -->

<!--   message("Searching matches for feature ", featureName,  -->
<!--           " (", featureLimits[1], "-", featureLimits[2], ")",  -->
<!--           " in collection ", collection) -->

<!--   #### N-to-1 alignments of spike-coding sequences #### -->
<!--   featurePrefix <- paste0( -->
<!--     featureName, -->
<!--     "_", collection) -->

<!--   dir[[featureName]] <- file.path(dir$results, featureName) -->
<!--   dir.create(dir[[featureName]], showWarnings = FALSE, recursive = TRUE) -->
<!--   outfiles[featureName]  <- file.path( -->
<!--     dir[[featureName]], paste0(featurePrefix, ".fasta")) -->
<!--   message("\tOutput directory: ", dir[[featureName]] ) -->

<!--   ## Get sequence of the feature from the reference genome -->
<!--   refSeq <- subseq(genomes[refStrainName],  -->
<!--                    start = featureLimits[1],  -->
<!--                    end = featureLimits[2]) -->

<!--   featureAlignmentsNto1 <- allFeatureAlignments[[featureName]] -->

<!--   seqOrder <- order(featureAlignmentsNto1$stat$score, decreasing = TRUE) -->

<!--   if (featureLength > 20000) { -->
<!--     windowSize <- 800  -->
<!--   } else if (featureLength > 5000) { -->
<!--     windowSize <- 500  -->
<!--   } else if (featureLength > 3000) { -->
<!--     windowSize <- 200  -->
<!--   } else { -->
<!--     windowSize <- max(100, 10 * round(featureLength / 100)) -->
<!--   } -->

<!--   ## PIP profile of one-to-N alignment -->
<!--   plotPIPprofiles( -->
<!--     alignments = featureAlignmentsNto1$alignments[seqOrder], -->
<!--     reversePlot = TRUE, -->
<!--     windowSize = windowSize,  -->
<!--     main = paste0(featureName, " (", featureStart, ":", featureEnd ,", ", featureLength, " bp)"),  -->
<!-- #    colors = NULL, -->
<!--     colors = strainColors, -->
<!--     legendMargin = 0.25,  -->
<!--     legendCorner = "topright", lwd = 1, -->
<!--     legendCex = 0.5, ylim = c(30, 100)) -->


<!--   kable(featureAlignmentsNto1$stats[seqOrder,],  -->
<!--         caption = paste0( -->
<!--           "One-to-N alignment of feature ",  -->
<!--           featureName)) -->

<!-- } -->
<!-- # } -->
<!-- ``` -->


## Output files

```{r output_files}
kable(t(as.data.frame(dir)), col.names = "Dir", caption = "Directories")

outfileTable <- data.frame(path = as.vector(outfiles))
outfileTable$basename <- basename(as.vector(outfileTable$path))
outfileTable$dir <- dirname(as.vector(outfileTable$path))
outfileTable$link <- paste0(
  "[", outfileTable$basename, "](", outfileTable$path, ")"
)

kable(outfileTable[, c("dir", "link")], 
      col.names = c("dir", "file"), 
      caption = "Output files")

```

## Memory image

We store the result ina memory image, in oder to be able reloading it to plot PIP profiles with different parameters. 

```{r}
save.image(file = outfiles["Memory image"])

```


## Session info

```{r session_info}
sessionInfo()
```