---
title: "Markov Models part 1: nothing to hide so far. Solutions"
subtitle: "Computational biology"
author: "Jacques van Helden"
date: '`r Sys.Date()`'
output:
  pdf_document:
    fig_caption: yes
    highlight: zenburn
    toc: yes
    toc_depth: 3
  beamer_presentation:
    colortheme: dolphin
    fig_caption: yes
    fig_height: 6
    fig_width: 7
    fonttheme: structurebold
    highlight: tango
    incremental: no
    keep_tex: no
    slide_level: 2
    theme: Montpellier
    toc: yes
  slidy_presentation:
    self_contained: false
    fig_caption: yes
    fig_height: 6
    fig_width: 7
    highlight: tango
    incremental: no
    keep_md: yes
    smaller: yes
    theme: cerulean
    toc: yes
    widescreen: yes
  html_document:
    self_contained: false
    code_folding: hide
    fig_caption: yes
    highlight: zenburn
    theme: cerulean
    toc: yes
    toc_depth: 3
    toc_float: yes
  ioslides_presentation:
    self_contained: false
    css: slides.css
    fig_caption: yes
    fig_height: 6
    fig_width: 7
    highlight: tango
    smaller: yes
    toc: yes
    widescreen: yes
  word_document:
    toc: yes
    toc_depth: 3
# font-import: http://fonts.googleapis.com/css?family=Risque
font-family: Garamond
transition: linear
---

```{r setup, include=FALSE, echo=FALSE, eval=TRUE}
library(knitr)

options(width = 300)
knitr::opts_chunk$set(
  fig.width = 7, fig.height = 5, 
  fig.align = "center", 
  fig.path = "figures/markov-models_",
  size = "tiny", 
  # echo = FALSE, 
  eval = TRUE, 
  warning = FALSE, 
  message = FALSE, 
  results = TRUE, 
  comment = "")
# knitr::asis_output("\\footnotesize")

# dir.slides <- "~/IFB/NNCR/using_IFB_NNCR/slides/"
# setwd(dir.slides)

```

```{r libraries}
if (!require("gplots")) {
   install.packages("gplots", dependencies = TRUE)
   library(gplots)
   }
if (!require("RColorBrewer")) {
   install.packages("RColorBrewer", dependencies = TRUE)
   library(RColorBrewer)
   }

```

## CpG island length distribution

```{r load_CpG_coordinates}
## Load CpG island coordinates
CpG.coord <- read.delim(file = "results/hg38/hg38_CpG_islands.bed.gz", header = FALSE)

names(CpG.coord) <- c("chr", "start", "end", "id")
## Note: bed convention : the end marks the first position
## **after** the feature, so the length is simply the difference
CpG.coord$length <- CpG.coord$end - CpG.coord$start


```


```{r CpG_len_stat}

genome.size <-  3e+9

## Compute statistics about CpG island lengths
CpG.length.stat <- data.frame(
  nb = nrow(CpG.coord),
  min = min(CpG.coord$length),
  Q1 = quantile(CpG.coord$length, probs = 0.25),
  median = median(CpG.coord$length),
  mean = mean(CpG.coord$length),
  Q3 = quantile(CpG.coord$length, probs = 0.75),
  pc95 = quantile(CpG.coord$length, probs = 0.95),
  max = max(CpG.coord$length),
  MB =  sum(CpG.coord$length) / 1e6,
  cov.percent = sum(CpG.coord$length) / genome.size * 100
)


## Print the stats
kable(CpG.length.stat, caption = "Statistics about CpG island lengths", digits = 3, row.names = FALSE)

```

The currently annotated CpG islands  cover `r round(digits=1, CpG.length.stat$MB)` Mb, which represent `r round(digits=1, CpG.length.stat$cov.percent)`% of the genome. 


```{r CpG_lengths, fig.width=7, fig.height=7, fig.cap="Distribution of lengths for all the CpG islands annotated in the Human genome.", out.width="80%"}
par(mfrow = c(2,1))


par(mar = c(4.1, 5.1, 1, 1))

hist(CpG.coord$length, 
     breaks = seq(from = 0, 
                  to = CpG.length.stat$max + 50, 
                  by = 50),
     main = NA, las = 1,
     xlab = "CpG length", ylab = "Number of CpGs")

## Draw the histogram with a limited X axis to see the relevant part of the distribution
hist(CpG.coord$length, 
     breaks = seq(from = 0, 
                  to = CpG.length.stat$max + 50, 
                  by = 50),
     xlim = c(0,3000),
     main = NA, las = 1,
     xlab = "CpG length (truncated X scale)", ylab = "Number of CpGs", col = "gray")

par(mar = c(4.1, 5.1, 4.1, 2.1))
par(mfrow = c(1,1))
```


## Loading the Markov models

We start by loading the two Markov models previously computed from the [RSAT](http://metazoa.rsat.eu/) tool [create background model](http://rsat.sb-roscoff.fr/create-background-model_form.cgi).


```{r load_markov_models}
## Load a transition matrix from the RSAT result
readMarkovModel <- function(file) {
  model <- read.delim(
    file = file, 
    row.names = 1)
  names(model) <- toupper(names(model))
  rownames(model) <- toupper(rownames(model))
  prior <- model[nrow(model),1:4] ## Last row contains priors as comments
  names(prior) <- names(model)[1:4]
  transitions <- rbind(model[1:4, 1:4], "B" = prior)
  return(transitions)
}

## Load the CpG transition table from the RSAT result
transitions.CpG <- readMarkovModel(file = "results/hg38/hg38_CpG_transitions_m1.tsv")

## Load the genomic background transition table from the RSAT result
transitions.Bg <- readMarkovModel(file = "results/hg38/hg38_genomic-bg_transitions_m1.tsv")

## Check that the rows of the transition matrix sum to 1
## (given some rounding errors)
# apply(transitions.CpG[2:5], 1, sum)
# apply(transitions.Bg[2:5], 1, sum)

```

```{r transition_heatmap_function}
## Define a function that draws a heatmap from a transition matrix
transition.heatmap <- function(
  transitions, 
  main = "Transitions",
  col.palette = gray.colors(n = 100, start = 1, end = 0),
  breaks = (0:length(col.palette))/(n.colors*2)) {
  

  n.colors <- length(col.palette)  
  h <- heatmap.2(
    as.matrix(transitions), 
    main = main,
    cellnote = round(digits = 3, as.matrix(transitions)),
    trace = "none", 
    margins = c(4, 4),
    #    offsetRow = -10,
    breaks = breaks, 
    key = FALSE,
    srtCol = 0,
    notecol = "black", 
    notecex = 1.2,
    col = col.palette,
    scale = "none", 
    las = 1, 
    Rowv = FALSE, Colv = FALSE, dendrogram = "none")
  return(h)
  #  grab_grob()
}

```


```{r transition_heatmap_CpG, fig.width=6, fig.height=6, out.width="100%", fig.cap="Heatmap of the transition matrices from 1st order Markov models trained on CpG islands."}

## Draw transition heatmap
heatmap.CpG <- transition.heatmap(transitions.CpG, "CpG islands")

```


```{r transition_heatmap_Bg, fig.width=6, fig.height=6, out.width="100%", fig.cap="Heatmap of the transition matrices from 1st order Markov models trained on genomic background."}

## Draw transition heatmap
heatmap.Bg <- transition.heatmap(transitions.Bg, "Genomic background")

```


## Discrimination between two models

Problem: for a given sequence of events (e.g. a nucleotidic sequence), identify the most likely Markov model.

Approach: compute the log-likelihood ratios (log-odd ratios) of the sequence probabilities computed using respectively the CpG island and genomic background models.

$$P_{\text{CpG}}(S) = P_{\text{CpG}}(S_1) \cdot \prod_{i=1}^{n-1} P_{\text{CpG}}(S_{i+1}|S_i)$$
$$P_{\text{Bg}}(S) = P_{\text{Bg}}(S_1) \cdot \prod_{i=1}^{n-1} P_{\text{Bg}}(S_{i+1}|S_i)$$


$$L(S) = log \left( \frac{ P_{\text{CpG}}(S)}{P_{\text{Bg}}(S)} \right)$$

where 

- $L(S)$ is the log-likelihood of the sequence $S$, 
- $P_{\text{CpG}}(S)$ the probability for this sequence to be generated by the CpG island model, and
- $P_{\text{Bg}}(S)$ its probability to be generated by the background model. 

A more efficient approach : rather than computing the two sequence probabilities (as the product of transition probabilities), we can compute once and forever a matrix with tbe log-odds of residue transitions.

$$L(r_j|r_i) = log \left( \frac{P_{\text{CpG}}(r_j|r_i)}{P_{\text{Bg}}(r_j|r_i)} \right)$$


```{r transition_log_odds, fig.width=6, fig.height=6, out.width="100%", fig.cap="Heatmap of the log-odds between CpG and genomic background."}

my.palette <- colorRampPalette(c("blue", "white", "red"))(n = 100)

## Compute log-odds of transition frequencies
transition.log.odds <- log2(transitions.CpG / transitions.Bg)

max.lor <- ceiling(max(abs(range(transition.log.odds))))

heatmap.logodds <- transition.heatmap(
  transition.log.odds, "CpG / Bg log-odds",
  col.palette = my.palette,
#  breaks = seq(from = -max.lor, to = +max.lor, length.out = length(my.palette) + 1)
  breaks = length(my.palette) + 1
)

```


```{r transition_heatmaps, fig.width=6, fig.height=6, out.width="100%", fig.cap="Heatmaps of the transition matrices from 1st order Markov models trained on CpG islands (left) or genomic background (right), respectively."}
# library(gridGraphics)
# library(grid)
# library(gplots)
# library(gridExtra)
# 
# grab_grob <- function(){
#   grid.echo()
#   grid.grab()
# }
# heatmaps <- list(
#   CpG = one.heatmap(transitions.CpG, "CpG islands"),
#   Bg = one.heatmap(transitions.Bg, "Genomic background")
# )
# grid.newpage()
# grid.arrange(grobs = heatmaps, ncol = 2, clip = TRUE)

```

We can then use this log-odds matrix to compute the log-odds of a sequence as the sum of the transition log-odds. 

$$L(S) = L_{\text{B}}(S_1) \cdot \sum_{i=1}^{n-1} L(S_{i+1}|S_i)$$

## Hidden Markov Model

We already dispose of the emission probabilities of the residues in the two respective states (CpG islands and genomic background). We now need to compute the transition probabilities between the states. 

We can estimate these probabilities based on the following parameters

- genome size: $S_{Bg} = `r genome.size`$ 
- total size of the annotated CpG islands: $S_{CpG} = `r format(digits=12, big.mark=",", sum(CpG.coord$length))`$
- number of CpG islands: $N_{CpG} = `r format(digits=12, big.mark=",", CpG.length.stat$nb)`$

```{r CpG_parameters}

## Estimate transition matrix for CpG islands

stats <- list(L.G = 3e+9, ## Genome size
                L.CpG = 24200434, # Total size of CpG islands (label: +)
                nb.CpG = 31144 # Number of CpG islands
)
stats$L.non.CpG <- stats$L.G - stats$L.CpG # Total size of non-CpG Islands (label: -)
stats$CpG.mean.length <- stats$L.CpG / stats$nb.CpG

library(knitr)
library(pander)
pander(as.data.frame(stats), 
       caption = "Parameters used to compute the CpG island transition matrix.", 
       big.mark = ",")

```


```{r CpG_transition_matrix}

## Estimate transition matrix for CpG islands

stats <- list(L.G = 3e+9, ## Genome size
                L.CpG = 24200434, # Total size of CpG islands (label: +)
                nb.CpG = 31144 # Number of CpG islands
)
stats$L.non.CpG <- stats$L.G - stats$L.CpG # Total size of non-CpG Islands (label: -)
stats$CpG.mean.length <- stats$L.CpG / stats$nb.CpG


## Transition matrix
transitions <- data.frame(matrix(nrow = 2, ncol = 2))
names(transitions) <- c("+", "-")
rownames(transitions) <- c("+", "-")

## Transitions from non-CpG to CpG.
## This is the number of switches from non-CpG to CpG island (the number of nucleotide preceding a CpG islands)
## divided by the total length of non-CPG islands
transitions["-", "+"] <- stats$nb.CpG / stats$L.non.CpG
## The only other possible transition from "-" is to "-" so the sum is 1 (complementary events)
transitions["-", "-"] <- 1 - transitions["-", "+"]

## Transitions from CpG to non-CpG
## Same reasoning: the number of ending CpG positions divided by the total length of CpGs
transitions["+", "-"] <- stats$nb.CpG / stats$L.CpG
## The only other possible transition from "+" is to "+" so the sum is 1 (complementary events)
transitions["+", "+"] <- 1 - transitions["+", "-"]


```


For convenience, we will use the labels $+$ and $-$ to denote the CpG islands and non-CpG island regions, respectively.


We can compute the probability of switching from a non-CpG islands to a CpG island with the following reasoning. 

- The total size of non-CpG islands is the difference between genome size (`r format(digits=12, big.mark=",", stats$L.G)`) and the total size of the CpG islands (`r format(digits=12, big.mark=",", stats$L.CpG)`).

$$S_{-}  = S_{bg} - S_{+} = `r format(digits=12, big.mark=",", stats$L.G)` - `r format(digits=12, big.mark=",", stats$L.CpG)` = `r format(digits=12, big.mark=",", stats$L.non.CpG)`$$

- In total, there are $`r format(digits=12, big.mark=",", stats$nb.CpG)`$ CpG islands in the genome, and each of them is preceded by a non-CpG island region. There are thus  $`r format(digits=12, big.mark=",", stats$nb.CpG)`$ transitions from non-CpG to CpG. The probability of transition from non-CpG to CpG  is thus the number of non-CpG positions preceding a CpG divided by the total number of non-CpG positions.

$$P(+|-) = N_+ / S_- = \frac{`r format(digits=12, big.mark=",", stats$nb.CpG)`}{`r format(digits=12, big.mark=",", stats$L.non.CpG)`} = `r format(digits=5, transitions["-", "+"])`$$

- Since there are only two possible states, the transition probability from non-CpG to non-CpG is the complement of the transition probability from non-CpG to CpG. 

$$P(-|-) = 1 - P(+|-) = `r format(digits=5, transitions["-", "-"])`$$

The same reasoning applies to compute the transition probabilities from CpG islands. 

- Each CpG island has an exactly one ending nucleotide, which precedes a non-CpG island nucleotide. The number of nucleotides marking a transition from CpG to non-CpG is thus  $`r format(digits=12, big.mark=",", stats$nb.CpG)`$. The probability of transition from CpG to non-CpG is this number divided by the total size of all CpGs.

$$P(-|+) = N_+ / S_+ = \frac{`r format(digits=12, big.mark=",", stats$nb.CpG)`}{`r format(digits=12, big.mark=",", stats$L.CpG)`} = `r format(digits=3, transitions["+", "-"])`$$

- The probability of transition from CpG to CpG is the complement.

$$P(+|+) = 1 - P(-|+) = `r format(digits=5, transitions["+", "+"])`$$

The results are summarised in the transition matrix below. 

```{r CpG_transition_display}

kable(transitions, digits  = 5, caption = "Transition matrix between CpG islands and non-CpG island regions in the human genome. ", align = "c")

```