## Introduction to Likelihood 2: Fitting an SI model to an HIV epidemic with Maximum Likelihood Estimation
## Clinic on the Meaningful Modeling of Epidemiological Data
## International Clinics on Infectious Disease Dynamics and Data (ICI3D) Program
## African Institute for Mathematical Sciences, Muizenberg, RSA
## (C) Steve Bellan, 2015

###################################################################### 

## By the end of this tutorial you should…

## * Understand how to simulate cross-sectional prevalence data around a simulated epidemic.
## * Calculate a binomial likelihood from these prevalence data and a fully specified epidemic model.
## * Use R's "optim" function to do multivariate optimization over transformed parameter space.
## * Understand the difference betweeen SANN and Nelder-Mead algorithms
## * Create 95% CIs and contours from the hessian matrix
## * Create 95% CIs and contours based on the likelihood ratio

require(boot); require(deSolve); require(ellipse);

## Function that makes a list of disease parameters with default values
disease_params <- function(Beta = 0.9 ## transmission coefficient when prevalence is 0 
	, alpha = 8 ## for transmission coefficient: decline with prevalence
  	, progRt = (1/10)*4 ## rate of of progression through each of the I classes, for 10 years total
	, birthRt = .03 ## birth rate, 3% of people give birth per year
  	, deathRt = 1/60 ## 60 year natural life expectancy
){
    return(as.list(environment()))
}

disease_params()
disease_params(Beta = .2)

initPrev <- exp(-7) ## infected at start
tseqMonth <- seq(1976, 2015, by = 1/12)
init <- c(S=1-initPrev, I1=initPrev, I2=0, I3=0, I4=0, CI = 0, CD = 0) ## modeling proportion of population
Is <- paste0('I',1:4) ## for easy indexing

## Define the SI ODE model. This model is equivalent to the third model in the HIV in Harare tutorial
## 	(though some parameters may have different names)
SImod <- function(tt, yy, parms) with(c(parms,as.list(yy)), {
    ## State variables are: S, I1, I2, I3, I4
    ## derived quantitties
    I <- I1+I2+I3+I4           ## total infected
    N <- I + S                 ## total population
    transmissionCoef <- Beta * exp(-alpha * I/N) ## Infectious contact rate
    ## state variable derivatives (ODE system)
    deriv <- rep(NA,7)
    deriv[1] <-	birthRt*N - deathRt*S - transmissionCoef*S*I/N ## Instantaneous rate of change: Susceptibles
    deriv[2] <-	transmissionCoef*S*I/N - progRt*I1 - deathRt*I1 ## Instantaneous rate of change: Infection class I1
    deriv[3] <-	progRt*I1 - progRt*I2 - deathRt*I2 ## Instantaneous rate of change:  Infection class I2
    deriv[4] <-	progRt*I2 - progRt*I3 - deathRt*I3 ## Instantaneous rate of change: Infection class I3 
    deriv[5] <-	progRt*I3 - progRt*I4 - deathRt*I4 ## Instantaneous rate of change: Infection class I4
    deriv[6] <-	transmissionCoef*S*I/N ## Instantaneous rate of change: Cumulative incidence
    deriv[7] <-	progRt*I4 ## Instantaneous rate of change: Cumulative mortality
    return(list(deriv))
})

## Function to run the deterministic model simulation, based on the ODE system defined in SImod().
simEpidemic <- function(init, tseq = tseqMonth, modFunction=SImod, parms = disease_params()) {
    simDat <- as.data.frame(lsoda(init, tseq, modFunction, parms=parms))
    simDat$I <- rowSums(simDat[, Is])
    simDat$N <- rowSums(simDat[, c('S',Is)])
    simDat$P <- with(simDat, I/N)
    return(simDat)
}

## Function to 'sample' the population:
## From a simulated epidemic, measure prevalence at several time points by drawing
## cross-sectional samples of individuals at each time, testing them, and then calculating sample
## prevalence and associated binomial confidence intervals

sampleEpidemic <- function(simDat # Simulated "data" which we treat as real 
	, sampleDates = seq(1980, 2010, by = 3) # Sample every 3 years 
	, numSamp = rep(80, length(sampleDates)) # Number of individuals sampled at each time point
){
    prev_at_sample_times <- simDat[simDat$time %in% sampleDates, 'P']
    numPos <- rbinom(length(numSamp), numSamp, prev_at_sample_times)
    lci <- mapply(function(x,n) binom.test(x,n)$conf.int[1], x = numPos, n = numSamp)
    uci <- mapply(function(x,n) binom.test(x,n)$conf.int[2], x = numPos, n = numSamp)    
    return(data.frame(time = sampleDates, numPos, numSamp, sampPrev =  numPos/numSamp,
                      lci = lci, uci = uci))
}

## Run system of ODEs for "true" parameter values
trueParms <- disease_params() # Default model parameters are defined in lines 20-26
simDat <- simEpidemic(init, parms = trueParms) # Simulated epidemic (underlying process)

par(bty='n', lwd = 2)
# Plot simulated prevalence through time:
with(simDat, plot(time, P, xlab = '', ylab = 'prevalence', type = 'l', ylim = c(0,.4), col='red', las = 1))
## Take cross-sectional sample of individuals to get prevalence estimates at multiple time points:
set.seed(1) # Initiate the random number generator (so everyone's simulation looks the same)
myDat <- sampleEpidemic(simDat) # Simulate data from the sampling process (function defined above)
points(myDat$time, myDat$sampPrev, col = 'red', pch = 16, cex = 2) # Plot sample prevalence at each time point
arrows(myDat$time, myDat$uci, myDat$time, myDat$lci, col = 'red', len = .025, angle = 90, code = 3) # Plot 95% CIs around the sample prevalences

## To start, we need to write a likelihood function that gives the probability that a given parameter set
## (Beta, alpha values) would generate the observed data. Remember that we are assuming that there is some
## true underlying epidemic curve that is deterministic and the data we observe are only noisy
## because of sampling/observation error (not because the underlying curve is also
## noisy--i.e. process error--which would be particularly likely for epidemics in small populations).

## We assume binomial sampling errors. So we can write the -log-likelihood as the probability of
## observing the observed number positive out of each sample if the prevalence is the value generated by
## a model parameterized by a given set of parameters:

## Return the negative log of likelihood by convention
nllikelihood <- function(parms = disease_params(), obsDat=myDat) {
    simDat <- simEpidemic(init, parms=parms)
    ## What are the rows from our simulation at which we have observed data?
    matchedTimes <- simDat$time %in% obsDat$time
    nlls <- -dbinom(obsDat$numPos, obsDat$numSamp, prob = simDat$P[matchedTimes], log = T)
    return(sum(nlls))
}
nllikelihood(trueParms) ## loglikelihood of the true parameters (which we usually never know)
nllikelihood(disease_params(Beta = 3, alpha = 1))  ## vs some random guessed parameters

## First look up how optim() works. The more you read through the help
## file the easier this will be!!! In particular make sure you understand that
## the first argument of optim must be the initial values of the parameters to
## be fitted (i.e. Beta & alpha) and that any other parameters to be fixed are
## given as additional arguments (in the help file under "...")
## ?optim

## Since we need to be able to easily separate fitted and fixed parameters,
## let's make a function that takes fitted and fixed parameters together and
## puts them back in a parameter list (similar to the output of
## disease_params()). We also want it to be able to take logged parameter values
## and unlog them automatically back into the parameter list, so that we can fit
## on a log scale, but run our model with the unlogged values.
subsParms <- function(fit.params, fixed.params=disease_params())
    within(fixed.params, {
        loggedParms <- names(fit.params)[grepl('log_', names(fit.params))]
        unloggedParms <- names(fit.params)[!grepl('log_', names(fit.params))]        
        for(nm in unloggedParms) assign(nm, as.numeric(fit.params[nm]))
        for(nm in loggedParms) assign(gsub('log_','',nm), exp(as.numeric(fit.params[nm])))
        rm(nm, loggedParms, unloggedParms)
    })
guess.params <- c(log_Beta = log(5), log_alpha = log(8))
subsParms(guess.params, disease_params())

## Make likelihood a function of fixed and fitted parameters.
objFXN <- function(fit.params ## paramters to fit
                   , fixed.params =disease_params() ## fixed paramters
                   , obsDat=myDat) {
    parms <- subsParms(fit.params, fixed.params)
    nllikelihood(parms, obsDat = obsDat) ## then call likelihood
}
objFXN(guess.params, disease_params())

## Select initial values for fitted parameters from which optimization routine
## will start. If you select bad initial values the algorithm can get stuck on a
## bad set of parameters. You can always try the true values as a starting point
## for this problem, although that's rarely possible in real problems.

init.pars <- c(log_alpha = log(30), log_Beta = log(.1))
## We will start with SANN optimization since it is stochastic and therefore
## less likely to get stuck in a local minima. But then finish with Nelder-Mead
## optimization which is much faster.

###  NOTE: for trace >0 you see more progress report, bigger numbers show more
###  update
trace <- 3

## SANN: This is stochastic, be CAREFUL -- sometimes it gets stuck at local minima
## for unreasonble parameters. If you see this happen, run it again!
optim.vals <- optim(par = init.pars
                    , objFXN
                    , fixed.params = disease_params()
                    , obsDat = myDat
                    , control = list(trace = trace, maxit = 150)
                    , method = "SANN")
exp(unname(optim.vals$par))
trueParms[c('alpha','Beta')]

## We feed the last parameters of SANN in as the first values of Nelder-Mead
optim.vals <- optim(par = optim.vals$par
                    , objFXN
                    , fixed.params = disease_params()
                    , obsDat = myDat
                    , control = list(trace = trace, maxit = 800, reltol = 10^-7)
                    , method = "Nelder-Mead"
                    , hessian = T)
optim.vals
MLEfits <- optim.vals$par
trueParms[c('alpha','Beta')]
exp(unname(MLEfits))

log_alpha.fit <- MLEfits["log_alpha"]
log_Beta.fit <- MLEfits["log_Beta"]

## Look at the output of optim. Understand what it means. Did the algorithm
## converge? Look at ?optim to understand it.

## Plot MLE fit time series
par(bty='n', lwd = 2, las = 1)
with(simDat, plot(time, P, xlab = '', ylab = 'prevalence', type = 'l', ylim = c(0,.4), col='red'))
fitDat <- simEpidemic(init, parms = subsParms(optim.vals$par, trueParms))
with(fitDat, lines(time, P, col='blue'))
points(myDat$time, myDat$sampPrev, col = 'red', pch = 16, cex = 2)
arrows(myDat$time, myDat$uci, myDat$time, myDat$lci, col = 'red', len = .025, angle = 90, code = 3)
legend("topleft", c('truth', 'observed', 'fitted'), lty = c(1, NA, 1), pch = c(NA,16, NA),
       col = c('red', 'red', 'blue'), bty = 'n')

######################################################################
## Contour plots with the hessian
######################################################################
## The Hessian matrix gives you the curvature of the likelihood function at the maximum likelihood
## estimate (MLE) of the fitted parameters. In other words, it tells you the second derivative around
## MLE, which can be used to estimate the covariance variance matrix of the estimator. This estimate of
## the covariance varance matrix is known as the Fisher information matrix and can be obtained by
## inverting the Hessian.
fisherInfMatrix <- solve(optim.vals$hessian) ## invert the Hessian, to estimate the covar-var matrix of parameter estimates

## Initialize plot of parameters
plot(1,1, type = 'n', log = 'xy',
     ## xlim = range(alpha.seq), ylim = range(Beta.seq),
     xlim = c(2,15), ylim = c(.5,2),
     las = 1,
     xlab = expression(alpha), ylab = expression(beta),
        main = "-log(likelihood) contours", bty = "n")
## Add true parameter values to the plot
with(trueParms, points(alpha, Beta, pch = 16, cex = 2, col = 'red'))
## Add MLE to the plot
points(exp(MLEfits['log_alpha']), exp(MLEfits['log_Beta']), pch = 16, cex = 2, col = 'black')
## Add 95% contour ellipse from Hessian
lines(exp(ellipse(fisherInfMatrix, centre = MLEfits, level = .95)))
legend("topleft", c('truth', 'MLE', '95% Confidence Region'), lty = c(NA, NA, 1), pch = c(16,16, NA),
       col = c('red', 'black', 'black'), bg='white', bty = 'n')

######################################################################
## Contour plots with likelihood profiles
######################################################################
## With all other parameters fixed to their initial values, lets look at a
## contour likelihood plot over Beta and alpha.  To do this we write wrapper
## functions of log_Beta and log_alpha to feed to outer() and then
## contour(). This is confusing so make sure you understand every function.

## This function simply takes values log_Beta and log_alpha and feeds them into
## objFXN above as a single variable called logpars, you'll see why this is
## useful below.
objXalpha_Beta <- function(alpha, Beta, fixed.params = disease_params(), browse=F)
    objFXN(fit.params = c(log_alpha = log(alpha), log_Beta = log(Beta))
               , fixed.params = fixed.params)

## Now instead of giving a single argument on the log scale we give 2
## on the untransformed scale.
objFXN(c(log_alpha = log(1/5), log_Beta = log(25)))
objXalpha_Beta(1/5, 25)

## If we try to give this function multiple values of alpha and Beta,
## however, it gets confused and only analyzes the first one.
objXalpha_Beta(c(1/5:8), 25:30)

## So we "Vectorize" this function so it can take in vectors of the parameters
## and return the output. objXalpha_BetaVEC then calls on objXalpha_Beta() to
## take xx,yy as pairwise values and returns objXalpha_Beta() for all pairs
objXalpha_BetaVEC <- Vectorize(objXalpha_Beta, list("alpha", "Beta"))

## Task: Explain how the following three lines are related.
objXalpha_Beta(25, 1/5)
objXalpha_Beta(26, 1/6)
objXalpha_Beta(8, .9)
objXalpha_BetaVEC(c(25:26), c(1/5,1/6))

## Now we use the R function outer() to evaluate objXalpha_BetaVEC() over a grid
## of {alpha, Beta} combinations. This can take a long time because we have to do
## res^2 evaluations of nllikelihood(), and recall that each time we do this we are
## running lsoda() inside simEpidemic()

## Grid resolution resXres, increasing it makes contours smoother but takes a lot longer
res <- 15

## Now create a sequence of alpha values for the grid
alpha.seq <- exp(seq(log_alpha.fit-1, log_alpha.fit+1, l = res))
alpha.seq

## Now create a sequence of Beta values for the grid 
Beta.seq <- exp(seq(log_Beta.fit-1, log_Beta.fit + 1, l = res))
Beta.seq

## The function outer() now evaluates objXalpha_BetaVEC on this grid. ?outer
mat <- outer(alpha.seq, Beta.seq, objXalpha_BetaVEC) # this can take a long time

## Make a contour plot that shows the confidence regions in red.  Likelihood
## Ratio Test confidence regions use the chi squared distribution cutoff with
## degrees of freedom 2 (2 parameters)
ml.val <- optim.vals$value
conf.cutoff <- ml.val + qchisq(.95,2)/2

## Show likelihood contours
par(cex = 1.2)
plot(1,1, type = 'n', log = 'xy',
     ## xlim = range(alpha.seq), ylim = range(Beta.seq),
     xlim = c(3,15), ylim = c(.5,2), 
     xlab = expression(alpha), ylab = expression(beta),
        main = "-log(likelihood) contours", bty = "n")
.filled.contour(alpha.seq, Beta.seq, mat, levels = seq(min(mat), max(mat), l=20), col = topo.colors(20))
## Add contour for 95% CI from likelihood ratio
contour(alpha.seq, Beta.seq, mat, levels = c(conf.cutoff),
        col = "black", lwd = 2, labels = "", labcex = .2, add = T)
## Add contour for 95% CI from Hessian
lines(exp(ellipse(fisherInfMatrix, centre = MLEfits, level = .95)), lty = 2)
## Add MLE to the plot
points(exp(log_alpha.fit), exp(log_Beta.fit), pch = 16, cex = 1, col = 'black')
## Add true parameter values to the plot
with(trueParms, points(alpha, Beta, pch = 16, cex = 1, col = 'red'))
legend("topleft",
       c('truth', 'MLE', '95% contour (profile likelihood)', '95% contour (Fisher information matrix)')
       , lty = c(NA, NA, 1, 2), pch = c(16,16, NA, NA),
       col = c('red', rep('black',3)), bg='white', bty = 'n')