library(lubridate)
library(dplyr)
library(wbstats)
library(deSolve)
library(GA)
library(parallel)

# The RHS function for KMK
RHS_KMK_SIR <- function(t, x, p) {
  with(as.list(c(x,p)), {
    dS <- -beta*S*I/pop
    dI <- beta*S*I/pop-gamma*I
    dR <- gamma*I
    return(list(c(dS, dI, dR)))
  })
}

# Compute the error based on one parameter value. Will also need a few more
# arguments
error_incidence <- function(p_vary, 
                            params, 
                            incidence_data,
                            method = "rk4",
                            which_incidence = "inc100") {
  # Anything that changes during optimisation needs to be set here
  params$beta = as.numeric(p_vary["beta"])
  params$gamma = as.numeric(p_vary["gamma"])
  # Check value of R0. If it is less than 1, no need to compute ODE solution,
  # just return Inf
  R0 = params$beta / params$gamma
  if (R0<1) {
    return(Inf)
  }
  # I0 is the I that gives us the incidence we are matching in the data. Since
  # incidence=beta*S*I/N, we have
  # I(0) = incidence*N/(beta*S(0)) ~= incidence/beta
  S0 = params$pop
  I0 = data_subset[[which_incidence]][1]/(params$beta)
  IC = c(S = S0, I = I0, R = 0)
  # The times at which we compute the solution to compare with data
  times = as.numeric(as.Date(incidence_data$date))
  sol = ode(IC, times, RHS_KMK_SIR, params, method = method)
  # Error checking
  if (sol[dim(sol)[1],"time"] < times[length(times)]) {
    return(Inf)
  }
  # Values required to compute the error
  incidence_from_run = params$beta * sol[,"S"] * sol[,"I"] / params$pop
  # Compute the error
  diff_values = incidence_data[[which_incidence]] - incidence_from_run
  diff_values_squared = diff_values^2
  error = sum(diff_values_squared)
  return(error)
}

# Read in the data for France
data = readr::read_csv("https://www.sentiweb.fr/datasets/incidence-PAY-3.csv",
                       skip = 1)
# Show the top of the table (always useful)
head(data)
# Week is by epi week, let's sort this out. Four first digits are year, last 
# two are week. Let us rename the first column as yyyyww, to avoid confusion
# with week
colnames(data)[1] = "yyyyww" 
data$year = substr(data$yyyyww, 1, 4)
data$week = substr(data$yyyyww, 5, 6)
# It will be easier to work with days as days, so we convert dates
data$date = lubridate::parse_date_time(paste(data$year, data$week, 1, sep="/"),
                                       'Y/W/w')
# There are some issues, let us fix without thinking: if something went wrong, it 
# is typically because of a week numbered 53. We make that day 31 December of 
# that year
idx = which(is.na(data$date))
data$date[idx] = ymd(paste0(data$year[idx],"-12-31"))

# Ah, yes, don't forget: let's flip the order of the table, as it is sorted
# by decreasing dates
data = data %>%
  arrange(date)

# Type of incidence to use (column name in the data)
which_incidence = "inc"

# Let us now select one epidemic season (mid year to next). Use another variable
# so we don't have to reload the data if we do weird edits..
# We take a flu season prior to the pandemic (they are weird after that)
beg_year = 2018
end_year = 2019
data_subset = data %>%
  filter(date > paste0(beg_year,"-12-15")) %>%
  filter(date <= ymd(paste0(end_year,"-04-30")))

plot(data_subset$date, data_subset[[which_incidence]],
     type = "b",
     xlab = "Month", ylab = "Incidence",
     main = paste0(beg_year, "-", end_year, " flu season"))

# To do work, we will need the total population of France
pop_data_FRA <- wb_data(country = "FRA", indicator = "SP.POP.TOTL",
                        mrv = 100, return_wide = FALSE) %>%
  filter(date == beg_year)
# For numerical reasons, we might work per 100K people.. Detect this.
if (grepl("100", which_incidence)) {
  pop_data_FRA = pop_data_FRA$value / 100000
} else {
  pop_data_FRA = pop_data_FRA$value
}

params = list()
params$gamma = 1/3.5   # Let's see if we can fit with this simple value
# R0=beta/gamma, so beta=R0*gamma
params$beta = 1.5*params$gamma
# Add population data to this, for convenience
params$pop = pop_data_FRA

## Fit using a genetic algorithm
GA = ga(
  type = "real-valued",
  fitness = function(x) -error_incidence(p_vary = c(beta = x[1], gamma = x[2]),
                                         params = params,
                                         incidence_data = data_subset,
                                         which_incidence = which_incidence,
                                         method = "rk4"),
  parallel = 120,
  lower = c(0.1, 1/20),
  upper = c(1, 1/2),
  pcrossover = 0.7,
  pmutation = 0.2,
  optim = TRUE,
  optimArgs = list(method = "CG"),
  suggestions = c(params$beta, params$gamma),
  popSize = 500,
  maxiter = 100
)

params$beta = GA@solution[1]
params$gamma = GA@solution[2]
S0 = pop_data_FRA
I0 = data_subset[[which_incidence]][1]/(params$beta)
IC = c(S = S0, I = I0, R = 0)
dates_num = as.numeric(as.Date(data_subset$date))
times = seq(dates_num[1], dates_num[length(dates_num)], 0.1)
sol <- ode(IC, times, RHS_KMK_SIR, params)
sol_incidence = as.numeric(params$beta) * sol[,"S"] * sol[,"I"]/params$pop
y_max = max(max(sol_incidence), max(data_subset[[which_incidence]]))

plot(sol[,"time"], sol_incidence, type = "l",
     xlab = "Date", ylab = "Incidence",
     xaxt = "n", lwd = 2, col = "red",
     ylim = c(0, y_max))
lines(as.numeric(as.Date(data_subset$date)), data_subset[[which_incidence]],
      type = "b")
dates_pretty = pretty(dates_num)
axis(1, at = dates_pretty, labels = as.Date(dates_pretty, origin="1970-01-01"))