# Copyright (C) 2015  Julián Urbano <urbano.julian@gmail.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see http://www.gnu.org/licenses/.

source("src/gt4ireval.R")
source("src/measures/tauAP.R")

library(ks) # To estimate pdfs
library(Matrix) # To improve Cholesky decomposition

# Logits ===============================================================================================================

logit <- function(p, a = .05)
  # a : parameter to smooth at 0 and 1
{
  return(log((p+a) / (1-p+a)))
}
invlogit <- function(p, a = .05)
  # a : parameter to smooth at 0 and 1
{
  ep <- exp(p)
  ep <- (ep+a*ep-a) / (1+ep)
  ep[ep > 1] <- 1
  ep[ep < 0] <- 0
  return(ep)
}

# Effect decomposition and estimation of distribution functions ========================================================

## Decompose a topic-by-system effectiveness matrix according to the model
##   x_st = mu + nu_s + nu_t + e_ts
decompose.effects <- function(X)
{
  if(is.data.frame(X))
    X <- as.matrix(X)

  mu <- mean(X) # true system mean scores
  NU_s <- colMeans(X) - mu # true system effects
  NU_t <- rowMeans(X) - mu # topic effects
  E <- X - matrix(nrow = nrow(X),
                  mu
                  + rep(NU_s, each = nrow(X))
                  + NU_t) # residuals matrix

  return(list(mu = mu, NU_s = NU_s, NU_t = NU_t, E = E))
}

## Kernel estimation of CDF with support [xmin,xmax]
estimate.cdf <- function(x, xmin = min(x), xmax = max(x))
{
  # Estimate CDF and approximate it
  cdf <- kcde(x, xmin = xmin, xmax = xmax)
  cdf <- approxfun(cdf$eval.points, cdf$estimate, yleft = 0, yright = 1)
  return(cdf)
}
## Kernel estimation of iCDF
estimate.icdf <- function(x, xmin = min(x), xmax = max(x))
{
  # Estimate CDF and approximate it
  cdf <- kcde(x, xmin = xmin, xmax = xmax)
  cdf <- approxfun(cdf$eval.points, cdf$estimate, yleft = 0, yright = 1)
  # Wrap inverse CDF
  icdf <- function(p) {
    sapply(p, function(p2) {
      # Prevent uniroot from raising error at extreme values
      p2 <- round(p2, 5)
      if(p2 <= 0) return(xmin)
      if(p2 >= 1) return(xmax)
      # Solve the CDF for the given probability
      uniroot(function(t) { cdf(t)-p2 }, interval = c(xmin, xmax))$root
    })
  }
  return(icdf)
}

# Configuration and Simulation =========================================================================================

## Analyze an existing collection and return an object to configure the simulation.
##
## The returned object contains the effect decomposition of the original data, the kernel-density estimated distribution
## functions of the residuals and topic effects, the covariance matrix and its Cholesky decompositioin. Appropriate
## transformations are applied according to the assumptions.
configure.simulation <- function(X, normal = FALSE, homoscedastic = FALSE, uncorrelated = FALSE, random = TRUE)
{
  if(is.data.frame(X))
    X <- as.matrix(X)

  # min and max effect score, to use later on when estimating distribution functions
  xmin <- -1
  xmax <- 1
  # If not Normal assumption, transform to logit and update xmin and xmax
  if(normal == FALSE) {
    X <- logit(X)
    xmin <- -2*logit(1)
    xmax <- 2*logit(1)
  }

  # Decompose effects
  dec <- decompose.effects(X)

  # If Homoscedastic assumption, standardize residuals with pooled variance
  if(homoscedastic == TRUE) {
    var_p <- mean(apply(dec$E, 2, var))
    dec$E <- apply(dec$E, 2, function(e) { e / sd(e) * sqrt(var_p)})
  }

  # Estimate CDF and iCDF of topic effects and residuals
  dec$cdf_t <- estimate.cdf(dec$NU_t, xmin = xmin, xmax = xmax)
  dec$icdf_t <- estimate.icdf(dec$NU_t, xmin = xmin, xmax = xmax)
  dec$CDF_e <- apply(dec$E, 2, function(e) estimate.cdf(e, xmin = min(xmin, min(e)), xmax = max(xmax, max(e))))
  dec$ICDF_e <- apply(dec$E, 2, function(e) estimate.icdf(e, xmin = min(xmin, min(e)), xmax = max(xmax, max(e))))

  # Append NU_t to E, and compute the covariance matrix to generate new variables with similar dependence structure
  dec$Sigma <- cov(cbind(dec$E, dec$NU_t))
  dec$Sigma <- as.matrix(nearPD(dec$Sigma)$mat) # compute the nearest positive definite matrix

  # If Uncorrelated assumption, set all off-diagonal covariances to 0
  if(uncorrelated == TRUE) {
    dec$Sigma <- diag(diag(dec$Sigma), ncol = ncol(dec$Sigma))
  }

  # Compute the Cholesky decomposition
  dec$C <- chol(dec$Sigma)

  dec$assumptions = list(normal = normal, homoscedastic = homoscedastic, uncorrelated = uncorrelated, random = random)
  return(dec)
}

## Simulate a new collection with n_t_ topics using the given configuration.
simulate.collection <- function(config, n_t_, min_n_t__ = 200, n_t__factor = 4)
  ## min_n_t__   : minimum number of topics to sample from when not assuming random sampling
  ## n_t__factor : factor of n_t_ to select a minimum of topics to sample from when not assuming random sampling
{
  n_s <- length(config$NU_s) # Number of systems

  # Generate a matrix of normal random vectors with the given covariance matrix
  # If not Random sampling assumption, generate many more random vectors, to select later on
  if(config$assumptions$random == FALSE) {
    R <- matrix(rnorm(ncol(config$Sigma) * max(min_n_t__, n_t_ * n_t__factor)), ncol = ncol(config$Sigma))
  } else {
    R <- matrix(rnorm(ncol(config$Sigma) * n_t_), ncol = ncol(config$Sigma))
  }
  R <- R %*% config$C

  # and transform their marginals to uniform with the normal CDF
  U <- matrix(0, ncol = ncol(R), nrow = nrow(R))
  for(i in 1:ncol(R))
    U[,i] <- pnorm(R[,i], sd = sqrt(config$Sigma[i,i]))

  # Use these uniforms and the estimated iCDFs to get our final random vectors
  Z <- matrix(NA, ncol = ncol(U), nrow = nrow(U))
  for(i in 1:(ncol(Z)-1)) { # skip last vector, which corresponds to NU_t
    # If Normal assumption, use normal marginal, or the estimated otherwise
    if(config$assumptions$normal == TRUE) {
      Z[,i] <- qnorm(U[,i], mean = 0, sd = sqrt(config$Sigma[i,i]))
    } else{
      Z[,i] <- config$ICDF_e[[i]](U[,i])
    }
  }
  Z[,ncol(Z)] <- config$icdf_t(U[,ncol(Z)])

  # If not Random sampling assumption, randomly select n_t_ topics without uniform sampling probability
  if(config$assumptions$random == FALSE) {
    # First sort Z by topic effect
    Z <- Z[order(Z[,ncol(Z)]),]
    # The probability of sampling each topic will be given by de pdf of a Beta distribution with random parameters
    shape1 <- runif(1, .01, 2)
    shape2 <- runif(1, 2, 8)
    if(rbinom(1, 1, .5)) {
      shape <- shape1
      shape1 <- shape2
      shape2 <- shape
    }
    P <- dbeta((0:(nrow(Z)+1)) / (nrow(Z)+2), shape1, shape2) # add one extra point at both sides to avoid infinities
    P <- P[-c(1, length(P))] # remove the two extra points here
    P <- P / sum(P) # Normalize to sum 1
    # Now randomly select n_t_ topics without replacement, using the probabilities of sampling just generated
    Z <- Z[sample(1:nrow(Z), size = n_t_, replace = FALSE, prob = P),]
  }

  # Generate simulated matrix
  Y <- matrix(nrow = n_t_,
              config$mu # Maintain the same grand average
              + rep(config$NU_s, each = n_t_) # maintain the same system effects
              + Z[,ncol(Z)]) + Z[,-ncol(Z)] # and add the new NU_t and Z
  colnames(Y) <- names(config$NU_s)

  # If not Normal assumption, transform back from logit
  if(config$assumptions$normal == FALSE) {
    Y <- invlogit(Y)
  }

  return(Y)
}

# Diagnosis ============================================================================================================

diagnose.simulation <- function(X, Y,
                                decX = decompose.effects(X),
                                F_tX = estimate.cdf(decX$NU_t, xmin = -1, xmax = 1),
                                F_EX = apply(decX$E, 2, function(e) estimate.cdf(e, xmin = min(e), xmax = max(e))),
                                gX = g.study(X))
{
  decY <- decompose.effects(Y)
  gY <- g.study(Y)

  gXsum <- sum(c(gX$var.s, gX$var.q, gX$var.e))
  gYsum <- sum(c(gY$var.s, gY$var.q, gY$var.e))

  # Coefficients of the correlation matrices
  corX <- cor(decX$E)
  corY <- cor(decY$E)
  # don't use the diagonals (they're always 1)
  diag(corX) <- NA
  diag(corY) <- NA
  corX <- as.vector(corX)
  corY <- as.vector(corY)

  # Residual variances
  varX <- apply(decX$E, 2, var)
  varY <- apply(decY$E, 2, var)

  # Cramer-von Mises (topics)
  if(nrow(X) > nrow(Y)) {
    bigger <- decX
  } else {
    bigger <- decY
  }
  F_tY <- ecdf(decY$NU_t)
  w2_t <- mean( (F_tX(bigger$NU_t) - F_tY(bigger$NU_t))^2 )
  # Cramer-von Mises (residuals)
  F_EY <- apply(decY$E, 2, ecdf)
  w2_E <- 0
  for(f in 1:ncol(X)) {
    w2_E <- w2_E + mean( (F_EX[[f]](bigger$E[,f]) - F_EY[[f]](bigger$E[,f]))^2 )
  }
  w2_E <- w2_E / length(F_EX)

  return(c(#mu = decY$mu - decX$mu,
           mu_s = mean( (decY$mu+decY$NU_s) - (decX$mu+decX$NU_s) ),
           #rmse_s = sqrt(mean( ((decY$mu+decY$NU_s) - (decX$mu+decX$NU_s))^2 )),
		   #tau = cor(decX$NU_s, decY$NU_s, method = "k"),
           tauAP = tauAP.actual(Y, X),
           #r = cor(decX$NU_s, decY$NU_s, method = "p"),
           w2_t = w2_t,
           w2_E = w2_E,
           var_s = gY$var.s/gYsum - gX$var.s/gXsum,
           var_t = gY$var.q/gYsum - gX$var.q/gXsum,
           var_st = gY$var.e/gYsum - gX$var.e/gXsum,
           sd_var = sd(varY) - sd(varX),
           cor = cor(corX, corY, use = "complete")
  ))
}