library("TCA")
library("ggplot2")
library("ggpubr")
library("pracma")
library("matrixStats")

prep_data <- function(data_path){

  file_name1 <- paste(data_path,.Platform$file.sep,"hannum.chr22.RData",sep="")
  file_name2 <- paste(data_path,.Platform$file.sep,"liu.cd4.chr22.RData",sep="")
  file_name3 <- paste(data_path,.Platform$file.sep,"paquette.chr22.RData",sep="")

  file_names <- c(file_name1, file_name2, file_name3)

  if(any(!file.exists(file_name1, file_name2, file_name3))){
    library("GEOquery")
    library("data.table")
    library("EpiDISH")
  }

  if (!file.exists(file_name1)){

    # Download the Hannum et al. data
    gse <- GEOquery::getGEO("GSE40279", destdir = data_path, GSEMatrix = TRUE)

    # Extract methylation data
    X.hannum <- Biobase::exprs(gse[[1]])

    # covariates; note that there's also 'ethnicity' covariate in the data, however, it is perfectly captured by the plateinformation
    plate.hannum <- as.numeric(as.factor((Biobase::pData(gse[[1]])[["plate:ch1"]])))
    cov.hannum <- cbind(as.numeric((Biobase::pData(gse[[1]])[["age (y):ch1"]])), as.numeric(as.factor((Biobase::pData(gse[[1]])[["gender:ch1"]]))), indicator_vars(plate.hannum))
    rownames(cov.hannum) <- colnames(X.hannum)
    colnames(cov.hannum) <- c("age", "gender", "plate1", "plate2", "plate3", "plate4", "plate5", "plate6", "plate7", "plate8")

    # Calculate principal components from control probes that reflect technical variability; since we are not working with IDAT files we don't have "real" control probes - instead, we use low variance sites that are not expected to capture any true biological variability.
    low_var_pcs.hannum <- low_var_pcs(X.hannum, rank = 10, p = 1000)
    cov.hannum <- cbind(cov.hannum, low_var_pcs.hannum)

    # estimate cell-type proportions usign a reference-based approach
    ref <- as.matrix(EpiDISH::centDHSbloodDMC.m[,c("Neutro","Eosino","CD4T","CD8T","Mono","B","NK")])
    W.hannum <- EpiDISH::epidish(X.hannum, ref)$estF
    # merge Neutro and Eosino
    W.hannum.gran <- W.hannum[,"Neutro",drop=F] + W.hannum[,"Eosino",drop=F]
    colnames(W.hannum.gran) <- "Gran"
    W.hannum <- cbind(W.hannum.gran,W.hannum[,c("CD4T","CD8T","Mono","B","NK")])

    # remove polymorphic sites, cross-reactive sites, and non-autosomal sites according to Chen et al.; in addition, remove low variance sites, as those are unlikely to demonstrate biological signal.
    X.hannum.processed <- filter_data(X.hannum)

    # keep only sites on chromosome 22 (so that tutorial can run quickly)
    map <- read.table("https://github.com/cozygene/glint/blob/master/utils/assets/HumanMethylationSites?raw=true",sep=",",row.names = 1)
    chrs <- map[rownames(X.hannum.processed),1]
    chr22sites <- which(chrs == 22)
    X.hannum.processed.22 <- X.hannum.processed[chr22sites,]

    hannum <- list(X = X.hannum.processed.22, cov = cov.hannum, W = W.hannum)
    save(hannum, file = file_name1, compress = "bzip2")

    rm(hannum, gse, X.hannum, X.hannum.processed)
    file.remove(paste(data_path,.Platform$file.sep,"GPL13534.soft",sep=""))
    file.remove(paste(data_path,.Platform$file.sep,"GSE40279_series_matrix.txt.gz",sep=""))

  }

  if (!file.exists(file_name2)){

    # Download the Liu et al. CD4  data

    # covariates
    gse <- GEOquery::getGEO("GSE56581", destdir = data_path, GSEMatrix = TRUE)
    liu.cd4.age <- as.matrix(as.numeric(Biobase::pData(gse[[1]])[["age (yrs):ch1"]]))
    colnames(liu.cd4.age) <- "age"

    # methylation data
    liu.cd4.methfile <- paste(data_path,.Platform$file.sep,"GSE56581_methylome_normalized.txt",sep="")
    download.file("ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE56nnn/GSE56581/suppl/GSE56581_methylome_normalized.txt.gz", paste(liu.cd4.methfile,".gz",sep=""))
    gunzip(paste(liu.cd4.methfile,".gz",sep=""))
    X.liu.cd4 <- fread(liu.cd4.methfile)

    X.liu.cd4.rownames <- X.liu.cd4$ID_REF
    X.liu.cd4 <- as.matrix(X.liu.cd4[,2:ncol(X.liu.cd4)])
    rownames(X.liu.cd4) <- X.liu.cd4.rownames
    X.liu.cd4 <- X.liu.cd4[,seq(1,ncol(X.liu.cd4),2)]

    # Calculate principal components from control probes that reflect technical variability; since we are not working with IDAT files we don't have "real" control probes - instead, we use low variance sites that are not expected to capture any true biological variability.
    low_var_pcs.liu.cd4 <- low_var_pcs(X.liu.cd4, rank = 10)
    cov.liu.cd4 <- cbind(liu.cd4.age, low_var_pcs.liu.cd4)

    # keep only sites on chromosome 22 (so that tutorial can run quickly)
    map <- read.table("https://github.com/cozygene/glint/blob/master/utils/assets/HumanMethylationSites?raw=true",sep=",",row.names = 1)
    chrs <- map[rownames(X.liu.cd4),1]
    chr22sites <- which(chrs == 22)
    X.liu.cd4.22 <- X.liu.cd4[chr22sites,]

    liu.cd4 <- list(X = X.liu.cd4.22, cov = cov.liu.cd4)
    save(liu.cd4, file = file_name2)

    rm(liu.cd4, gse, X.liu.cd4.22, X.liu.cd4)
    file.remove(paste(data_path,.Platform$file.sep,"GSE56581_methylome_normalized.txt",sep=""))
    file.remove(paste(data_path,.Platform$file.sep,"GSE56581_series_matrix.txt.gz",sep=""))

  }

  if (!file.exists(file_name3)){
    # Paquette et al. (Epigenetics 2016)
    gse <- GEOquery::getGEO("GSE75248", destdir = data_path, GSEMatrix = TRUE)

    # methylation data
    X.paquette <- Biobase::exprs(gse[[1]])

    # Calculate principal components from control probes that reflect technical variability; since we are not working with IDAT files we don't have "real" control probes - instead, we use low variance sites that are not expected to capture any true biological variability.
    low_var_pcs.paquette <- low_var_pcs(X.paquette, rank = 10, p = 1000)

    # remove polymorphic sites, cross-reactive sites, and non-autosomal sites according to Chen et al.; in addition, remove low variance sites, as those are unlikely to demonstrate biological signal.
    X.paquette.processed <- filter_data(X.paquette)

    # keep only sites on chromosome 22 (so that tutorial can run quickly); Paquette et al. reported some signal in chromosome 16
    map <- read.table("https://github.com/cozygene/glint/blob/master/utils/assets/HumanMethylationSites?raw=true",sep=",",row.names = 1)
    chrs <- as.character(map[rownames(X.paquette.processed),1])
    chr22sites <- which(chrs == "22")
    X.paquette.processed.22 <- X.paquette.processed[chr22sites,]

    # Extract covaraites
    gestational_age <-as.numeric(unlist(lapply(Biobase::pData(gse[[1]])[["gestational age:ch1"]], function(x) strsplit(x,";")[[1]][1])))
    arousal <- as.numeric(Biobase::pData(gse[[1]])[["arousal:ch1"]])
    batch <- as.numeric(as.factor((Biobase::pData(gse[[1]])[["batch:ch1"]])))
    batch[is.na(batch)] <- 3
    batch <- indicator_vars(batch)
    # since gender information is not available on the GEO record, get gender information that was inferred from the IDAR files based on X chromosome methylation
    gender <- read.table("https://raw.githubusercontent.com/cozygene/TCA/master/vignettes/gender.paquette.txt", row.names = 1)
    cov.paquette <- data.frame(gender,gestational_age,arousal,batch)
    colnames(cov.paquette) <- c("gender","gestational_age","arousal","batch1","batch2")
    rownames(cov.paquette) <- colnames(X.paquette.processed.22)
    # remove na values
    keep <- setdiff(1:nrow(cov.paquette) ,which(rowSums(is.na(cov.paquette)) > 0))
    cov.paquette <- cbind(cov.paquette[keep,], low_var_pcs.paquette[keep,])

    X.paquette.processed.22 <- X.paquette.processed.22[,keep]

    # Load cell-type proportions usign a reference-based approach; these were estimated using the raw IDAT files of the data (available on GEO) with the package ENmix.
    W.paquette <- read.table("https://raw.githubusercontent.com/cozygene/TCA/master/vignettes/W.paquette.txt", row.names = 1)
    W.paquette <- W.paquette[,2:ncol(W.paquette)]
    W.paquette[W.paquette < 0] <- 0
    # remove lowly abundant cell types
    W.paquette <- W.paquette[keep, colMeans(W.paquette)>=0.01]
    rownames(W.paquette) <- rownames(cov.paquette)
    # normalize cell type proportions (output from the ENmix package didn't require the proportions of an individual to sum up to 1)
    W.paquette <- W.paquette/t(repmat(rowSums(W.paquette),ncol(W.paquette),1))

    # get the set of cord blood reference CpGs that were used for estimating cell-type proportions; we will use these CpGs for re-estimating W
    library("FlowSorted.CordBlood.450k")
    ref.cord <- get("FlowSorted.CordBlood.450k")
    ref.cord <- preprocessRaw(ref.cord)
    ref.cord.cpgs <- rownames(FlowSorted.CordBlood.450k.ModelPars)
    ref.cord.cpgs.intersect <- intersect(ref.cord.cpgs, rownames(X.paquette))

    paquette <- list(X = X.paquette.processed.22, cov = cov.paquette, W = W.paquette, X.ref_cpgs = X.paquette[ref.cord.cpgs.intersect,keep])
    save(paquette, file = file_name3)

    rm(paquette, gse, X.paquette, X.paquette.processed, X.paquette.processed.22)
    file.remove(paste(data_path,.Platform$file.sep,"GSE75248_series_matrix.txt.gz",sep=""))

    ## note - the files W.paquette.txt and gender.paquette.txt were generated using the original IDAT files of the Paquette data.
    # Gender information was inferred from methylation levels frmo the X chromosome as it is not available on Tthe GEO record.
    # Code attched below; in order to run it, download the IDAT files from GEO accession number GSE75248 and set 'path' in 'readidat' below to the location of the files.
    # require("ENmix")
    # rgdat <- readidat(path = ".", manifestfile=NULL, recursive = FALSE, verbose = TRUE)
    # meth <- getmeth(rgdat); rm(rgdat)
    # W <- estimateCellProp(meth, refdata="FlowSorted.CordBlood.450k", cellTypes=NULL, nonnegative = TRUE, nProbes=50, normalize=TRUE, refplot=FALSE)
    # write.table(W, file = "W.paquette.txt", quote = FALSE, sep = " ")
    # beta <- getB(meth)
    # map <- read.table("https://github.com/cozygene/glint/blob/master/utils/assets/HumanMethylationSites?raw=true",sep=",",row.names = 1)
    # chrs <- as.character(map[rownames(beta),1])
    # sites.X <- which(chrs == "X")
    # sex_cpgs.pca.paquette <- prcomp(t(beta[sites.X,]), center=TRUE, scale=TRUE, rank = 2)
    # write.table(sex_cpgs.pca.paquette$x[,1:2], file = "sex_cpgs.pca.paquette.txt", quote = FALSE, sep = " ")
    # plot(sex_cpgs.pca.paquette$x[,1],sex_cpgs.pca.paquette$x[,2]) # shows a separation between males and females; note that the males/females ratio here perfectly match the one in the Paquette et al. paper.
    # df <-data.frame(as.numeric(x[,1] > 0))
    # colnames(df) <-"gender"; rownames(df) <- rownames(x)
    # write.table(df,file = "gender.paquette.txt", quote = FALSE, sep = " ")

  }

  return(file_names)

}


indicator_vars <- function(x){
  x.indicators <- matrix(0,length(x),length(unique(x))-1)
  counter <- 1
  u <- unique(x)
  for (i in setdiff(u,u[length(u)])){
    x.indicators[,counter] <- as.numeric(x == i)
    counter <- counter + 1
  }
  return(x.indicators)
}

low_var_pcs <- function(X, rank = 10, p = 1000){
  site.variances <- matrixStats::rowVars(X)
  names(site.variances) <- rownames(X)
  low.var.sites <- names(head(sort(site.variances), p))
  low.var.pca <- prcomp(t(X[low.var.sites,]), center=TRUE, scale=TRUE, rank = rank)
  return(low.var.pca$x)
}

filter_data <- function(X, var_th = 0.0001){
  nonspecific_probes <- read.table("https://raw.githubusercontent.com/cozygene/glint/master/parsers/assets/nonspecific_probes.txt")[,1]
  XY_probes <- read.table("https://raw.githubusercontent.com/cozygene/glint/master/parsers/assets/HumanMethylationSites_X_Y.txt")[,1]
  polymorphic_probes <- read.table("https://raw.githubusercontent.com/cozygene/glint/master/parsers/assets/polymorphic_cpgs.txt")[,1]
  # remove sites with very low variance that are unlikely to exhibit biological signal
  site.variances <- matrixStats::rowVars(X)
  names(site.variances) <- rownames(X)
  low_var_sites <- names(which(site.variances < var_th))
  exclude <- union(nonspecific_probes,union(XY_probes,union(low_var_sites,polymorphic_probes)))
  return(X[setdiff(rownames(X),exclude),])
}

plot_qq <- function(pvals, labels, ggarrange.nrow = 1, ggarrange.ncol = 1, alpha = 0.05, experiment_wide_line = TRUE){
  significance_th <- list(alpha/length(pvals[[1]]))
  if(length(pvals)-1) significance_th[[2]] <- alpha/(length(pvals)*length(pvals[[1]]))
  qqplots <- lapply(1:length(pvals), function(p){
    df <- data.frame(pvals.obs = -log10(sort(pvals[[p]])), pvals.exp = -log10(sort((1:length(pvals[[1]]))/length(pvals[[1]]))));
    qqplot <- ggplot(df, aes(x = pvals.exp, y = pvals.obs)) +
      stat_binhex(geom = "point", bins=1000, size=1) +
      geom_abline() +
      ggtitle(labels[p]) +
      xlab(expression(Expected~-log[10](P))) + ylab(expression(Observed~-log[10](P))) +
      theme_bw() +
      guides(fill="none") +
      geom_hline(yintercept=-log10(significance_th[[1]]), linetype="dashed", color = "red", size=1)
    if (length(significance_th)-1 & experiment_wide_line) qqplot <- qqplot + geom_hline(yintercept=-log10(significance_th[[2]]), linetype="dashed", color = "red", size=0.5)
    return(qqplot)
  })
  ggarrange(plotlist = qqplots, ncol = ggarrange.ncol, nrow = ggarrange.nrow)
}

plot_scatter <- function(dfs, ggarrange.ncol, ggarrange.nrow, xlab, ylab, titles){
  plots <- vector("list", length = length(dfs))
  for (i in 1:length(dfs)){
    df <- data.frame(y = dfs[[i]]$y, x = dfs[[i]]$x)
    plots[[i]] <- ggplot(df, aes(x = x, y = y)) +
      geom_point(alpha = 0.6) +
      geom_smooth(method=lm) +
      stat_cor(method = "pearson", colour = "blue") +
      xlab(xlab) +
      ylab(ylab) +
      ggtitle(titles[i])
  }
  ggarrange(plotlist = plots, ncol = ggarrange.ncol, nrow = ggarrange.nrow)
}


if(FALSE){

  # Set a path for storing data
  data_path <- "./"

  # Load the data
  filenames <- prep_data(data_path)
  for (filename in filenames) load(filename)

  ## Experiment #1: detecting CD4 differential methylation with age; working under the assumption X|Y (i.e. age affects methylation levels)

  # Apply the TCA model to the Hannum whole-blood data
  tca.mdl.hannum <- tca(X = hannum$X,
                        W = hannum$W,
                        C1 = hannum$cov[,c("gender","age")],
                        C2 = hannum$cov[,3:ncol(hannum$cov)])

  # Extract p-values of a joint test
  tca.mdl.hannum.pvals.joint <- tca.mdl.hannum$gammas_hat_pvals.joint[,"age"]

  # Extract p-values for each cell type, under a marginal conditional test
  tca.mdl.hannum.pvals.marg_cond <- tca.mdl.hannum$gammas_hat_pvals[,paste(colnames(hannum$W),".age",sep="")]

  # qq-plots - for the p-values of the joint test, and for the p-values in CD4, under a marginal conditional test
  plot_qq(list(tca.mdl.hannum.pvals.joint, tca.mdl.hannum.pvals.marg_cond[,"CD4T.age"]),
          labels = c("Joint test with age", "CD4 marginal conditional test with age"),
          ggarrange.nrow = 1,
          ggarrange.ncol = 2,
          experiment_wide_line = FALSE)

  # Run ReFACTor to capture more of the variation of cell-type composition
  refactor.mdl.hannum <- refactor(X = hannum$X,
                                  k = 6,
                                  C = hannum$cov[,3:ncol(hannum$cov)])

  # Rerun TCA, this time include the ReFACTor components as additional covariates
  tca.mdl.hannum.2 <- tca(X = hannum$X,
                          W = hannum$W,
                          C1 = hannum$cov[,c("gender","age")],
                          C2 = cbind(hannum$cov[,3:ncol(hannum$cov)],refactor.mdl.hannum$scores))

  # Extract the updated p-values of a joint test
  tca.mdl.hannum.2.pvals.joint <- tca.mdl.hannum.2$gammas_hat_pvals.joint[,"age"]

  # Extract the updated marginal conditional p-values
  tca.mdl.hannum.2.pvals.marg_cond <- tca.mdl.hannum.2$gammas_hat_pvals[,paste(colnames(hannum$W),".age",sep="")]

  # qq-plots - for the p-values of the joint test, and for the p-values in CD4, under a marginal conditional test
  plot_qq(list(tca.mdl.hannum.2.pvals.joint, tca.mdl.hannum.2.pvals.marg_cond[,"CD4T.age"]),
          labels = c("Joint test with age", "CD4 marginal conditional test with age"),
          ggarrange.nrow = 1,
          ggarrange.ncol = 2,
          experiment_wide_line = FALSE)

  # Extract the hits based on the joint test
  hits.joint <- names(which(tca.mdl.hannum.2.pvals.joint < 0.05/nrow(hannum$X)))

  # Extract the hits from hits.joint where CD4 cells demonstrate the lowest p-value across all cell types
  cd4.hits <- names(which(tca.mdl.hannum.2.pvals.marg_cond[hits.joint,"CD4T.age"] == rowMins(tca.mdl.hannum.2.pvals.marg_cond[hits.joint,])))

  sprintf("Detected %d associations using a joint test, %d associations using a marginal conditional test, and %d associations in CD4 cells using a marginal conditional test.",
          sum(tca.mdl.hannum.2.pvals.joint <= 0.05/nrow(hannum$X)),
          sum(tca.mdl.hannum.2.pvals.marg_cond <= 0.05/(nrow(hannum$X)*nrow(hannum$W))),
          sum(tca.mdl.hannum.2.pvals.marg_cond[,"CD4T.age"] <= 0.05/(nrow(hannum$X)*nrow(hannum$W))))

  # Replicate the CD4 hits in the Liu purified CD4 data
  cd4.hits.liu.pvals <- unlist(lapply(1:length(cd4.hits),
                                      function(x) summary(lm(y~., data.frame(y=liu.cd4$X[cd4.hits[x],], liu.cd4$cov)))$coefficients["age","Pr(>|t|)"]))

  # Plot adjusted methylation (i.e. adjusted for covariates) as a function of age in all four replicated CD4 associations
  dfs <-  vector("list", length = 4)
  for (i in 1:4){
    r <- scale(residuals(lm(y~., data.frame(y=liu.cd4$X[cd4.hits[i],], liu.cd4$cov[,2:ncol(liu.cd4$cov)]))))
    dfs[[i]] <- data.frame(x = liu.cd4$cov[,"age"], y = r)
  }
  plot_scatter(dfs = dfs,
               ggarrange.ncol = 2,
               ggarrange.nrow = 2,
               xlab = "Age",
               ylab = "Adjusted methylation level",
               titles = paste("CD4 methylation in ",cd4.hits,sep=""))

  # Verify that p-values are calibrated in the Liu data
  liu.cd4.regression.pvals <- unlist(lapply(1:nrow(liu.cd4$X),
                                            function(x) summary(lm(y~.,data.frame(y = liu.cd4$X[x,],liu.cd4$cov)))$coefficients["age","Pr(>|t|)"]))
  plot_qq(list(liu.cd4.regression.pvals), labels = "Linear regression (sorted CD4)")


  ## Experiment #2: detecting differential methylation with infant arousal in granulocytes; working under the assumption Y|X (i.e. methylation levels affect or mediate components affecting infant arousal).

  # Apply the TCA model to the Paquette data
  tca.mdl.paquette <- tca(X = paquette$X,
                          W = paquette$W,
                          C1 = paquette$cov[,c("gender","gestational_age")],
                          C2 = paquette$cov[,4:ncol(paquette$cov)],
                          constrain_mu = TRUE)

  # Run tcareg with a joint test and extract p-values; generate a qq-plot
  C3_names <- c("gender","gestational_age","batch1","batch2")
  tcareg.mdl.paquette.joint <- tcareg(X = paquette$X,
                             tca.mdl = tca.mdl.paquette,
                             y = paquette$cov[,"arousal",drop=F],
                             C3 = paquette$cov[,C3_names],
                             test = "joint")
  plot_qq(list(tcareg.mdl.paquette.joint$pvals), labels = "Joint test with infant arousal")

  # Since there is an inflation in the qq-plot we try to re-estimate the cell-type proportions
  # First, run TCA using only the reference sites for getting a new estimate of W
  tca.mdl.paquette.refit_W <- tca(X = paquette$X.ref_cpgs,
                                  W = paquette$W,
                                  C1 = paquette$cov[,c("gender","gestational_age")],
                                  C2 = paquette$cov[,4:ncol(paquette$cov)],
                                  constrain_mu = TRUE,
                                  refit_W = TRUE,
                                  refit_W.features = rownames(paquette$X.ref_cpgs))
  # Use the updated estimate of W in a new execution of TCA on the data
  tca.mdl.paquette.2 <- tca(X = paquette$X,
                            W = tca.mdl.paquette.refit_W$W,
                            C1 = paquette$cov[,c("gender","gestational_age")],
                            C2 = paquette$cov[,4:ncol(paquette$cov)],
                            constrain_mu = TRUE)

  # Run tcareg with a joint test and extract p-values; generate a qq-plot
  tcareg.mdl.paquette.joint.2 <- tcareg(X = paquette$X,
                               tca.mdl = tca.mdl.paquette.2,
                               y = paquette$cov[,"arousal",drop=F],
                               C3 = paquette$cov[,C3_names],
                               test = "joint")
  plot_qq(list(tcareg.mdl.paquette.joint.2$pvals), labels = "Joint test with infant arousal")

  # Run tcareg with a marginal conditional test; generate qq plots
  tcareg.mdl.paquette.2.marg_cond <- tcareg(X = paquette$X,
                                         tca.mdl = tca.mdl.paquette.2,
                                         y = paquette$cov[,"arousal",drop=F],
                                         C3 = paquette$cov[,C3_names],
                                         test = "marginal_conditional")
  plot_qq(split(tcareg.mdl.paquette.2.marg_cond$pvals,rep(1:ncol(paquette$W), each = nrow(paquette$X))),
          labels = paste(colnames(paquette$W)," marginal conditional with arousal", sep=""),
          ggarrange.nrow = 2,
          ggarrange.ncol = 2)

  # Exctract the hit we got from the joint test
  hit.joint <- rownames(paquette$X)[which(tcareg.mdl.paquette.joint.2$pvals < 0.05/nrow(paquette$X))]

  # The p-values of the marginal conditional test in the hit we found with the joint test suggest that the association is in granulocytes
  tcareg.mdl.paquette.2.marg_cond$pvals[hit.joint,]

  # Extract from the tca model the part that is relevant for the hit found
  tcasub.mdl.paquette.2 <- tcasub(tca.mdl = tca.mdl.paquette.2,
                                  features = hit.joint)

  # Calculate cell-type-specific methylation for the samples in the detected CpG
  tensor.mdl.hit <- tensor(tca.mdl = tcasub.mdl.paquette.2,
                           X = paquette$X[hit.joint,,drop=F])

  # Plot the estimated granulocyte-specific methylation with arasual; adjust methylation levels to covariates
  # Compare with using the bulk data
  r.gran <- scale(residuals(lm(y~., data.frame(y=tensor.mdl.hit[[2]][1,], paquette$cov[,setdiff(1:ncol(paquette$cov),3)]))))
  df.gran <- data.frame(y = r.gran, x = paquette$cov[,"arousal"])
  # The bulk data should be further adjusted for cell-type composition (i.e. tca.mdl.paquette.2$W)
  r.bulk <- scale(residuals(lm(y~., data.frame(y=paquette$X[hit.joint,], tca.mdl.paquette.2$W, paquette$cov[,setdiff(1:ncol(paquette$cov),3)]))))
  df.bulk <- data.frame(y = r.bulk, x = paquette$cov[,"arousal"])
  plot_scatter(dfs = list(df.bulk, df.gran),
               ggarrange.ncol = 2,
               ggarrange.nrow = 1,
               xlab = "Infant arousal",
               ylab = "Adjusted methylation level",
               titles = paste(c("Whole-blood methylation in ", "Granulocyte methylation in "),hit.joint,sep=""))

}