## Originally developed by Travis Porco
## Last updated by Reshma Kassanjee, June 2023
## Some Rights Reserved
## CC BY-NC 4.0 (https://creativecommons.org/licenses/by-nc/4.0/)


# The setting for this tutorial exercise is loosely inspired by the RCT
#   described by Prajna et al 2010 (https://europepmc.org/article/PMC/3774126).
#   In this clinical trial of patients presenting to the clinic with corneal 
#   ulcers, the primary outcome was spectacle-corrected 
#   visual acuity (BSCVA) at 3 months, which is a continuous measure. 
#   Study participants were randomized to 
#   (1) receive either topical natamycin or topical voriconazole, and
#   (2) either have repeated scraping of the epithelium or not.

# Try to work through Part One during the tutorial time
#   Part Two is 'extra' and optional, to do in your own time  


########## PART ONE ##########


# To begin, let us focus on a simpler version of this trial: 
#   study subjects were randomized to receive either natamycin
#   (we will call these subjects the control group) or voriconazole 
#   (forming the treatment group)


#### RANDOMISATION ####


## How can we randomly assign subjects to the treatment and control groups?  

# There are different ways of doing this random selection, but for one, 
#   we can use the 'sample' command to sample subjects for one of the groups.

# Here, we divide 20 subjects into a treatment group and a control group.
#   We select without replacement half of the numbers from
#   1 to 20 (representing the people) to create the treatment group.

ids <- 1:20
treatment.group <- sample(x = ids,
                          size = 10,
                          replace = FALSE)
treatment.group

# Repeat this exercise a few times.

# Another way to randomise is to shuffle a vector of ten 0s (representing, 
#   say, the controls) and ten 1s (the treated)  

assignments <- sample(x = c(rep(1,10),rep(0,10)),
                      size = 20,
                      replace = FALSE)
assignments

# The assignment of the first person is found by looking at assignments[1],
#   of the second person, by looking at assignments[2], and so on.  
#   We can print the list of treated subjects like this:

which(assignments==1)


#### GENERATING A DATASET ####

# We will now generate a simulated (or fake) dataset, and save it as a data frame. 

# Recall that the outcome is a continuous measure, and that the study aims to
#   compare the average outcome (i.e., acuity) between the two study groups/arms.


## Warm up exercise: Normal random variables

# To simulate a given number of normal random variates, use the 'rnorm' command.  
#   Here, we simulate 100 normal variates with a mean of 1 and a standard
#   deviation of 0.3, and plot the data in a histogram

v1 <- rnorm(100, mean = 1, sd = 0.3)

par(mfrow = c(1,1),
    oma = c(0,0,0,0)) # This sets the number of subplots, and margins. 
# Do not worry about these sorts of details for now.
hist(v1,
     xlab = 'Acuity (logMAR)',
     ylab = 'Frequency',
     main = 'Histogram of generated outcomes',
     col = 'red')


## Make a dataset

# We will make a data frame which contains the following columns:
#   study.id (1,2,..), treatment (0/1), and outcome (a continuous number),
#   and where each row provides the data for one subject.

# The average or mean outcome for the treatment group can be different to that
#   in the control group --  this difference is the 'treatment effect',
#   which we want to estimate.

# To begin, we will assume the treatment effect is zero. 

n.subjects        <- 500    # number of subjects in study (even number)
control.mean      <- 0.5    # mean acuity for the control group
treatment.effect  <- 0      # difference in mean for the treatment group (vs control)
outcome.sd        <- 0.3    # spread (standard deviation) of the outcome in each arm

dset            <- data.frame(study.id = 1:n.subjects)
dset$treatment  <- sample( c(rep(1,n.subjects/2),rep(0,n.subjects/2)),
                           size = n.subjects,
                           replace = FALSE)
dset.mean       <- control.mean + treatment.effect*dset$treatment
dset$outcome    <- rnorm(n.subjects,
                         mean = dset.mean,
                         sd = outcome.sd)

# Let us look at the first (up to) 20 rows of the dataset:

head(dset, 20)

# Let us plot histograms of the outcomes, by arm:

par(mfrow = c(2,1),
    oma = c(0,0,0,0))
hist(dset$outcome[dset$treatment==0],
     xlab = 'Acuity (logMAR)',
     ylab = 'Frequency',
     main = 'Control group',
     col = 'blue',
     xlim = c(min(dset$outcome)-outcome.sd/10, max(dset$outcome)+outcome.sd/10),
     breaks = seq(from = min(dset$outcome)-outcome.sd/10
                  , to = max(dset$outcome)+outcome.sd/10
                  , length.out = 20))
hist(dset$outcome[dset$treatment==1],
     xlab = 'Acuity (logMAR)',
     ylab = 'Frequency',
     main = 'Treatment group',
     col = 'red',
     xlim = c(min(dset$outcome)-outcome.sd/10, max(dset$outcome)+outcome.sd/10),
     breaks = seq(from = min(dset$outcome)-outcome.sd/10
                  , to = max(dset$outcome)+outcome.sd/10
                  , length.out = 20))

# We can also calculate the mean outcome in each arm

tapply(dset$outcome, dset[,'treatment'], mean)

# and the difference in means (estimated treatment effect):  

mean(dset$outcome[dset$treatment == 1]) - mean(dset$outcome[dset$treatment == 0])

# Now change the treatment effect to be non-zero. For example, 
#   suppose that the average outcome in the treatment group is 0.7
#   units larger than in the control group.
#   i.e., set treatment.effect = 0.7 and regenerate and plot dset.


# Let us quickly clear up our environment before moving on. 

rm(list=ls())


## A function to generate data 

# Since we will want to generate a dataset multiple times below,
#   we have written a function to make the data. 
#   We have also specified default values for the inputs. 

gen.data <- function(n.subjects = 100, 
                     control.mean = 0.5, treatment.effect = 0.7, outcome.sd = 0.3) {
  dset            <- data.frame(study.id = 1:n.subjects)
  dset$treatment  <- sample( c(rep(1,n.subjects/2),rep(0,n.subjects/2)), size = n.subjects, replace = FALSE)
  mean.vector     <- control.mean + treatment.effect*dset$treatment
  dset$outcome    <- rnorm(n.subjects, mean=mean.vector, sd=outcome.sd)
  dset
}

# Try out the function

gen.data() # when you do not specify any of the inputs, the default value is used for that input

gen.data(control.mean = 1, treatment.effect = 0, outcome.sd = 0.3) 
gen.data(control.mean = 1, treatment.effect = 1, outcome.sd = 0.1)

# Replace ?? with your chosen inputs
gen.data(n.subjects = ??, control.mean = ??, treatment.effect = ??, outcome.sd = ??) 


#### UNDERSTANDING HOW RANDOMISATION BALANCES CONFOUNDERS ####

# Suppose that how compromised acuity is at 0 months also affects the outcome. 
#   The baseline acuity is classified as either severely compromised / 'severe' (1) 
#   or not (0).
# Let us generate our dataset again, but this time subjects also have a 
#   (binary) baseline value (called x.severe, either 0 or 1). 
#   If a subject starts as 'severe' (1), their outcome is shifted by some
#     amount (called severe.effect below, with a default values of -0.3).
#   There is a probability severe.prob (default of 0.5) of starting in the
#   'severe' class. 

# Because treatment is randomised, the proportion of subjects with 'severe'
#   should be similar in each arm. By chance, they may be quite different for any 
#   given dataset, but overall, on average, the arms should be well balanced.

gen.data.severe <- function(n.subjects = 100,
                            control.mean = 0.5, treatment.effect = 0.7, outcome.sd = 0.3,
                            severe.prob = 0.5, severe.effect = -0.3) {
  dset            <- data.frame(study.id = 1:n.subjects)
  dset$x.severe   <- rbinom(n.subjects, 1, severe.prob)
  dset$treatment  <- sample( c(rep(1,n.subjects/2),rep(0,n.subjects/2)), size = n.subjects, replace = FALSE)
  mean.vector     <- control.mean + treatment.effect*dset$treatment + severe.effect*dset$x.severe
  dset$outcome    <- rnorm(n.subjects, mean = mean.vector, sd = outcome.sd)
  dset
}

# Let us generate a dataset using our default inputs, 
#   and calculate the proportion severe in each arm, 
#   and the mean outcome for each group of subjects.

# Generate data:
data.temp <- gen.data.severe() 

# Proportion with severe infection in each arm:
mean(data.temp$x.severe[data.temp$treatment==0])
mean(data.temp$x.severe[data.temp$treatment==1])

# Mean outcome by whether 'severe' and treatment:
tapply(data.temp$outcome, data.temp[,c('x.severe','treatment')], mean)

# Highlight the four lines of code above that generate and explore data.temp, 
#   and run them a few times, and reflect on the outputs.


#### BIAS IN ESTIMATION?? ####

# Let us now analyse the data. For each dataset that we simulate, we will estimate 
#   the treatment effect, with a 95% confidence interval (CI).

# *NB*. You do not need to worry about how we analyse the data,
#   please rather spend this time understanding the other concepts. 
#   You can just apply the function we provide below to a dataset.

# This is the function we will use to analyse the data:

analyse.data <- function(data.in){
  # using central limit theorem
  mean.diff <- mean(data.in$outcome[data.in$treatment == 1]) - mean(data.in$outcome[data.in$treatment == 0])
  est.sd <- sqrt(var(data.in$outcome[data.in$treatment == 1])/length(data.in$outcome[data.in$treatment == 1]) +
                   var(data.in$outcome[data.in$treatment == 0])/length(data.in$outcome[data.in$treatment == 0]))
  est.point <- mean.diff
  est.ci.lower  <- mean.diff-1.96*est.sd
  est.ci.upper  <- mean.diff+1.96*est.sd
  return(round(c(est = est.point, lower = est.ci.lower, upper = est.ci.upper),3))
}

# Importantly, we assume that we do not record the variable x.severe (baseline severity)
#   and thus do not include it in the analysis, even though it affects our outcome. 

# For example, here we generate two datasets, and analyse them:

data.temp <- gen.data.severe(treatment.effect = 0.2)
analyse.data(data.temp)

data.temp <- gen.data.severe(treatment.effect = 0.2)
analyse.data(data.temp)


## Let us repeat this process of generating and analysing the data multiple times, 
#   and explore our results:

# First we specify our inputs

n.sims              <-  500    # Number of times to repeat the study
treatment.effect.in <-  0.8    # Input treatment effect

# Then we generate and analyse the datasets, saving the results in df.ests

df.ests <- data.frame(study.number = 1:n.sims, est = NA, lower = NA, upper = NA)
for (ii in 1:n.sims) {
  data.ii <- gen.data.severe(treatment.effect = treatment.effect.in)
  ests.ii <- analyse.data(data.ii)
  df.ests$est[ii]   <- ests.ii['est']
  df.ests$lower[ii] <- ests.ii['lower']
  df.ests$upper[ii] <- ests.ii['upper'] 
}

# Let us plot the estimates and CIs for the studies, and add a 
#   horizontal line showing the true treatment effect.

par(mfrow = c(1,1)
    , oma = c(0,0,0,0))
y.min.temp <- min(df.ests$lower)-0.1
y.max.temp <- max(df.ests$upper)+0.1
with(df.ests, 
     {plot(study.number
           , est
           , xlab = 'Study number'
           , ylab = 'Estimated treatment effect (and 95% CI)'
           , main = 'Estimated treatment effect over multiple studies \n (with randomisation)'
           , ylim = c(y.min.temp, y.max.temp)
           , type = 'n'); 
       arrows(x0 = study.number, y0 = lower, y1 = upper, code = 0, col = 'darkgrey');
       points(study.number, est, pch = 20, col = 'red');
       abline(h=treatment.effect.in, col = 'blue', lwd = 2)
     })

# Let us calculate the bias, i.e., the average estimate minus the true treatment effect

mean(df.ests$est) - treatment.effect.in


# Think about what all of these outputs show us.


#### BIAS WITHOUT RANDOMISATION ####

# Suppose now that randomisation was not applied. 
#   Instead doctors could choose treatment assignments, and they based their decisions
#   on how compromised baseline acuity was.
#   Here we extend our data generation function to take this into account:
#   people who were 'severe' were provided the treatment with 
#   probability treatment.prob.severe (default of 0.8), and people not 'severe' 
#   received the treatment with probability treatment.prob.notsevere 
#   (default of 0.4)

gen.data.conf <- function(n.subjects = 100
                          , control.mean = 0.5, treatment.effect = 0.7, outcome.sd = 0.3
                          , severe.prob = 0.5, severe.effect = -0.3
                          , treatment.prob.severe = 0.8, treatment.prob.notsevere = 0.4) {
  dset            <- data.frame(study.id = 1:n.subjects)
  dset$x.severe   <- rbinom(n.subjects, 1, severe.prob)
  prob.treatment  <- treatment.prob.severe*dset$x.severe + treatment.prob.notsevere*(1-dset$x.severe)
  dset$treatment  <- rbinom(n.subjects, 1, prob.treatment)
  mean.vector     <- control.mean + treatment.effect*dset$treatment + severe.effect*dset$x.severe
  dset$outcome    <- rnorm(n.subjects, mean = mean.vector, sd = outcome.sd)
  dset
}

## Let us repeat the process of generating and analysing the data multiple times, 
#   as above, and compare our results.

# Let us use the same inputs as above

n.sims              <- 500   
treatment.effect.in <- 0.8   

# And generate and analyse the data, saving the estimates in
#   df.ests.conf (conf is for confounding)

df.ests.conf <- data.frame(study.number = 1:n.sims, est = NA, lower = NA, upper = NA)
for (ii in 1:n.sims) {
  data.ii <- gen.data.conf(treatment.effect = treatment.effect.in)
  ests.ii <- analyse.data(data.ii)
  df.ests.conf$est[ii]   <- ests.ii['est']
  df.ests.conf$lower[ii] <- ests.ii['lower']
  df.ests.conf$upper[ii] <- ests.ii['upper'] 
}

# Let us explore the results as before.

par(mfrow = c(1,1)
    , oma = c(0,0,0,0))
y.min.temp <- min(df.ests.conf$lower)-0.1
y.max.temp <- max(df.ests.conf$upper)+0.1
with(df.ests.conf, 
     {plot(study.number
           , est
           , xlab = 'Study number'
           , ylab = 'Estimated treatment effect (and 95% CI)'
           , main = 'Estimated treatment effect over multiple studies \n (without randomisation)'
           , ylim = c(y.min.temp, y.max.temp)
           , type = 'n'); 
       arrows(x0 = study.number, y0 = lower, y1 = upper, code = 0, col = 'darkgrey');
       points(study.number, est, col = 'red', pch = 20);
       abline(h=treatment.effect.in, col = 'blue', lwd = 2)
     })

# And calculate the bias again

mean(df.ests.conf$est) - treatment.effect.in


# Think about what all of these outputs show us.

# Remember, in each of the two sets of simulations, both severity and treatment
#   affect the outcome. In both, we assume that we cannot measure severity and
#   cannot include it in our analysis.  Our goal is to understand the effect 
#   of treatment on the outcome.

# What is the interpretation of the estimates in the first simulation 
#   (with randomisation)  and in the second (without randomisation)?


# If you are done, you can play around with different values for some of the inputs.


########## PART TWO ##########

# Here you will get to explore some clinical trial data. These data (from the
#   Mycotic Ulcer Therapeutic Exploratory Trial) can only be used for this lab.
#   To protect confidentiality, we have added a tiny random value to some of 
#   the scar values from the actual trial.

# The (secondary) outcome you will focus on here is the scar size at 3 weeks.

# To load the data, use:

load('MuTxT.Rdata')

# The variables are:
#   patid           -- patient ID
#   drug            -- drug assignment: 0 is natamycin, 1 is voriconazole 
#   scrape          -- scraping: 0 is no, 1 is yes
#   age             -- age
#   sex             -- sex
#   if_perf         -- 1 if a perforation happened
#   scar_baseline   -- scar size at baseline
#   scar_3W        -- scar size at 3 weeks

# Try to estimate the treatment effect on scar size at 3 weeks. 
#   Consider controlling for baseline scar size.
#   Is it necessary to control for a covariate? Why might one do so?

# An example of an analysis is

summary(lm(scar_3W ~ drug + scrape, data=mutxt))