## Originally developed by Travis Porco
## Last updated by Reshma Kassanjee, June 2023
## Some Rights Reserved
## CC BY-NC 4.0 (https://creativecommons.org/licenses/by-nc/4.0/)
# The setting for this tutorial exercise is loosely inspired by the RCT
# described by Prajna et al 2010 (https://europepmc.org/article/PMC/3774126).
# In this clinical trial of patients presenting to the clinic with corneal
# ulcers, the primary outcome was spectacle-corrected
# visual acuity (BSCVA) at 3 months, which is a continuous measure.
# Study participants were randomized to
# (1) receive either topical natamycin or topical voriconazole, and
# (2) either have repeated scraping of the epithelium or not.
# Try to work through Part One during the tutorial time
# Part Two is 'extra' and optional, to do in your own time
########## PART ONE ##########
# To begin, let us focus on a simpler version of this trial:
# study subjects were randomized to receive either natamycin
# (we will call these subjects the control group) or voriconazole
# (forming the treatment group)
#### RANDOMISATION ####
## How can we randomly assign subjects to the treatment and control groups?
# There are different ways of doing this random selection, but for one,
# we can use the 'sample' command to sample subjects for one of the groups.
# Here, we divide 20 subjects into a treatment group and a control group.
# We select without replacement half of the numbers from
# 1 to 20 (representing the people) to create the treatment group.
ids <- 1:20
treatment.group <- sample(x = ids,
size = 10,
replace = FALSE)
treatment.group
# Repeat this exercise a few times.
# Another way to randomise is to shuffle a vector of ten 0s (representing,
# say, the controls) and ten 1s (the treated)
assignments <- sample(x = c(rep(1,10),rep(0,10)),
size = 20,
replace = FALSE)
assignments
# The assignment of the first person is found by looking at assignments[1],
# of the second person, by looking at assignments[2], and so on.
# We can print the list of treated subjects like this:
which(assignments==1)
#### GENERATING A DATASET ####
# We will now generate a simulated (or fake) dataset, and save it as a data frame.
# Recall that the outcome is a continuous measure, and that the study aims to
# compare the average outcome (i.e., acuity) between the two study groups/arms.
## Warm up exercise: Normal random variables
# To simulate a given number of normal random variates, use the 'rnorm' command.
# Here, we simulate 100 normal variates with a mean of 1 and a standard
# deviation of 0.3, and plot the data in a histogram
v1 <- rnorm(100, mean = 1, sd = 0.3)
par(mfrow = c(1,1),
oma = c(0,0,0,0)) # This sets the number of subplots, and margins.
# Do not worry about these sorts of details for now.
hist(v1,
xlab = 'Acuity (logMAR)',
ylab = 'Frequency',
main = 'Histogram of generated outcomes',
col = 'red')
## Make a dataset
# We will make a data frame which contains the following columns:
# study.id (1,2,..), treatment (0/1), and outcome (a continuous number),
# and where each row provides the data for one subject.
# The average or mean outcome for the treatment group can be different to that
# in the control group -- this difference is the 'treatment effect',
# which we want to estimate.
# To begin, we will assume the treatment effect is zero.
n.subjects <- 500 # number of subjects in study (even number)
control.mean <- 0.5 # mean acuity for the control group
treatment.effect <- 0 # difference in mean for the treatment group (vs control)
outcome.sd <- 0.3 # spread (standard deviation) of the outcome in each arm
dset <- data.frame(study.id = 1:n.subjects)
dset$treatment <- sample( c(rep(1,n.subjects/2),rep(0,n.subjects/2)),
size = n.subjects,
replace = FALSE)
dset.mean <- control.mean + treatment.effect*dset$treatment
dset$outcome <- rnorm(n.subjects,
mean = dset.mean,
sd = outcome.sd)
# Let us look at the first (up to) 20 rows of the dataset:
head(dset, 20)
# Let us plot histograms of the outcomes, by arm:
par(mfrow = c(2,1),
oma = c(0,0,0,0))
hist(dset$outcome[dset$treatment==0],
xlab = 'Acuity (logMAR)',
ylab = 'Frequency',
main = 'Control group',
col = 'blue',
xlim = c(min(dset$outcome)-outcome.sd/10, max(dset$outcome)+outcome.sd/10),
breaks = seq(from = min(dset$outcome)-outcome.sd/10
, to = max(dset$outcome)+outcome.sd/10
, length.out = 20))
hist(dset$outcome[dset$treatment==1],
xlab = 'Acuity (logMAR)',
ylab = 'Frequency',
main = 'Treatment group',
col = 'red',
xlim = c(min(dset$outcome)-outcome.sd/10, max(dset$outcome)+outcome.sd/10),
breaks = seq(from = min(dset$outcome)-outcome.sd/10
, to = max(dset$outcome)+outcome.sd/10
, length.out = 20))
# We can also calculate the mean outcome in each arm
tapply(dset$outcome, dset[,'treatment'], mean)
# and the difference in means (estimated treatment effect):
mean(dset$outcome[dset$treatment == 1]) - mean(dset$outcome[dset$treatment == 0])
# Now change the treatment effect to be non-zero. For example,
# suppose that the average outcome in the treatment group is 0.7
# units larger than in the control group.
# i.e., set treatment.effect = 0.7 and regenerate and plot dset.
# Let us quickly clear up our environment before moving on.
rm(list=ls())
## A function to generate data
# Since we will want to generate a dataset multiple times below,
# we have written a function to make the data.
# We have also specified default values for the inputs.
gen.data <- function(n.subjects = 100,
control.mean = 0.5, treatment.effect = 0.7, outcome.sd = 0.3) {
dset <- data.frame(study.id = 1:n.subjects)
dset$treatment <- sample( c(rep(1,n.subjects/2),rep(0,n.subjects/2)), size = n.subjects, replace = FALSE)
mean.vector <- control.mean + treatment.effect*dset$treatment
dset$outcome <- rnorm(n.subjects, mean=mean.vector, sd=outcome.sd)
dset
}
# Try out the function
gen.data() # when you do not specify any of the inputs, the default value is used for that input
gen.data(control.mean = 1, treatment.effect = 0, outcome.sd = 0.3)
gen.data(control.mean = 1, treatment.effect = 1, outcome.sd = 0.1)
# Replace ?? with your chosen inputs
gen.data(n.subjects = ??, control.mean = ??, treatment.effect = ??, outcome.sd = ??)
#### UNDERSTANDING HOW RANDOMISATION BALANCES CONFOUNDERS ####
# Suppose that how compromised acuity is at 0 months also affects the outcome.
# The baseline acuity is classified as either severely compromised / 'severe' (1)
# or not (0).
# Let us generate our dataset again, but this time subjects also have a
# (binary) baseline value (called x.severe, either 0 or 1).
# If a subject starts as 'severe' (1), their outcome is shifted by some
# amount (called severe.effect below, with a default values of -0.3).
# There is a probability severe.prob (default of 0.5) of starting in the
# 'severe' class.
# Because treatment is randomised, the proportion of subjects with 'severe'
# should be similar in each arm. By chance, they may be quite different for any
# given dataset, but overall, on average, the arms should be well balanced.
gen.data.severe <- function(n.subjects = 100,
control.mean = 0.5, treatment.effect = 0.7, outcome.sd = 0.3,
severe.prob = 0.5, severe.effect = -0.3) {
dset <- data.frame(study.id = 1:n.subjects)
dset$x.severe <- rbinom(n.subjects, 1, severe.prob)
dset$treatment <- sample( c(rep(1,n.subjects/2),rep(0,n.subjects/2)), size = n.subjects, replace = FALSE)
mean.vector <- control.mean + treatment.effect*dset$treatment + severe.effect*dset$x.severe
dset$outcome <- rnorm(n.subjects, mean = mean.vector, sd = outcome.sd)
dset
}
# Let us generate a dataset using our default inputs,
# and calculate the proportion severe in each arm,
# and the mean outcome for each group of subjects.
# Generate data:
data.temp <- gen.data.severe()
# Proportion with severe infection in each arm:
mean(data.temp$x.severe[data.temp$treatment==0])
mean(data.temp$x.severe[data.temp$treatment==1])
# Mean outcome by whether 'severe' and treatment:
tapply(data.temp$outcome, data.temp[,c('x.severe','treatment')], mean)
# Highlight the four lines of code above that generate and explore data.temp,
# and run them a few times, and reflect on the outputs.
#### BIAS IN ESTIMATION?? ####
# Let us now analyse the data. For each dataset that we simulate, we will estimate
# the treatment effect, with a 95% confidence interval (CI).
# *NB*. You do not need to worry about how we analyse the data,
# please rather spend this time understanding the other concepts.
# You can just apply the function we provide below to a dataset.
# This is the function we will use to analyse the data:
analyse.data <- function(data.in){
# using central limit theorem
mean.diff <- mean(data.in$outcome[data.in$treatment == 1]) - mean(data.in$outcome[data.in$treatment == 0])
est.sd <- sqrt(var(data.in$outcome[data.in$treatment == 1])/length(data.in$outcome[data.in$treatment == 1]) +
var(data.in$outcome[data.in$treatment == 0])/length(data.in$outcome[data.in$treatment == 0]))
est.point <- mean.diff
est.ci.lower <- mean.diff-1.96*est.sd
est.ci.upper <- mean.diff+1.96*est.sd
return(round(c(est = est.point, lower = est.ci.lower, upper = est.ci.upper),3))
}
# Importantly, we assume that we do not record the variable x.severe (baseline severity)
# and thus do not include it in the analysis, even though it affects our outcome.
# For example, here we generate two datasets, and analyse them:
data.temp <- gen.data.severe(treatment.effect = 0.2)
analyse.data(data.temp)
data.temp <- gen.data.severe(treatment.effect = 0.2)
analyse.data(data.temp)
## Let us repeat this process of generating and analysing the data multiple times,
# and explore our results:
# First we specify our inputs
n.sims <- 500 # Number of times to repeat the study
treatment.effect.in <- 0.8 # Input treatment effect
# Then we generate and analyse the datasets, saving the results in df.ests
df.ests <- data.frame(study.number = 1:n.sims, est = NA, lower = NA, upper = NA)
for (ii in 1:n.sims) {
data.ii <- gen.data.severe(treatment.effect = treatment.effect.in)
ests.ii <- analyse.data(data.ii)
df.ests$est[ii] <- ests.ii['est']
df.ests$lower[ii] <- ests.ii['lower']
df.ests$upper[ii] <- ests.ii['upper']
}
# Let us plot the estimates and CIs for the studies, and add a
# horizontal line showing the true treatment effect.
par(mfrow = c(1,1)
, oma = c(0,0,0,0))
y.min.temp <- min(df.ests$lower)-0.1
y.max.temp <- max(df.ests$upper)+0.1
with(df.ests,
{plot(study.number
, est
, xlab = 'Study number'
, ylab = 'Estimated treatment effect (and 95% CI)'
, main = 'Estimated treatment effect over multiple studies \n (with randomisation)'
, ylim = c(y.min.temp, y.max.temp)
, type = 'n');
arrows(x0 = study.number, y0 = lower, y1 = upper, code = 0, col = 'darkgrey');
points(study.number, est, pch = 20, col = 'red');
abline(h=treatment.effect.in, col = 'blue', lwd = 2)
})
# Let us calculate the bias, i.e., the average estimate minus the true treatment effect
mean(df.ests$est) - treatment.effect.in
# Think about what all of these outputs show us.
#### BIAS WITHOUT RANDOMISATION ####
# Suppose now that randomisation was not applied.
# Instead doctors could choose treatment assignments, and they based their decisions
# on how compromised baseline acuity was.
# Here we extend our data generation function to take this into account:
# people who were 'severe' were provided the treatment with
# probability treatment.prob.severe (default of 0.8), and people not 'severe'
# received the treatment with probability treatment.prob.notsevere
# (default of 0.4)
gen.data.conf <- function(n.subjects = 100
, control.mean = 0.5, treatment.effect = 0.7, outcome.sd = 0.3
, severe.prob = 0.5, severe.effect = -0.3
, treatment.prob.severe = 0.8, treatment.prob.notsevere = 0.4) {
dset <- data.frame(study.id = 1:n.subjects)
dset$x.severe <- rbinom(n.subjects, 1, severe.prob)
prob.treatment <- treatment.prob.severe*dset$x.severe + treatment.prob.notsevere*(1-dset$x.severe)
dset$treatment <- rbinom(n.subjects, 1, prob.treatment)
mean.vector <- control.mean + treatment.effect*dset$treatment + severe.effect*dset$x.severe
dset$outcome <- rnorm(n.subjects, mean = mean.vector, sd = outcome.sd)
dset
}
## Let us repeat the process of generating and analysing the data multiple times,
# as above, and compare our results.
# Let us use the same inputs as above
n.sims <- 500
treatment.effect.in <- 0.8
# And generate and analyse the data, saving the estimates in
# df.ests.conf (conf is for confounding)
df.ests.conf <- data.frame(study.number = 1:n.sims, est = NA, lower = NA, upper = NA)
for (ii in 1:n.sims) {
data.ii <- gen.data.conf(treatment.effect = treatment.effect.in)
ests.ii <- analyse.data(data.ii)
df.ests.conf$est[ii] <- ests.ii['est']
df.ests.conf$lower[ii] <- ests.ii['lower']
df.ests.conf$upper[ii] <- ests.ii['upper']
}
# Let us explore the results as before.
par(mfrow = c(1,1)
, oma = c(0,0,0,0))
y.min.temp <- min(df.ests.conf$lower)-0.1
y.max.temp <- max(df.ests.conf$upper)+0.1
with(df.ests.conf,
{plot(study.number
, est
, xlab = 'Study number'
, ylab = 'Estimated treatment effect (and 95% CI)'
, main = 'Estimated treatment effect over multiple studies \n (without randomisation)'
, ylim = c(y.min.temp, y.max.temp)
, type = 'n');
arrows(x0 = study.number, y0 = lower, y1 = upper, code = 0, col = 'darkgrey');
points(study.number, est, col = 'red', pch = 20);
abline(h=treatment.effect.in, col = 'blue', lwd = 2)
})
# And calculate the bias again
mean(df.ests.conf$est) - treatment.effect.in
# Think about what all of these outputs show us.
# Remember, in each of the two sets of simulations, both severity and treatment
# affect the outcome. In both, we assume that we cannot measure severity and
# cannot include it in our analysis. Our goal is to understand the effect
# of treatment on the outcome.
# What is the interpretation of the estimates in the first simulation
# (with randomisation) and in the second (without randomisation)?
# If you are done, you can play around with different values for some of the inputs.
########## PART TWO ##########
# Here you will get to explore some clinical trial data. These data (from the
# Mycotic Ulcer Therapeutic Exploratory Trial) can only be used for this lab.
# To protect confidentiality, we have added a tiny random value to some of
# the scar values from the actual trial.
# The (secondary) outcome you will focus on here is the scar size at 3 weeks.
# To load the data, use:
load('MuTxT.Rdata')
# The variables are:
# patid -- patient ID
# drug -- drug assignment: 0 is natamycin, 1 is voriconazole
# scrape -- scraping: 0 is no, 1 is yes
# age -- age
# sex -- sex
# if_perf -- 1 if a perforation happened
# scar_baseline -- scar size at baseline
# scar_3W -- scar size at 3 weeks
# Try to estimate the treatment effect on scar size at 3 weeks.
# Consider controlling for baseline scar size.
# Is it necessary to control for a covariate? Why might one do so?
# An example of an analysis is
summary(lm(scar_3W ~ drug + scrape, data=mutxt))