## Visualizing infectious disease data in R
## Clinic on the Meaningful Modeling of Epidemiological Data
## International Clinics on Infectious Disease Dynamics and Data (ICI3D) Program
## African Institute for Mathematical Sciences, Muizenberg, RSA
## (C) Steve Bellan, 2010
## Updated by Juliet Pulliam, 2018
## NOTE: The comments will guide you through the tutorial but you
## should make sure you understand what the code is doing. Many
## plotting parameters are assigned to ?. This will give you an
## error, you should try out different values for these parameters as
## suggested in the comments or find them yourselves in a helpfil.
######################################################################
## Section 1: Plotting prevalence
######################################################################
######################################################################
## 1A - First, you need to import the data you want to plot! To do
## this you need to make sure you are in the same working directory as
## your data.
getwd() # shows you what directory you are currently in
## Next, you should change the working directory to where the data is kept.
## Define a variable "path" that gives the file path to the directory
## where the data are stored.
path <- ??? # Replace the question marks with a character string telling R
# where to look for the data
## and you can replace it with where you have saved the data set.
setwd(path)
######################################################################
## 2A - Loading and exploring the data.
bots.dat <- read.csv('HIV_Botswana.csv')
head(bots.dat, 5)
## Let's plot a pie chart of the HIV prevalence in 1994.
pie(c(bots.dat$prevHIV[bots.dat$year == 1994], # Proportion HIV+
1 - bots.dat$prevHI[bots.dat$year == 1994]), # Proportion HIV-
labels = c("HIV +", "HIV -"),
col = c("red","blue"),
main = "HIV prevalence in Botswana, 1994")
## Change the above code to plot the prevalence of HIV in 2001.
## While pie charts are good for showing % breakdown between
## categories, bar plots can do the same with another variable
## added.
barplot(bots.dat$prevHIV,
col = "red")
## This is a good start but we should ALWAYS label axes and have a
## title.
barplot(bots.dat$prevHIV * 100, # *100 yields %
names.arg = bots.dat$year, # labels bars by year
col = "red",
xlab = "year",
ylab = "% HIV+",
main = "HIV Prevalence in Botswana, 1990-2007")
## This is much better, but it doesn't really show the HIV- population
## like the pie chart did. If we want this we can make what's called
## a "stacked bar plot". To do this we need a matrix with two rows,
## the first that specifies the proportion HIV+ and the second the
## proportion HIV-. We'll have to bind two rows using rbind():
?rbind
prev.frame <- 100*rbind(bots.dat$prevHIV,1 - bots.dat$prevHIV)
prev.frame
barplot(prev.frame,
names.arg = bots.dat$year, # labels of the bars
xlab = "year",
ylab = "% of population",
main = "HIV Prevalence in Botswana, 1990-2007",
col = c("red","blue"),
beside = FALSE, # Stacks bars
space = 0, # TRY 1, .2, 0
border = F) # TRY TRUE & FALSE
## We still need to show our audience that red means HIV+ and blue
## means HIV- so they don't think its the other way around. To do this
## we can add a legend.
?legend
legend("top", # Location of the legend
c("HIV-", "HIV+"),
col = c("blue", "red"),
pch = ???, # TRY 15 or 19 or other integers
bg = ???, # TRY various colors
cex = ???) # TRY .5, 1, 2
## Alternatively we can just plot prevalence as a series of points.
plot(bots.dat$year,
bots.dat$prevHIV *100,
xlab = "year",
ylab = "% of population",
main = "HIV Prevalence in Botswana, 1990-2007",
ylim = c(0,30)) # Sets y axis limits.
######################################################################
## Section 2: Plotting incidence.
######################################################################
## We'll be plotting measles data in this example. The data was
## provided by Benjamin Bolker and is available
## online from the International Infectious Disease Data Archive
## (IIDDA) at http://iidda.mcmaster.ca.
measles.Lon <- read.csv("measlesCleanLon.csv")
######################################################################
## 2A - Now let's explore the data we've imported.
head(measles.Lon) # Shows first 5 rows
tail(measles.Lon, 20) # Shows last 20 rows
names(measles.Lon) # Shows variable names
dim(measles.Lon) # (rows, columns) of data
nrow(measles.Lon) # rows of data
ncol(measles.Lon) # columns of data
######################################################################
## 2B - Now we should check to see what types (classes) of variables
## are in this data set. Let's ask R what class it thinks our
## variables are:
class(measles.Lon$cases)
## Great! It recognized that the number of cases is an integer. Let's
## check the date variable.
class(measles.Lon$date)
## Uh oh, it thinks that date is a factor. Factors are categorical
## variables, but time is continuous. To deal with this, you will
## want to convert the dates to a format that R understands as a date.
## One way to do this is using the build-in as.Date() function.
measles.Lon$date <- as.Date(as.character(measles.Lon$date))
head(measles.Lon)
class(measles.Lon$date)
range(measles.Lon$date)
## Now R turned the date into strings of characters, but recognizes
## them as real dates which is why it gave the correct range.
######################################################################
## 2C - Now we're ready to make our first plot.
plot(measles.Lon$date, # X variable
measles.Lon$cases) # Y variable
## Not bad, right? But those circles are a little bit big and make
## the pattern more difficult to follow. Let's try some other
## options.
plot(measles.Lon$date,
measles.Lon$cases,
type = ii, # TRY "p","l","b","s","h"
xlab = "Time", # x axis label
ylab = "# cases (weekly)", # y axis label
main = "London Measles Incidence, 1944-1994" # plot title
)
## Much better!
######################################################################
## 2D - What other information can we add to make this
## plot more insightful? Well, measles vaccine was introduced
## publically in 1968. Let's see if this helps us understand
## the trends better.
## First let's define the year in which vaccination began in R's time
## format:n
vaccine.year <- as.Date("1968-01-01")
arrows(vaccine.year, 5000, # 1st (x,y) coordinate of arrow
vaccine.year, 4500, # 4500 and 2000
length = .1, # TRY .1 and 1
lwd = 2, # TRY 2 and 4
lty = 1, # TRY 1 and 2
col = "red")
## Great! But now we should label this arrow, so that readers know
## what it means.
text(vaccine.year, 5000, # (x,y) coordinate of text
"beginning of vaccination", # text to plot
pos = 2, # TRY 1,2,3 and 4
col = "red")
######################################################################
## PROBLEM 1A
######################################################################
## The MMR vaccine (measles, mumps, rubella) vaccine was
## introduced to London in 1988 and administered more widely than the
## measles vaccine itself. WRITE CODE to add a blue, labeled arrow
## that shows when this occurred.
######################################################################
######################################################################
## PROBLEM 1B
######################################################################
## Let's compare the measles data from London with that from
## Liverpool. We'll add the Liverpool data to the plot with the
## lines() function. You should modify the below code to use different
## colors, or line types and a legend to distinguish between the two
## incidence data sets. Choose your graphical parameters so you can
## see both the data sets as clearly as possible.
## NOTE: Many of the graphical parameters you might be interested in
## changing are described in:
?par
measles.LP <- read.csv("measlesCleanLP.csv")
measles.LP$date <- as.Date(measles.LP$date)
## lines() & points() add more lines/points to an already open plot
?lines()
######################################################################
######################################################################
######################################################################
## 2E - Does it look like there is seasonality in measles incidence?
## Let's see if there might be. To do this we're going to sum up all
## the cases for each month of the year and then plot by month using a
## boxplot.
month.char <- format(measles.Lon$date,
format = ???) # TRY "%B", "%m", "%b" and
# "%b-%Y". Pick the value
# that gives you months as
# three letters.
head(month.char,50)
## Now we should really tell R that month is a factor with levels
## equal to month.abb, a default vector in R that gives:
print(month.abb)
month.char <- factor(month.char,levels=month.abb)
head(month.char,40)
## We're going to try to look at measles incidence seasonality with
## several different plots. Often its useful to have multiple panels
## in a plot window if you want to see many plots side by side:
par(mfrow = c(1,2)) # This says that the next two
# plots will be plotted in a
# window with 1 row of
# panels and 2 columns.
## some other common graphics tweaks you may find handy
par('ps'=17, ## set font size to 18
bty = 'n', ## turn off box around plot
lwd = 2, mar = c(6,7,6,.5),
las = 2) ## set all line widths to twice as big
## First lets do a simple scatterplot.
plot(as.numeric(month.char),
measles.Lon$cases,
xaxt = "n", # Tells R not to plot an x-axis so
# we can do it manually
bty = "n", # Doesn't plot a box around the plot
xlab = "",
ylab = ???, # WHAT IS AN APPROPRIATE LABEL?
main = "Weekly Measles Incidence\n in London by Month",
pch = 16, # TRY 5, 19, 20, 21
cex = 1) # TRY 3, 1, .4
## In the above example, we did not plot an x-axis so that we could do
## it manually. We'll label x axis with the vector:
axis(1,
at = 1:12,
labels = month.abb)
## This gives us some idea about the distribution of weekly incidence
## in months, but there are so many points on top of each other we
## can't really see the mean. Let's use tapply() to get mean weekly
## incidence by month.
?tapply # WHAT DOES tapply() do?
mean.by.months <- tapply(measles.Lon$cases, month.char, mean)
print(mean.by.months)
## Now let's add a line to the plot using lines(), which takes x & y
## inputs like plot() but adds data to a plot that's already open.
lines(1:12,
mean.by.months)
## WOW! The means are quite low on this plot. This means there are a
## lot of points clustered at the bottom of each month. Let's plot a
## boxplot. Boxplots are good for showing the distribution of samples
## by the level of another variable. First, read about the function.
?boxplot
boxplot(measles.Lon$cases ~ month.char,
range = 1, # TRY 1 and 3
ylab ='', # ADD AN APPROPORIATE LABEL
bty = "n",
main ="Seasonality in \nmeasles incidence")
## Both these curves focus closely on the distribution of weekly
## incidence but don't show trends in the mean very well. This is
## because the y axis is scaled so large to include really big
## values. If we make a barplot of just the monthly means we can focus
## more on them.
barplot(mean.by.months,
names.arg = names(mean.by.months),
ylab = "mean weekly incidence",
main = "Mean Weekly Incidence Aggregated \nby Month in London, 1944-1994")
## Say you have decided that there is not enough space between the
## three plots. If you'd like to change the margins you can use the
## following code:
par(mfrow=c(1,3), mar=c(4,4,4,6)) # Each of the elements of the
# "mar" vector specify the
# margins for the bottom,
# left, top & right of the
# plot, respectively. Change
# the third number to a 2 and
# see what happens.
## some other common graphics tweaks you may find handy
par('ps'=18, ## set font size to 18
bty = 'n', ## turn off box around plot
lwd = 2) ## set all line widths to twice as big
######################################################################
## PROBLEM 2
######################################################################
## Compare the seasonality of London and Liverpool's measles incidence
## in a single plot where the mean weekly incidence of measles is
## displayed with a line for each city. Remember to use a legend!.
######################################################################
######################################################################
## SECTION 3 - Histograms
######################################################################
## In this section you'll learn how to plot and manipulate
## histograms. The data we'll play with is the number of hookworm eggs
## per gram of feces counted from a large population of people and
## whether or not they wear shoes.
par(mfrow=c(1,1), # Let's reset our paneling and
mar=c(4,4,4,4)) # margin parameters back to
# their original values.
hookworm <- read.csv("hookworms.csv")
head(hookworm)
nrow(hookworm)
## Check out some of the parameters of the histogram.
?hist
hist(hookworm$epg)
## That's rather ugly and doesn't show us much about the data. This
## is because the bins are extremeley big so we don't see much
## variability. If we make the bins smaller we'll see more
## information. The way to do this is to set the breaks between
## bins. To divide the data into "pretty" bins we can use the pretty
## function.
?pretty
my.breaks <- pretty(range(hookworm$epg),100)
hist(hookworm$epg,
breaks = my.breaks,
xlab = "eggs per gram",
main = "Egg Per Gram Distribution")
## This looks much better, but we still cannot see the data clearly
## because most of the EPG values were < 10,000 and the xlimit goes
## out to 40,000 because the distribution is very skewed. To get a
## better look at the majority of the data we can restrict ourselves
## to a smaller x axis limit. Let's add even more breaks to improve
## our resolution.
my.breaks <- pretty(range(hookworm$epg),1000)
hist(hookworm$epg,
breaks = my.breaks,
xlim = c(0, 10000),
xlab = "eggs per gram",
main = "Egg Per Gram Distribution")
## Notice how many of the EPG counts were near 0! We couldn't see this
## when the bins were much larger. If we make the bins even smaller
## what happens?
######################################################################
## PROBLEM 3
######################################################################
## Create breaks that are spaced at intervals of 10, and 1 on two
## panels side by side and compare how the data is presented.
######################################################################
######################################################################
## PROBLEM 4
######################################################################
## Create two histograms side by side on panels that show the
## distribution of EPG for people who do and do not where shoes. Be
## very careful with your axes limits! If two plots have different
## scaled axes then visual comparisons may lead to incorrect
## impressions.
######################################################################
######################################################################
## PROBLEM 5
######################################################################
## The function pdf() can be used to create pdf files of your
## graphics. Similarly, the function jpeg() can be used to create
## *.jpg graphics files. Use pdf() to create one pdf file with the
## plots from PROBLEMS 1,2,3 & 4.
## To initialize a pdf you use the line:
## pdf("myfilename.pdf")
## Then any graphics you create in R afterwards go into that pdf file
## until you turn off the graphics device by running the line of code:
## dev.off()
## By default R will add each plot window as a different page to your
## pdf file. You can fiddle with this option by trying
## pdf("myfilename.pdf", onefile = FALSE)
######################################################################