## This tutorial has been REPLACED by another Tutorial 4 ☺

## Visualizing infectious disease data in R
## Clinic on the Meaningful Modeling of Epidemiological Data
## International Clinics on Infectious Disease Dynamics and Data (ICI3D) Program
## African Institute for Mathematical Sciences, Muizenberg, RSA
## (C) Steve Bellan, 2010
## Updated by Juliet Pulliam, 2018

## NOTE: The comments will guide you through the tutorial but you
## should make sure you understand what the code is doing.  Many
## plotting parameters are assigned to ?.  This will give you an
## error, you should try out different values for these parameters as
## suggested in the comments or find them yourselves in a helpfil.

######################################################################
## Section 1: Plotting prevalence
######################################################################

######################################################################
## 1A - First, you need to import the data you want to plot!  To do
## this you need to make sure you are in the same working directory as
## your data.

getwd() # shows you what directory you are currently in

## Next, you should change the working directory to where the data is kept.
## Define a variable "path" that gives the file path to the directory 
## where the data are stored.

path <- ??? # Replace the question marks with a character string telling R
						# where to look for the data

## and you can replace it with where you have saved the data set.
setwd(path)

######################################################################
## 2A - Loading and exploring the data.  

bots.dat <- read.csv('HIV_Botswana.csv')
head(bots.dat, 5)

## Let's plot a pie chart of the HIV prevalence in 1994. 

pie(c(bots.dat$prevHIV[bots.dat$year == 1994], # Proportion HIV+
      1 - bots.dat$prevHI[bots.dat$year == 1994]), # Proportion HIV-
    labels = c("HIV +", "HIV -"),
    col = c("red","blue"),
    main = "HIV prevalence in Botswana, 1994")

## Change the above code to plot the prevalence of HIV in 2001.


## While pie charts are good for showing % breakdown between
## categories, bar plots can do the same with another variable
## added.

barplot(bots.dat$prevHIV,
        col = "red")

## This is a good start but we should ALWAYS label axes and have a
## title.

barplot(bots.dat$prevHIV * 100,         # *100 yields % 
        names.arg = bots.dat$year,      # labels bars by year
        col = "red",
        xlab = "year",
        ylab = "% HIV+",
        main = "HIV Prevalence in Botswana, 1990-2007")

## This is much better, but it doesn't really show the HIV- population
## like the pie chart did.  If we want this we can make what's called
## a "stacked bar plot".  To do this we need a matrix with two rows,
## the first that specifies the proportion HIV+ and the second the
## proportion HIV-. We'll have to bind two rows using rbind():

?rbind

prev.frame <- 100*rbind(bots.dat$prevHIV,1 - bots.dat$prevHIV)
prev.frame
barplot(prev.frame,
        names.arg = bots.dat$year,       # labels of the bars
        xlab = "year",
        ylab = "% of population",
        main = "HIV Prevalence in Botswana, 1990-2007",
        col = c("red","blue"),
        beside = FALSE,                 # Stacks bars
        space = 0,                    # TRY 1, .2, 0
        border = F)                   # TRY TRUE & FALSE

## We still need to show our audience that red means HIV+ and blue
## means HIV- so they don't think its the other way around. To do this
## we can add a legend.
?legend

legend("top",                      # Location of the legend
       c("HIV-", "HIV+"),
       col = c("blue", "red"),
       pch = ???,                     # TRY 15 or 19 or other integers
       bg = ???,                      # TRY various colors
       cex = ???)                     # TRY .5, 1, 2       

## Alternatively we can just plot prevalence as a series of points.
plot(bots.dat$year,
     bots.dat$prevHIV *100,
     xlab = "year",
     ylab = "% of population",
     main = "HIV Prevalence in Botswana, 1990-2007",
     ylim = c(0,30))                    # Sets y axis limits.

######################################################################
## Section 2: Plotting incidence.
######################################################################

## We'll be plotting measles data in this example. The data was
## provided by Benjamin Bolker and is available
## online from the International Infectious Disease Data Archive
## (IIDDA) at http://iidda.mcmaster.ca.


measles.Lon <- read.csv("measlesCleanLon.csv")

######################################################################
## 2A - Now let's explore the data we've imported.
head(measles.Lon)                       # Shows first 5 rows
tail(measles.Lon, 20)                   # Shows last 20 rows

names(measles.Lon)                     # Shows variable names

dim(measles.Lon)                       # (rows, columns) of data
nrow(measles.Lon)                      # rows of data
ncol(measles.Lon)                      # columns of data

######################################################################
## 2B - Now we should check to see what types (classes) of variables
## are in this data set.  Let's ask R what class it thinks our
## variables are:

class(measles.Lon$cases)
## Great! It recognized that the number of cases is an integer. Let's
## check the date variable.

class(measles.Lon$date)
## Uh oh, it thinks that date is a factor.  Factors are categorical
## variables, but time is continuous.  To deal with this, you will
## want to convert the dates to a format that R understands as a date.
## One way to do this is using the build-in as.Date() function.

measles.Lon$date <- as.Date(as.character(measles.Lon$date))
head(measles.Lon)
class(measles.Lon$date)
range(measles.Lon$date)

## Now R turned the date into strings of characters, but recognizes
## them as real dates which is why it gave the correct range.

######################################################################
## 2C - Now we're ready to make our first plot.

plot(measles.Lon$date,                  # X variable
     measles.Lon$cases)                 # Y variable

## Not bad, right?  But those circles are a little bit big and make
## the pattern more difficult to follow.  Let's try some other
## options.
plot(measles.Lon$date,
		 measles.Lon$cases,
		 type = ii,                      # TRY "p","l","b","s","h"
		 xlab = "Time",                     # x axis label
		 ylab = "# cases (weekly)",         # y axis label
		 main = "London Measles Incidence, 1944-1994" # plot title
)
## Much better!

######################################################################
## 2D - What other information can we add to make this
## plot more insightful?  Well, measles vaccine was introduced
## publically in 1968. Let's see if this helps us understand
## the trends better.

## First let's define the year in which vaccination began in R's time
## format:n
vaccine.year <- as.Date("1968-01-01")

arrows(vaccine.year, 5000,             # 1st (x,y) coordinate of arrow
       vaccine.year, 4500,              # 4500 and 2000
       length = .1,                   # TRY .1 and 1
       lwd = 2,                      # TRY 2 and 4
       lty = 1,                      # TRY 1 and 2
       col = "red")

## Great! But now we should label this arrow, so that readers know
## what it means.

text(vaccine.year, 5000,                # (x,y) coordinate of text
     "beginning of vaccination",        # text to plot
     pos = 2,                         # TRY 1,2,3 and 4
     col = "red")
######################################################################
## PROBLEM 1A
######################################################################
## The MMR vaccine (measles, mumps, rubella) vaccine was
## introduced to London in 1988 and administered more widely than the
## measles vaccine itself.  WRITE CODE to add a blue, labeled arrow
## that shows when this occurred.
######################################################################

######################################################################
## PROBLEM 1B
######################################################################
## Let's compare the measles data from London with that from
## Liverpool. We'll add the Liverpool data to the plot with the
## lines() function. You should modify the below code to use different
## colors, or line types and a legend to distinguish between the two
## incidence data sets. Choose your graphical parameters so you can
## see both the data sets as clearly as possible.

## NOTE: Many of the graphical parameters you might be interested in
## changing are described in:

?par

measles.LP <- read.csv("measlesCleanLP.csv")
measles.LP$date <- as.Date(measles.LP$date)

## lines() & points() add more lines/points to an already open plot

?lines()

######################################################################
######################################################################


######################################################################
## 2E - Does it look like there is seasonality in measles incidence?
## Let's see if there might be. To do this we're going to sum up all
## the cases for each month of the year and then plot by month using a
## boxplot.

month.char <- format(measles.Lon$date,
                     format = ???)    # TRY "%B", "%m", "%b" and
                                        # "%b-%Y".  Pick the value
                                        # that gives you months as
                                        # three letters.
head(month.char,50)
## Now we should really tell R that month is a factor with levels
## equal to month.abb, a default vector in R that gives:
print(month.abb)

month.char <- factor(month.char,levels=month.abb)
head(month.char,40)

## We're going to try to look at measles incidence seasonality with
## several different plots. Often its useful to have multiple panels
## in a plot window if you want to see many plots side by side:

par(mfrow = c(1,2))                     # This says that the next two
                                        # plots will be plotted in a
                                        # window with 1 row of
                                        # panels and 2 columns.
## some other common graphics tweaks you may find handy
par('ps'=17,  ## set font size to 18
    bty = 'n', ## turn off box around plot
    lwd = 2, mar = c(6,7,6,.5),
las = 2) ## set all line widths to twice as big

## First lets do a simple scatterplot.
plot(as.numeric(month.char),
     measles.Lon$cases,
     xaxt = "n",                  # Tells R not to plot an x-axis so
                                  # we can do it manually
     bty = "n",                   # Doesn't plot a box around the plot
     xlab = "",                   
     ylab = ???,                  # WHAT IS AN APPROPRIATE LABEL?
     main = "Weekly Measles Incidence\n in London by Month",
     pch = 16,                         # TRY 5, 19, 20, 21
     cex = 1)                         # TRY 3, 1, .4

## In the above example, we did not plot an x-axis so that we could do
## it manually. We'll label x axis with the vector:


axis(1,
     at = 1:12,
     labels = month.abb)

## This gives us some idea about the distribution of weekly incidence
## in months, but there are so many points on top of each other we
## can't really see the mean.  Let's use tapply() to get mean weekly
## incidence by month.

?tapply                                 # WHAT DOES tapply() do?

mean.by.months <- tapply(measles.Lon$cases, month.char, mean)

print(mean.by.months)

## Now let's add a line to the plot using lines(), which takes x & y
## inputs like plot() but adds data to a plot that's already open.

lines(1:12,
      mean.by.months)

## WOW! The means are quite low on this plot.  This means there are a
## lot of points clustered at the bottom of each month. Let's plot a
## boxplot. Boxplots are good for showing the distribution of samples
## by the level of another variable. First, read about the function.

?boxplot

boxplot(measles.Lon$cases ~ month.char,
        range = 1,                    # TRY 1 and 3
        ylab ='',                     # ADD AN APPROPORIATE LABEL
        bty = "n",
        main ="Seasonality in \nmeasles incidence")

## Both these curves focus closely on the distribution of weekly
## incidence but don't show trends in the mean very well.  This is
## because the y axis is scaled so large to include really big
## values. If we make a barplot of just the monthly means we can focus
## more on them.

barplot(mean.by.months,
        names.arg = names(mean.by.months),
        ylab = "mean weekly incidence",
        main = "Mean Weekly Incidence Aggregated \nby Month in London, 1944-1994")

## Say you have decided that there is not enough space between the
## three plots.  If you'd like to change the margins you can use the
## following code:

par(mfrow=c(1,3), mar=c(4,4,4,6))       # Each of the elements of the
                                        # "mar" vector specify the
                                        # margins for the bottom,
                                        # left, top & right of the
                                        # plot, respectively. Change
                                        # the third number to a 2 and
                                        # see what happens.

## some other common graphics tweaks you may find handy
par('ps'=18,  ## set font size to 18
    bty = 'n', ## turn off box around plot
    lwd = 2) ## set all line widths to twice as big
    
    
######################################################################
## PROBLEM 2
######################################################################
## Compare the seasonality of London and Liverpool's measles incidence
## in a single plot where the mean weekly incidence of measles is
## displayed with a line for each city.  Remember to use a legend!.
######################################################################


######################################################################
## SECTION 3 - Histograms
######################################################################

## In this section you'll learn how to plot and manipulate
## histograms. The data we'll play with is the number of hookworm eggs
## per gram of feces counted from a large population of people and
## whether or not they wear shoes.

par(mfrow=c(1,1),                       # Let's reset our paneling and
    mar=c(4,4,4,4))                     # margin parameters back to
                                        # their original values.

hookworm <- read.csv("hookworms.csv")

head(hookworm)
nrow(hookworm)

## Check out some of the parameters of the histogram.

?hist

hist(hookworm$epg)

## That's rather ugly and doesn't show us much about the data.  This
## is because the bins are extremeley big so we don't see much
## variability.  If we make the bins smaller we'll see more
## information. The way to do this is to set the breaks between
## bins. To divide the data into "pretty" bins we can use the pretty
## function.

?pretty

my.breaks <- pretty(range(hookworm$epg),100)

hist(hookworm$epg,
     breaks = my.breaks,
     xlab = "eggs per gram",
     main = "Egg Per Gram Distribution")

## This looks much better, but we still cannot see the data clearly
## because most of the EPG values were < 10,000 and the xlimit goes
## out to 40,000 because the distribution is very skewed.  To get a
## better look at the majority of the data we can restrict ourselves
## to a smaller x axis limit.  Let's add even more breaks to improve
## our resolution.

my.breaks <- pretty(range(hookworm$epg),1000)

hist(hookworm$epg,
     breaks = my.breaks,
     xlim = c(0, 10000),
     xlab = "eggs per gram",
     main = "Egg Per Gram Distribution")

## Notice how many of the EPG counts were near 0! We couldn't see this
## when the bins were much larger. If we make the bins even smaller
## what happens?

######################################################################
## PROBLEM 3
######################################################################
## Create breaks that are spaced at intervals of 10, and 1 on two
## panels side by side and compare how the data is presented.
######################################################################

######################################################################
## PROBLEM 4
######################################################################
## Create two histograms side by side on panels that show the
## distribution of EPG for people who do and do not where shoes.  Be
## very careful with your axes limits! If two plots have different
## scaled axes then visual comparisons may lead to incorrect
## impressions.
###################################################################### 


######################################################################
## PROBLEM 5
######################################################################
## The function pdf() can be used to create pdf files of your
## graphics.  Similarly, the function jpeg() can be used to create
## *.jpg graphics files. Use pdf() to create one pdf file with the
## plots from PROBLEMS 1,2,3 & 4.

## To initialize a pdf you use the line:

## pdf("myfilename.pdf")

## Then any graphics you create in R afterwards go into that pdf file
## until you turn off the graphics device by running the line of code:

## dev.off()

## By default R will add each plot window as a different page to your
## pdf file.  You can fiddle with this option by trying
## pdf("myfilename.pdf", onefile = FALSE)

######################################################################