####################################################################################################
## EPID 7500
## Steve Bellan
## Fall 2017
## UGA
## Licensed for reuse with attribution as CC-BY NC
## (https://creativecommons.org/licenses/by-nc/4.0/legalcode)
####################################################################################################
require(tidyverse)

####################################################################################################
## Simulating a study using a function
####################################################################################################
## Let's pick up where we left off last week. Here's the same code in
## a more compact form (and removing the extraneous pedagogical parts)

## **** ##
sampSize <- 1000
bnames <- read_csv('https://raw.githubusercontent.com/hadley/data-baby-names/master/baby-names.csv')
myStudy <- select(bnames, name, sex) %>% 
  slice(sample(1:n(), sampSize, replace=F)) ## randomly select sampSize rows
myStudy <- myStudy %>% mutate(sex = factor(sex)) ## make sex a factor
levels(myStudy$sex) <- c('male','female') ## change the levels from boy/girl
myStudy <- mutate(myStudy, smoker = sample(c(T,F), n(), replace=T)) ## add smoker data
## set up lung cancer risk table for sex/smoker status
lifetimeLungCancerRisk <- crossing(sex = factor(c('male','female'), levels = c('male','female')),
 ## specified levels for sex in the same order as above in myStudy to avoid warnings during join
                                   smoker = c(T,F))
lifetimeLungCancerRisk <- lifetimeLungCancerRisk %>% 
  mutate(risk = c(14,116,13,172)/1000) ## from https://www.ncbi.nlm.nih.gov/pubmed/7895211

## Join tables to add risk to myStudy
myStudy <- left_join(myStudy, lifetimeLungCancerRisk, by=c('sex','smoker'))
myStudy <- myStudy %>% ## simulate cancer
  mutate(cancer = rbinom(n=n(), size=1, prob=risk))
## **** ##

myStudy

####################################################################################################
## 1A look at some cross tabulations of your simulation
xtabs(~cancer, myStudy)
xtabs(~smoker + cancer, myStudy)
xtabs(~smoker + cancer + sex, myStudy)

## 1B Guess: How big of a sample size do you think you would have
## needed for this study to be reasonably sure smoking really is
## correlated with cancer risk? Write your guess down:


####################################################################################################
## 2A Convert the code between the two (## **** ##) above into a
##    function with an argument for sampSize.  Try it out on a few
##    sampSizes.

## 2B Add a logical argument called printXtab to your function that,
##    if TRUE, prints the crosstabulation of smoker X cancer.

## 2C Run the function for sampSize of 10, 20, 30, 100, and 1000 using
## sapply(), printing the cross tabulation using the argument from 2B

####################################################################################################
## Debugging
####################################################################################################
## R's built in debugging mode allows you to walk around inside a function.
?browser
?debug

## Rstudio also has built in debugger capabilities
## https://support.rstudio.com/hc/en-us/articles/200713843?version=1.0.153&mode=desktop#stopping-on-a-line

## 3A Call debug() on the name of your function and then run your
## function again, using R's debugger to step through it and explore
## objects as you go.

## 3B Add the following line to the top of your function

## if(browse) browser()

## And add the argument browse with a default of FALSE to your
## function's arguments. Then run your function once with the default
## for browse, and once with browse=T.


####################################################################################################
## Risk Ratios
####################################################################################################
## As you have learned or will be learning in your introductor epi
## class, a common way to measure an effect of a risk ractor is with a
## risk ratio. In our simulated study, we don't have any real notion
## of time (just lifetimes). So our measure of cancer is prevalence.

mean(myStudy$cancer) ## prevalence of cancer

## We can measure prevalence in both the smoking & non-smoking groups
prevXsmoker <- group_by(myStudy, smoker) %>%
    summarize(prevalence = mean(cancer))
prevXsmoker

## And we can calculate a prevalence ratio of smoker vs non-smoker

prevRatioSmoking <- summarize(prevXsmoker, prevRatioSmoking = prevalence[smoker==T] / prevalence[smoker==F]) %>%
    .$prevRatioSmoking
prevRatioSmoking

## 4A Create a function that takes the output of your study simulator
## and calculates this prevalence ratio


####################################################################################################
## Statistical inference with non-parametric resampling tests
####################################################################################################

## We're testing whether smoking is correlated with lung
## cancer. Statistical inference is based on the idea of generating a
## null hypothesis of no relationship and then seeing whether the
## available data can be used to reject the null hypothesis. Here the
## null hypothesis would be that smoking and cancer are NOT
## associated. If this was true then the fact that some individuals
## smoke and some do not is totally unrelated to their cancer outcome
## column, and any correlation we see between smoking and cancer in
## our data is just a product of random noise.

## We can resample our data in a way such that the null hypothesis IS
## TRUE in the resampled data. All we neeed to do, is to jumble up the
## smoker variable, while keeping the cancer column the same:

myStudyJumbled <- myStudy %>%
    mutate(smoker = sample(smoker, size = n(), replace=FALSE))
myStudyJumbled

## 5A Use your function from 4 above to calculate the prevalence ratio
## for data with smoker jumbled.

## 5B Write a for() loop that repeats this 10 times (jumble, calculate
## prevalence ratio, store prevalence ratio in a vector).

## *Hint*: Here's some example code that shows how you can use a for()
## loop to fill a vector

runs <- 10
vectorToFill <- as.numeric() ## initiate an empty vector that's of class numeric
for(i in 1:runs) {
## append something to the vector (starts out empty then gets filled up one at a time)
    vectorToFill <- c(vectorToFill, i*2 + 10) 
}
vectorToFill

## 5C Now do it 1000 times. Plot a histogram of these prevalence
## ratios from the jumbled data using ggplot() + geom_histogram()

## 5D add a vertical red line for where the prevalence ratio from the
## true data lies.

## 5E Interpret this plot, what does it tell you?

## 5F Repeat 5C-D above for a simulated study sample size of 100. What is the difference?