##################################################
## Introduction to Sampling & Variability
## Steve Bellan
## Meaningful Modeling of Epidemiological Data 2016
## African Institute of Mathematical Sciences, Muizenberg, South Africa

##################################################
## Participatory Coding
##################################################

## How does travel amongst Haitian immigrants in Florida impact active TB prevalence?

## Case-control: compare TB cases & matched controls in terms of whether they've traveled?

## Alternative question phrasing: How would the prevalence of active TB be different if none of this population had traveled? i.e. indirect effects

## consider other indirect relationships between travel and having active TB (trt disruption)

##############################
## We'll go with this question to begin
## 
## Cross-sectional: screen for active TB amongst a population and also ask people if they've traveled in the past 2 years?

## think about selection biases?

npop <- 1000
travelProb <- 0.63
baselineTB <- .01
truePrevR <- 2.1

tbPrev <- c(notravel = baselineTB,
            travel = baselineTB*truePrevR)

set.seed(1)

dat <- data.frame(id = 1:npop)
dat$travel <- rbinom(npop, 1, prob = travelProb)
table(dat$travel)
dat$travel <- factor(dat$travel, labels = names(tbPrev))
dat$tb <- rbinom(npop, 1, prob = tbPrev[dat$travel])
head(dat); tail(dat)
xtabs(~ tb + travel, data = dat)

prevR <- function(dat) {
    prevTrav <- with(dat, mean(tb[travel=='travel']))
    prevNoTrav <- with(dat, mean(tb[travel=='notravel']))
    return(prevTrav/prevNoTrav)
}
realPR <- prevR(dat)

permTest <- function(dat, nperms = 999, browse=F) {
    if(browse) browser()
    permPrevR <- rep(NA, nperms)
    for(ii in 1:nperms) {
        permDat <- dat
        permDat$tb <- sample(permDat$tb, size = npop,
                             replace=F)
        permPrevR[ii] <- prevR(permDat)
    }
    return(permPrevR)
}

nullPrevR <- permTest(dat, nperms =9999, browse=F)
par('ps'=25, mar = rep(6,4))
hist(log(nullPrevR), xlab = 'prevalence ratio (trav/notrav)',
     col = 'black', breaks = 100
     ) ## make sure PR is shown on a scale so that 2 and 0.5 look reasonably similar
abline(v = log(1), lty = 2, col = 'red', lwd = 3) ## null hypothesis
abline(v = log(realPR), lty = 1, col = 'orange', lwd = 5) ## real PR

pValFxn <- function(FullVector, ObsVal)
    2*min(mean(FullVector >= ObsVal), mean(FullVector <= ObsVal))

pValFxn(c(nullPrevR,realPR), realPR)

powCalc <- function(nruns = 10, npop = 1000, travelProb = 0.63,
                    ## tbPrev = c(notravel = 0.01, travel = 0.021),
                    baselineTB = .01,
                    truePrevR = 2.1,
                    nperms = 999, browse=F) {
    pvalV <- rep(NA, nruns)
    if(browse) browser()
    tbPrev <- c(notravel = baselineTB,
                travel = baselineTB*truePrevR)
    for(jj in 1:nruns) {
        print(jj)
        ## simulate "real data"
        dat <- data.frame(id = 1:npop)
        dat$travel <- rbinom(npop, 1, prob = travelProb)
        dat$travel <- factor(dat$travel, labels = names(tbPrev))
        dat$tb <- rbinom(npop, 1, prob = tbPrev[dat$travel])
        realPR <- prevR(dat)
        if(browse) {
            print(xtabs(~ tb + travel, data = dat))
            print(realPR)
        }
        ## calculate p value from real data
        nullPrevR <- permTest(dat, nperms=nperms, browse=F)
        ## hist(log(nullPrevR),
        ##      xlab = 'prevalence ratio (trav/notrav)',
        ##      col = 'black', breaks = 100
        ##      )
        pvalV[jj] <- pValFxn(c(nullPrevR,realPR), realPR)
    }
    return(mean(pvalV <= .05))
}

## default parameters
pow <- powCalc(nruns = 100, nperms = 199, browse=F)


pow <- powCalc(nruns = 100, nperms = 199, browse=F, baselineTB = .1)

baselineTBprevs <- seq(0.001, .1, length = 10)
numPows <- length(baselineTBprevs)
powV <- rep(NA, numPows)
for(bb in 1:numPows) {
    powV[bb] <- powCalc(baselineTB = baselineTBprevs[bb],
                        truePrevR = 2.1)
}

par('ps'=25, mar = rep(6,4), lwd = 2)
plot(baselineTBprevs, powV, xlab = 'baseline TB prev',
     type = 'l', lwd = 3, ylim = c(0,1),
     ylab = 'statistical power', bty = 'n')