#################################################################################################### ## EPID 7500 ## Steve Bellan ## Fall 2017 ## UGA ## Licensed for reuse with attribution as CC-BY NC ## (https://creativecommons.org/licenses/by-nc/4.0/legalcode) #################################################################################################### require(tidyverse) #################################################################################################### ## Simulating a study using a function #################################################################################################### ## Load names from the internet bnames <- read_csv('https://raw.githubusercontent.com/hadley/data-baby-names/master/baby-names.csv') bnames <- bnames %>% mutate(sex=factor(sex)) levels(bnames$sex) <- c('male','female') ## change the levels from boy/girl bnames ## Create table of cancer risk by sex & smoking from Canadian study lifetimeLungCancerRisk <- crossing(sex = factor(c('male','female'), levels = c('male','female')), smoker = c(T,F)) lifetimeLungCancerRisk <- lifetimeLungCancerRisk %>% ## from https://www.ncbi.nlm.nih.gov/pubmed/7895211 mutate(risk = c(14,116,13,172)/1000) createMyStudy <- function(sampSize=1000, printXtabs=F, groupRisks = lifetimeLungCancerRisk, browse=F) { if(browse) browser() myStudy <- select(bnames, name, sex) %>% slice(sample(1:n(), sampSize, replace=F)) ## randomly select sampSize rows myStudy <- mutate(myStudy, smoker = sample(c(T,F), n(), replace=T)) ## add smoker data ## Join tables to add risk to myStudy myStudy <- left_join(myStudy, groupRisks, by=c('sex','smoker')) myStudy <- myStudy %>% ## simulate cancer mutate(cancer = rbinom(n=n(), size=1, prob=risk)) if(printXtabs) print(xtabs(~smoker + cancer, myStudy)) return(myStudy) } createMyStudy(sampSize=100, printXtabs = T, browse=F) ## Calculate a risk ratio CalcPrevRatio <- function(data, browse=F) { if(browse) browser() prevRatioSmoking <- group_by(data, smoker) %>% summarize(prevalence = mean(cancer)) %>% summarize(prevRatioSmoking = prevalence[smoker==T] / prevalence[smoker==F]) %>% .$prevRatioSmoking return(prevRatioSmoking) } myStudy <- createMyStudy(sampSize=100) ## simulate CalcPrevRatio(data=myStudy, browse=F) ## analyze #################################################################################################### ## Statistical inference with non-parametric resampling tests #################################################################################################### ## We're testing whether smoking is correlated with lung cancer. Statistical inference is based on ## the idea of generating a null hypothesis of no relationship and then seeing whether the available ## data can be used to reject the null hypothesis. Here the null hypothesis would be that smoking ## and cancer are NOT associated. If this was true then the fact that some individuals smoke and ## some do not is totally unrelated to their cancer outcome column, and any correlation we see ## between smoking and cancer in our data is just a product of random noise. ## We can resample our data in a way such that the null hypothesis IS TRUE in the resampled ## data. All we neeed to do, is to jumble up the smoker variable, while keeping the cancer column ## the same: permuteStat <- function(data, statFun=CalcPrevRatio, permutations = 999, showHist=T, updateFreq = 200, browse=F) { if(browse) browser() actualPrevRatio <- CalcPrevRatio(data) message('study statistic is ', signif(actualPrevRatio,3)) output <- vector('double', permutations) for(i in seq_along(output)) { if(i %% updateFreq ==0) print(i) jumbledSim <- data %>% mutate(smoker = sample(smoker, size = n(), replace=FALSE)) output[i] <- statFun(jumbledSim) } prevRatiosAll <- tibble(prevRatio = c(actualPrevRatio, output), randomized = c(F, rep(T, length(output))) ) if(showHist) { g <- ggplot(prevRatiosAll, aes(x=prevRatio, fill=randomized)) + scale_x_log10() + geom_histogram(bins = 30) + geom_vline(xintercept = actualPrevRatio, col = 'red') print(g) } prevRatiosAll } myStudy <- createMyStudy(sampSize=300, printXtabs=T) args(permuteStat) permutationTestResults <- permuteStat(myStudy, permutations=1, browse=T) permutationTestResults <- permuteStat(myStudy, permutations=999, browse=F) calcPvalue <- function(allVals, browse=F) { if(browse) browser() upperTail <- allVals %>% summarize(mean(prevRatio >= prevRatio[!randomized])) lowerTail <- allVals %>% summarize(mean(prevRatio <= prevRatio[!randomized])) min(upperTail, lowerTail) } calcPvalue(permutationTestResults, browse=T) calcPvalue(permutationTestResults) #################################################################################################### ## 1. What is the true prevalence ratio amongst men and women? lifetimeLungCancerRisk %>% summarise(truePrevRatio = mean(risk[smoker==T]) / mean(risk[smoker==F])) mean(lifetimeLungCancerRisk$risk[lifetimeLungCancerRisk$smoker==T]) / mean(lifetimeLungCancerRisk$risk[lifetimeLungCancerRisk$smoker==F]) ## with(lifetimeLungCancerRisk, mean(risk[smoker==T)]) / mean(risk[smoker==F])) #################################################################################################### ## 2. Create a new table where the risk ratio is only 1.5 lifetimeLungCancerRisk lifetimeLungCancerRisk1.5 <- lifetimeLungCancerRisk %>% group_by(sex) %>% mutate(risk = replace(risk, smoker==F, risk[smoker!=F]/1.5 )) lifetimeLungCancerRisk lifetimeLungCancerRisk1.5 lifetimeLungCancerRisk1.5 %>% summarise(truePrevRatio = mean(risk[smoker==T]) / mean(risk[smoker==F])) ## 3. Run the study simulator with this risk myStudy1.5 <- createMyStudy(sampSize = 1000, printXtabs = T, groupRisks = lifetimeLungCancerRisk1.5) CalcPrevRatio(myStudy1.5) permutationTestResults1.5 <- permuteStat(myStudy1.5) calcPvalue(permutationTestResults1.5) #################################################################################################### ## 4. Power calculation: If your sample size is 200, what is the probability that your study would be ## able to detect that smoking had a statistically significant effect on cancer (P > .05)? lifetimeLungCancerRisk1.5 runs <- 1000 pValues <- rep(NA, runs) for(i in seq_along(pValues)) { message(i) myStudy1.5 <- createMyStudy(sampSize = 200, printXtabs = F, groupRisks = lifetimeLungCancerRisk1.5) ## permutationTestResults1.5 <- permuteStat(myStudy1.5, updateFreq = 500, permutations = 99) ## pValues[i] <- calcPvalue(permutationTestResults1.5) myModel1.5 <- glm(cancer ~ smoker, data = myStudy1.5, family = binomial('logit')) pValues[i] <- coef(summary(myModel1.5))['smokerTRUE',4] ## p value on smoking } hist(pValues, breaks=100) mean(pValues <= .05) ## 5. In your own words define a p Value ## Read more about permutation tests: https://en.wikipedia.org/wiki/Resampling_(statistics)#Permutation_tests ## 6. Logistic regression is another common way to evaluate binary outcomes myModel1.5 <- glm(cancer ~ smoker, family=binomial(link='logit'), data=myStudy1.5) summary(myModel1.5) ## summary of results coef(myModel1.5) ## just coefficients confint(myModel1.5) ## confidence intervals around them exp(coef(myModel1.5))['smokerTRUE'] ## the odds ratio between smokers and non-smokers exp(confint(myModel1.5))['smokerTRUE',] ## the confidence interval around that odds ratio coef(summary(myModel)) ## model coefficients and statistics coef(summary(myModel))['smokerTRUE',4] ## p value on smoking #################################################################################################### #################################################################################################### ## Probability distributions ## Above, we considered a binary outcome (cancer). To simulate a binary outcome as a function of ## explanatory variables, we use the binomial distribution. What other kinds of random variables are ## there? #################################################################################################### ## 1. Continuous random variables are very common (weight, height, hemoglobin level, etc). We often ## use the normal (AKA gaussian) distribution to model these because many continuous variables have ## this distribution stdNormalDensityFxn <- function(x) dnorm(x, mean = 0, sd = 1) curve(stdNormalDensityFxn, from = -3, to = 3) samp <- rnorm(1000, mean=0, sd=1) hist(samp) hist(samp, breaks = 100) ## #################################################################################################### ## Confidence Intervals #################################################################################################### ## 1. In the above example we considered three binary random variables (smoking, sex, cancer). Let's ## now consider a continuous variable, simulating the effect of vitamin A supplementation on changes ## in a child's height over time ## https://www.ncbi.nlm.nih.gov/pubmed/10648265 createHeightStudy <- function(sampSize=1000, printXtabs=F, meanHeights, browse=F) { if(browse) browser() myStudy <- select(bnames, name, sex) %>% slice(sample(1:n(), sampSize, replace=F)) ## randomly select sampSize rows myStudy <- myStudy %>% mutate(vitaminA = sample(c(T,F), n(), replace=T)) %>% mutate(height = NA) %>% mutate(height = replace(height, vitaminA==T, rnorm(sum(vitaminA), 100, 5))) %>% mutate(height = replace(height, vitaminA==F, rnorm(sum(!vitaminA), 95, 5))) return(myStudy) } htDat <- createHeightStudy() htDat ggplot(htDat, aes(height, fill=vitaminA)) + geom_histogram(position='dodge', bins=60) ## 1. Create a function that calculates a difference in heights from your data ## 2. Use the t.test function to compare heights between the two groups tresult <- t.test(filter(htDat, vitaminA==T) %>% .$height, filter(htDat, vitaminA==F) %>% .$height) tresult names(tresult) tresult$p.value tresult$conf.int