## EPID 7500 - Day 5 ## Fall 2017 ## Steve Bellan ## Start with binomial distribution tutorial (change to dplyr version). ## Simulating data & Visualizing it require(tidyverse) ## We'll use the dplyr & ggplot libraries to create simulated data and examine it. ## 1. Load in data on baby names with which to create some fake data bnames <- read_csv('https://raw.githubusercontent.com/hadley/data-baby-names/master/baby-names.csv') names(bnames) ## read about the sample function ?sample ## 1A Use the sample function to pick 5 random names from the above data set while sampling WITH replacement sample(bnames$name, 5, replace=T) ## Edit the below code to select 1000 random names WITHOUT replacement, along with sex sampSize <- 1000 myStudy <- select(bnames, name, sex) %>% slice(sample(1:n(), sampSize, replace=F)) ## Fix the levels of sex so that they say 'male' & 'female' instead of 'boy' & girl' unique(myStudy$sex) myStudy <- myStudy %>% mutate(sex = factor(sex)) levels(myStudy$sex) levels(myStudy$sex) <- c('male','female') levels(myStudy$sex) myStudy myStudy <- myStudy %>% mutate(smoker = sample(c(T,F), n(), replace=T)) ## Read this abstract: https://www.ncbi.nlm.nih.gov/pubmed/7895211 ## Let's create a table of lifetime cancer risk. We need every combination of sex/smoker. crossing(1:2,8:9) crossing(1:2,head(letters,2), tail(letters,2)) ## What does crossing do? ## Use crossing to make a 4 row tibble that has every combination of sex and smoker status lifetimeLungCancerRisk <- crossing(sex = factor(c('male','female')), smoker = c(T,F)) ## Fill in the lung cancer risk for each of the 4 combinations of sex and smoker lifetimeLungCancerRisk <- lifetimeLungCancerRisk %>% mutate(risk = c(14,116,13,172)/1000) ## risk amongst Canadians from above reference lifetimeLungCancerRisk levels(lifetimeLungCancerRisk$sex) levels(myStudy$sex) ?left_join myStudy <- left_join(myStudy, lifetimeLungCancerRisk, by=c('sex','smoker')) ## You may have gotten a warning above. Figure out the source of the warning by ## examining the levels of sex in both data tables. Should you be worried? ## How can you make sure it hasnt caused a problem? myStudy ?rbinom args(rbinom) rbinom(n=1, size=10, prob=.5) rbinom(n=10, size=1, prob=.5) myStudy <- myStudy %>% mutate(cancer = rbinom(n=n(), size=1, prob=risk)) myStudy ## Cross tabulation can be done quikly in R with xtabs ?xtabs xtabs(~cancer, myStudy) xtabs(~smoker + cancer, myStudy) xtabs(~ smoker + cancer + sex, myStudy) ## there are Epidemiology packages in R that make contingency tables ## of the kind epidemiologists are more familiar with. install.packages('epitools') require(epitools) ?epitab epitab(x=myStudy$smoker, y=myStudy$cancer) ## Do your best to interpret this table with the help of ?epitab ## Let's plot some of the results p1 <- ggplot(myStudy, aes(x=smoker,y=cancer, fill=sex)) p1 + geom_bar(stat = 'identity') + ylab('number of cancer cases') p1 + geom_jitter(aes(color=sex)) ## Bonus Question: Change the proportion of the population who smokes to 10%. ## Add height to the data set ## https://tall.life/height-percentile-calculator-age-country/ myStudy <- myStudy %>% mutate(height = ifelse(sex=='male', rnorm(n(), 69.3, 2.94), rnorm(n(), 63.8, 2.8))) myStudy ggplot(myStudy, aes(height, cancer)) + geom_jitter() ggplot(myStudy, aes(height, cancer, color=smoker)) + geom_jitter() ## Bonus Question: Simulate cancer outcomes where above cancer rates are true for the average height, but where # risk increases 10% for every 1 inch increase in height above the mean (& decreases 10% for each 1" below the mean)