##################################################
# Chapter 3 examples
##################################################
library(tidyverse)
library(scales)
library(grid)
library(viridis)
library(quantreg)

set.seed(1233)

source('grab_nhanes.R')

##################################################
# Visualize variability around a regression line
##################################################
regression_variability_plot <- function(dset, type){
    dset <- dset %>% mutate(p=predict(lm(y~x)))
    panel_title <- switch(type, y='Variability around E(Y)', regression='Variability around E(Y|X)')
    gg <- ggplot(dset)
    gg <- gg+ggtitle(panel_title)
    if(type == 'y'){
        gg <- gg+geom_segment(aes(x=x, y=y, xend=x, yend=mean(y)),
                              colour='gray',
                              size=0.75)
        gg <- gg+geom_hline(aes(yintercept=mean(y)),
                            size=0.75)
    } else {
        gg <- gg+geom_segment(aes(x=x, y=y, xend=x, yend=p),
                              colour='gray',
                              size=0.75)
        gg <- gg+geom_line(aes(x=x, y=p),
                           size=0.75)
    }
    gg <- gg+geom_point(aes(x=x, y=y), size=2)
    gg <- gg+labs(x='\nx', y='y\n')
    print(gg)
}
dset <- tibble(x=seq(10), y=2*x+rnorm(10))
regression_variability_plot(dset, 'y')
regression_variability_plot(dset, 'regression')

##################################################
# Initial exploration: age 20-59 years
##################################################
iset <- bind_rows(grab_and_curate_nhanes(1999, minage=20, maxage=59),
                  grab_and_curate_nhanes(2015, minage=20, maxage=59))

##############################################
# A single binary covariate
##############################################
bmi_density_plot <- function(dset){
    gg <- ggplot(dset)
    gg <- gg+geom_histogram(aes(BMI,
                                y=..density..,
                                colour=YEAR,
                                fill=YEAR),
                            binwidth=1,
                            alpha=0.3,
                            position='identity')
    gg <- gg+theme(legend.position=c(0.95, 0.95),
                   legend.justification=c('right', 'top'))
    gg <- gg+scale_x_continuous(name=parse(text='Body~mass~index~(kg/m^2)'),
                                limits=c(9, 71),
                                breaks=seq(10, 70, by=10),
                                expand=c(0, 0))
    gg <- gg+scale_y_continuous(name='Density',
                                limits=c(0, 0.08),
                                breaks=seq(0, 0.08, by=0.02),
                                expand=c(0, 0))
    gg <- gg+scale_colour_viridis(name='', discrete=TRUE, begin=0.75, end=0.25)
    gg <- gg+scale_fill_viridis(name='', discrete=TRUE, begin=0.75, end=0.25)
    print(gg)
    # Overall summary
    oset <- dset %>% summarize(N=n(),
                               Mean=mean(BMI),
                               Var=var(BMI),
                               SD=sd(BMI))
    # Year-specific summary
    yset <- dset %>% group_by(YEAR)
    yset <- yset %>% summarize(N=n(),
                               Mean=mean(BMI),
                               Var=var(BMI),
                               SD=sd(BMI))
    # Calculate within-group variance
    wset <- yset %>% summarize(WGVar=sum(Var*N)/sum(N))
    # Alternative calculation
    yset <- dset %>% group_by(YEAR)
    yset <- yset %>% mutate(BMISCALED=scale(BMI, scale=FALSE))
    yset <- yset %>% ungroup()
    wset <- yset %>% summarize(WGVar=var(BMISCALED))
    # Proportion of variance explained
    with(oset, (Var-wset$WGVar)/Var)
}
bmi_density_plot(iset)

##################################################
# Age scatterplot
##################################################
age_scatterplot <- function(dset, correction=0.5){
    gset <- dset %>% group_by(AGE)
    gset <- gset %>% summarize(BMI=mean(BMI))
    gg <- ggplot(dset, aes(x=AGE, y=BMI))
    gg <- gg+geom_point(color='gray', size=1)
    gg <- gg+geom_segment(data=gset,
                          aes(x=AGE-correction,
                              y=BMI,
                              xend=AGE+correction,
                              yend=BMI),
                          size=1)
    gg <- gg+geom_smooth(method='lm', color='black', se=FALSE, size=1)
    gg <- gg+scale_x_continuous(name='Age (years)',
                                limits=c(19, 61),
                                breaks=seq(20, 60, by=10),
                                expand=c(0, 0))
    gg <- gg+scale_y_continuous(name=parse(text='Body~mass~index~(kg/m^2)'),
                                limits=c(9, 71),
                                breaks=seq(10, 70, by=10),
                                expand=c(0, 0))
    print(gg)
    # Compare simple averages and fitted regression values
    fit <- lm(BMI~AGE, data=dset)
    fset <- dset %>% distinct(AGE)
    fset <- fset %>% arrange(AGE)
    fset <- fset %>% mutate(REG=predict(fit, newdata=tibble(AGE)))
    fset <- full_join(gset, fset, by='AGE')
    fset <- bind_cols(fset %>% filter(between(AGE, 20, 29)),
                      fset %>% filter(between(AGE, 30, 39)),
                      fset %>% filter(between(AGE, 40, 49)),
                      fset %>% filter(between(AGE, 50, 59)))
    names(fset) <- gsub('[.0-9]*$', '', names(fset))
    return(fset)
}
age_scatterplot(iset)

##################################################
# Visualize BMI trends by age, sex, and race/ethnicity
##################################################
bmi_trends_plot <- function(dset){
    dset <- dset %>% mutate(AGEGROUP=sub('≤', '""<=', AGEGROUP),
                            AGEGROUP=sub('<=', '""<=', AGEGROUP),
                            AGEGROUP=sub('>', '"">', AGEGROUP),
                            RACE=factor(RACE, labels=sub(' ', '~', levels(RACE))),
                            YEAR=sub('-', '-\n', YEAR))
    mset <- dset %>% group_by(SEX, AGEGROUP, RACE, YEAR)
    mset <- mset %>% summarize(BMI=mean(BMI))
    gg <- ggplot(dset)
    gg <- gg+geom_dotplot(aes(x=YEAR,
                              y=BMI,
                              colour=interaction(SEX, AGEGROUP),
                              fill=interaction(SEX, AGEGROUP)),
                          binaxis='y',
                          binwidth=1.5,
                          stackdir='centerwhole',
                          alpha=0.4)
    gg <- gg+geom_line(data=mset,
                       aes(x=YEAR,
                           y=BMI,
                           group=interaction(SEX, RACE)),
                       size=0.75)
    gg <- gg+facet_grid(RACE~SEX+AGEGROUP, labeller=label_parsed)
    gg <- gg+scale_x_discrete(name='')
    gg <- gg+scale_y_continuous(name=parse(text='Body~mass~index~(kg/m^2)'),
                                limits=c(9, 71),
                                breaks=seq(10, 70, by=20),
                                expand=c(0, 0))
    gg <- gg+scale_colour_viridis(name='', discrete=TRUE, begin=0.75, end=0.25)
    gg <- gg+scale_fill_viridis(name='', discrete=TRUE, begin=0.75, end=0.25)
    print(gg)
}
bmi_trends_plot(iset)

##################################################
# QQ normal plot to assess normality
##################################################
qqnormal_plot <- function(dset){
    gg <- ggplot(dset, aes(sample=Residuals))
    gg <- gg+ggtitle('QQ-normal plot')
    gg <- gg+stat_qq(geom='point',
                     position='jitter',
                     alpha=0.1,
                     size=2)
    gg <- gg+geom_hline(yintercept=0, colour='darkgray', linetype='dashed')
    gg <- gg+scale_x_continuous(name='Theoretical quantiles')
    gg <- gg+scale_y_continuous(name='Residual quantiles',
                                limits=c(-20, 40),
                                expand=c(0, 0))
    print(gg)
}

##################################################
# Residuals vs fitted plot to assess linearity
# and constant variance
##################################################
resvsfit_plot <- function(dset){
    gg <- ggplot(dset)
    gg <- gg+ggtitle('Residuals-versus-fitted-values plot')
    gg <- gg+geom_point(aes_string(x='Fitted',
                                   y='Residuals'),
                        position='jitter',
                        alpha=0.1,
                        size=2)
    gg <- gg+geom_hline(yintercept=0, colour='darkgray', linetype='dashed')
    gg <- gg+scale_x_continuous(name='Fitted')
    gg <- gg+scale_y_continuous(name='Residuals',
                                limits=c(-20, 40),
                                expand=c(0, 0))
    print(gg)
}

##################################################
# Fit linear regression with interaction
##################################################
linear_model <- function(dset,
                         alpha=0.05,
                         include.interaction=FALSE,
                         continuous.age=FALSE,
                         scaled.age=FALSE){
    if(scaled.age)
        dset <- dset %>% mutate(AGE=AGE/10)
    if(include.interaction){
        if(continuous.age){
            equation <- 'BMI~SEX+RACE+AGE*YEAR'
            Label <- 'bmi_trends_continuous_age'
        } else {
            equation <- 'BMI~SEX+RACE+AGEGROUP*YEAR'
            Label <- 'bmi_trends_categorical_age'
        }
    } else {
        if(continuous.age){
            equation <- 'BMI~SEX+RACE+AGE+YEAR'
            Label <- 'bmi_trends_continuous_age_no_interaction'
        } else {
            equation <- 'BMI~SEX+RACE+AGEGROUP+YEAR'
            Label <- 'bmi_trends_categorical_age_no_interaction'
        }
    }
    fit <- lm(as.formula(equation), data=dset)
    cset <- coef(summary(fit))
    cset <- data.frame(Variable=rownames(cset), cset, check.names=FALSE)
    cset <- cset %>% transmute(Variable=sub('(Intercept)', 'Intercept', Variable, fixed=TRUE),
                               Variable=gsub(paste0('(', paste(names(dset), collapse='|'), ')'), '\\1 = ', Variable),
                               Variable=sub('GROUP', '', Variable),
                               Variable=sub('.*:.*', 'AGE-YEAR interaction', Variable),
                               Coefficient=Estimate,
                               SE=`Std. Error`,
                               # '95% lower'=Estimate-qnorm(1-alpha/2)*`Std. Error`,
                               # '95% upper'=Estimate+qnorm(1-alpha/2)*`Std. Error`,
                               'P-value'=`Pr(>|t|)`)
    if(continuous.age)
        cset <- cset %>% mutate(Variable=sub('AGE =', 'AGE', Variable))
    if(!include.interaction & !continuous.age){
        rset <- dset %>% mutate(Fitted=fitted(fit),
                                Residuals=resid(fit))
        qqnormal_plot(rset)
        resvsfit_plot(rset)
        # white women young early
        wwye <- matrix(c(1, 1, 0, 0, 0, 0, 0, 0), 1)
        wwye.ht <- multcomp::glht(fit, linfct=wwye)
        confint(wwye.ht)
        # Mexican women old early
        mwoe <- matrix(c(1, 1, 0, 1, 0, 0, 1, 0), 1)
        mwoe.ht <- multcomp::glht(fit, linfct=mwoe)
        confint(mwoe.ht)
        # race using F-test and LRT
        fit.norace <- update(fit, .~.-RACE)
        ftest.race <- anova(fit, fit.norace, test='F')
        lrt.race <- anova(fit, fit.norace, test='LRT')
        cat('Unadjusted R-squared:', summary(fit)$r.squared, '\n')
        cat('Adjusted R-squared:  ', summary(fit)$adj.r.squared, '\n')
    }
    lstats <- tibble(Age=ifelse(continuous.age, 'Continuous', 'Categorical'),
                     Interaction=ifelse(include.interaction, 'Yes', 'No'),
                     Parameters=length(coef(fit))+1,
                     'Log likelihood'=as.numeric(logLik(fit)),
                     AIC=AIC(fit),
                     BIC=BIC(fit))
    return(lstats)
}
cat_ni <- linear_model(iset, include.interaction=FALSE, continuous.age=FALSE)
con_ni <- linear_model(iset, include.interaction=FALSE, continuous.age=TRUE)
cat_wi <- linear_model(iset, include.interaction=TRUE, continuous.age=FALSE)
con_wi <- linear_model(iset, include.interaction=TRUE, continuous.age=TRUE)
lset <- bind_rows(cat_ni, con_ni, cat_wi, con_wi)

##################################################
# Load and filter NHANES demographic, BMI, PE data
##################################################
grab_and_curate_nhanes2 <- function(year, minage=0, maxage=84, minbmi=0, maxbmi=Inf){
    bset <- grab_nhanes('bmx', year)
    dset <- grab_nhanes('demo', year)
    paset <- grab_nhanes('paq',year)
    dset <- dset %>% select(SEQN, RIAGENDR, RIDAGEYR, RIDRETH1)
    bset <- bset %>% select(SEQN, BMXBMI)
    paset <- paset %>% select(SEQN,PAQ650)
    rset <- right_join(dset, bset, by='SEQN')
    mset <- right_join(rset, paset, by='SEQN')
    mset <- mset %>% rename(ID='SEQN',
                            SEX='RIAGENDR',
                            AGE='RIDAGEYR',
                            RACE='RIDRETH1',
                            BMI='BMXBMI',
                            PE='PAQ650')
    mset <- mset %>% mutate(SEX=factor(SEX,
                                       levels=seq(2),
                                       labels=c('Male', 'Female')),
                            AGEGROUP=factor(AGE <= 50,
                                            levels=c('TRUE', 'FALSE'),
                                            labels=c('<=50', '>50')),
                            AGEDECADE=cut(AGE,
                                          breaks=seq(0, 90, by=10),
                                          labels=paste(seq(0, 80, by=10),
                                                       seq(10, 90, by=10),
                                                       sep='-'),
                                          right=FALSE),
                            RACE=factor(RACE,
                                        levels=c(3, 4, 1, 2, 5),
                                        labels=c('Non-Hispanic White',
                                                 'Non-Hispanic Black',
                                                 'Mexican American',
                                                 'Other Hispanic',
                                                 'Other/Mixed')),
                            PE=ifelse(PE != 1 & PE != 2, NA, PE),
                            PE=factor(PE,
                                      levels=c(2, 1),
                                      labels=c('No', 'Yes')),
                            YEAR=paste(year, year+1, sep='-'))
    mset <- mset %>% filter(between(AGE, minage, maxage),
                            between(BMI, minbmi, maxbmi))
    return(mset)
}

##################################################
# Example of confounding
##################################################
pe_analysis <- function(dset){
    fit0 <- lm(AGE~PE, data=dset)
    cat('AGE~PE:', coef(fit0)['PEYes'], '\n')
    fit1 <- lm(BMI~AGE, data=dset)
    cat('BMI~AGE:', coef(fit1)['AGE'], '\n')
    fit2 <- lm(BMI~PE, data=dset)
    cat('BMI~PE:', coef(fit2)['PEYes'], '\n')
    fit3 <- lm(BMI~PE+AGE, data=dset)
    cat('BMI~PE+AGE:', coef(fit3)['PEYes'], '\n')
}
peset <- grab_and_curate_nhanes2(2015, minage=20, maxage=59)
pe_analysis(peset)

############################################
# Quantile regression
############################################
simple_quantile_regression_plot <- function(dset, correction=0.5){
    q25 <- rq(BMI~AGE, data=dset, tau=0.25)
    q75 <- rq(BMI~AGE, data=dset, tau=0.75)
    pset <- tibble(AGE=seq(20, 59))
    pset <- pset %>% mutate(Predict25=predict(q25, newdata=pset),
                            Predict75=predict(q75, newdata=pset))
    gg <- ggplot(dset)
    gg <- gg+geom_rect(data=pset,
                       aes(xmin=AGE-correction,
                           ymin=Predict25,
                           xmax=AGE+correction,
                           ymax=Predict75),
                       fill='yellow',
                       alpha=0.3)
    gg <- gg+geom_point(aes(x=AGE, y=BMI),
                        color='gray')
    gg <- gg+geom_abline(intercept=coef(q25)[1],
                         slope=coef(q25)[2],
                         size=1)
    gg <- gg+geom_abline(intercept=coef(q75)[1],
                         slope=coef(q75)[2],
                         size=1)
    gg <- gg+scale_x_continuous(name='Age (years)',
                                limits=c(19, 61),
                                breaks=seq(20, 60, by=10),
                                expand=c(0, 0))
    gg <- gg+scale_y_continuous(name=parse(text='Body~mass~index~(kg/m^2)'),
                                limits=c(9, 71),
                                breaks=seq(10, 70, by=10),
                                expand=c(0, 0))
    print(gg)
}
simple_quantile_regression_plot(iset)

multivariate_quantile_regression <- function(dset, tau=0.5){
    lab <- switch(as.character(tau), '0.25'=' (1st)', '0.75'=' (3rd)')
    fit <- rq(BMI~SEX+RACE+AGE+YEAR, data=dset, tau=tau)
    cset <- coef(summary(fit))
    cset <- data.frame(Variable=rownames(cset), cset, check.names=FALSE)
    cset <- cset %>% transmute(Variable=sub('(Intercept)', 'Intercept', Variable, fixed=TRUE),
                               Variable=gsub(paste0('(', paste(names(dset), collapse='|'), ')'), '\\1 = ', Variable),
                               Variable=sub('AGE =', 'AGE', Variable),
                               Variable=sub('GROUP', '', Variable),
                               Variable=sub('.*:.*', 'AGE-YEAR interaction', Variable),
                               Coef.=Value,
                               SE=`Std. Error`,
                               P=`Pr(>|t|)`)
    names(cset)[-1] <- sub('$', lab, names(cset)[-1])
    return(cset)
}

quartile_regression_table <- function(dset){
    q25 <- multivariate_quantile_regression(dset, tau=0.25)
    q75 <- multivariate_quantile_regression(dset, tau=0.75)
    fset <- bind_cols(q25, q75 %>% select(-Variable))
    return(fset)
}
quartile_regression_table(iset)

############################################
# Nonparametric regression - kernel methods
# sample observations for age 2-84 years for
# pedagogical and numerical reasons
############################################
dset <- bind_rows(grab_and_curate_nhanes(1999),
                  grab_and_curate_nhanes(2015))
sset <- dset %>% sample_n(size=1000, replace=FALSE)

regression_subgroup_plot <- function(dset, group){
    group_sym <- sym(group)
    rset <- dset %>% group_by(!!group_sym)
    rset <- rset %>% summarize(Xmin=min(AGE),
                               Xmax=max(AGE),
                               Mean=mean(BMI))
    panel_title <- switch(group,
                          AGEGROUP='Age ≤50 vs >50 years',
                          AGEDECADE='Age 0-10, ..., 80-90 years')
    gg <- ggplot(dset)
    gg <- gg+ggtitle(panel_title)
    gg <- gg+geom_point(aes(x=AGE, y=BMI),
                        color='gray')
    gg <- gg+geom_segment(data=rset,
                          aes(x=Xmin,
                              y=Mean,
                              xend=Xmax,
                              yend=Mean),
                          size=1)
    gg <- gg+scale_x_continuous(name='Age (years)',
                                limits=c(0, 86),
                                breaks=seq(0, 80, by=10),
                                expand=c(0, 0))
    gg <- gg+scale_y_continuous(name=parse(text='Body~mass~index~(kg/m^2)'),
                                limits=c(9, 71),
                                breaks=seq(10, 70, by=10),
                                expand=c(0, 0))
    print(gg)
}
regression_subgroup_plot(sset, 'AGEGROUP')
regression_subgroup_plot(sset, 'AGEDECADE')

############################################
# Manually calculate uniform kernel estimate
# across ages using bandwith 10
############################################
halfwidth <- 5
uset <- sset %>% mutate(Lower=AGE-halfwidth, Upper=AGE+halfwidth)
uset <- uset %>% group_by(Lower, Upper)
uset <- uset %>% mutate(BMIUNIFORM=mean(BMI))

uniform_kernel_plot <- function(dset,
                                example_age=42,
                                halfwidth=5,
                                correction=0.5){
    gg <- ggplot(dset)
    gg <- gg+annotate(geom='rect',
                      xmin=example_age-halfwidth-correction,
                      xmax=example_age+halfwidth+correction,
                      ymin=-Inf,
                      ymax=Inf,
                      fill='red',
                      alpha=0.2)
    gg <- gg+geom_point(aes(x=AGE, y=BMI),
                        color='gray')
    gg <- gg+geom_step(aes(x=AGE, y=BMIUNIFORM),
                       size=1)
    gg <- gg+geom_point(aes(x=example_age, y=unique(BMIUNIFORM[AGE == example_age])),
                        colour='darkred',
                        size=4)
    gg <- gg+scale_x_continuous(name='Age (years)',
                                limits=c(0, 86),
                                breaks=seq(0, 80, by=10),
                                expand=c(0, 0))
    gg <- gg+scale_y_continuous(name=parse(text='Body~mass~index~(kg/m^2)'),
                                limits=c(9, 71),
                                breaks=seq(10, 70, by=10),
                                expand=c(0, 0))
    print(gg)
}
uniform_kernel_plot(uset)

############################################
# Manually calculate normal kernel estimate
############################################
bandwidth <- 3
nset <- sset %>% within({WEIGHTS <- outer(AGE, AGE, function(x, y) dnorm(x-y, sd=bandwidth))
                         BMINORMAL <- apply(WEIGHTS, 1, function(x) sum(BMI*x)/sum(x))
                         rm(WEIGHTS)})

normal_kernel_plot <- function(dset,
                               example_age=42,
                               bandwidth=3,
                               correction=0.5){
    aset <- dset %>% distinct(AGE)
    aset <- aset %>% filter(AGE <= example_age)
    ages <- with(aset, unique(AGE))
    gg <- ggplot(dset)
    for(age in ages){
        example_diff <- abs(example_age-age)
        if(example_diff %in% c(0, 1, 2))
            cat('Diff:', example_diff,
                'Weight:', dnorm(example_diff, sd=bandwidth), '\n')
        gg <- gg+annotate(geom='rect',
                          xmin=example_age-example_diff-correction,
                          xmax=example_age+example_diff+correction,
                          ymin=-Inf,
                          ymax=Inf,
                          fill='red',
                          alpha=dnorm(example_diff, sd=bandwidth))
    }
    gg <- gg+geom_point(aes(x=AGE, y=BMI),
                        color='gray')
    gg <- gg+geom_line(aes(x=AGE, y=BMINORMAL),
                       size=1)
    gg <- gg+geom_point(aes(x=example_age, y=unique(BMINORMAL[AGE == example_age])),
                        colour='darkred',
                        size=4)
    gg <- gg+scale_x_continuous(name='Age (years)',
                                limits=c(0, 86),
                                breaks=seq(0, 80, by=10),
                                expand=c(0, 0))
    gg <- gg+scale_y_continuous(name=parse(text='Body~mass~index~(kg/m^2)'),
                                limits=c(9, 71),
                                breaks=seq(10, 70, by=10),
                                expand=c(0, 0))
    print(gg)
}
normal_kernel_plot(nset)