# This is the R code for the R Markdown file twopart.Rmd

## ---- HEALTH EXPENDITURE DATA FROM THE MEPS ----------------------------------
## @knitr meps_load
meps <- readRDS("data/meps-fyc-2012.rds")

## @knitr twopart_hist
library(data.table)
library(ggplot2)
theme_set(theme_bw())
qplot(totexp12, data = meps[totexp12 < quantile(meps$totexp12, probs = .95)])

## @knitr twopart_loghist
library(MASS)
meps <- meps[, logtotexp12 := ifelse(totexp12 > 0, log(totexp12), NA)]
ggplot(meps, aes(x = logtotexp12)) + 
  geom_histogram(aes(y = ..density..), colour = "black", fill = "White") +
  stat_function(fun = dnorm, args = fitdistr(meps[totexp12 > 0 , logtotexp12],
                                             "normal")$estimate)

## ---- PREDICTING MEAN EXPENDITURES -------------------------------------------
## @knitr meps_vars
meps[, age := ifelse(age12x < 0, NA, age12x)]
meps[, age2 := age^2]
meps[, srh := as.factor(ifelse(rthlth53 < 0, NA, rthlth53))] # health status
meps[, hisp := ifelse(racethx == 1, 1, 0)] # hispanic race
meps[, black := ifelse(racebx == 1 | racebx == 2, 1, 0)]
meps[, prvins := ifelse(inscov12 == 1, 1, 0)] # private insurance
meps[, pubins := ifelse(inscov12 == 2, 1, 0)] # public insurance
meps[, d_totexp12 := ifelse(totexp12 == 0, 0, 1)] # indicator for positive spending

## @knitr meps_subset
meps[, id := seq(1, nrow(meps))]
xvars <- c("age", "age2", "srh", "hisp", "black", "prvins", "pubins")
meps <- meps[, c("id", "totexp12", "d_totexp12", "logtotexp12", xvars), with = FALSE]
meps <- meps[complete.cases(meps[, xvars, with = FALSE])]

## @knitr meps_sample
set.seed(100)
meps[, sample := sample(c("train", "test"), nrow(meps), replace = TRUE)]

## @knitr fit
Fm <- function(y, xvars){
  return(as.formula(paste(y, "~", paste(xvars, collapse = "+"))))
}
# part 1
logistic.fit <- glm(Fm("d_totexp12", xvars), meps[sample == "train"], family = binomial)

# part 2
ols.fit <- lm(Fm("totexp12", xvars), meps[totexp12 > 0 & sample == "train"])
logols.fit <- lm(Fm("logtotexp12", xvars), meps[totexp12 > 0 & sample == "train"])
gamma.fit <- glm(Fm("totexp12", xvars), meps[totexp12 > 0 & sample == "train"], 
                 family = Gamma(link = log))

## @knitr pred
phat <- predict(logistic.fit, meps[sample == "test"], type = "response")
pred <- data.table(totexp12 = meps[sample == "test", totexp12])
pred$ols <- phat * predict(ols.fit, meps[sample == "test"])
pred$logols <- phat * exp(predict(logols.fit, meps[sample == "test"]) + summary(logols.fit)$sigma^2/2)
pred$gamma <- phat * predict(gamma.fit, meps[sample == "test"], type = "response")

## @knitr rmse
RMSE <- function(x, y)  sqrt(mean((y - x)^2, na.rm = TRUE))
rmse <- c(RMSE(pred$totexp12, pred$ols),
          RMSE(pred$totexp12, pred$logols),
          RMSE(pred$totexp12, pred$gamma))
names(rmse) <- c("OLS", "Log OLS", "Gamma")
print(rmse)

## @knitr smearing
meps[, agecat := cut(age, breaks = c(0, 1, seq(5, 90, 5)), 
                             right = FALSE)]
epsilon <- data.table(age = logols.fit$mode$age, res = logols.fit$res)
epsilon[, agecat := cut(age, breaks = c(0, 1, seq(5, 90, 5)), 
                                   right = FALSE)]
epsilon <- epsilon[, .(phihat = mean(exp(res))), by = "agecat"]
meps <- merge(meps, epsilon, by = "agecat", all.x = TRUE)
meps <- meps[order(id)]
pred$logols_smear <- phat * exp(predict(logols.fit, meps[sample == "test"])) * mean(exp(logols.fit$res))
pred$logols_hetsmear <- phat * exp(predict(logols.fit, meps[sample == "test"])) * meps[sample == "test", phihat]
rmse <- c(RMSE(pred$totexp12, pred$logols_smear),
          RMSE(pred$totexp12, pred$logols_hetsmear))
names(rmse) <- c("Log OLS Homoskedastic Smearing", "Log OLS Heteroskedastic Smearing")
print(rmse)

## ---- PREDICTIVE SIMULATION --------------------------------------------------
## @knitr logistic_normal
n <- nrow(meps[sample == "test"])
d <- rbinom(n, 1, phat)
y.norm <- d * rnorm(n, pred$ols, summary(ols.fit)$sigma)

## @knitr logistic_lognormal
y.lognorm <- d * rlnorm(n, predict(logols.fit, meps[sample == "test"]) , 
                        summary(logols.fit)$sigma)

## @knitr mom_dispersion
res <- (gamma.fit$model$totexp12 - gamma.fit$fit)/gamma.fit$fit # this is equivalent to gamma.fit$res
c(sum(res^2)/gamma.fit$df.res, summary(gamma.fit)$dispersion)

## @knitr logistic_gamma
a <- gamma.shape(gamma.fit)$alpha
b <- a/pred$gamma
y.gamma <- d * rgamma(n, shape = a , rate = b)

## @knitr twopart_yrepden
y <- meps[sample == "test", totexp12]
p.dat <- data.table(y = c(y, y.norm, y.lognorm, y.gamma),
                    lab = c(rep("Observed", n), rep("Normal", n), 
                            rep("Lognormal", n), rep("Gamma", n)))
p <- ggplot(p.dat[y > 0 & y < 10000], aes(x = y, col = lab)) + 
  geom_density(kernel = "gaussian") +
  xlab("Expenditures") + ylab("Density") +
  theme(legend.position="bottom") + labs(col = "") +
  scale_color_manual(values=c(Observed = "black", Normal = "red", 
                              Lognormal = "blue", Gamma = "green")) 
print(p)

## @knitr yrep_quantiles
MySum <- function(x){
  q <- c(0.30, 0.5, 0.75, .9, .95, .98)
  dat <- c(100 * mean(x == 0, na.rm = TRUE),
           min(x, na.rm = TRUE), quantile(x, probs = q, na.rm = TRUE), 
           max(x, na.rm = TRUE))
  names(dat) <- c("PercentZero", "Min", paste0("Q", 100 * q), "Max")
  return(round(dat, 0))
} 
sumstats <- rbind(MySum(y), MySum(y.norm), 
                  MySum(y.lognorm), MySum(y.gamma))
rownames(sumstats) <- c("Observed", "Normal", "Lognormal", "Gamma")
print(sumstats)