## ---- DATA ------------------------------------------------------------------- ## @knitr data library("data.table") meps <- fread("data/mepsdiab.csv") summary(meps$totexp) ## @knitr lorenz_curve library("ineq") library("ggplot2") theme_set(theme_bw()) lorenz <- Lc(meps$totexp) lorenz <- data.table(p = lorenz$p, L = lorenz$L) frac <- c(seq(.3, .9, .1), .95) quants <- data.table(quant = quantile(lorenz$L, probs = frac), frac) ggplot(lorenz, aes(x = p, y = L)) + geom_line() + geom_abline() + coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + xlab("Fraction of individuals ordered by expenditures") + ylab("Cumulative share of expenditures") + theme(legend.title=element_blank()) + scale_x_continuous(breaks = seq(0, 1, by = .1)) + scale_y_continuous(breaks = seq(0, 1, by = .1)) + theme(legend.position = "bottom") + geom_text(data = quants, aes(x = frac - .02, y = quant + .02, label = round(quant, 2), size = 8), show.legend = F) ## ---- TRAINING AND TEST DATA ------------------------------------------------- ## @knitr variables ccsnames <- names(meps)[grep("^ccs\\d\\d\\d", names(meps))] ds <- c("dsdiet", "dsmed", "dsinsu", "dseypr", "dskidn") xvars <- c("l_topexp", "l_totexp", "age", "srh", ds, ccsnames) meps <- meps[, c("dupersid", "panel", "year", "topexp", "totexp", xvars), with = FALSE] ## @knitr datasets train <- meps[panel <= 15] train <- train[complete.cases(train)] test <- meps[panel > 15] test <- test[complete.cases(test)] ## ---- LOGISTIC REGRESSION ---------------------------------------------------- ## @knitr logistic_regression Fm <- function(y, x){ return(as.formula(paste(y, "~", paste(xvars, collapse = "+")))) } logistic <- glm(Fm("topexp", xvars), family = binomial, train) p.logistic <- predict.glm(logistic, test, type = "response") ## ---- REGULARIZED LOGISTIC REGRESSION ---------------------------------------- ## @knitr logistic_ridge_regression library("glmnet") x.train <- model.matrix(Fm("topexp", xvars), train)[, -1] y.train <- train$topexp x.test <- model.matrix(Fm("topexp", xvars), test)[, -1] y.test <- test$topexp cv.ridge <- cv.glmnet(x.train, y.train, family = "binomial", alpha = 0) p.ridge <- predict(cv.ridge, x.test, s = "lambda.min", type = "response") ## ---- RANDOM FORESTS --------------------------------------------------------- ## @knitr rf library("randomForest") set.seed(101) train$srh <- as.numeric(train$srh) x.train <- model.matrix(Fm("topexp", xvars), train)[, -1] y.train <- as.factor(y.train) rf <- randomForest(x = x.train, y = y.train, ntree = 501, importance = TRUE, do.trace= FALSE) test$srh <- as.numeric(test$srh) p.rf <- predict(rf, test, type = "prob") ## ---- CLASSIFIER PERFORMANCE ------------------------------------------------- ## @knitr brier Brier <- function(p, y) mean((p - y)^2) c(Brier(p.logistic, y.test), Brier(p.ridge, y.test), Brier(p.rf[, 2], y.test)) ## @knitr roc library("ROCR") # predictions p.mat <- cbind(p1 = p.logistic, p2 = p.ridge[, 1], p3 = p.rf[, 2]) labels.mat <- matrix(y.test, nrow = length(y.test), ncol = ncol(p.mat)) pred <- prediction(p.mat, labels.mat) # roc curve perf <- performance(pred, "tpr", "fpr") roc.dat <- data.frame(fpr = do.call("c", perf@x.values), tpr = do.call("c", perf@y.values)) len <- do.call("c", lapply(perf@x.values, length)) roc.dat$lab <- rep(c("Logistic regression", "Logistic ridge regression", "Random forest"), times = len) ggplot(roc.dat, aes(x = fpr, y = tpr, col = lab)) + geom_line() + xlab("False positive rate") + ylab("True positive rate") + theme(legend.title=element_blank()) + theme(legend.position = "bottom") ## @knitr auc auc <- as.numeric(performance(pred, "auc")@y.values) print(auc) ## @knitr ppv ppv.dat <- data.frame(pred = p.ridge[, 1], y = y.test, exp = test$totexp) ppv.dat$ecdf.p <- ecdf(ppv.dat$pred)(ppv.dat$pred) pctile <- seq(.7, 1, .01) tp <- rep(NA, length(pctile)) for (i in 1:length(tp)){ yhat <- 1 * (ppv.dat$ecdf.p > pctile[i]) tp[i] <- mean(ppv.dat$y[which(yhat == 1)]) } ggplot(data.frame(x = pctile, y = tp), aes(x = x, y = y)) + geom_line() + xlab("Percentile cutoff") + ylab("Positive predicted value") + geom_vline(xintercept = 0.9, lty = "dotted", col = "red") ## @knitr spending_ratio fsaf <- sum(ppv.dat$exp[ppv.dat$ecdf.p >= .9])/sum(ppv.dat$exp[ppv.dat$y == 1]) print(fsaf)