--- title: "Tabloid data set" author: "" date: '' output: pdf_document: number_sections: true includes: in_header: mystyles.sty --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(cache = TRUE) options(digits=3) options(width = 48) ``` # Description A large retailer wants to explore the predictability of response to a tabloid mailing. \sk If they mail a tabloid to a customer in their data-base, can they predict whether or not the customer will respond by making a purchase. \sk The dependent variable is 1 if they buy something, 0 if they do not. \sk They tried to come up with x’s based on past purchasing behavior. \sk The Predictive Analytics team builds a model for \bl the probability the customer responds \bk given \rd information about the customer\bk. \sk What information about a customer do they use? - `nTab`: number of past orders. - `moCbook`: months since last order. - `iRecMer1`: 1/months since last order in merchandise category 1. - `llDol`: log of the dollar value of past purchases. The data for these variables is obtained from the companies operational data base. # Preprocessing We download the data and preprocess it first ```{r} download.file( 'https://github.com/ChicagoBoothML/MLClassData/raw/master/Tabloid/Tabloid_test.csv', 'Tabloid_test.csv') download.file( 'https://github.com/ChicagoBoothML/MLClassData/raw/master/Tabloid/Tabloid_train.csv', 'Tabloid_train.csv') td = read.csv("Tabloid_train.csv") td_test = read.csv("Tabloid_test.csv") td$purchase = as.factor(td$purchase) td_test$purchase = as.factor(td_test$purchase) ``` \newpage # Summary statistics ```{r} summary(td) ``` Notice that the percentage of households that make a purchase is pretty small! \sk $258/10000 = 0.0258$ \sk Illustration of how `nTab` is related to `responders`. ```{r fig.width=10, fig.height=5} par(mfrow=c(1,2)) hist(td[td$purchase==0, "nTab"], breaks=40, col="red", main="nonresponders", xlab="nTab", xlim=c(0,85)) hist(td[td$purchase==1, "nTab"], breaks=40, col="blue", main="responders", xlab="nTab", xlim=c(0,85)) ``` \newpage Here is `Y` plotted vs. each of the four `X`'s \sk ```{r fig.width=8, fig.height=8} par(mfrow=c(2,2), mar=c(3,3,3,1), mgp=c(2,1,0)) plot(nTab~purchase,td,col=c("red", "blue")) plot(moCbook~purchase,td,col=c("red", "blue")) plot(iRecMer1~purchase,td,col=c("red", "blue")) plot(llDol~purchase,td,col=c("red", "blue")) ``` \newpage # Fit models We fit * logistic regression * random forest model * boosting ```{r} library(tree) library(randomForest) library(gbm) ``` Create some helper function used for evaluation. The following function is used to compute the deviance of a model ```{r} # deviance loss function # y should be 0/1 # phat are probabilities obtain by our algorithm # wht shrinks probs in phat towards .5 --- this helps avoid numerical problems don't use log(0)! lossf = function(y,phat,wht=0.0000001) { if(is.factor(y)) y = as.numeric(y)-1 phat = (1-wht)*phat + wht*.5 py = ifelse(y==1, phat, 1-phat) return(-2*sum(log(py))) } ``` The following will get confucion matrix: ```{r} # deviance loss function # y should be 0/1 # phat are probabilities obtain by our algorithm # thr is the cut off value - everything above thr is classified as 1 getConfusionMatrix = function(y,phat,thr=0.5) { if(is.factor(y)) y = as.numeric(y)-1 yhat = ifelse(phat > thr, 1, 0) tb = table(predictions = yhat, actual = y) rownames(tb) = c("predict_0", "predict_1") return(tb) } ``` And finally, this function gives miss-classification rate: ```{r} # deviance loss function # y should be 0/1 # phat are probabilities obtain by our algorithm # thr is the cut off value - everything above thr is classified as 1 lossMR = function(y,phat,thr=0.5) { if(is.factor(y)) y = as.numeric(y)-1 yhat = ifelse(phat > thr, 1, 0) return(1 - mean(yhat == y)) } ``` We need a place to store results ```{r} phatL = list() #store the test phat for the different methods here ``` ## Logistic regression We fit a logistic regression model using all variables ```{r} lgfit = glm(purchase~., td, family=binomial) print(summary(lgfit)) ``` Predictions are stored for later analysis ```{r} phat = predict(lgfit, td_test, type="response") phatL$logit = matrix(phat,ncol=1) ``` \newpage ## Random Forest We fit random forest models for a few different settings. ```{r} set.seed(99) ##settings for randomForest p=ncol(td)-1 mtryv = c(p, sqrt(p)) ntreev = c(500,1000) setrf = expand.grid(mtryv,ntreev) # this contains all settings to try colnames(setrf)=c("mtry","ntree") phatL$rf = matrix(0.0,nrow(td_test),nrow(setrf)) # we will store results here ###fit rf for(i in 1:nrow(setrf)) { #fit and predict frf = randomForest(purchase~., data=td, mtry=setrf[i,1], ntree=setrf[i,2], nodesize=10) phat = predict(frf, newdata=td_test, type="prob")[,2] phatL$rf[,i]=phat } ``` \newpage ## Boosting We fit boosting models for a few different settings. ```{r} ##settings for boosting idv = c(2,4) ntv = c(1000,5000) shv = c(.1,.01) setboost = expand.grid(idv,ntv,shv) colnames(setboost) = c("tdepth","ntree","shrink") phatL$boost = matrix(0.0,nrow(td_test),nrow(setboost)) ``` Remember to convert to numeric 0,1 values for boosting. ```{r} tdB = td; tdB$purchase = as.numeric(tdB$purchase)-1 td_testB = td_test; td_testB$purchase = as.numeric(td_testB$purchase)-1 ``` Fitting ```{r} for(i in 1:nrow(setboost)) { ##fit and predict fboost = gbm(purchase~., data=tdB, distribution="bernoulli", n.trees=setboost[i,2], interaction.depth=setboost[i,1], shrinkage=setboost[i,3]) phat = predict(fboost, newdata=td_testB, n.trees=setboost[i,2], type="response") phatL$boost[,i] = phat } ``` # Analysis of results ## Miss-classification rate Let us first look at miss-classification rate. For **logistic regression** we have: ```{r} getConfusionMatrix(td_test$purchase, phatL[[1]][,1], 0.5) cat('Missclassification rate = ', lossMR(td_test$purchase, phatL[[1]][,1], 0.5), '\n') ``` \newpage For **random forest** we have: ```{r} nrun = nrow(setrf) for(j in 1:nrun) { print(setrf[j,]) print("Confusion Matrix:") print(getConfusionMatrix(td_test$purchase, phatL[[2]][,j], 0.5)) cat('Missclassification rate = ', lossMR(td_test$purchase, phatL[[2]][,j], 0.5), '\n') } ``` \newpage For **boosting** we have: ```{r} nrun = nrow(setboost) for(j in 1:nrun) { print(setboost[j,]) print("Confusion Matrix:") print(getConfusionMatrix(td_test$purchase, phatL[[3]][,j], 0.5)) cat('Missclassification rate = ', lossMR(td_test$purchase, phatL[[3]][,j], 0.5), '\n') } ``` \sk\sk\sk This is strange... There seems to be fit in the model. ```{r fig.width=4, fig.height=4} par(mar=c(3,3,3,1), mgp=c(2,1,0)) phat = predict(lgfit, newdata=td, type="response") plot(phat~td$purchase, col=c("red","blue"), xlab="purchase", ylab="phat", ylim=c(0,1.05), cex.text=0.7) ``` \newpage ## Deviance Plot test set loss --- deviance: ```{r fig.width=8, fig.height=8} lossL = list() nmethod = length(phatL) for(i in 1:nmethod) { nrun = ncol(phatL[[i]]) lvec = rep(0,nrun) for(j in 1:nrun) lvec[j] = lossf(td_test$purchase, phatL[[i]][,j]) lossL[[i]]=lvec; names(lossL)[i] = names(phatL)[i] } lossv = unlist(lossL) plot(lossv, ylab="loss on Test", type="n") nloss=0 for(i in 1:nmethod) { ii = nloss + 1:ncol(phatL[[i]]) points(ii,lossv[ii],col=i,pch=17) nloss = nloss + ncol(phatL[[i]]) } legend("topright",legend=names(phatL),col=1:nmethod,pch=rep(17,nmethod)) ``` From each method class, we choose the one that has the lowest error on the validation set. ```{r} nmethod = length(phatL) phatBest = matrix(0.0,nrow(td_test),nmethod) #pick off best from each method colnames(phatBest) = names(phatL) for(i in 1:nmethod) { nrun = ncol(phatL[[i]]) lvec = rep(0,nrun) for(j in 1:nrun) lvec[j] = lossf(td_test$purchase,phatL[[i]][,j]) imin = which.min(lvec) phatBest[,i] = phatL[[i]][,imin] phatBest[,i] = phatL[[i]][,1] } ``` \newpage We can plot $\hat p$ for best models on the test set ```{r fig.width=8, fig.height=8} pairs(phatBest) ``` \newpage The idea behind the tabloid example is that if we can predict who will buy we can target those customers and send them the tabloid. \sk To get an idea of how well our model is working, we can imagine choosing a customer from the data set to mail to first - did they buy? \sk We can look at the y value to see if they bought. \sk Whom would you mail to first? \sk You could mail the first 40 people in your database. \scriptsize ```{r} td$phat = phat td[1:40, c("purchase", "phat")] ``` \normalsize\sk\sk Out of the first 40, there is only one purchase. \newpage If you believe your model, you might mail to the household with the largest $\hat p$ (estimated prob of buying) first. Then you would mail to the household with the second largest $\hat p$ and so on. \scriptsize ```{r} td$phat = phat sorted_phat = order(-phat) td[sorted_phat[1:40], c("purchase", "phat")] ``` \normalsize \sk\sk You got 16 purchases out of the first 40 customers you targeted. Using only 40/10000 = 0.004 of the data we got 16/258 = .062 of the purchases! \newpage ## Expected value of a classifier Let us target everyone with $\hat p > 0.02$ Our **cost/benefit matrix** looks like this ```{r} cost_benefit = matrix(c(0,-0.8,0,39.20), nrow=2) print(cost_benefit) ``` Expected values of targeting is below: ```{r} confMat = getConfusionMatrix(td_test$purchase, phatBest[,1], 0.02) print(confMat) cat("Expected value of targeting using logistic regression = ", sum(sum(confMat * cost_benefit)), "\n") ``` ```{r} confMat = getConfusionMatrix(td_test$purchase, phatBest[,2], 0.02) print(confMat) cat("Expected value of targeting using random forests = ", sum(sum(confMat * cost_benefit)), "\n") ``` ```{r} confMat = getConfusionMatrix(td_test$purchase, phatBest[,3], 0.02) print(confMat) cat("Expected value of targeting using boosting = ", sum(sum(confMat * cost_benefit)), "\n") ``` \newpage ## ROC curves Library for plotting various summary curves ```{r} library(ROCR) ``` ```{r fig.width=6, fig.height=6} plot(c(0,1),c(0,1),xlab='FPR',ylab='TPR',main="ROC curve",cex.lab=1,type="n") for(i in 1:ncol(phatBest)) { pred = prediction(phatBest[,i], td_test$purchase) perf = performance(pred, measure = "tpr", x.measure = "fpr") lines(perf@x.values[[1]], perf@y.values[[1]],col=i) } abline(0,1,lty=2) legend("topleft",legend=names(phatL),col=1:nmethod,lty=rep(1,nmethod)) ``` \newpage ## Lift curves ```{r fig.width=6, fig.height=6} pred = prediction(phatBest[,1], td_test$purchase) perf = performance(pred, measure = "lift", x.measure = "rpp") plot(perf, col=1, ylim=c(0,5)) abline(h=1, lty=2) for(i in 2:ncol(phatBest)) { pred = prediction(phatBest[,i], td_test$purchase) perf = performance(pred, measure = "lift", x.measure = "rpp") lines(perf@x.values[[1]], perf@y.values[[1]],col=i) } legend("topright",legend=names(phatL),col=1:nmethod,lty=rep(1,nmethod)) ``` \newpage ## Cummulative response ```{r fig.width=6, fig.height=6} pred = prediction(phatBest[,1], td_test$purchase) perf = performance(pred, measure = "tpr", x.measure = "rpp") plot(perf, col=1, ylim=c(0,1)) abline(h=1, lty=2) abline(0,1,lty=2) for(i in 2:ncol(phatBest)) { pred = prediction(phatBest[,i], td_test$purchase) perf = performance(pred, measure = "tpr", x.measure = "rpp") lines(perf@x.values[[1]], perf@y.values[[1]],col=i) } legend("bottomright",legend=names(phatL),col=1:nmethod,lty=rep(1,nmethod)) ```