--- title: "Give Me Some Credit Data Set from a Kaggle competition" author: "" date: '' output: pdf_document: number_sections: true includes: in_header: mystyles.sty --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) knitr::opts_chunk$set(cache = TRUE) options(digits=3) options(width = 48) ``` # Description This is a kaggle competition data set. \sk There are 150,000 observations in the kaggle training data. \sk The Y is: "Person experienced 90 days past due delinquency or worse: Y/N" \sk Can you predict when an account is going to be delinquent! Data can be obtained from here: [https://www.kaggle.com/c/GiveMeSomeCredit](https://www.kaggle.com/c/GiveMeSomeCredit) # Preprocessing We download the data and preprocess it first. We split the kaggle training data into a 50% train and 50% test. The kaggle test does not come with y! We made y=1 if delinquent and 0 else. ```{r} download.file( 'https://github.com/ChicagoBoothML/MLClassData/raw/master/GiveMeSomeCredit/CreditScoring.csv', 'CreditScoring.csv') trainDf = read.csv("CreditScoring.csv") ##remove X (is 1:n) trainDf = trainDf[,-1] ##add y as factor trainDf$y = as.factor(trainDf$SeriousDlqin2yrs) trainDf = trainDf[,-1] # get rid of old y = SeriousDlqin2yrs ##get rid of NumberOfDependents, don't want to deal with NA's trainDf=trainDf[,-10] ##get rid of MonthlyIncome, don't want to deal with NA's trainDf=trainDf[,-5] ##split train in train, test n=nrow(trainDf) set.seed(99) ii = sample(1:n,n) ntest = floor(n/2) testDf = trainDf[ii[1:ntest],] trainDf = trainDf[ii[(ntest+1):n],] ``` \newpage # Summary statistics ```{r} table(trainDf$y) ``` 6 to 7 % of accounts are delinquent. For example, it looks like older people are less likely to be delinquent. ```{r fig.height=8, fig.width=14} plot(age~y,trainDf,col=c("red","blue"),cex.lab=1.4) ``` \newpage # Fit models We fit * logistic regression * random forest model * boosting ```{r message=F} library(ROCR) library(tree) library(randomForest) library(gbm) ``` Create some helper function used for evaluation. The following function is used to compute the deviance of a model ```{r} # deviance loss function # y should be 0/1 # phat are probabilities obtain by our algorithm # wht shrinks probs in phat towards .5 --- this helps avoid numerical problems don't use log(0)! lossf = function(y,phat,wht=0.0000001) { if(is.factor(y)) y = as.numeric(y)-1 phat = (1-wht)*phat + wht*.5 py = ifelse(y==1, phat, 1-phat) return(-2*sum(log(py))) } ``` The following will get confucion matrix: ```{r} # deviance loss function # y should be 0/1 # phat are probabilities obtain by our algorithm # thr is the cut off value - everything above thr is classified as 1 getConfusionMatrix = function(y,phat,thr=0.5) { if(is.factor(y)) y = as.numeric(y)-1 yhat = ifelse(phat > thr, 1, 0) tb = table(predictions = yhat, actual = y) rownames(tb) = c("predict_0", "predict_1") return(tb) } ``` And finally, this function gives miss-classification rate: ```{r} # deviance loss function # y should be 0/1 # phat are probabilities obtain by our algorithm # thr is the cut off value - everything above thr is classified as 1 lossMR = function(y,phat,thr=0.5) { if(is.factor(y)) y = as.numeric(y)-1 yhat = ifelse(phat > thr, 1, 0) return(1 - mean(yhat == y)) } ``` We need a place to store results ```{r} phatL = list() #store the test phat for the different methods here ``` ## Logistic regression We fit a logistic regression model using all variables ```{r} lgfit = glm(y~., trainDf, family=binomial) print(summary(lgfit)) ``` Predictions are stored for later analysis ```{r} phat = predict(lgfit, testDf, type="response") phatL$logit = matrix(phat,ncol=1) ``` \newpage ## Random Forest We fit random forest models for a few different settings. ```{r} set.seed(99) ##settings for randomForest p=ncol(trainDf)-1 mtryv = c(p, sqrt(p)) ntreev = c(500,1000) (setrf = expand.grid(mtryv,ntreev)) # this contains all settings to try colnames(setrf)=c("mtry","ntree") phatL$rf = matrix(0.0,nrow(testDf),nrow(setrf)) # we will store results here ###fit rf for(i in 1:nrow(setrf)) { #fit and predict frf = randomForest(y~., data=trainDf, mtry=setrf[i,1], ntree=setrf[i,2], nodesize=10) phat = predict(frf, newdata=testDf, type="prob")[,2] phatL$rf[,i]=phat } ``` \newpage ## Boosting We fit boosting models for a few different settings. ```{r} ##settings for boosting idv = c(2,4) ntv = c(1000,5000) shv = c(.1,.01) (setboost = expand.grid(idv,ntv,shv)) colnames(setboost) = c("tdepth","ntree","shrink") phatL$boost = matrix(0.0,nrow(testDf),nrow(setboost)) ``` Remember to convert to numeric 0,1 values for boosting. ```{r} trainDfB = trainDf; trainDfB$y = as.numeric(trainDfB$y)-1 testDfB = testDf; testDfB$y = as.numeric(testDfB$y)-1 ``` Fitting ```{r} for(i in 1:nrow(setboost)) { ##fit and predict fboost = gbm(y~., data=trainDfB, distribution="bernoulli", n.trees=setboost[i,2], interaction.depth=setboost[i,1], shrinkage=setboost[i,3]) phat = predict(fboost, newdata=testDfB, n.trees=setboost[i,2], type="response") phatL$boost[,i] = phat } ``` \newpage # Analysis of results ## Miss-classification rate Let us first look at miss-classification rate. For **logistic regression** we have: ```{r} getConfusionMatrix(testDf$y, phatL[[1]][,1], 0.5) cat('Missclassification rate = ', lossMR(testDf$y, phatL[[1]][,1], 0.5), '\n') ``` \newpage For **random forest** we have: ```{r} nrun = nrow(setrf) for(j in 1:nrun) { print(setrf[j,]) print("Confusion Matrix:") print(getConfusionMatrix(testDf$y, phatL[[2]][,j], 0.5)) cat('Missclassification rate = ', lossMR(testDf$y, phatL[[2]][,j], 0.5), '\n') } ``` \newpage For **boosting** we have: ```{r} nrun = nrow(setboost) for(j in 1:nrun) { print(setboost[j,]) print("Confusion Matrix:") print(getConfusionMatrix(testDf$y, phatL[[3]][,j], 0.5)) cat('Missclassification rate = ', lossMR(testDf$y, phatL[[3]][,j], 0.5), '\n') } ``` \newpage ## Deviance Plot test set loss --- deviance: ```{r fig.width=8, fig.height=8} lossL = list() nmethod = length(phatL) for(i in 1:nmethod) { nrun = ncol(phatL[[i]]) lvec = rep(0,nrun) for(j in 1:nrun) lvec[j] = lossf(testDf$y, phatL[[i]][,j]) lossL[[i]]=lvec; names(lossL)[i] = names(phatL)[i] } lossv = unlist(lossL) plot(lossv, ylab="loss on Test", type="n") nloss=0 for(i in 1:nmethod) { ii = nloss + 1:ncol(phatL[[i]]) points(ii,lossv[ii],col=i,pch=17) nloss = nloss + ncol(phatL[[i]]) } legend("topright",legend=names(phatL),col=1:nmethod,pch=rep(17,nmethod)) ``` From each method class, we choose the one that has the lowest error on the validation set. ```{r} nmethod = length(phatL) phatBest = matrix(0.0,nrow(testDf),nmethod) #pick off best from each method colnames(phatBest) = names(phatL) for(i in 1:nmethod) { nrun = ncol(phatL[[i]]) lvec = rep(0,nrun) for(j in 1:nrun) lvec[j] = lossf(testDf$y,phatL[[i]][,j]) imin = which.min(lvec) phatBest[,i] = phatL[[i]][,imin] phatBest[,i] = phatL[[i]][,1] } ``` \newpage We can plot $\hat p$ for best models on the test set ```{r fig.width=8, fig.height=8} pairs(phatBest) ``` \newpage Each plot relates $\hat{p}$ to $y$.\sk Going from left to right, $\hat{p}$ is from logit, random forests, and boosting. ```{r} colnames(phatBest) = c("logit", "rf", "boost") tempdf = data.frame(phatBest,y = testDf$y) par(mfrow=c(1,3)) plot(logit~y,tempdf,ylim=c(0,1),cex.lab=1.4,col=c("red","blue")) plot(rf~y,tempdf,ylim=c(0,1),cex.lab=1.4,col=c("red","blue")) plot(boost~y,tempdf,ylim=c(0,1),cex.lab=1.4,col=c("red","blue")) ``` Boosting and random forests both look **pretty good**! Both are **dramatically better than logit**! \newpage ## Expected value of a classifier Our **cost/benefit matrix** looks like this ```{r} cost_benefit = matrix(c(0,-0.25,0,1), nrow=2) print(cost_benefit) ``` If $\hat p > 0.2$, we extend credit. Expected values of classifiers are below: ```{r} confMat = getConfusionMatrix(testDf$y, phatBest[,1], 0.2) print(confMat) cat("Expected value of logistic regression = ", sum(sum(confMat * cost_benefit)), "\n") ``` ```{r} confMat = getConfusionMatrix(testDf$y, phatBest[,2], 0.2) print(confMat) cat("Expected value of random forests = ", sum(sum(confMat * cost_benefit)), "\n") ``` ```{r} confMat = getConfusionMatrix(testDf$y, phatBest[,3], 0.2) print(confMat) cat("Expected value of boosting = ", sum(sum(confMat * cost_benefit)), "\n") ``` \newpage ## ROC curves ```{r fig.width=6, fig.height=6} plot(c(0,1),c(0,1),xlab='FPR',ylab='TPR',main="ROC curve",cex.lab=1,type="n") for(i in 1:ncol(phatBest)) { pred = prediction(phatBest[,i], testDf$y) perf = performance(pred, measure = "tpr", x.measure = "fpr") lines(perf@x.values[[1]], perf@y.values[[1]],col=i) } abline(0,1,lty=2) legend("topleft",legend=names(phatL),col=1:nmethod,lty=rep(1,nmethod)) ``` \newpage ## Lift curves ```{r fig.width=6, fig.height=6} pred = prediction(phatBest[,1], testDf$y) perf = performance(pred, measure = "lift", x.measure = "rpp") plot(perf, col=1, ylim=c(0,5)) abline(h=1, lty=2) for(i in 2:ncol(phatBest)) { pred = prediction(phatBest[,i], testDf$y) perf = performance(pred, measure = "lift", x.measure = "rpp") lines(perf@x.values[[1]], perf@y.values[[1]],col=i) } legend("topright",legend=names(phatL),col=1:nmethod,lty=rep(1,nmethod)) ``` \newpage ## Cummulative response ```{r fig.width=6, fig.height=6} pred = prediction(phatBest[,1], testDf$y) perf = performance(pred, measure = "tpr", x.measure = "rpp") plot(perf, col=1, ylim=c(0,1)) abline(h=1, lty=2) abline(0,1,lty=2) for(i in 2:ncol(phatBest)) { pred = prediction(phatBest[,i], testDf$y) perf = performance(pred, measure = "tpr", x.measure = "rpp") lines(perf@x.values[[1]], perf@y.values[[1]],col=i) } legend("bottomright",legend=names(phatL),col=1:nmethod,lty=rep(1,nmethod)) ```