---
title: "Give Me Some Credit Data Set from a Kaggle competition"
author: ""
date: ''
output: 
    pdf_document:
        number_sections: true
        includes:
            in_header: mystyles.sty
---


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(cache = TRUE)
options(digits=3)
options(width = 48)
```

# Description


This is a kaggle competition data set. \sk

There are 150,000 observations in the kaggle training data. \sk

The Y is: "Person experienced 90 days past due delinquency or worse: Y/N" \sk

Can you predict when an account is going to be delinquent!

Data can be obtained from here:   [https://www.kaggle.com/c/GiveMeSomeCredit](https://www.kaggle.com/c/GiveMeSomeCredit)


# Preprocessing

We download the data and preprocess it first. 
We split the kaggle training data into a 50% train and 50% test.
The kaggle test does not come with y! 
We made y=1 if delinquent and 0 else.

```{r}
download.file(
    'https://github.com/ChicagoBoothML/MLClassData/raw/master/GiveMeSomeCredit/CreditScoring.csv',
    'CreditScoring.csv')

trainDf = read.csv("CreditScoring.csv")

##remove X (is 1:n)
trainDf = trainDf[,-1]

##add y as factor
trainDf$y = as.factor(trainDf$SeriousDlqin2yrs)
trainDf = trainDf[,-1] # get rid of old y = SeriousDlqin2yrs

##get rid of NumberOfDependents, don't want to deal with NA's 
trainDf=trainDf[,-10]
##get rid of MonthlyIncome, don't want to deal with NA's 
trainDf=trainDf[,-5]

##split train in train, test
n=nrow(trainDf)
set.seed(99)
ii = sample(1:n,n)
ntest = floor(n/2)
testDf = trainDf[ii[1:ntest],]
trainDf = trainDf[ii[(ntest+1):n],]
```

\newpage

# Summary statistics

```{r}
table(trainDf$y)
```

6 to 7 % of accounts are delinquent.

For example, it looks like older people are less likely to be
delinquent.

```{r fig.height=8, fig.width=14}
plot(age~y,trainDf,col=c("red","blue"),cex.lab=1.4)
```


\newpage

# Fit models

We fit

* logistic regression
* random forest model
* boosting

```{r message=F}
library(ROCR)
library(tree)
library(randomForest)
library(gbm)
```

Create some helper function used for evaluation.

The following function is used to compute the deviance of a model

```{r}
# deviance loss function
# y should be 0/1
# phat are probabilities obtain by our algorithm 
# wht shrinks probs in phat towards .5 --- this helps avoid numerical problems don't use log(0)!
lossf = function(y,phat,wht=0.0000001) {
   if(is.factor(y)) y = as.numeric(y)-1
   phat = (1-wht)*phat + wht*.5
   py = ifelse(y==1, phat, 1-phat)
   return(-2*sum(log(py)))
}
```

The following will get confucion matrix:

```{r}
# deviance loss function
# y should be 0/1
# phat are probabilities obtain by our algorithm 
# thr is the cut off value - everything above thr is classified as 1
getConfusionMatrix = function(y,phat,thr=0.5) {
   if(is.factor(y)) y = as.numeric(y)-1
   yhat = ifelse(phat > thr, 1, 0)
   tb = table(predictions = yhat, 
                   actual = y)  
   rownames(tb) = c("predict_0", "predict_1")
   return(tb)
}
```

And finally, this function gives miss-classification rate:

```{r}
# deviance loss function
# y should be 0/1
# phat are probabilities obtain by our algorithm 
# thr is the cut off value - everything above thr is classified as 1
lossMR = function(y,phat,thr=0.5) {
   if(is.factor(y)) y = as.numeric(y)-1
   yhat = ifelse(phat > thr, 1, 0)
   return(1 - mean(yhat == y))
}
```


We need a place to store results
```{r}
phatL = list() #store the test phat for the different methods here
```


## Logistic regression

We fit a logistic regression model using all variables
```{r}
lgfit = glm(y~., trainDf, family=binomial)
print(summary(lgfit))
```

Predictions are stored for later analysis
```{r}
phat = predict(lgfit, testDf, type="response")
phatL$logit = matrix(phat,ncol=1) 
```

\newpage

## Random Forest

We fit random forest models for a few different settings.

```{r}
set.seed(99)

##settings for randomForest
p=ncol(trainDf)-1
mtryv = c(p, sqrt(p))
ntreev = c(500,1000)
(setrf = expand.grid(mtryv,ntreev))  # this contains all settings to try
colnames(setrf)=c("mtry","ntree")
phatL$rf = matrix(0.0,nrow(testDf),nrow(setrf))  # we will store results here

###fit rf
for(i in 1:nrow(setrf)) {
   #fit and predict
   frf = randomForest(y~., data=trainDf, 
                      mtry=setrf[i,1],
                      ntree=setrf[i,2],
                      nodesize=10)
   phat = predict(frf, newdata=testDf, type="prob")[,2]
   phatL$rf[,i]=phat
}
```

\newpage

## Boosting

We fit boosting models for a few different settings.

```{r}
##settings for boosting
idv = c(2,4)
ntv = c(1000,5000)
shv = c(.1,.01)
(setboost = expand.grid(idv,ntv,shv))
colnames(setboost) = c("tdepth","ntree","shrink")
phatL$boost = matrix(0.0,nrow(testDf),nrow(setboost))
```

Remember to convert to numeric 0,1 values for boosting.
```{r}
trainDfB = trainDf; trainDfB$y = as.numeric(trainDfB$y)-1
testDfB = testDf; testDfB$y = as.numeric(testDfB$y)-1
```

Fitting
```{r}
for(i in 1:nrow(setboost)) {
   ##fit and predict
   fboost = gbm(y~., data=trainDfB, distribution="bernoulli",
              n.trees=setboost[i,2],
              interaction.depth=setboost[i,1],
              shrinkage=setboost[i,3])
   
   phat = predict(fboost,
                  newdata=testDfB,
                  n.trees=setboost[i,2],
                  type="response")

   phatL$boost[,i] = phat
}
```

\newpage
# Analysis of results


## Miss-classification rate

Let us first look at miss-classification rate.

For **logistic regression** we have:
```{r}
getConfusionMatrix(testDf$y, phatL[[1]][,1], 0.5)
cat('Missclassification rate = ', lossMR(testDf$y, phatL[[1]][,1], 0.5), '\n')
```

\newpage

For **random forest** we have:
```{r}
nrun = nrow(setrf)
for(j in 1:nrun) {
  print(setrf[j,])
  print("Confusion Matrix:")
  print(getConfusionMatrix(testDf$y, phatL[[2]][,j], 0.5))
  cat('Missclassification rate = ', lossMR(testDf$y, phatL[[2]][,j], 0.5), '\n')
}
```

\newpage

For **boosting** we have:
```{r}
nrun = nrow(setboost)
for(j in 1:nrun) {
  print(setboost[j,])
  print("Confusion Matrix:")
  print(getConfusionMatrix(testDf$y, phatL[[3]][,j], 0.5))
  cat('Missclassification rate = ', lossMR(testDf$y, phatL[[3]][,j], 0.5), '\n')
}
```


\newpage

## Deviance

Plot test set loss --- deviance:

```{r fig.width=8, fig.height=8}
lossL = list()
nmethod = length(phatL)
for(i in 1:nmethod) {
   nrun = ncol(phatL[[i]])
   lvec = rep(0,nrun)
   for(j in 1:nrun) lvec[j] = lossf(testDf$y, phatL[[i]][,j])
   lossL[[i]]=lvec; names(lossL)[i] = names(phatL)[i]
}
lossv = unlist(lossL)
plot(lossv, ylab="loss on Test", type="n")
nloss=0
for(i in 1:nmethod) {
   ii = nloss + 1:ncol(phatL[[i]])
   points(ii,lossv[ii],col=i,pch=17)
   nloss = nloss + ncol(phatL[[i]])
}
legend("topright",legend=names(phatL),col=1:nmethod,pch=rep(17,nmethod))
```

From each method class, we choose the one that has the lowest error on the validation set.

```{r}
nmethod = length(phatL)
phatBest = matrix(0.0,nrow(testDf),nmethod) #pick off best from each method
colnames(phatBest) = names(phatL)
for(i in 1:nmethod) {
   nrun = ncol(phatL[[i]])
   lvec = rep(0,nrun)
   for(j in 1:nrun) lvec[j] = lossf(testDf$y,phatL[[i]][,j])
   imin = which.min(lvec)
   phatBest[,i] = phatL[[i]][,imin]
   phatBest[,i] = phatL[[i]][,1]
}
```

\newpage

We can plot $\hat p$ for best models on the test set

```{r fig.width=8, fig.height=8}
pairs(phatBest)
```

\newpage


Each plot relates $\hat{p}$ to $y$.\sk

Going from left to right, $\hat{p}$ is from logit, random forests, and boosting.

```{r}
colnames(phatBest) = c("logit", "rf", "boost")
tempdf = data.frame(phatBest,y = testDf$y)

par(mfrow=c(1,3))
plot(logit~y,tempdf,ylim=c(0,1),cex.lab=1.4,col=c("red","blue"))
plot(rf~y,tempdf,ylim=c(0,1),cex.lab=1.4,col=c("red","blue"))
plot(boost~y,tempdf,ylim=c(0,1),cex.lab=1.4,col=c("red","blue"))
```

Boosting and random forests both look **pretty good**!   
Both are **dramatically better than logit**!


\newpage

## Expected value of a classifier


Our **cost/benefit matrix** looks like this
```{r}
cost_benefit = matrix(c(0,-0.25,0,1), nrow=2)
print(cost_benefit)
```

If $\hat p > 0.2$, we extend credit. Expected values of classifiers are below:

```{r}
confMat = getConfusionMatrix(testDf$y, phatBest[,1], 0.2)
print(confMat)
cat("Expected value of logistic regression = ", 
    sum(sum(confMat * cost_benefit)), "\n")
```

```{r}
confMat = getConfusionMatrix(testDf$y, phatBest[,2], 0.2)
print(confMat)
cat("Expected value of random forests = ", 
    sum(sum(confMat * cost_benefit)), "\n")
```

```{r}
confMat = getConfusionMatrix(testDf$y, phatBest[,3], 0.2)
print(confMat)
cat("Expected value of boosting = ", 
    sum(sum(confMat * cost_benefit)), "\n")
```

\newpage

## ROC curves


```{r fig.width=6, fig.height=6}
plot(c(0,1),c(0,1),xlab='FPR',ylab='TPR',main="ROC curve",cex.lab=1,type="n")
for(i in 1:ncol(phatBest)) {
   pred = prediction(phatBest[,i], testDf$y)
   perf = performance(pred, measure = "tpr", x.measure = "fpr")
   lines(perf@x.values[[1]], perf@y.values[[1]],col=i)
}
abline(0,1,lty=2)
legend("topleft",legend=names(phatL),col=1:nmethod,lty=rep(1,nmethod))
```


\newpage 

## Lift curves

```{r fig.width=6, fig.height=6}
pred = prediction(phatBest[,1], testDf$y)
perf = performance(pred, measure = "lift", x.measure = "rpp")
plot(perf, col=1, ylim=c(0,5))
abline(h=1, lty=2)

for(i in 2:ncol(phatBest)) {
   pred = prediction(phatBest[,i], testDf$y)
   perf = performance(pred, measure = "lift", x.measure = "rpp")
   lines(perf@x.values[[1]], perf@y.values[[1]],col=i)
}
legend("topright",legend=names(phatL),col=1:nmethod,lty=rep(1,nmethod))
```

\newpage

## Cummulative response 


```{r fig.width=6, fig.height=6}
pred = prediction(phatBest[,1], testDf$y)
perf = performance(pred, measure = "tpr", x.measure = "rpp")
plot(perf, col=1, ylim=c(0,1))
abline(h=1, lty=2)
abline(0,1,lty=2)
for(i in 2:ncol(phatBest)) {
   pred = prediction(phatBest[,i], testDf$y)
   perf = performance(pred, measure = "tpr", x.measure = "rpp")
   lines(perf@x.values[[1]], perf@y.values[[1]],col=i)
}
legend("bottomright",legend=names(phatL),col=1:nmethod,lty=rep(1,nmethod))
```