######################################################################
# data can be downloaded from 
# https://www.kaggle.com/c/GiveMeSomeCredit/data
# make sure that the file "cs-training.csv" is in the current directory
trainDf = read.csv("cs-training.csv")

##remove X (is 1:n)
trainDf = trainDf[,-1]

##add y as factor
trainDf$y = as.factor(trainDf$SeriousDlqin2yrs)
trainDf = trainDf[,-1] #get rid of old y = SeriousDlqin2yrs

##get rid of NumberOfDependents, don't want to deal with NA's 
trainDf=trainDf[,-10]
##get rid of MonthlyIncome, don't want to deal with NA's 
trainDf=trainDf[,-5]

##split train in train, test
n=nrow(trainDf)
set.seed(99)
ii = sample(1:n,n)
ntest = floor(n/2)
testDf = trainDf[1:ntest,]
trainDf = trainDf[(ntest+1):n,]

pnm="kaggle"

######################################################################
cat("### load librairies and make loss and lift functions\n")
library(tree)
library(randomForest)
library(gbm)

#--------------------------------------------------
#deviance loss function
lossf = function(y,phat,wht=0.0000001) {
#y should be 0/1
#wht shrinks probs in phat towards .5, don't log 0!
   if(is.factor(y)) y = as.numeric(y)-1
   phat = (1-wht)*phat + wht*.5
   py = ifelse(y==1,phat,1-phat)
   return(-2*sum(log(py)))
}

#--------------------------------------------------
#lift function
liftf = function(yl,phatl,dopl=TRUE) {
   if(is.factor(yl)) yl = as.numeric(yl)-1
   oo = order(-phatl)
   sy = cumsum(yl[oo])/sum(yl==1)
   if(dopl) {
      ii = (1:length(sy))/length(sy)
      plot(ii,sy,type='l',lwd=2,col='blue',xlab='% tried',ylab='% of successes',cex.lab=2)
      abline(0,1,lty=2)
   }
   return(sy)
}

######################################################################
cat("### setup storage for results\n")
phatL = list() #store the test phat for the different methods here

######################################################################
cat("### fit logit\n")
###settings for logit (just one of course)
phatL$logit = matrix(0.0,nrow(testDf),1) #only one logit fit 

###fit logit
lgfit = glm(y~.,trainDf,family=binomial)
print(summary(lgfit))
###predict using logit
phat = predict(lgfit,testDf,type="response")

phatL$logit = matrix(phat,ncol=1) #logit phat

##do lift
liftf(testDf$y,phatL$logit[,1])

######################################################################
cat("### fit random Forests\n")
set.seed(99)

##settings for randomForest
p=ncol(trainDf)-1
mtryv = c(p,sqrt(p))
ntreev = c(500,1000)
setrf = expand.grid(mtryv,ntreev)
colnames(setrf)=c("mtry","ntree")
phatL$rf = matrix(0.0,nrow(testDf),nrow(setrf))

###fit rf
for(i in 1:nrow(setrf)) {
   cat("on randomForest fit ",i,"\n")
   print(setrf[i,])

   #fit and predict
   frf = randomForest(y~.,data=trainDf,mtry=setrf[i,1],ntree=setrf[i,2])
   phat = predict(frf,newdata=testDf,type="prob")[,2]

   phatL$rf[,i]=phat
}

######################################################################
cat("### fit boosting\n")

##settings for boosting
idv = c(2,4)
ntv = c(1000,5000)
shv = c(.1,.01)
setboost = expand.grid(idv,ntv,shv)
colnames(setboost) = c("tdepth","ntree","shrink")
phatL$boost = matrix(0.0,nrow(testDf),nrow(setboost))

trainDfB = trainDf; trainDfB$y = as.numeric(trainDfB$y)-1
testDfB = testDf; testDfB$y = as.numeric(testDfB$y)-1

##fit boosting
for(i in 1:nrow(setboost)) {
   cat("on boosting fit ",i,"\n")
   print(setboost[i,])

   ##fit and predict
   fboost = gbm(y~.,data=trainDfB,distribution="bernoulli",
              n.trees=setboost[i,2],interaction.depth=setboost[i,1],shrinkage=setboost[i,3])
   phat = predict(fboost,newdata=testDfB,n.trees=setboost[i,2],type="response")

   phatL$boost[,i] = phat
}

######################################################################
cat("### plot loss\n")
lossL = list()
nmethod = length(phatL)
for(i in 1:nmethod) {
   nrun = ncol(phatL[[i]])
   lvec = rep(0,nrun)
   print(nrun)
   for(j in 1:nrun) lvec[j] = lossf(testDf$y,phatL[[i]][,j])
   lossL[[i]]=lvec; names(lossL)[i] = names(phatL)[i]
}
lossv = unlist(lossL)
plot(lossv,ylab="loss on Test",type="n")
nloss=0
for(i in 1:nmethod) {
   ii = nloss + 1:ncol(phatL[[i]])
   points(ii,lossv[ii],col=i,pch=17)
   nloss = nloss + ncol(phatL[[i]])
}
legend("topright",legend=names(phatL),col=1:nmethod,pch=rep(17,nmethod))

######################################################################
cat("### lift\n")
nmethod = length(phatL)
phatBest = matrix(0.0,nrow(testDf),nmethod) #pick off best from each method
colnames(phatBest) = names(phatL)
for(i in 1:nmethod) {
   nrun = ncol(phatL[[i]])
   lvec = rep(0,nrun)
   print(nrun)
   for(j in 1:nrun) lvec[j] = lossf(testDf$y,phatL[[i]][,j])
   print(lvec)
   imin = which.min(lvec)
   cat("imin: ",imin,"\n")
   phatBest[,i] = phatL[[i]][,imin]
   phatBest[,i] = phatL[[i]][,1]
}
pairs(phatBest)

dfrac = (1:nrow(testDf))/nrow(testDf)
plot(c(0,1),c(0,1),xlab='% tried',ylab='% of successes',cex.lab=2,type="n")
for(i in 1:ncol(phatBest)) {
   temp = liftf(testDf$y,phatBest[,i],dopl=FALSE)
   lines(dfrac,temp,type="l",col=i)
}
abline(0,1,lty=2)
legend("topleft",legend=names(phatL),col=1:nmethod,lty=rep(1,nmethod))