## ----load-libs, echo=FALSE, eval=TRUE,message=FALSE,warning=FALSE--------
library(ggplot2)
library(stringr)
library(tidyr)
library(dplyr)
library(gridExtra)
library(caret)
library(pROC)

## ----read-in-data, echo=FALSE, eval=TRUE---------------------------------
insurance <- read.csv("https://raw.githubusercontent.com/heathergeiger/Data621_hw4/master/insurance_training_data.csv",header=TRUE,stringsAsFactors=FALSE)

insurance <- insurance[,setdiff(colnames(insurance),"INDEX")]

for(column in c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"))
{
insurance[,column] <- str_replace_all(insurance[,column],pattern='\\$',replace='')
insurance[,column] <- str_replace_all(insurance[,column],pattern=',',replace='')
insurance[,column] <- as.numeric(as.vector(insurance[,column]))
}

## ----read-in-evaluation, echo=FALSE, eval=TRUE---------------------------
evaluation <- read.csv("https://raw.githubusercontent.com/heathergeiger/Data621_hw4/master/insurance-evaluation-data.csv",header=TRUE,stringsAsFactors=FALSE)

evaluation <- evaluation[,setdiff(colnames(evaluation),"INDEX")]

for(column in c("INCOME","HOME_VAL","BLUEBOOK","OLDCLAIM"))
{
evaluation[,column] <- str_replace_all(evaluation[,column],pattern='\\$',replace='')
evaluation[,column] <- str_replace_all(evaluation[,column],pattern=',',replace='')
evaluation[,column] <- as.numeric(as.vector(evaluation[,column]))
}

## ----num-non-NA-per-column, echo=FALSE, eval=TRUE------------------------
print("Total number of records in data:")
nrow(insurance)

print("Number of non-NA values per variable in data:")

non_NA_per_column <- insurance %>%
gather() %>%
na.omit(value) %>%
count(key)

non_NA_per_column <- data.frame(non_NA_per_column)

non_NA_per_column <- non_NA_per_column[order(non_NA_per_column[,2]),]

data.frame(Variable = non_NA_per_column[,1],n = non_NA_per_column[,2])

## ----num-unique-per-column, echo=FALSE, eval=TRUE,message=FALSE,warning=FALSE----
num_unique_values_per_variable <- insurance %>%
gather() %>%
group_by(key) %>%
summarize(uniques = n_distinct(value))

num_unique_values_per_variable <- data.frame(num_unique_values_per_variable,stringsAsFactors=FALSE)

num_unique_values_per_variable <- data.frame(Variable = num_unique_values_per_variable[,1],Num.uniques = num_unique_values_per_variable[,2],stringsAsFactors=FALSE)

num_unique_values_per_variable$Variable <- as.vector(num_unique_values_per_variable$Variable)

num_unique_values_per_variable[order(num_unique_values_per_variable[,2]),]

print("Levels of EDUCATION:")
unique(insurance$EDUCATION)
print("Levels of HOMEKIDS:")
unique(insurance$HOMEKIDS[order(insurance$HOMEKIDS)])

## ----format-binary-vars-training-data, echo=FALSE, eval=TRUE-------------
colnames(insurance) <- plyr::mapvalues(colnames(insurance),
	from=c("CAR_USE","MSTATUS","PARENT1","RED_CAR","REVOKED","SEX","URBANICITY"),
	to=c("Commercial_vehicle","Married","Single_parent","Red_car","Revoked","Sex_male","Urban_not_rural"))

binary_variable_translations <- data.frame(Variable = c("Commercial_vehicle","Married","Single_parent","Red_car","Revoked","Sex_male","Urban_not_rural"),
	True.value = c("Commercial","Yes","Yes","yes","Yes","M","Highly Urban/ Urban"),
	stringsAsFactors=FALSE)

for(i in 1:length(binary_variable_translations$Variable))
{
var <- binary_variable_translations$Variable[i]
true_value <- binary_variable_translations$True.value[i]
insurance[,var] <- ifelse(insurance[,var] == true_value,1,0)
}

## ----format-binary-vars-evaluation-data, echo=FALSE, eval=TRUE-----------
colnames(evaluation) <- plyr::mapvalues(colnames(evaluation),
	from=c("CAR_USE","MSTATUS","PARENT1","RED_CAR","REVOKED","SEX","URBANICITY"),
	to=c("Commercial_vehicle","Married","Single_parent","Red_car","Revoked","Sex_male","Urban_not_rural"))

for(i in 1:length(binary_variable_translations$Variable))
{
var <- binary_variable_translations$Variable[i]
true_value <- binary_variable_translations$True.value[i]
evaluation[,var] <- ifelse(evaluation[,var] == true_value,1,0)
}

## ----barplot-binary-vars, echo=FALSE, eval=TRUE--------------------------
insurance[,c("Commercial_vehicle","Married","Single_parent","Red_car","Revoked","Sex_male","Urban_not_rural","TARGET_FLAG")] %>%
gather("variable","value") %>%
group_by(variable) %>%
count(value) %>%
mutate(value = factor(value)) %>%
mutate(percent = n*100/8161) %>%
ggplot(.,
aes(variable,percent)) +
geom_bar(stat = "identity", aes(fill = value)) +
xlab("Variable") +
ylab("Percent of records") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

## ----barplot-select-discrete-numeric-vars, echo=FALSE, eval=TRUE,message=FALSE,warning=FALSE----
insurance[,c("CLM_FREQ","HOMEKIDS","KIDSDRIV")] %>%
gather("variable","value") %>%
group_by(variable) %>%
count(value) %>%
mutate(value = factor(value,levels=5:0)) %>%
mutate(percent = n*100/8161) %>%
ggplot(.,
aes(variable,percent)) +
geom_bar(stat = "identity", aes(fill = value)) +
xlab("Variable") +
ylab("Percent of records") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_manual(values = rev(c("#999999","#E69F00", "#56B4E9", "#009E73", "#F0E442","#CC79A7")))

## ----convert-blank-job-to-not-stated-training-and-evaluation, echo=FALSE, eval=TRUE----
insurance[insurance$JOB == "","JOB"] <- "Not stated"
evaluation[evaluation$JOB == "","JOB"] <- "Not stated"

## ----barplot-factor-vars, echo=FALSE, eval=TRUE,message=FALSE,warning=FALSE----
for(var in c("EDUCATION","CAR_TYPE","JOB"))
{
frequency_per_level <- data.frame(table(insurance[,var]))
colnames(frequency_per_level) <- c("value","n")

print(ggplot(frequency_per_level,
aes(value,n*100/8161)) +
geom_bar(stat = "identity",col="black",fill="darkgrey") +
xlab("") +
ylab("Percent of records") +
ggtitle(var) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)))
}

## ----convert-clmfreq-homekids-and-kidsdriv-to-binary-in-training-and-evaluation, echo=FALSE, eval=TRUE----
colnames(insurance) <- plyr::mapvalues(colnames(insurance),
	from=c("CLM_FREQ","HOMEKIDS","KIDSDRIV"),
	to=c("Past_claim","Kids","Driving_kids"))

colnames(evaluation) <- plyr::mapvalues(colnames(evaluation),
	from=c("CLM_FREQ","HOMEKIDS","KIDSDRIV"),
	to=c("Past_claim","Kids","Driving_kids"))

for(var in c("Past_claim","Kids","Driving_kids"))
{
insurance[,var] <- ifelse(insurance[,var] > 0,1,0)
evaluation[,var] <- ifelse(evaluation[,var] > 0,1,0)
}

## ----barplot-mvr-pts, echo=FALSE, eval=TRUE------------------------------
frequency_per_level <- data.frame(table(insurance[,"MVR_PTS"]))
colnames(frequency_per_level) <- c("value","n")
#frequency_per_level$value <- factor(frequency_per_level$value,levels=as.numeric(frequency_per_level$value)[order(as.numeric(frequency_per_level$value))])
frequency_per_level$value <- factor(frequency_per_level$value,levels=c(0:10,11,13))

ggplot(frequency_per_level,
aes(value,n*100/8161)) +
geom_bar(stat = "identity",col="black",fill="darkgrey") +
xlab("") +
ylab("Percent of records") +
ggtitle("MVR_PTS") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

## ----mvr-pts-max-10-training-and-eval, echo=FALSE, eval=TRUE-------------
insurance$MVR_PTS <- ifelse(insurance$MVR_PTS > 10,10,insurance$MVR_PTS)
evaluation$MVR_PTS <- ifelse(evaluation$MVR_PTS > 10,10,evaluation$MVR_PTS)

## ----barplot-carage-tif-yoj, echo=FALSE, eval=TRUE-----------------------
for(var in c("CAR_AGE","TIF","YOJ"))
{
frequency_per_level <- data.frame(table(insurance[,var]))
colnames(frequency_per_level) <- c("value","n")
frequency_per_level$value <- as.vector(frequency_per_level$value)
frequency_per_level <- rbind(frequency_per_level,data.frame(value = "Not stated",n = length(which(is.na(insurance[,var]) == TRUE)),stringsAsFactors=FALSE))
frequency_per_level$value <- factor(frequency_per_level$value,levels=c(unique(insurance[,var])[order(unique(insurance[,var]))],"Not stated"))

#print(var)
#print(frequency_per_level)
#frequency_per_level$value

print(ggplot(frequency_per_level,
aes(value,n*100/8161)) +
geom_bar(stat = "identity",col="black",fill="darkgrey") +
xlab("") +
ylab("Percent of records") +
ggtitle(var) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)))
}

## ----convert-yoj-to-binary-training-and-eval, echo=FALSE, eval=TRUE------
colnames(insurance) <- plyr::mapvalues(colnames(insurance),from="YOJ",to="New_on_the_job")
colnames(evaluation) <- plyr::mapvalues(colnames(evaluation),from="YOJ",to="New_on_the_job")

insurance[,"New_on_the_job"] <- ifelse(insurance[,"New_on_the_job"] == 0,1,0)
evaluation[,"New_on_the_job"] <- ifelse(evaluation[,"New_on_the_job"] == 0,1,0)

insurance[which(is.na(insurance[,"New_on_the_job"]) == TRUE),"New_on_the_job"] <- 0
evaluation[which(is.na(evaluation[,"New_on_the_job"]) == TRUE),"New_on_the_job"] <- 0

## ----convert-homeval-NA-to-0-training-and-eval, echo=FALSE, eval=TRUE----
insurance[which(is.na(insurance$HOME_VAL) == TRUE),"HOME_VAL"] <- 0
evaluation[which(is.na(evaluation$HOME_VAL) == TRUE),"HOME_VAL"] <- 0

## ----count-zeroes-homeval-and-income, echo=FALSE, eval=TRUE--------------
print("Percent of records with HOME_VAL > 0:")
round((length(which(insurance$HOME_VAL > 0))*100)/nrow(insurance),digits=2)

print("Percent of records with INCOME not stated:")
round((length(which(is.na(insurance$INCOME) == TRUE))*100)/nrow(insurance),digits=2)
print("Percent of records with INCOME == 0:")
round((length(which(insurance$INCOME == 0))*100)/nrow(insurance),digits=2)
print("Percent of records with INCOME > 0:")
round((length(which(insurance$INCOME > 0))*100)/nrow(insurance),digits=2)

## ----hist-numeric-vars, echo=FALSE, eval=TRUE,fig.width=12,fig.height=8----
par(mfrow=c(2,4))

for(var in c("AGE","TRAVTIME","BLUEBOOK","HOME_VAL","INCOME","OLDCLAIM","TARGET_AMT"))
{
hist(insurance[insurance[,var] > 0,var],
xlab="Values",
ylab="Number of records",
main=var,
labels=TRUE)
}

hist(insurance$TARGET_AMT[insurance$TARGET_AMT >0 & insurance$TARGET_AMT < 10000],
xlab="Values",
ylab="Number of records",
main="TARGET AMT < $10,000",
labels=TRUE)

## ----add-homeowner-var, echo=FALSE, eval=TRUE----------------------------
insurance <- data.frame(insurance,
	Homeowner = ifelse(insurance$HOME_VAL > 0,1,0),
	stringsAsFactors=FALSE,check.names=FALSE)

evaluation <- data.frame(evaluation,
	Homeowner = ifelse(evaluation$HOME_VAL > 0,1,0),
	stringsAsFactors=FALSE,check.names=FALSE)

## ----recount-unique-values-per-var, echo=FALSE, eval=TRUE----------------
num_unique_values_per_variable <- insurance %>%
gather() %>%
group_by(key) %>%
summarize(uniques = n_distinct(value))

num_unique_values_per_variable <- data.frame(num_unique_values_per_variable,stringsAsFactors=FALSE)

num_unique_values_per_variable <- data.frame(Variable = num_unique_values_per_variable[,1],Num.uniques = num_unique_values_per_variable[,2],stringsAsFactors=FALSE)

num_unique_values_per_variable$Variable <- as.vector(num_unique_values_per_variable$Variable)

## ----crash-rate-binary-vars, echo=FALSE, eval=TRUE-----------------------
binary_variables <- num_unique_values_per_variable$Variable[num_unique_values_per_variable$Num.uniques == 2]

binary_variables_data <- insurance[,binary_variables]

binary_variables_data <- data.frame(TARGET_FLAG = rep(binary_variables_data$TARGET_FLAG,times=(ncol(binary_variables_data) - 1)),
	gather(binary_variables_data[,setdiff(colnames(binary_variables_data),"TARGET_FLAG")],"variable","value"),
	stringsAsFactors=FALSE)

binary_variables_data$value <- factor(binary_variables_data$value)

count_per_target_and_predictor <- binary_variables_data %>%
				group_by(TARGET_FLAG,variable) %>%
				count(value)

count_per_target_and_predictor <- data.frame(count_per_target_and_predictor,check.names=FALSE)

count_per_predictor_level <- count_per_target_and_predictor %>%
	group_by(variable,value) %>%
	summarize(total = sum(n))

count_per_predictor_level <- data.frame(count_per_predictor_level,check.names=FALSE)

count_per_target_and_predictor <- merge(count_per_target_and_predictor,count_per_predictor_level,by=c("variable","value"))

count_per_target_and_predictor <- count_per_target_and_predictor[count_per_target_and_predictor$TARGET_FLAG == 1,]

count_per_target_and_predictor <- data.frame(variable = count_per_target_and_predictor$variable,
	value = count_per_target_and_predictor$value,
	crash.rate = count_per_target_and_predictor$n*100/count_per_target_and_predictor$total)

ggplot(count_per_target_and_predictor,
aes(variable,crash.rate)) +
geom_bar(stat = "identity", aes(fill = value),position="dodge") +
xlab("Variable") +
ylab("Crash rate (%)") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

## ----single-parents-def, echo=FALSE, eval=TRUE---------------------------
print("Number of unmarried individuals with children:")
length(which(insurance[,"Married"] == 0 & insurance[,"Kids"] == 1))
print("Number of single parents:")
length(which(insurance[,"Single_parent"] == 1))

## ----crash-rate-partnered-parents-vs-no-kids, echo=FALSE, eval=TRUE------
print("Crash rate of single parents:")
round((length(which(insurance[,"Single_parent"] == 1 & insurance[,"TARGET_FLAG"] == 1))*100)/length(which(insurance[,"Single_parent"] == 1)),digits=2)

print("Crash rate of partnered parents:")
round((length(which(insurance[,"Kids"] == 1 & insurance[,"Married"] == 1 & insurance[,"TARGET_FLAG"] == 1))*100)/length(which(insurance[,"Kids"] == 1 & insurance[,"Married"] == 1)),digits=2)

print("Crash rate of individuals without children:")
round((length(which(insurance[,"Kids"] == 0 & insurance[,"TARGET_FLAG"] == 1))*100)/length(which(insurance[,"Kids"] == 0)),digits=2)

## ----crash-rate-factor-vars, echo=FALSE, eval=TRUE-----------------------
factor_variables <- num_unique_values_per_variable$Variable[num_unique_values_per_variable$Num.uniques > 2 & num_unique_values_per_variable$Num.uniques < 10]

factor_variables_data <- insurance[,factor_variables]

factor_variables_data <- data.frame(TARGET_FLAG = rep(insurance$TARGET_FLAG,times=ncol(factor_variables_data)),
	gather(factor_variables_data,"variable","value"),
	stringsAsFactors=FALSE,check.names=FALSE)

count_per_target_and_predictor <- factor_variables_data %>%
				group_by(TARGET_FLAG,variable) %>%
				count(value)

count_per_target_and_predictor <- data.frame(count_per_target_and_predictor,check.names=FALSE)

count_per_predictor_level <- count_per_target_and_predictor %>%
			group_by(variable,value) %>%
			summarize(total = sum(n))

count_per_predictor_level <- data.frame(count_per_predictor_level,check.names=FALSE)

count_per_target_and_predictor <- merge(count_per_target_and_predictor,count_per_predictor_level,by=c("variable","value"))
count_per_target_and_predictor <- count_per_target_and_predictor[count_per_target_and_predictor$TARGET_FLAG == 1,]

count_per_target_and_predictor <- data.frame(variable = count_per_target_and_predictor$variable,
		value = count_per_target_and_predictor$value,
		crash.rate = count_per_target_and_predictor$n*100/count_per_target_and_predictor$total)

for(var in unique(count_per_target_and_predictor$variable))
{
print(ggplot(count_per_target_and_predictor[count_per_target_and_predictor$variable == var,],
aes(value,crash.rate)) +
geom_bar(stat = "identity") +
xlab("") +
ylab("Crash rate (%)") +
ggtitle(var) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)))
}

## ----add-binary-education-minivan-job-to-training-and-eval, echo=FALSE, eval=TRUE----
insurance <- data.frame(insurance,
	College_education = ifelse(insurance$EDUCATION == "Bachelors" | insurance$EDUCATION == "Masters" | insurance$EDUCATION == "PhD",1,0),
	Minivan = ifelse(insurance$CAR_TYPE  == "Minivan",1,0),
	Student = ifelse(insurance$JOB == "Student",1,0),
	Blue_collar = ifelse(insurance$JOB == "z_Blue Collar",1,0),
	check.names=FALSE,stringsAsFactors=FALSE)

evaluation <- data.frame(evaluation,
	College_education = ifelse(evaluation$EDUCATION == "Bachelors" | evaluation$EDUCATION == "Masters" | evaluation$EDUCATION == "PhD",1,0),
	Minivan = ifelse(evaluation$CAR_TYPE == "Minivan",1,0),
	Student = ifelse(evaluation$JOB == "Student",1,0),
	Blue_collar = ifelse(evaluation$JOB == "z_Blue Collar",1,0),
	check.names=FALSE,stringsAsFactors=FALSE)

## ----car-age-vs-target-flag, echo=FALSE, eval=TRUE-----------------------
print("Crash rate when CAR_AGE = 1:")
round((length(which(insurance$CAR_AGE == 1 & insurance$TARGET_FLAG == 1))*100)/length(which(insurance$CAR_AGE == 1)),digits=2)
print("Crash rate when CAR_AGE >= 20:")
round((length(which(insurance$CAR_AGE >= 20 & insurance$TARGET_FLAG == 1))*100)/length(which(insurance$CAR_AGE >= 20)),digits=2)

count_per_target_and_predictor <- insurance[insurance$CAR_AGE >= 3 & insurance$CAR_AGE <= 19 & is.na(insurance$CAR_AGE) == FALSE,c("CAR_AGE","TARGET_FLAG")] %>%
			count(TARGET_FLAG,CAR_AGE)	

count_per_target_and_predictor <- data.frame(count_per_target_and_predictor,check.names=FALSE)

count_per_predictor_level <- insurance[insurance$CAR_AGE >= 3 & insurance$CAR_AGE <= 19 & is.na(insurance$CAR_AGE) == FALSE,c("CAR_AGE","TARGET_FLAG")] %>%
			count(CAR_AGE)

count_per_predictor_level <- data.frame(count_per_predictor_level,check.names=FALSE)

count_per_target_and_predictor <- merge(count_per_target_and_predictor,count_per_predictor_level,by="CAR_AGE")

count_per_target_and_predictor <- count_per_target_and_predictor[count_per_target_and_predictor$TARGET_FLAG == 1,]

count_per_target_and_predictor <- data.frame(value = count_per_target_and_predictor$CAR_AGE,
				crash.rate = count_per_target_and_predictor$n.x*100/count_per_target_and_predictor$n.y)

count_per_target_and_predictor$value <- factor(count_per_target_and_predictor$value,levels=3:19)

ggplot(count_per_target_and_predictor,
aes(value,crash.rate)) +
geom_bar(stat = "identity") +
xlab("CAR_AGE") +
ylab("Crash rate (%)") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

## ----tif-vs-target-flag, echo=FALSE, eval=TRUE---------------------------
tif_for_barplot <- insurance[insurance$TIF == 1 | insurance$TIF == 3 | insurance$TIF == 4 | insurance$TIF == 6 | insurance$TIF == 7 | insurance$TIF == 9 | insurance$TIF >= 10,c("TARGET_FLAG","TIF")]

tif_for_barplot$TIF <- ifelse(tif_for_barplot$TIF > 10,"11+",tif_for_barplot$TIF)

count_per_target_and_predictor <- tif_for_barplot %>% count(TARGET_FLAG,TIF)
count_per_target_and_predictor <- data.frame(count_per_target_and_predictor,check.names=FALSE)

count_per_predictor_level <- tif_for_barplot %>% count(TIF)
count_per_predictor_level <- data.frame(count_per_predictor_level,check.names=FALSE)

count_per_target_and_predictor <- merge(count_per_target_and_predictor,count_per_predictor_level,by="TIF")

count_per_target_and_predictor <- count_per_target_and_predictor[count_per_target_and_predictor$TARGET_FLAG == 1,]

count_per_target_and_predictor <- data.frame(value = count_per_target_and_predictor$TIF,
		crash.rate = count_per_target_and_predictor$n.x*100/count_per_target_and_predictor$n.y)

count_per_target_and_predictor$value <- factor(count_per_target_and_predictor$value,levels=c("1","3","4","6","7","9","10","11+"))

ggplot(count_per_target_and_predictor,
aes(value,crash.rate)) +
geom_bar(stat = "identity") +
xlab("TIF") +
ylab("Crash rate (%)") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

## ----mvr-pts-vs-target-flag, echo=FALSE, eval=TRUE-----------------------
count_per_target_and_predictor <- insurance[,c("TARGET_FLAG","MVR_PTS")] %>% count(TARGET_FLAG,MVR_PTS)
count_per_target_and_predictor <- data.frame(count_per_target_and_predictor,check.names=FALSE)

count_per_predictor_level <- insurance[,c("TARGET_FLAG","MVR_PTS")] %>% count(MVR_PTS)
count_per_predictor_level <- data.frame(count_per_predictor_level,check.names=FALSE)

count_per_target_and_predictor <- merge(count_per_target_and_predictor,count_per_predictor_level,by="MVR_PTS")

count_per_target_and_predictor <- count_per_target_and_predictor[count_per_target_and_predictor$TARGET_FLAG == 1,]

count_per_target_and_predictor <- data.frame(value = count_per_target_and_predictor$MVR_PTS,
	crash.rate = count_per_target_and_predictor$n.x*100/count_per_target_and_predictor$n.y)

count_per_target_and_predictor$value <- factor(count_per_target_and_predictor$value,levels=0:10)

ggplot(count_per_target_and_predictor,
aes(value,crash.rate)) +
geom_bar(stat = "identity") +
xlab("MVR_PTS") +
ylab("Crash rate (%)") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

## ----mvr-pts-add-constant, echo=FALSE, eval=TRUE-------------------------
par(mfrow=c(1,2))

plot(as.numeric(as.vector(count_per_target_and_predictor$value)),
count_per_target_and_predictor$crash.rate,
xlab="Motor vehicle record points (raw)",
ylab="Crash rate (%)",
type="o")

abline(lm(count_per_target_and_predictor$crash.rate ~ as.numeric(as.vector(count_per_target_and_predictor$value))),lty=2)

plot(ifelse(as.numeric(as.vector(count_per_target_and_predictor$value)) >= 7,as.numeric(as.vector(count_per_target_and_predictor$value)) + 3,as.numeric(as.vector(count_per_target_and_predictor$value))),
count_per_target_and_predictor$crash.rate,
xlab="Motor vehicle record points (modified)",
ylab="Crash rate (%)",
main="Add 3 when points >= 7",
type="o")

abline(lm(count_per_target_and_predictor$crash.rate ~ ifelse(as.numeric(as.vector(count_per_target_and_predictor$value)) >= 7,as.numeric(as.vector(count_per_target_and_predictor$value)) + 3,as.numeric(as.vector(count_per_target_and_predictor$value)))),lty=2)

## ----add-mvr-pts-modified, echo=FALSE, eval=TRUE-------------------------
insurance <- data.frame(insurance,
	MVR_PTS_MOD = ifelse(insurance$MVR_PTS >= 7,insurance$MVR_PTS + 3,insurance$MVR_PTS),
	check.names=FALSE,stringsAsFactors=FALSE)

evaluation <- data.frame(evaluation,
	MVR_PTS_MOD = ifelse(evaluation$MVR_PTS >= 7,evaluation$MVR_PTS + 3,evaluation$MVR_PTS),
	check.names=FALSE,stringsAsFactors=FALSE)

## ----age-groups-vs-target-flag, echo=FALSE, eval=TRUE--------------------
age_groups <- ifelse(insurance$AGE < 25,"Young (<25)","In-between (25-64)")
age_groups <- ifelse(insurance$AGE >= 65,"Senior (65+)",age_groups)

for(var in c("Young (<25)","In-between (25-64)","Senior (65+)"))
{
crash_rate <- round((length(which(age_groups == var & insurance$TARGET_FLAG == 1))*100)/length(which(age_groups == var)),digits=2)
print(paste0("Crash rate for age group ",var,": ",crash_rate))
}

## ----add-binary-var-young-age, echo=FALSE, eval=TRUE---------------------
insurance <- data.frame(insurance,
	Young_age = ifelse(insurance$AGE < 25,1,0),
	check.names=FALSE,stringsAsFactors=FALSE)

evaluation <- data.frame(evaluation,
	Young_age = ifelse(evaluation$AGE < 25,1,0),
	check.names=FALSE,stringsAsFactors=FALSE)

insurance[which(is.na(insurance$Young_age) == TRUE),"Young_age"] <- 0
evaluation[which(is.na(evaluation$Young_age) == TRUE),"Young_age"] <- 0

## ----cor-travtime-bluebook-income-target-flag, echo=FALSE, eval=TRUE-----
par(mfrow=c(1,2))
boxplot(TRAVTIME ~ factor(TARGET_FLAG),data=insurance,ylab="TRAVTIME")
boxplot(TRAVTIME ~ factor(TARGET_FLAG),data=insurance[insurance$TRAVTIME < 60,],ylab="TRAVTIME",main="TRAVTIME < 60 only")

par(mfrow=c(1,2))
boxplot(INCOME ~ factor(TARGET_FLAG),data=insurance,ylab="INCOME")
boxplot(BLUEBOOK ~ factor(TARGET_FLAG),data=insurance,ylab="BLUEBOOK")
boxplot(log10(INCOME + 1)  ~ factor(TARGET_FLAG),data=insurance,ylab="log10(INCOME + 1)")
boxplot(log10(BLUEBOOK) ~ factor(TARGET_FLAG),data=insurance,ylab="log10(BLUEBOOK)")

## ----obtain-latest-binary-vars, echo=FALSE, eval=TRUE--------------------
num_unique_values_per_variable <- insurance %>%
gather() %>%
group_by(key) %>%
summarize(uniques = n_distinct(value))

num_unique_values_per_variable <- data.frame(num_unique_values_per_variable,stringsAsFactors=FALSE)

num_unique_values_per_variable <- data.frame(Variable = num_unique_values_per_variable[,1],Num.uniques = num_unique_values_per_variable[,2],stringsAsFactors=FALSE)

num_unique_values_per_variable$Variable <- as.vector(num_unique_values_per_variable$Variable)

binary_variables <- num_unique_values_per_variable$Variable[num_unique_values_per_variable$Num.uniques == 2]

binary_variables <- setdiff(binary_variables,c("TARGET_FLAG","Red_car","Sex_male"))

## ----correlations-within-binary, echo=FALSE, eval=TRUE,message=FALSE,warning=FALSE----
binary_variables_data <- insurance[,binary_variables]

correlation_variables <- abs(cor(binary_variables_data,use="pairwise.complete.obs"))

for(i in 1:(nrow(correlation_variables) - 1))
{
correlation_variables[i,seq(from=i,to=ncol(correlation_variables),by=1)] <- NA
}

correlation_variables[ncol(binary_variables_data),ncol(binary_variables_data)] <- NA

correlation_variables <- gather(data.frame(correlation_variables),"y","correlation")

correlation_variables <- data.frame(x = rep(unique(correlation_variables$y),times=length(unique(correlation_variables$y))),correlation_variables)

correlation_variables$x <- factor(correlation_variables$x,levels=colnames(binary_variables_data))
correlation_variables$y <- factor(correlation_variables$y,levels=colnames(binary_variables_data))

correlation_variables %>%
ggplot(.,
aes(x = x,y = y)) +
geom_tile(aes(fill = correlation)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_gradient2(low = "blue",mid = "white",high = "red",na.value = "grey50") +
geom_text(aes(label = round(correlation, 2)),size=2) +
ggtitle("Magnitude only shown (abs value if negative)")

## ----replot-correlations-within-binary, echo=FALSE, eval=TRUE,message=FALSE,warning=FALSE----
correlation_variables$correlation[as.vector(correlation_variables$x) == "Kids" & as.vector(correlation_variables$y) == "Driving_kids"] <- NA
correlation_variables$correlation[as.vector(correlation_variables$x) == "Single_parent" & as.vector(correlation_variables$y) == "Married"] <- NA
correlation_variables$correlation[as.vector(correlation_variables$x) == "Single_parent" & as.vector(correlation_variables$y) == "Kids"] <- NA

correlation_variables %>%
ggplot(.,
aes(x = x,y = y)) +
geom_tile(aes(fill = correlation)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_gradient2(low = "blue",mid = "white",high = "red",na.value = "grey50") +
geom_text(aes(label = round(correlation, 2)),size=2) +
ggtitle("Magnitude only shown (abs value if negative)")

## ----correlations-within-numeric, echo=FALSE, eval=TRUE------------------
pairs(insurance[,c("INCOME","BLUEBOOK","TRAVTIME")],lower.panel=NULL,cex=0.5)

print("Correlation INCOME and BLUEBOOK:")
round(cor(insurance$INCOME,insurance$BLUEBOOK,use="pairwise.complete.obs"),digits=2)
print("Correlation INCOME and TRAVTIME:")
round(cor(insurance$INCOME,insurance$TRAVTIME,use="pairwise.complete.obs"),digits=2)
print("Correlation TRAVTIME and BLUEBOOK:")
round(cor(insurance$BLUEBOOK,insurance$TRAVTIME,use="pairwise.complete.obs"),digits=2)

## ----correlations-binary-vs-income, echo=FALSE, eval=TRUE----------------
correlations_income_vs_binary_var <- c()

for(var in binary_variables)
{
correlations_income_vs_binary_var <- c(correlations_income_vs_binary_var,cor(insurance[,var],insurance$INCOME,use="pairwise.complete.obs"))
}

correlations_income_vs_binary_var <- data.frame(Variable = binary_variables,
	Correlation = round(correlations_income_vs_binary_var,digits=2),
	stringsAsFactors=FALSE)

correlations_income_vs_binary_var[order(correlations_income_vs_binary_var$Correlation),]

## ----correlations-binary-vs-travtime, echo=FALSE, eval=TRUE--------------
correlations_travtime_vs_binary_var <- c()

for(var in binary_variables)
{
correlations_travtime_vs_binary_var <- c(correlations_travtime_vs_binary_var,cor(insurance[,var],insurance$TRAVTIME,use="pairwise.complete.obs"))
}

correlations_travtime_vs_binary_var <- data.frame(Variable = binary_variables,
	Correlation = round(correlations_travtime_vs_binary_var,digits=2),
	stringsAsFactors=FALSE)

correlations_travtime_vs_binary_var[order(correlations_travtime_vs_binary_var$Correlation),]

## ----correlation-numeric-vars-target-amt, echo=FALSE, eval=TRUE,message=FALSE,warning=FALSE----
numeric_variables <- c("TIF","CAR_AGE","AGE","INCOME","BLUEBOOK","TARGET_AMT","OLDCLAIM")

numeric_variables_data <- insurance[insurance$TARGET_FLAG == 1,numeric_variables]

numeric_variables_data <- data.frame(numeric_variables_data,
	log10.BLUEBOOK = log10(numeric_variables_data$BLUEBOOK),
	log10.income = log10(numeric_variables_data$INCOME + 1),
	log10.target = log10(numeric_variables_data$TARGET_AMT))

numeric_variables_data$OLDCLAIM <- ifelse(numeric_variables_data$OLDCLAIM > 0,numeric_variables_data$OLDCLAIM,NA)

numeric_variables_data <- data.frame(numeric_variables_data,
	log10.OLDCLAIM = log10(numeric_variables_data$OLDCLAIM))

correlation_variables <- abs(cor(numeric_variables_data,use="pairwise.complete.obs"))

for(i in 1:(nrow(correlation_variables) - 1))
{
correlation_variables[i,seq(from=i,to=ncol(correlation_variables),by=1)] <- NA
}

correlation_variables[ncol(numeric_variables_data),ncol(numeric_variables_data)] <- NA

correlation_variables <- gather(data.frame(correlation_variables),"y","correlation")

correlation_variables <- data.frame(x = rep(unique(correlation_variables$y),times=length(unique(correlation_variables$y))),correlation_variables)

correlation_variables$x <- factor(correlation_variables$x,levels=colnames(numeric_variables_data))
correlation_variables$y <- factor(correlation_variables$y,levels=colnames(numeric_variables_data))

correlation_variables$correlation[correlation_variables$x == "log10.OLDCLAIM" & correlation_variables$y == "OLDCLAIM"] <- NA
correlation_variables$correlation[correlation_variables$x == "BLUEBOOK" & correlation_variables$y == "log10.BLUEBOOK"] <- NA
correlation_variables$correlation[correlation_variables$x == "log10.target" & correlation_variables$y == "TARGET_AMT"] <- NA
correlation_variables$correlation[correlation_variables$x == "log10.income" & correlation_variables$y == "INCOME"] <- NA

correlation_variables %>%
ggplot(.,
aes(x = x,y = y)) +
geom_tile(aes(fill = correlation)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_gradient2(low = "blue",mid = "white",high = "red",na.value = "grey50") +
geom_text(aes(label = round(correlation, 2)),size=2) +
ggtitle("Magnitude only shown (abs value if negative)")

## ----boxplot-cartype-vs-target-amt, echo=FALSE, eval=TRUE----------------
boxplot(log10(TARGET_AMT) ~ factor(CAR_TYPE,levels=c("Minivan","Sports Car","z_SUV","Pickup","Van","Panel Truck")),
xlab="Car type",
ylab="log10(TARGET_AMT)",
data=insurance[insurance$TARGET_FLAG == 1,])

## ----boxplot-commercial-vs-target-amt, echo=FALSE, eval=TRUE-------------
par(mfrow=c(1,2))

boxplot(log10(TARGET_AMT) ~ factor(Commercial_vehicle),
xlab="Commercial vehicle",
ylab="log10(TARGET_AMT)",
data=insurance[insurance$TARGET_FLAG == 1,])

boxplot(log10(TARGET_AMT) ~ factor(Commercial_vehicle),
xlab="Commercial vehicle",
ylab="log10(TARGET_AMT)",
data=insurance[insurance$TARGET_FLAG == 1 & insurance$TARGET_AMT >= 1000,],
main="TARGET_AMT >= $1,000 only")

## ----correlations-binary-vars-with-target-amt, echo=FALSE, eval=TRUE-----
correlations_binary_vs_target_amt <- c()

for(var in binary_variables)
{
correlations_binary_vs_target_amt <- c(correlations_binary_vs_target_amt,cor(insurance[insurance$TARGET_FLAG == 1,var],log10(insurance[insurance$TARGET_FLAG == 1,"TARGET_AMT"]),use="pairwise.complete.obs"))
}

correlations_binary_vs_target_amt <- data.frame(Variable = binary_variables,
	Correlation = round(correlations_binary_vs_target_amt,digits=2),
	stringsAsFactors=FALSE)

correlations_binary_vs_target_amt[order(correlations_binary_vs_target_amt$Correlation),]

## ----add-dummy-var-for-car-type-to-training-and-eval, echo=FALSE, eval=TRUE----
insurance <- data.frame(insurance,
	Car_type_for_claim_amount = ifelse(insurance$CAR_TYPE != "Van" & insurance$CAR_TYPE != "Panel Truck",0,1),
	check.names=FALSE,stringsAsFactors=FALSE)

evaluation <- data.frame(evaluation,
	Car_type_for_claim_amount = ifelse(evaluation$CAR_TYPE != "Van" & evaluation$CAR_TYPE != "Panel Truck",0,1),
	check.names=FALSE,stringsAsFactors=FALSE)

insurance$Car_type_for_claim_amount <- ifelse(insurance$CAR_TYPE == "Panel Truck",2,insurance$Car_type_for_claim_amount)
evaluation$Car_type_for_claim_amount <- ifelse(evaluation$CAR_TYPE == "Panel Truck",2,evaluation$Car_type_for_claim_amount)

## ----add-marriage-home-kids-score, echo=FALSE, eval=TRUE-----------------
marriage_home_and_kids_score_insurance <- rep(8,times=nrow(insurance))
marriage_home_and_kids_score_evaluation <- rep(8,times=nrow(evaluation))

marriage_home_and_kids_score_insurance <- ifelse(insurance$Married == 0,marriage_home_and_kids_score_insurance - 1,marriage_home_and_kids_score_insurance)
marriage_home_and_kids_score_insurance <- ifelse(insurance$Homeowner == 0,marriage_home_and_kids_score_insurance - 2,marriage_home_and_kids_score_insurance)
marriage_home_and_kids_score_insurance <- ifelse(insurance$Single_parent == 1,marriage_home_and_kids_score_insurance - 3,marriage_home_and_kids_score_insurance)
marriage_home_and_kids_score_insurance <- ifelse(insurance$Single_parent == 1 & insurance$Homeowner == 0,marriage_home_and_kids_score_insurance - 2,marriage_home_and_kids_score_insurance)

marriage_home_and_kids_score_evaluation <- ifelse(evaluation$Married == 0,marriage_home_and_kids_score_evaluation - 1,marriage_home_and_kids_score_evaluation)
marriage_home_and_kids_score_evaluation <- ifelse(evaluation$Homeowner == 0,marriage_home_and_kids_score_evaluation - 2,marriage_home_and_kids_score_evaluation)
marriage_home_and_kids_score_evaluation <- ifelse(evaluation$Single_parent == 1,marriage_home_and_kids_score_evaluation - 3,marriage_home_and_kids_score_evaluation)
marriage_home_and_kids_score_evaluation <- ifelse(evaluation$Homeowner == 0 & evaluation$Single_parent == 1,marriage_home_and_kids_score_evaluation - 2,marriage_home_and_kids_score_evaluation)

insurance <- data.frame(insurance,
	marriage_home_and_kids_score = marriage_home_and_kids_score_insurance,
	check.names=FALSE,stringsAsFactors=FALSE)

evaluation <- data.frame(evaluation,
	marriage_home_and_kids_score = marriage_home_and_kids_score_evaluation,
	check.names=FALSE,stringsAsFactors=FALSE)

## ----fill-in-income-NAs, echo=FALSE, eval=TRUE---------------------------
print("Median income in training data:")
median(insurance$INCOME,na.rm=TRUE)
print("Median income in evaluation data:")
median(evaluation$INCOME,na.rm=TRUE)
print("Median income across both:")
median(c(insurance$INCOME,evaluation$INCOME),na.rm=TRUE)

insurance[which(is.na(insurance$INCOME) == TRUE),"INCOME"] <- median(c(insurance$INCOME,evaluation$INCOME),na.rm=TRUE)
evaluation[which(is.na(evaluation$INCOME) == TRUE),"INCOME"] <- median(c(insurance$INCOME,evaluation$INCOME),na.rm=TRUE)

## ----model-select-binary-vars-marriage-home-kids-score-travtime-mvr-pts, echo=FALSE, eval=TRUE----
model1 <- glm(TARGET_FLAG ~ Driving_kids + New_on_the_job + TRAVTIME + Commercial_vehicle + Past_claim + Revoked + MVR_PTS_MOD + Urban_not_rural + College_education + Minivan + Young_age + marriage_home_and_kids_score,data=insurance,family="binomial")

summary(model1)

## ----model2-add-income, echo=FALSE, eval=TRUE----------------------------
model2 <- glm(TARGET_FLAG ~ Driving_kids + New_on_the_job + TRAVTIME + Commercial_vehicle + Past_claim + Revoked + MVR_PTS_MOD + Urban_not_rural + College_education + Minivan + Young_age + marriage_home_and_kids_score + log10(INCOME + 1),data=insurance,family="binomial")

summary(model2)

## ----model2-add-income-remove-new-on-the-job, echo=FALSE, eval=TRUE------
model2 <- glm(TARGET_FLAG ~ Driving_kids + TRAVTIME + Commercial_vehicle + Past_claim + Revoked + MVR_PTS_MOD + Urban_not_rural + College_education + Minivan + Young_age + marriage_home_and_kids_score + log10(INCOME + 1),data=insurance,family="binomial")

summary(model2)

## ----model3-separate-marriage-singleparent-homeowner, echo=FALSE, eval=TRUE----
model3 <- glm(TARGET_FLAG ~ Driving_kids + TRAVTIME + Commercial_vehicle + Past_claim + Revoked + MVR_PTS_MOD + Urban_not_rural + College_education + Minivan + Young_age + New_on_the_job + Married + Homeowner + Single_parent,data=insurance,family="binomial")

summary(model3)

## ----model1-claim-amt, echo=FALSE, eval=TRUE-----------------------------
model1_linear_model <- lm(log10(TARGET_AMT) ~ log10(BLUEBOOK) + Car_type_for_claim_amount,data=insurance[insurance$TARGET_FLAG == 1,])

summary(model1_linear_model)

## ----model1-claim-amt-factor-cartype, echo=FALSE, eval=TRUE--------------
model1_linear_model <- lm(log10(TARGET_AMT) ~ log10(BLUEBOOK) + factor(Car_type_for_claim_amount),data=insurance[insurance$TARGET_FLAG == 1,])

summary(model1_linear_model)

## ----model1-claim-amt-bluebook-only, echo=FALSE, eval=TRUE---------------
model1_linear_model <- lm(log10(TARGET_AMT) ~ log10(BLUEBOOK),data=insurance[insurance$TARGET_FLAG == 1,])

summary(model1_linear_model)

## ----model2-linear-model-backward-selection, echo=FALSE, eval=TRUE-------
insurance_for_model2_linear_model <- data.frame(insurance,
        log10.BLUEBOOK = log10(insurance$BLUEBOOK),
        check.names=FALSE,stringsAsFactors=FALSE)

model2_linear_model <- lm(log10(TARGET_AMT) ~ log10(BLUEBOOK) + Driving_kids + New_on_the_job + TRAVTIME + Commercial_vehicle + Past_claim + Revoked + MVR_PTS_MOD + Urban_not_rural + College_education + Minivan + Young_age + marriage_home_and_kids_score,
data=insurance[insurance$TARGET_FLAG == 1,])

summary(model2_linear_model)

## ----model-linear-model-backward-selection-remove-very-high-pvalues, echo=FALSE, eval=TRUE----
model2_linear_model <- lm(log10(TARGET_AMT) ~ log10(BLUEBOOK) + MVR_PTS_MOD + marriage_home_and_kids_score,
	data=insurance[insurance$TARGET_FLAG == 1,])

summary(model2_linear_model)

## ----model-linear-model-married-homeowner-singleparent-individually, echo=FALSE, eval=TRUE----
model2_linear_model <- lm(log10(TARGET_AMT) ~ log10(BLUEBOOK) + Kids + Married + Homeowner + Single_parent + MVR_PTS_MOD,
	data=insurance[insurance$TARGET_FLAG == 1,])

summary(model2_linear_model)

## ----model-linear-model-bluebook-married-mvrpts, echo=FALSE, eval=TRUE----
model2_linear_model <- lm(log10(TARGET_AMT) ~ log10(BLUEBOOK) + Married + MVR_PTS_MOD,
	data=insurance[insurance$TARGET_FLAG == 1,])

summary(model2_linear_model)

## ----add-log10, echo=FALSE, eval=TRUE------------------------------------
insurance <- data.frame(insurance,
	log10.INCOME = log10(insurance$INCOME + 1),
	log10.BLUEBOOK = log10(insurance$BLUEBOOK),
	log10.TARGET = log10(insurance$TARGET_AMT + 1),
	check.names=FALSE,stringsAsFactors=FALSE)

evaluation <- data.frame(evaluation,
	log10.INCOME = log10(evaluation$INCOME + 1),
	log10.BLUEBOOK = log10(evaluation$BLUEBOOK),
	log10.TARGET = log10(evaluation$TARGET_AMT + 1),
	check.names=FALSE,stringsAsFactors=FALSE)

model2 <- glm(TARGET_FLAG ~ Driving_kids + TRAVTIME + Commercial_vehicle + Past_claim + Revoked + MVR_PTS_MOD + Urban_not_rural + College_education + Minivan + Young_age + marriage_home_and_kids_score + log10.INCOME,data=insurance,family="binomial")

model1_linear_model <- lm(log10.TARGET ~ log10.BLUEBOOK,data=insurance[insurance$TARGET_FLAG == 1,])
model2_linear_model <- lm(log10.TARGET ~ log10.BLUEBOOK + Married + MVR_PTS_MOD,data=insurance[insurance$TARGET_FLAG == 1,])

## ----confusmatrix, echo=FALSE, eval=TRUE---------------------------------
model1_numeric_predictions <- predict(object = model1,data=insurance,type="response")
model2_numeric_predictions <- predict(object = model2,data=insurance,type="response")
model3_numeric_predictions <- predict(object = model3,data=insurance,type="response")

caret::confusionMatrix(data=ifelse(model1_numeric_predictions > 0.5,1,0),
reference=insurance$TARGET_FLAG,
positive="1")

caret::confusionMatrix(data=ifelse(model2_numeric_predictions > 0.5,1,0),
reference=insurance$TARGET_FLAG,
positive="1")

caret::confusionMatrix(data=ifelse(model3_numeric_predictions > 0.5,1,0),
reference=insurance$TARGET_FLAG,
positive="1")

## ----roc-curves, echo=FALSE, eval=TRUE-----------------------------------
roc(predictor = model1_numeric_predictions,
response = insurance$TARGET_FLAG,
print.thres=c(0.25,0.30,0.35,0.40,0.45,0.5),
main="Model1",
plot=TRUE)

roc(predictor = model2_numeric_predictions,
response = insurance$TARGET_FLAG,
print.thres=c(0.25,0.30,0.35,0.40,0.45,0.5),
main="Model2",
plot=TRUE)

roc(predictor = model3_numeric_predictions,
response = insurance$TARGET_FLAG,
print.thres=c(0.25,0.30,0.35,0.40,0.45,0.5),
main="Model3",
plot=TRUE)

## ----confusmatrix-p-035, echo=FALSE, eval=TRUE---------------------------
caret::confusionMatrix(data=ifelse(model1_numeric_predictions > 0.35,1,0),
reference=insurance$TARGET_FLAG,
positive="1")

caret::confusionMatrix(data=ifelse(model2_numeric_predictions > 0.35,1,0),
reference=insurance$TARGET_FLAG,
positive="1")

caret::confusionMatrix(data=ifelse(model3_numeric_predictions > 0.35,1,0),
reference=insurance$TARGET_FLAG,
positive="1")

## ----precision-and-F1, echo=FALSE, eval=TRUE-----------------------------
true_and_false_positives_and_negatives <- function(dat,actual_column_name,predicted_column_name,positive_value){
        TN_number = length(which(dat[,actual_column_name] != positive_value & dat[,predicted_column_name] != positive_value))
        FP_number = length(which(dat[,actual_column_name] != positive_value & dat[,predicted_column_name] == positive_value))
        TP_number = length(which(dat[,actual_column_name] == positive_value & dat[,predicted_column_name] == positive_value))
        FN_number = length(which(dat[,actual_column_name] == positive_value & dat[,predicted_column_name] != positive_value))
        return(data.frame(TN = TN_number,FP = FP_number,TP = TP_number,FN = FN_number))         
}

precision_homebrew <- function(dat,actual_column_name,predicted_column_name,positive_value){
        true_and_false_for_this_dat <- true_and_false_positives_and_negatives(dat,actual_column_name,predicted_column_name,positive_value)
        return(true_and_false_for_this_dat$TP/(true_and_false_for_this_dat$TP + true_and_false_for_this_dat$FP))
}

sensitivity_homebrew <- function(dat,actual_column_name,predicted_column_name,positive_value){
        true_and_false_for_this_dat <- true_and_false_positives_and_negatives(dat,actual_column_name,predicted_column_name,positive_value)
        return(true_and_false_for_this_dat$TP/(true_and_false_for_this_dat$TP + true_and_false_for_this_dat$FN))
}

specificity_homebrew <- function(dat,actual_column_name,predicted_column_name,positive_value){
        true_and_false_for_this_dat <- true_and_false_positives_and_negatives(dat,actual_column_name,predicted_column_name,positive_value)
        return(true_and_false_for_this_dat$TN/(true_and_false_for_this_dat$TN + true_and_false_for_this_dat$FP))
}

F1_score_homebrew <- function(dat,actual_column_name,predicted_column_name,positive_value){
        precision_this_dat <- precision_homebrew(dat,actual_column_name,predicted_column_name,positive_value)
        sensitivity_this_dat <- sensitivity_homebrew(dat,actual_column_name,predicted_column_name,positive_value)
        return((2 * precision_this_dat * sensitivity_this_dat)/(precision_this_dat + sensitivity_this_dat))
}

precision_model3 <- precision_homebrew(data.frame(Actual = insurance$TARGET_FLAG,Predicted = ifelse(model3_numeric_predictions > 0.35,1,0)),"Actual","Predicted","1")

print(paste0("Precision = ",round(precision_model3,digits=4)))

F1_model3 <- F1_score_homebrew(data.frame(Actual = insurance$TARGET_FLAG,Predicted = ifelse(model3_numeric_predictions > 0.35,1,0)),"Actual","Predicted","1")

print(paste0("F1 = ",round(F1_model3,digits=4)))

## ----residual-plots, echo=FALSE, eval=TRUE,fig.width=12,fig.height=8-----
par(mfrow=c(2,4))

plot(model1_linear_model)

plot(model2_linear_model)

## ----residuals-histograms, echo=FALSE, eval=TRUE-------------------------
par(mfrow=c(1,2))

hist(model1_linear_model$residuals,xlab="Residuals",ylab="# records",main="Model1")
hist(model2_linear_model$residuals,xlab="Residuals",ylab="# records",main="Model2")

## ----check-evaluation-data, echo=TRUE, eval=TRUE-------------------------
apply(evaluation[,c("Driving_kids","Commercial_vehicle","Past_claim","Revoked","MVR_PTS_MOD","Urban_not_rural","College_education","Minivan","Young_age","New_on_the_job","Married","Homeowner","Single_parent")],2,function(x)table(x,useNA="ifany"))

summary(evaluation$TRAVTIME)

summary(evaluation$log10.BLUEBOOK)

## ----apply-models, echo=FALSE, eval=TRUE---------------------------------
numeric_predictions_target_flag <- predict(object = model3,newdata=evaluation,type="response")
factor_predictions_target_flag <- ifelse(numeric_predictions_target_flag > 0.35,1,0)

predictions_target_amt <- predict(object = model2_linear_model,newdata=evaluation[factor_predictions_target_flag == 1,])
predictions_target_amt <- 10^predictions_target_amt

predictions_target_amt_incl_zeroes <- rep(0,times=nrow(evaluation))
predictions_target_amt_incl_zeroes[factor_predictions_target_flag == 1] <- predictions_target_amt

evaluation_predictions <- data.frame(Crash.probability = round(numeric_predictions_target_flag,digits=4),
	TARGET_FLAG = factor_predictions_target_flag,
	TARGET_AMT = round(predictions_target_amt_incl_zeroes,digits=2),
	check.names=FALSE)

write.table(evaluation_predictions,
file="insurance_evaluation_predictions.csv",
row.names=FALSE,col.names=TRUE,quote=FALSE,sep=",")