######################################################################
### Title: "Week 3 - Outliers and Linear Probability Models"
### Course: STA 235H
### Semester: Fall 2023
### Professor: Magdalena Bennett
#######################################################################

# Clears memory
rm(list = ls())
# Clears console
cat("\014")
# scipen=999 removes scientific notation; scipen=0 turns it on.
options(scipen = 0)

### Load libraries
# If you don't have one of these packages installed already, you will need to run install.packages() line
library(tidyverse)
library(vtable)
library(AER) #package that includes some interesting data
library(estimatr) #package to run linear regressions with robust SE

###################### OUTLIERS ################################################

### HMDA Example

# This is the data from 2017 HMDA for Bastrop county (https://www.consumerfinance.gov/data-research/hmda/historic-data/?geo=tx&records=first-lien-owner-occupied-1-4-family-records&field_descriptions=labels)
# (you can also find the whole dataset for Austin by changing the name of the file to hmda_2017_austin.csv)

loans <- read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week3/2_OLS_Issues/data/hmda_2017_austin_bastrop.csv", stringsAsFactors = FALSE)

# You can find information about the variables here: https://files.consumerfinance.gov/hmda-historic-data-dictionaries/lar_record_codes.pdf

# Let's look at loans that were approved (action_taken = 1) for home purchase (loan_purpose = 1) (hint: you will need to subset your data)

loans <- loans %>% filter(action_taken==1 & loan_purpose==1)

# Q: How could we see if we have outliers?

ggplot(data = loans, aes(x = loan_amount_000s)) +
  geom_histogram(color = "skyblue3", fill = "skyblue", lwd = 1.1) +
  xlab("Loan amount (1,000 US$)") +
  theme_minimal()

# Show a scatter plot of loan amount vs applicant's income:

ggplot(data = loans, aes(x = applicant_income_000s, y = loan_amount_000s)) +
  geom_point(color = "skyblue3") +
  theme_minimal() +
  xlab("Applicant's income (M US$)") + ylab("Loan Amount (M US$)")  

# Fit a regression line to the previous plot:

ggplot(data = loans, aes(x = applicant_income_000s, y = loan_amount_000s)) +
  geom_point(color = "skyblue3") +
  theme_minimal() +
  xlab("Applicant's income (M US$)") + ylab("Loan Amount (M US$)") +
  geom_smooth(method = "lm", se = FALSE, color = "blue", lwd = 1.1)
# geom_smooth() fits a smooth function for our data!
# method="lm" fits a linear regression of loan_amount ~ income
# se = FALSE is to turn off the shade for standard errors (set se = TRUE and see what happens!)
# color = "blue" sets the color of the regression line
# lwd = 1.1 sets the width of the line


# Fit a regression line but *excluding* the clear outliers

## Create new dataset excluding both income outliers 
loans_without_outliers_income <- loans %>% filter(applicant_income_000s<700)

ggplot(data = loans, aes(x = applicant_income_000s, y = loan_amount_000s)) +
  geom_point(color = "skyblue3") +
  theme_minimal() +
  xlab("Applicant's income (M US$)") + ylab("Loan Amount (M US$)") +
  geom_smooth(method = "lm", se = FALSE, color = "blue", lwd = 1.1) +
  #same plot as before, but we add a new layer, which is a linear regression but for the NEW dataset we created!
  geom_smooth(data = loans_without_outliers_income,
              aes(x = applicant_income_000s, y = loan_amount_000s),
              method = "lm",
              se = FALSE,
              color = "purple")


## Create new dataset excluding both loan outliers 
loans_without_outliers_loan <- loans %>% filter(loan_amount_000s<750)

ggplot(data = loans, aes(x = applicant_income_000s, y = loan_amount_000s)) +
  geom_point(color = "skyblue") +
  theme_minimal() +
  xlab("Applicant's income (1,000 US$)") + ylab("Loan Amount (1,000 US$)") +
  geom_smooth(method = "lm", se = FALSE, color = "blue", lwd = 1.1) +
  #same plot as before, but we add a new layer, which is a linear regression but for the NEW dataset we created!
  geom_smooth(data = loans_without_outliers_loan,
              aes(x = applicant_income_000s, y = loan_amount_000s),
              method = "lm",
              se = FALSE,
              color = "purple")


### Tip: If you want to add labels to your smooth lines, you can add "color" as an aes() feature (use whatever names you want!)

ggplot(data = loans, aes(x = applicant_income_000s, y = loan_amount_000s)) +
  geom_point(color = "skyblue") +
  theme_minimal() +
  xlab("Applicant's income (1,000 US$)") + ylab("Loan Amount (1,000 US$)") +
  geom_smooth(aes(color = "linear fit"), method = "lm", se = FALSE, lwd = 1.1) +
  geom_smooth(data = loans_without_outliers_loan,
              aes(x = applicant_income_000s, y = loan_amount_000s,
                  color = "linear fit w/o outliers"),
              method = "lm",
              se = FALSE) +
  scale_color_manual(values = c("blue", "purple"), name = "Regression lines") #Then, you can set your preferred colors using scale_color_manual()


# Q: For your original data, create a dummy variable if the observation is an outlier
# in terms of income (1) or (0) in another case. How many outliers do you have?

loans = loans %>% mutate(outlier = ifelse(applicant_income_000s>700, 1, 0))

loans %>% select(outlier) %>% table(.)

# Q: Run a regression with and without outliers. Do your results change qualitatively?

lm_all = lm(loan_amount_000s ~ applicant_income_000s, data = loans)

lm_wo_outliers = lm(loan_amount_000s ~ applicant_income_000s, data = loans_without_outliers_income)

summary(lm_all)
summary(lm_wo_outliers)


###################### LINEAR PROBABILITY MODELS ###############################

data(HMDA) # This dataset is loaded from the AER package

# To know what the variables are, you can type ?HMDA on the console
head(HMDA)

# Let's look at the variable `deny`, which captures whether someone gets the loan
# denied ("yes") or not ("no"). Q: How many people get the loan denied?
HMDA %>% select(deny) %>% table()

# `deny` is a factor (categorical) variable, and we want to transform it to numeric.
# We can do this in different ways:

#1) Use ifelse to set it to 1 if deny is "yes" and 0 in another case
HMDA = HMDA %>% mutate(deny_num = ifelse(deny=="yes", 1, 0))
# Check numbers are right
HMDA %>% select(deny_num) %>% table()

#2) Notice that deny is a factor that takes two numeric values: 1 if it's "no" and 2 if it's "yes".
# We can transform it to numeric (1,2) and then substract 1:
HMDA = HMDA %>% mutate(deny_num = as.numeric(deny) - 1)
# Check numbers are right
HMDA %>% select(deny_num) %>% table()


## Linear Probability Model

# Let's run a simple model:
summary(lm(deny_num ~ pirat, data = HMDA))

# Let's look at the fitted regression line and the data:
ggplot(data = HMDA, aes(x = pirat, y = deny_num)) + 
  #"pch=21" selects a marker with a different outline (circle), "color" changes the outline of the marker, "fill" changes the fill color (alpha changes transparency)
  geom_point(color = "#5601A4", fill = alpha("#5601A4",0.4), pch=21, size = 3) +
  
  #geom_smooth (with method "lm") adds a regression line
  geom_smooth(method = "lm", color = "#BF3984", se = FALSE, lty = 1, lwd = 1.3) + #lty defines the line type (1 is solid line, 2 is dashed line)
  
  # Adds horizontal lines (hlines) to show where 0 and 1 are.
  geom_hline(aes(yintercept = 0), color="dark grey", lty = 2, lwd=1)+
  geom_hline(aes(yintercept = 1), color="dark grey", lty = 2, lwd=1)+
  
  theme_minimal()+
  xlab("Payment/Income ratio") + ylab("Deny")

# Let's run robust standard errors now
model2 = lm_robust(deny_num ~ pirat, data = HMDA) #lm_robust() is only available after loading the estimatr package!

summary(model2)
#Q: How do the point estimates (beta hats) and standard errors compare for the same model with lm()?

# We can add more variables too:
model3 = lm_robust(deny_num ~ pirat + afam, data = HMDA)
summary(model3)

# Q: Interpret the coefficient for pirat (payment to income ratio) and the coefficient for afam (African American == Yes).
# Q: How do these standard errors compare to the same LPM WITHOUT robust standard errors? Are they larger or smaller?