###################################################################### ### Title: "Week 3 - Outliers and Linear Probability Models" ### Course: STA 235H ### Semester: Fall 2023 ### Professor: Magdalena Bennett ####################################################################### # Clears memory rm(list = ls()) # Clears console cat("\014") # scipen=999 removes scientific notation; scipen=0 turns it on. options(scipen = 0) ### Load libraries # If you don't have one of these packages installed already, you will need to run install.packages() line library(tidyverse) library(vtable) library(AER) #package that includes some interesting data library(estimatr) #package to run linear regressions with robust SE ###################### OUTLIERS ################################################ ### HMDA Example # This is the data from 2017 HMDA for Bastrop county (https://www.consumerfinance.gov/data-research/hmda/historic-data/?geo=tx&records=first-lien-owner-occupied-1-4-family-records&field_descriptions=labels) # (you can also find the whole dataset for Austin by changing the name of the file to hmda_2017_austin.csv) loans <- read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week3/2_OLS_Issues/data/hmda_2017_austin_bastrop.csv", stringsAsFactors = FALSE) # You can find information about the variables here: https://files.consumerfinance.gov/hmda-historic-data-dictionaries/lar_record_codes.pdf # Let's look at loans that were approved (action_taken = 1) for home purchase (loan_purpose = 1) (hint: you will need to subset your data) loans <- loans %>% filter(action_taken==1 & loan_purpose==1) # Q: How could we see if we have outliers? ggplot(data = loans, aes(x = loan_amount_000s)) + geom_histogram(color = "skyblue3", fill = "skyblue", lwd = 1.1) + xlab("Loan amount (1,000 US$)") + theme_minimal() # Show a scatter plot of loan amount vs applicant's income: ggplot(data = loans, aes(x = applicant_income_000s, y = loan_amount_000s)) + geom_point(color = "skyblue3") + theme_minimal() + xlab("Applicant's income (M US$)") + ylab("Loan Amount (M US$)") # Fit a regression line to the previous plot: ggplot(data = loans, aes(x = applicant_income_000s, y = loan_amount_000s)) + geom_point(color = "skyblue3") + theme_minimal() + xlab("Applicant's income (M US$)") + ylab("Loan Amount (M US$)") + geom_smooth(method = "lm", se = FALSE, color = "blue", lwd = 1.1) # geom_smooth() fits a smooth function for our data! # method="lm" fits a linear regression of loan_amount ~ income # se = FALSE is to turn off the shade for standard errors (set se = TRUE and see what happens!) # color = "blue" sets the color of the regression line # lwd = 1.1 sets the width of the line # Fit a regression line but *excluding* the clear outliers ## Create new dataset excluding both income outliers loans_without_outliers_income <- loans %>% filter(applicant_income_000s<700) ggplot(data = loans, aes(x = applicant_income_000s, y = loan_amount_000s)) + geom_point(color = "skyblue3") + theme_minimal() + xlab("Applicant's income (M US$)") + ylab("Loan Amount (M US$)") + geom_smooth(method = "lm", se = FALSE, color = "blue", lwd = 1.1) + #same plot as before, but we add a new layer, which is a linear regression but for the NEW dataset we created! geom_smooth(data = loans_without_outliers_income, aes(x = applicant_income_000s, y = loan_amount_000s), method = "lm", se = FALSE, color = "purple") ## Create new dataset excluding both loan outliers loans_without_outliers_loan <- loans %>% filter(loan_amount_000s<750) ggplot(data = loans, aes(x = applicant_income_000s, y = loan_amount_000s)) + geom_point(color = "skyblue") + theme_minimal() + xlab("Applicant's income (1,000 US$)") + ylab("Loan Amount (1,000 US$)") + geom_smooth(method = "lm", se = FALSE, color = "blue", lwd = 1.1) + #same plot as before, but we add a new layer, which is a linear regression but for the NEW dataset we created! geom_smooth(data = loans_without_outliers_loan, aes(x = applicant_income_000s, y = loan_amount_000s), method = "lm", se = FALSE, color = "purple") ### Tip: If you want to add labels to your smooth lines, you can add "color" as an aes() feature (use whatever names you want!) ggplot(data = loans, aes(x = applicant_income_000s, y = loan_amount_000s)) + geom_point(color = "skyblue") + theme_minimal() + xlab("Applicant's income (1,000 US$)") + ylab("Loan Amount (1,000 US$)") + geom_smooth(aes(color = "linear fit"), method = "lm", se = FALSE, lwd = 1.1) + geom_smooth(data = loans_without_outliers_loan, aes(x = applicant_income_000s, y = loan_amount_000s, color = "linear fit w/o outliers"), method = "lm", se = FALSE) + scale_color_manual(values = c("blue", "purple"), name = "Regression lines") #Then, you can set your preferred colors using scale_color_manual() # Q: For your original data, create a dummy variable if the observation is an outlier # in terms of income (1) or (0) in another case. How many outliers do you have? loans = loans %>% mutate(outlier = ifelse(applicant_income_000s>700, 1, 0)) loans %>% select(outlier) %>% table(.) # Q: Run a regression with and without outliers. Do your results change qualitatively? lm_all = lm(loan_amount_000s ~ applicant_income_000s, data = loans) lm_wo_outliers = lm(loan_amount_000s ~ applicant_income_000s, data = loans_without_outliers_income) summary(lm_all) summary(lm_wo_outliers) ###################### LINEAR PROBABILITY MODELS ############################### data(HMDA) # This dataset is loaded from the AER package # To know what the variables are, you can type ?HMDA on the console head(HMDA) # Let's look at the variable `deny`, which captures whether someone gets the loan # denied ("yes") or not ("no"). Q: How many people get the loan denied? HMDA %>% select(deny) %>% table() # `deny` is a factor (categorical) variable, and we want to transform it to numeric. # We can do this in different ways: #1) Use ifelse to set it to 1 if deny is "yes" and 0 in another case HMDA = HMDA %>% mutate(deny_num = ifelse(deny=="yes", 1, 0)) # Check numbers are right HMDA %>% select(deny_num) %>% table() #2) Notice that deny is a factor that takes two numeric values: 1 if it's "no" and 2 if it's "yes". # We can transform it to numeric (1,2) and then substract 1: HMDA = HMDA %>% mutate(deny_num = as.numeric(deny) - 1) # Check numbers are right HMDA %>% select(deny_num) %>% table() ## Linear Probability Model # Let's run a simple model: summary(lm(deny_num ~ pirat, data = HMDA)) # Let's look at the fitted regression line and the data: ggplot(data = HMDA, aes(x = pirat, y = deny_num)) + #"pch=21" selects a marker with a different outline (circle), "color" changes the outline of the marker, "fill" changes the fill color (alpha changes transparency) geom_point(color = "#5601A4", fill = alpha("#5601A4",0.4), pch=21, size = 3) + #geom_smooth (with method "lm") adds a regression line geom_smooth(method = "lm", color = "#BF3984", se = FALSE, lty = 1, lwd = 1.3) + #lty defines the line type (1 is solid line, 2 is dashed line) # Adds horizontal lines (hlines) to show where 0 and 1 are. geom_hline(aes(yintercept = 0), color="dark grey", lty = 2, lwd=1)+ geom_hline(aes(yintercept = 1), color="dark grey", lty = 2, lwd=1)+ theme_minimal()+ xlab("Payment/Income ratio") + ylab("Deny") # Let's run robust standard errors now model2 = lm_robust(deny_num ~ pirat, data = HMDA) #lm_robust() is only available after loading the estimatr package! summary(model2) #Q: How do the point estimates (beta hats) and standard errors compare for the same model with lm()? # We can add more variables too: model3 = lm_robust(deny_num ~ pirat + afam, data = HMDA) summary(model3) # Q: Interpret the coefficient for pirat (payment to income ratio) and the coefficient for afam (African American == Yes). # Q: How do these standard errors compare to the same LPM WITHOUT robust standard errors? Are they larger or smaller?