######################################################################
### Title: "Week 3 - Outliers and Linear Probability Models"
### Course: STA 235H
### Semester: Fall 2023
### Professor: Magdalena Bennett
#######################################################################

# Clears memory
rm(list = ls())
# Clears console
cat("\014")
# scipen=999 removes scientific notation; scipen=0 turns it on.
options(scipen = 0)

### Load libraries
# If you don't have one of these packages installed already, you will need to run install.packages() line
library(tidyverse)
library(vtable)
library(AER) #package that includes some interesting data
library(estimatr) #package to run linear regressions with robust SE

################################################################################
######################## In-Class Exercise #####################################
################################################################################

###################### OUTLIERS ################################################

### HMDA Example

# This is the data from 2017 HMDA for Bastrop county (https://www.consumerfinance.gov/data-research/hmda/historic-data/?geo=tx&records=first-lien-owner-occupied-1-4-family-records&field_descriptions=labels)
# (you can also find the whole dataset for Austin by changing the name of the file to hmda_2017_austin.csv)

loans <- read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week3/2_OLS_Issues/data/hmda_2017_austin_bastrop.csv", stringsAsFactors = FALSE)

# You can find information about the variables here: https://files.consumerfinance.gov/hmda-historic-data-dictionaries/lar_record_codes.pdf

# Let's look at loans that were approved (action_taken = 1) 
# for home purchase (loan_purpose = 1) (hint: you will need to subset your data)


# Q: How could we see if we have outliers? Create a histogram of loan_amount_000s


# Show a scatter plot of loan amount vs applicant's income:


# Fit a regression line to the previous plot:


# Fit a regression line but *excluding* the clear outliers for income

# Q: Run a regression with and without outliers. Do your results change qualitatively?


###################### LINEAR PROBABILITY MODELS ###############################

data(HMDA) # This dataset is loaded from the AER package

# To know what the variables are, you can type ?HMDA on the console
head(HMDA)

#1) Use ifelse to set it to 1 if deny is "yes" and 0 in another case
HMDA = HMDA %>% mutate(deny_num = ) # COMPLETE THIS

## Linear Probability Model (LPM)

# Q: Run a LPM using deny (the numeric version) as the outcome, and pirat (payment to income ratio), chist (credit history), single, hschool (high school diploma),
# insurance, and race as the covariates.

lm_deny = #COMPLETE THIS
  
# Q: interpret the coefficient for pirat, hschool, and afam.
  
  
#################### EXERCISE ON YOUR OWN ######################################

## Ames Housing dataset: Data for the housing market in Ames, Iowa.
## You can check the codebook here: https://sta235.com/Classes/Week3/2_OLS_Issues/data/ames_codebook.csv

housing <- read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week3/2_OLS_Issues/data/AmesHousing.csv")

# Only keep single family housing: (Bldg.Type)
housing <- housing %>% filter(Bldg.Type=="1Fam")

# Create 1) a histogram for Lot Area and 2) a scatter plot between SalePrice (y) and lot area (x). 
# Q) How many outliers (in terms of lot area) do you have? 


# Run a regression with your entire data between Sale Price, Lot Area, Year Built, and Bedrooms above ground. 
# Q: What is the association between sale price and lot area in this model?

lm_all = lm() # COMPLETE
summary(lm_all)

# Run the same regression as before, but exclude the outliers (in terms of lot area)
# Q: What is the association between sale price and lot area in this model? Is it the similar as before?

lm_wo_outliers = lm() # COMPLETE
summary(lm_wo_outliers)


# Create a dummy variable (price500) that takes the value of 1 if the sale price is greater than $500,000 and 0 in another case
housing = housing %>% # COMPLETE

# Run a regression with price500 as the outcome, and lot area, number of bedrooms above ground, year built, 
# overall quality of the materials, and pool area as covariates.
  
lm_price = #COMPLETE
  
# Q: Interpret the coefficient for `Overall.Qual`
# Q: Run the same regression as before, but use lm() instead of lm_robust(). Is there our change in the coefficient for Lot.Area?
# Should we use lm() or lm_robust() then? Or it doesn't matter?