######################################################################
### Title: "Week 2 - Multiple Regression: Interactions & Other Issues"
### Course: STA 235H
### Semester: Fall 2023
### Professor: Magdalena Bennett
#######################################################################
# Clears memory
rm(list = ls())
# Clears console
cat("\014")
# scipen=999 removes scientific notation; scipen=0 turns it on.
options(scipen = 0)
### Load libraries
# If you don't have one of these packages installed already, you will need to run install.packages() line
library(tidyverse)
library(vtable)
################################################################################
### In-class exercises
################################################################################
# Cars, cars, cars ----
cars <- read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week2/1_OLS/data/SoCalCars.csv", stringsAsFactors = FALSE)
sumtable(cars)
## Let's clean some data
## Select only used cars from the year 1970 onwards,
# that are under $100k,
# and have less (or equal) than 150k miles (and more (or equal) than 10k).
## Let's create new variables:
### luxury: dummy variable for luxury brands (in `luxury_brand` vector) (source: https://luxe.digital/business/digital-luxury-ranking/best-luxury-car-brands/)
### Transform price from dollars to thousands of dollars, and miles to thousands of miles.
### Transform year so that it's the number of years since 1970s
luxury_brands <- c("Audi", "BMW", "Cadillac", "Ferrari", "Jaguar", "Lamborghini", "Land Rover", "Lexus",
"Maserati", "Mercedes-Benz", "Porsche", "Rolls-Royce", "Tesla", "Volvo")
cars <- cars %>% filter() %>% #COMPLETE CODE
mutate(luxury = ifelse(make %in% luxury_brands, 1, 0),
price = , #COMPLETE CODE
mileage = , #COMPLETE CODE
year = ) #COMPLETE CODE
## Let's plot year against price. What do you see?
ggplot(data = cars, aes(y = price, x = year)) +
geom_point(fill = "white", color = "orange", size = 3, pch = 21) + #pch changes the type of the marker (you can see the options here: http://www.sthda.com/english/wiki/r-plot-pch-symbols-the-different-point-shapes-available-in-r)
theme_bw()+
xlab("Year (since 1970)") + ylab("Price (1,000 USD)")
## Let's run a regression of price on mileage, year, rating, luxury, and the interaction of luxury and year.
lm2 <- lm() #COMPLETE THE CODE
summary(lm2)
#### Q: What's the change in price for one additional year for luxury-brand cars vs non-luxury-brand cars, holding other variables constant?
# Visualizing data ----
## Let's do a histogram of our outcome variable
ggplot(data = cars, aes(x = price)) +
geom_histogram(color = "#BF3984", fill = "white", lwd = 1.5, bins = 40) + #You can change "bins" depending on your data. Make sure you don't have too many or too few! Play around with.
theme_bw()+
xlab("Price (M $)") + ylab("Count")
#### Q: Describe this plot. What can you say about it?
## We can also look at some descriptive statistics:
cars %>% select(price) %>% summary(.)
## Let's create a new variable, log_price
cars <- cars %>% mutate(log_price = log(price)) #Be careful here! If Y=0, then log is not defined!
#### Q; Now, plot the same plot as before, but using log_price. How would you describe this
ggplot() # COMPLETE THIS
## Now let's run the regression:
lm_log <- lm(log_price ~ rating + mileage + luxury + year, data = cars)
summary(lm_log)
#### Q: How do we interpret the coefficient for mileage as a percentage?
### This is a vector of coefficients
lm_log$coefficients
### This is the coefficient for mileage:
lm_log$coefficients["mileage"]
### This is the percentage change (exact): Exponentiate the coefficient, substract one, and multiply by 100
(exp(lm_log$coefficients["mileage"]) - 1)*100
##### Q: How does this compare with \beta_1*100% ?
# Quadratic model ----
## Let's look at data from the Current Population Survey (CPS) 1985
CPS1985 = read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week2/1_OLS/data/CPS1985_AER.csv")
# Let's run a regression between log(wage), education, experience, and experience^2
lm_mincer = lm() #COMPLETE THIS CODE