###################################################################### ### Title: "Week 2 - Multiple Regression: Interactions & Other Issues" ### Course: STA 235H ### Semester: Fall 2023 ### Professor: Magdalena Bennett ####################################################################### # Clears memory rm(list = ls()) # Clears console cat("\014") # scipen=999 removes scientific notation; scipen=0 turns it on. options(scipen = 0) ### Load libraries # If you don't have one of these packages installed already, you will need to run install.packages() line library(tidyverse) library(vtable) ################################################################################ ### In-class exercises ################################################################################ # Cars, cars, cars ---- cars <- read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week2/1_OLS/data/SoCalCars.csv", stringsAsFactors = FALSE) sumtable(cars) ## Let's clean some data ## Select only used cars from the year 1970 onwards, # that are under $100k, # and have less (or equal) than 150k miles (and more (or equal) than 10k). ## Let's create new variables: ### luxury: dummy variable for luxury brands (in `luxury_brand` vector) (source: https://luxe.digital/business/digital-luxury-ranking/best-luxury-car-brands/) ### Transform price from dollars to thousands of dollars, and miles to thousands of miles. ### Transform year so that it's the number of years since 1970s luxury_brands <- c("Audi", "BMW", "Cadillac", "Ferrari", "Jaguar", "Lamborghini", "Land Rover", "Lexus", "Maserati", "Mercedes-Benz", "Porsche", "Rolls-Royce", "Tesla", "Volvo") cars <- cars %>% filter() %>% #COMPLETE CODE mutate(luxury = ifelse(make %in% luxury_brands, 1, 0), price = , #COMPLETE CODE mileage = , #COMPLETE CODE year = ) #COMPLETE CODE ## Let's plot year against price. What do you see? ggplot(data = cars, aes(y = price, x = year)) + geom_point(fill = "white", color = "orange", size = 3, pch = 21) + #pch changes the type of the marker (you can see the options here: http://www.sthda.com/english/wiki/r-plot-pch-symbols-the-different-point-shapes-available-in-r) theme_bw()+ xlab("Year (since 1970)") + ylab("Price (1,000 USD)") ## Let's run a regression of price on mileage, year, rating, luxury, and the interaction of luxury and year. lm2 <- lm() #COMPLETE THE CODE summary(lm2) #### Q: What's the change in price for one additional year for luxury-brand cars vs non-luxury-brand cars, holding other variables constant? # Visualizing data ---- ## Let's do a histogram of our outcome variable ggplot(data = cars, aes(x = price)) + geom_histogram(color = "#BF3984", fill = "white", lwd = 1.5, bins = 40) + #You can change "bins" depending on your data. Make sure you don't have too many or too few! Play around with. theme_bw()+ xlab("Price (M $)") + ylab("Count") #### Q: Describe this plot. What can you say about it? ## We can also look at some descriptive statistics: cars %>% select(price) %>% summary(.) ## Let's create a new variable, log_price cars <- cars %>% mutate(log_price = log(price)) #Be careful here! If Y=0, then log is not defined! #### Q; Now, plot the same plot as before, but using log_price. How would you describe this ggplot() # COMPLETE THIS ## Now let's run the regression: lm_log <- lm(log_price ~ rating + mileage + luxury + year, data = cars) summary(lm_log) #### Q: How do we interpret the coefficient for mileage as a percentage? ### This is a vector of coefficients lm_log$coefficients ### This is the coefficient for mileage: lm_log$coefficients["mileage"] ### This is the percentage change (exact): Exponentiate the coefficient, substract one, and multiply by 100 (exp(lm_log$coefficients["mileage"]) - 1)*100 ##### Q: How does this compare with \beta_1*100% ? # Quadratic model ---- ## Let's look at data from the Current Population Survey (CPS) 1985 CPS1985 = read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week2/1_OLS/data/CPS1985_AER.csv") # Let's run a regression between log(wage), education, experience, and experience^2 lm_mincer = lm() #COMPLETE THIS CODE