########################################################## ### Title: "Week 1 - Multiple Regression" ### Course: STA 235H ### Semester: Fall 2023 ### Professor: Magdalena Bennett ########################################################## # Clears memory rm(list = ls()) # Clears console cat("\014") ### Load libraries # If you don't have one of these packages installed already, you will need to run install.packages() line library(tidyverse) library(ggplot2) ################################################################################ ### In-class exercises ################################################################################ # Load data rawData <- read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week1/2_OLS/data/bechdel.csv") # Select movies post 1990 bechdel <- rawData %>% filter(Year>1989) # Passes Bechdel test: ## 1) Create a binary variable called bechdel test that takes the value "PASS" if the rating == 3, and "FAIL" otherwise. bechdel <- bechdel %>% mutate(bechdel_test = ifelse(rating==3, "PASS", "FAIL"), Adj_Revenue = Adj_Revenue/10^6, Adj_Budget = Adj_Budget/10^6) # Q: Why do we divide revenue and budget by 10^6? Does it change our results and how? # Let's run some models lm_simple <- lm(Adj_Revenue ~ bechdel_test, data = bechdel) summary(lm_simple) # Q: Recover the coefficient and interpret it # Now include other covariates (like budget, metascore, and imdb rating) lm_multi <- lm()#... complete # Q: Show the results of your new model. Interpret the coefficients. ################################################################################ ### Additional code (if you want to check it at home) ################################################################################ # Load advertising data from ISLR d <- read.csv("https://raw.githubusercontent.com/maibennett/sta235/main/exampleSite/content/Classes/Week1/2_OLS/data/advertising.csv") # Let's run a simple linear model # - Use lm() to run a linear model # - The function `summary()` provides additional information (like SE and R2) summary(lm(Sales ~ TV, data = d)) # Obtain fitted values and residuals d <- d %>% mutate(Sales_hat = predict(lm(Sales ~ TV, data = .)), residual = Sales - Sales_hat) # Scatter plot (base plot) adv_base <- ggplot(data = d, aes(x = TV, y = Sales)) + # pch: changes the shape/type of marker # color and fill: define the outline and fill color of the marker geom_point(size = 4, color="dark grey", pch=21, fill=alpha("dark grey",0.4)) + # Clean theme (removes grey background) theme_bw()+ xlab("TV") + ylab("Sales")+ggtitle("Simple regression") # Add regression line and error terms adv_with_residual <- adv_base + geom_smooth(method = "lm", color = "orange", se = FALSE) + geom_segment(aes(xend = TV, yend = Sales_hat), color = alpha("#BF3984",0.5), size = 0.8) adv_with_residual # Multiple regression # Let's run the same model, but let's include Newspaper summary(lm(Sales ~ TV + Newspaper, data = d)) ## Q: How does the model change with respect to the simple linear model? # Now let's build a 3D plot # We will use the plotly package. If you don't have it installed, run the following line: # install.packages("plotly) library(plotly) # Create a grid for the values of TV and Newspaper, from their min to their max x_grid <- seq(from = min(d$TV), to = max(d$TV), length = 50) y_grid <- seq(from = min(d$Newspaper), to = max(d$Newspaper), length = 50) # Recover the estimates of beta from the previous model: beta_hat <- d %>% lm(Sales ~ TV + Newspaper, data = .) %>% coef() # Now for each value of x and y, generate the fitted Sales value: fitted_values <- crossing(y_grid, x_grid) %>% mutate(z_grid = beta_hat[1] + beta_hat[2]*x_grid + beta_hat[3]*y_grid) # Finally, generate the grid for the plane from the fitted values (it's a matrix!) z_grid <- fitted_values %>% pull(z_grid) %>% matrix(nrow = length(x_grid)) %>% t() # Plot the 3D scatter plot + regression plane (this is an interactive plot!): plot_ly(data = d, z = ~Sales, x = ~TV, y = ~Newspaper, opacity = 1, showlegend = FALSE) %>% add_markers() %>% add_surface(x = x_grid, y = y_grid, z = z_grid, showscale = FALSE, opacity = 0.5) %>% layout( scene = list( xaxis = list(title = "TV"), yaxis = list(title = "Newspaper"), zaxis = list(title = "Sales") ))