library(ggplot2)
library(dplyr)

data(state)
state <- as.data.frame(state.x77)

# review
fit <- lm(Income ~ Illiteracy, data = state)
summary(fit)

dfr <- data.frame(
  r = rstandard(fit)
)

ggplot(data = dfr) +
  geom_histogram(aes(r), bins = 11)

state <- state |>
  mutate(
    r = dfr$r,
    yhat = predict(fit)
  )

ggplot(data = state, aes(Illiteracy, Income)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

ggplot(data = state) +
  geom_point(aes(yhat, r), alpha = 0.1)

# multiple linear regression
fit <- lm(Income ~ Population + `Life Exp` + Illiteracy + Murder + `HS Grad` + Frost + Area, data = state)
summary(fit)

mp <- mean(state$Population)
mi <- mean(state$Illiteracy)
mle <- mean(state$`Life Exp`)
mm <- mean(state$Murder)
mhg <- mean(state$`HS Grad`)
mf <- mean(state$Frost)
ma <- mean(state$Area)

nd <- data.frame(Population = c(mp, mp),
                 Illiteracy = c(mi, mi),
                 `Life Exp` = c(mle, mle + 1), # one unit change in Life Expectancy
                 Murder = c(mm, mm),
                 `HS Grad` = c(mhg, mhg),
                 Frost = c(mf, mf),
                 Area = c(ma, ma),
                 check.names = FALSE # this (and backticks) required if variables have spaces
                 )

diff(predict(fit, newdata = nd))


# cars data set
# linear doesn't mean what you previously might have thought
data(mtcars)

# line doesn't fit data well
ggplot(data=mtcars, aes(wt, mpg)) +
  geom_point() + 
  geom_smooth(se = FALSE, method = "lm")

# look for the non-linearity in the 
# scatter plot: standardized residuals on yhat
fit <- lm(mpg ~ wt, data = mtcars)
mtcars$r <- rstandard(fit)
mtcars$yhat <- predict(fit)

ggplot(data=mtcars) + geom_point(aes(yhat, r))

# add a quadratic term in wt
fit <- lm(mpg ~  wt + I(wt * wt), data = mtcars)
summary(fit)

# the I(...) treats * as multiply
# it's as if we fit the following model
# mtcars$wt2 <- mtcars$wt * mtcars$wt
# fit <- lm(mpg ~  wt + wt2, data = mtcars)
# summary(fit)
# but doing it this way has annoying side effects

o <- order(mtcars$wt)
x <- seq(min(mtcars$wt), max(mtcars$wt), length.out=101)
mtcars <- mtcars |>
  mutate(
    wt = wt[o],
    mpg = mpg[o])

dfr <- data.frame(x = x,
                   yhat = predict(fit, newdata = data.frame(wt = x)))

ggplot() +
  geom_point(data=mtcars, aes(wt, mpg)) + 
 geom_line(data = dfr, aes(x, yhat), color = "blue")

dfr <- data.frame(
  r = rstandard(fit)
)

# we haven't yet solved all problems, but our model does predict better
ggplot(data = dfr) + geom_histogram(aes(r), bins = 11)

# for non-linear models (in the class of linear regression)
# the rate of change in yhat changes dependent on from where
# you make an interpretation; compare to wt = c(2, 3)
diff(predict(fit, newdata = data.frame(wt = c(4, 5))))

# back to state data

# inside lm(...) * means interaction term
# all lower order terms are included, up to and including, 
# the highest order term specified
fit <- lm(Income ~ Population * Area, data = state)
summary(fit)

# non-linear in multiple ways
fit <- lm(log10(mpg) ~ log10(wt), data = mtcars)
summary(fit)

dfl <- data.frame(
  x = x,
  yhat = 10^predict(fit, newdata = data.frame(wt = x))
)

ggplot() +
  geom_point(data = mtcars, aes(wt, mpg)) +
  geom_line(data = dfl, aes(x, yhat), color = "blue")

yhat <- predict(fit)
mean((10^yhat - mtcars$mpg)^2) # Mean Squared Error for mpg; undo log10(mpg)

# interaction terms between a categorical and numeric explanatory variable
df <- read.csv("https://raw.githubusercontent.com/roualdes/data/refs/heads/master/carnivora.csv")

ggplot(data = df, aes(GL, LY, color = SuperFamily)) +
  geom_point()


fit <- lm(LY ~ SuperFamily * GL, data = df)
summary(fit)

# looking forward to optimization
model.matrix(fit)