# README: the interpretation of model parameters in the Understanding part 
# are NOT in line with frequentist framework (the frequentist would predict
# the point estimate always). To emphasize uncertainty I include the SE in 
# prediction.
theme_set(theme_bw(base_size = 20))
library(tidyverse)
library(glue)
  

#####################################################################
##########         Understanding Linear Regression         ##########
#####################################################################


mu <- 10
sigma <- 1
y <- rnorm(mean = mu, sd = sigma, n = 1e5)

y[1:10]

df <- tibble(y = y)

ggplot(df, aes(y = y)) +
  geom_density() +
  coord_flip()
  
  
library(mice)
?mice()


# to illustrate how linear regression makes use of the gaussian distribution
# let us model an example for the purpose of prediction.


data(Howell1, package = "rethinking")

d <- Howell1
adults <- filter(d, age > 20)
ggplot(adults, aes(height)) +
  geom_histogram(aes(y = ..density..), fill = "white", color = "black") +
  geom_density(color = "black", size = 3)

# another way of showing the data 
adults %>% mutate(id = 1:dim(adults)[1]) %>%
  ggplot(aes(id, height)) +
    geom_point()  +
    geom_hline(yintercept = mean(adults$height), size = 3, color = "red") 


# If we have only the outcome "height" and nothing more, what would be our guess
# if someone asked us about the height of a random person of the same
# population (not the same sample of adults)? Lets ask our regression robot:

model1 <- lm(formula = height ~ 1, data = adults, )
summary(model1)


# what happens if we standardize the outcome?
# adults <- mutate(adults, height_s = scale(height)[, 1])
# model1_s <- lm(formula = height_s ~ 1, data = adults)
# summary(model1_s)

# example functions that work on the lm object
model1$coefficients

coef(model1)

# given our data and this model (linear regression), our best guess would be 
# 154.75. We are pretty confident that this is our best guess (see standard
# error). But we also know that our guess will only be good on average and 
# that each single guess can be far off (residual standard error). Our model 
# "thinks" like this: (illustrating why 157.75 is it's best guess): 


height <- rnorm(mean = 154.7477, sd = 7.799, n = 1e5)
df_height <- tibble(
  height = height,
  id = 1:length(height)
)
df_height  %>%
  ggplot(aes(height)) +
    geom_histogram(aes(y = ..density..), fill = "white", color = "black") +
    geom_density(color = "black", size = 3)

# another way of showing how the model operates"
df_height %>% 
  ggplot(aes(id, height)) +
    geom_point(alpha = 0.1) +
    geom_hline(yintercept = model1$coefficients, color = "red", size = 3)


# luckily we have more information than that and that is where the real power
# of linear regression comes from! We model the parameter mu as a linear combi-
# nation of other variables:

model2 <- lm(height ~ 1 + weight, data = adults)

# Given that we know the weight of people, what would be our best guess
# of their height?

summary(model2)

# given our data and model2, our best guess would be 
# 113.37 + weight * 0.92. We are pretty confident that this is our best guess
# (see standard errors intercept and weight). Instead of guessing one value for
# any person, now our best guess depends on the weight of that person. 
# Each average guess will be better this time and less far off than
# before (residual standard error). Our model "thinks" like this: 


# for a person with weight 40:
mu <- 113.36888 + 40 * 0.91651
sigma <- 5.101
height <- rnorm(mean = mu, sd = sigma, n = 1e5)
df_height <- tibble(
  height = height,
  id = 1:length(height)
)
df_height  %>%
  ggplot(aes(height)) +
    geom_histogram(aes(y = ..density..), fill = "white", color = "black") +
    geom_density(color = "black", size = 3)
# another way of showing the same data
df_height %>% 
  ggplot(aes(id, height)) +
    geom_point(alpha = 0.1) +
    geom_hline(yintercept = mu, color = "red", size = 3)


# or for a person with weight = 60
height <- rnorm(mean = 113.36888 + 60 * 0.91651, sd = 5.101, n = 1e5)
df_height <- tibble(
  height = height,
  id = 1:length(height)
)
df_height  %>%
  ggplot(aes(height)) +
    geom_histogram(aes(y = ..density..), fill = "white", color = "black") +
    geom_density(color = "black", size = 3)

df_height %>% 
  ggplot(aes(id, height)) +
    geom_point(alpha = 0.1) +
    geom_hline(yintercept = 
      model2$coefficients[1] + 60 * model2$coefficients[2],
      color = "red", size = 3)


# we can plot this for a range of weight values:
weight_seq <- seq(min(adults$weight) - 5, max(adults$weight) + 5, length.out = 30)


heights <- list()
for (w in seq_along(weight_seq)) {
    df <- tibble(
      height = rnorm(
        mean = model2$coefficients[1] + weight_seq[w] * model2$coefficients[2],
        sd = 5.101,
        n = 1e3),
      weight = weight_seq[w]
  )
  heights[[w]] <- df
}


heights <- bind_rows(heights)


ggplot(heights, aes(x = weight, y = height)) +
  geom_point(alpha = 0.1)

# and we can calculate our best guess for each weight, which is the regression 
# line!
best_guess <- tibble(
  weight_seq, 
  mu = model2$coefficients[1] + weight_seq * model2$coefficients[2])

ggplot(heights, aes(x = weight, y = height)) +
  geom_point(alpha = 0.1) +
  geom_line(data = best_guess, aes(x = weight_seq, y = mu), size = 3, color = "red")

# adding the real data 
ggplot(heights, aes(x = weight, y = height)) +
  geom_point(alpha = 0.1, color = "lightgrey") +
  geom_line(data = best_guess, aes(x = weight_seq, y = mu), size = 3, color = "red") +
  geom_point(data = adults, aes(x = weight, y = height),
    alpha = 0.5, 
      size = 2, 
      color = "firebrick", 
      shape = 1,
      stroke = 1)

# the same concept applies to categorical variables
model3 <- lm(height ~ 1 + male, data = adults)
summary(model3)
149.4578 + 11.1526

# given our data and model3, our best guess would be 149.46 for females and
# 160.61 for males. We are pretty confident etc...

df_height <- tibble(
  male = sample(c(0, 1), 1e4, replace = T),
  height = rnorm(1e4, mean = 149.4578 + male * 11.1526)
)
mu <- tibble(male = c(0, 1), mu = c(149.4578, 160.6104))
ggplot(df_height, aes(male, height)) +
  geom_point() +
  geom_line(data = mu, aes(male, mu), size = 3, color = "red")

# again adding the real data: 
ggplot(df_height, aes(male, height)) +
  geom_point(alpha = 0.1, color = "lightgrey") +
  geom_line(data = mu, aes(male, mu), size = 3, color = "red") +
  geom_jitter(data = adults, aes(x = male, y = height),
    alpha = 0.5, 
      size = 2, 
      color = "firebrick", 
      shape = 1,
      stroke = 1,
      width = 0.1
    )


# to show that this is nothing different than a STANDARD t test:
summary(model3)
t.test(height ~ male, data = adults, paired = FALSE, var.equal = TRUE)


# Now lets explore what our model predicts if we add both:
model4 <- lm(height ~ weight + male, data = adults)
summary(model4)

betas <- model4$coefficients
betas

heights <- map_dfr(weight_seq, function(w) {
  tibble(
    male = rep(c(0, 1), 5e3),
    height = rnorm(
      mean = betas[1] + w * betas[2] + male * betas[3],
      sd = 4.929,
      n = 1e4),
    weight = w,
    mu = betas[1] + w * betas[2] + male * betas[3]
  )
})

head(heights)

ggplot(heights, aes(weight, height)) +
  geom_point(alpha = 0.1) +
  geom_line(aes(y = mu), size = 3, color = "red") +
  facet_wrap(~male)
  
  
ggplot(heights, aes(weight, height)) +
  geom_point(alpha = 0.1, color = "lightgrey") +
  geom_line(aes(y = mu), size = 3, color = "red") +
  geom_point(data = adults, aes(x = weight, y = height),
    alpha = 0.5, 
      size = 2, 
      color = "firebrick", 
      shape = 1,
      stroke = 1) +
  facet_wrap(~male)


# But not all can be described well by a straight line.
# if you like, try to fit a polynomial regression!
ggplot(heights, aes(weight, height)) +
  geom_point(alpha = 0.1, color = "lightgrey") +
  geom_line(aes(y = mu), size = 3, color = "red") +
  geom_point(data = d, aes(x = weight, y = height),
    alpha = 0.5, 
      size = 2, 
      color = "firebrick", 
      shape = 1,
      stroke = 1) +
  facet_wrap(~male)


#####################################################################
##########               Performing Linear Regression      ##########
#####################################################################

# as we have seen, there are assumptions to linear regression that need to be
# tested:

final <- lm(height ~ 1 + weight + male, data = adults)

# 1. Is the gaussian model appropriate (often referred to as normality of
#    residuals)?

# lets first extract the residuals and the fitted values and then add them 
# to our dataframe:

# resid(model)
# fitted(model)


diag_df <- adults %>%
  mutate( 
    sresid = resid(final), 
    fitted = fitted(final)
  ) %>% 
  mutate(sresid = scale(sresid)[, 1])
  

# do you think the error (the residuals) are normally distributed?
ggplot(diag_df, aes(sresid)) +
    geom_density() +
    ylab('Density') + xlab('Standardized Redsiduals') 

# this plot lets us answer this easier:
ggplot(diag_df, aes(sample = sresid)) +
    geom_qq() 

# there are also statistical tests for this but think the analyst should 
# argue and decide whether or not the model is suited for the purpose of
# interest

# 2. Homogeneity of variance
#    The parameter sigma is estimated once (unlike mu which is estimated
#    for each new set of predictor value). Thus, it is assumed that sigma
#    is constant or "homogeneous" across the predictor space. Lets inspect 
#    if this is the case here:


diag_df$id <- 1:dim(diag_df)[1]

ggplot(diag_df, aes(fitted, sresid)) +
  geom_point(alpha = 0.6) +
  geom_smooth(se = F, color = "#f94c39") +
  geom_point(
    data = filter(diag_df, abs(sresid) > 3.5), 
    aes(fitted, sresid), color='red')  +
    ggrepel::geom_text_repel(
      data = filter(diag_df, abs(sresid) > 3.5), 
      aes(fitted, y = sresid, label = id), size = 3
    ) 

# Fitted vs observed
ggplot(diag_df, aes(fitted, height)) +
  geom_point(alpha = 0.6) +
  geom_smooth(se = F, color = '#f94c39')
  
  
# how about this model?
model <- lm(height ~ 1 + weight + male, data = d)

# 1. Is the gaussian model appropriate (often referred to as normality of
#    residuals)?


diag_df <- d %>%
  mutate( 
    sresid = resid(model), 
    fitted = fitted(model)
  ) %>% 
  mutate(sresid = scale(sresid)[, 1])
  

# do you think the error (the residuals) are normally distributed?
ggplot(diag_df, aes(sresid)) +
    geom_density() +
    ylab('Density') + xlab('Standardized Redsiduals') 

# this plot lets us answer this easier:
ggplot(diag_df, aes(sample = sresid)) +
    geom_qq() 


# 2. Homogeneity of variance


diag_df$id <- 1:dim(diag_df)[1]

ggplot(diag_df, aes(fitted, sresid)) +
  geom_point(alpha = 0.6) +
  geom_smooth(se = F, color = "#f94c39") +
  geom_point(
    data = filter(diag_df, abs(sresid) > 3.5), 
    aes(fitted, sresid), color='red')  +
    ggrepel::geom_text_repel(
      data = filter(diag_df, abs(sresid) > 3.5), 
      aes(fitted, y = sresid, label = id), size = 3
    ) 

# Fitted vs observed
ggplot(diag_df, aes(fitted, height)) +
  geom_point(alpha = 0.6) +
  geom_smooth(se = F, color = '#f94c39')
betas <- model$coefficients
summary(model)
weight_seq <- seq(min(d$weight) - 5, max(d$weight) + 5, length.out = 30)
heights <- map_dfr(weight_seq, function(w) {
  tibble(
    male = rep(c(0, 1), 5e3),
    height = rnorm(
      mean = betas[1] + w * betas[2] + male * betas[3],
      sd = 9.37,
      n = 1e4),
    weight = w,
    mu = betas[1] + w * betas[2] + male * betas[3]
  )
})
  
ggplot(heights, aes(weight, height)) +
  geom_point(alpha = 0.1, color = "lightgrey") +
  geom_line(aes(y = mu), size = 3, color = "red") +
  geom_point(data = d, aes(x = weight, y = height),
    alpha = 0.5, 
      size = 2, 
      color = "firebrick", 
      shape = 1,
      stroke = 1) +
  facet_wrap(~male)


# I used ggplot2 to follow up on your ggplot skills. However, for such 
# common plots, there will always be a package or as in this case a base 
# r function that is so much shorter:
plot(model)

# Now you know how to perform a regression. What else do you need to know 
# how to do in R in the context of regression?


# Hilde: How to do t-test:

# the standard t-test (as we saw) assumes homogeneity of variance
# you need to test this:
library(car)

leveneTest(adults$height, adults$male)
# then you can use the standard test 
males <- filter(adults, male == 1) %>% .$height 
females <- filter(adults, male == 0) %>% .$height
t.test(males, females, var.equal = TRUE)

# or here easier: 
t.test(height ~ male, data = adults, var.equal = TRUE)

# (problem with homogeneity of variance)
# a great feature in R is that the default is actually Welch t-test:
# (see https://www.rips-irsp.com/articles/10.5334/irsp.82/)
t.test(height ~ male, data = adults)

males <- na.omit(males)
males_post <- males + rnorm(n = length(males), 4, 1)
# perform a paired t test 
t.test(males, males_post, var.equal = TRUE, paired = TRUE)


# always check the function documentation no matter which analyses you use 
# e.g.: how does the model treat missingness? Often listwise deletion is
# default. But also some model impute by default or throw an error.


# as homework, please go through this page to see how to do x in R:
# https://www.statmethods.net/stats/index.html