# 1
# Stephan Huber, 000, 2020-May-30

# 2
# setwd("/home/sthu/Dropbox/hsf/22-ss/dsb_bac/work/")

# 3
rm(list = ls())

# 4
if (!require(pacman)) install.packages("pacman")
pacman::p_load(tidyverse, haven)

# 5
# cross-section

# 6
sex <- c("f", "f", "f", "m", "m", "m")
age <- c(21, 19, 23, 18, 20, 61)
weight <- c(48, 55, 63, 71, 77, 85)
calories <- c(1700, 1800, 2300, 2000, 2800, 2500)
sport <- c(60, 120, 180, 60, 240, 30)
df <- data.frame(sex, age, weight, calories, sport)

# write_csv(df, file = "/home/sthu/Dropbox/hsf/exams/21-04/stuff/df.csv")
# write_csv(df, file = "/home/sthu/Dropbox/hsf/github/courses/dta/df-calories.csv")
df <- read_csv("https://raw.githubusercontent.com/hubchev/courses/main/dta/df-calories.csv")

# 7
summary(df)

# 8
df |>
  group_by(sex) |>
  summarise(
    mcal = mean(calories),
    sdcal = sd(calories),
    mweight = mean(weight),
    sdweight = sd(weight)
  )

# 9
# discussed in class

# 10
# Many things can be mentioned here such as the use of colors
# (red/blue is not a good choice for color blind people),
# the legend makes no sense as red and green both refer to \textit{sport},
# the label of `f' and `m' is not explained in the legend,
# rotating the labels of the y-axis would increase readability, and
# both axes do not start at zero which is hard to see.
# Also, it is a common to draw the variable you want to explain
# (here: calories) on the y-axis.

# 11
plot(df)

# 12
cor(df$calories, df$sport, method = c("pearson"))
cor(df$weight, df$calories, method = c("pearson"))

# 13
ggplot(df, aes(x = calories, y = weight, label = sex)) +
  geom_point() +
  geom_text(hjust = 0, vjust = 0) +
  stat_smooth(formula = y ~ x, method = "lm", se = FALSE)

# 14
reg_base <- lm(weight ~ calories, data = df)
summary(reg_base)

# 15
# 1) An increase of 100 calories (taken on average on a daily basis) is associated
# - on average and ceteris paribus - with 2.69 more of kg the participants are
# pretended to weight.
# 2) The estimated coefficient $beta_1$ is statistically significantly different to zero
# on a significance level of 5%.
# 3) About 60 % of the variation of the weight is explained by the
# estimated coefficients of the empirical model.

# 16
# For omitted variable bias to occur, the omitted variable `Z` must satisfy
# two conditions:
#   1) The omitted variable is correlated with the included regressor
#   2) The omitted variable is a determinant of the dependent variable

# 17
# discussed in class


# unload packages
suppressMessages(pacman::p_unload(tidyverse, haven))