## -----------------------------------------------------------------------------
## Name: 07-appendix.R
## Description: Script for Chapter 7 of "Notes for Nonparametric Statistics"
## Link: https://egarpor.github.io/NP-UC3M/
## License: https://creativecommons.org/licenses/by-nc-nd/4.0/
## Author: Eduardo García-Portugués
## Version: 6.13.3
## -----------------------------------------------------------------------------

## ----ci-1, fig.cap = '(ref:ci-1-title)', fig.margin = FALSE-------------------
# R(K) for a normal
Rk <- 1 / (2 * sqrt(pi))

# Generate a sample from a N(mu, sigma^2)
n <- 100
mu <- 0
sigma <- 1
set.seed(123456)
x <- rnorm(n = n, mean = mu, sd = sigma)

# Compute the kde (NR bandwidth)
kde <- density(x = x, from = -4, to = 4, n = 1024, bw = "nrd")

# Selected bandwidth
h <- kde$bw

# Estimate the variance
var_kde_hat <- kde$y * Rk / (n * h)

# True expectation and variance (because the density is a normal)
E_kde <- dnorm(x = kde$x, mean = mu, sd = sqrt(sigma^2 + h^2))
var_kde <- (dnorm(kde$x, mean = mu, sd = sqrt(h^2 / 2 + sigma^2)) /
              (2 * sqrt(pi) * h) - E_kde^2) / n

# CI with estimated variance
alpha <- 0.05
z_alpha2 <- qnorm(1 - alpha / 2)
ci_low_1 <- kde$y - z_alpha2 * sqrt(var_kde_hat)
ci_up_1 <- kde$y + z_alpha2 * sqrt(var_kde_hat)

# CI with known variance
ci_low_2 <- kde$y - z_alpha2 * sqrt(var_kde)
ci_up_2 <- kde$y + z_alpha2 * sqrt(var_kde)

# Plot estimate, CIs and expectation
plot(kde, main = "Density and CIs", ylim = c(0, 1))
lines(kde$x, ci_low_1, col = "gray")
lines(kde$x, ci_up_1, col = "gray")
lines(kde$x, ci_low_2, col = "gray", lty = 2)
lines(kde$x, ci_up_2, col = "gray", lty = 2)
lines(kde$x, E_kde, col = "red")
legend("topright", legend = c("Estimate", "CI estimated var",
                              "CI known var", "Smoothed density"),
       col = c("black", "gray", "gray", "red"), lwd = 2, lty = c(1, 1, 2, 1))

## ----ci-2, fig.cap = '(ref:ci-2-title)', fig.margin = FALSE-------------------
# Simulation setting
n <- 200; h <- 0.15
mu <- 0; sigma <- 1 # Normal parameters
M <- 5e2 # Number of replications in the simulation
n_grid <- 512 # Number of x's for computing the kde
alpha <- 0.05; z_alpha2 <- qnorm(1 - alpha / 2) # alpha for CI

# Compute expectation and variance
kde <- density(x = 0, bw = h, from = -4, to = 4, n = n_grid) # Just for kde$x
E_kde <- dnorm(x = kde$x, mean = mu, sd = sqrt(sigma^2 + h^2))
var_kde <- (dnorm(kde$x, mean = mu, sd = sqrt(h^2 / 2 + sigma^2)) /
              (2 * sqrt(pi) * h) - E_kde^2) / n

# For storing if the mean is inside the CI
inside_ci_1 <- inside_ci_2 <- matrix(nrow = M, ncol = n_grid)

# Simulation
set.seed(12345)
for (i in 1:M) {

  # Sample & kde
  x <- rnorm(n = n, mean = mu, sd = sigma)
  kde <- density(x = x, bw = h, from = -4, to = 4, n = n_grid)
  sd_kde_hat <- sqrt(kde$y * Rk / (n * h))

  # CI with estimated variance
  ci_low_1 <- kde$y - z_alpha2 * sd_kde_hat
  ci_up_1 <- kde$y + z_alpha2 * sd_kde_hat

  # CI with known variance
  ci_low_2 <- kde$y - z_alpha2 * sqrt(var_kde)
  ci_up_2 <- kde$y + z_alpha2 * sqrt(var_kde)

  # Check if for each x the mean is inside the CI
  inside_ci_1[i, ] <- E_kde > ci_low_1 & E_kde < ci_up_1
  inside_ci_2[i, ] <- E_kde > ci_low_2 & E_kde < ci_up_2

}

# Plot results
plot(kde$x, colMeans(inside_ci_1), ylim = c(0.25, 1), type = "l",
     main = "Empirical coverage of CIs", xlab = "x", ylab = "Coverage")
lines(kde$x, colMeans(inside_ci_2), col = 4)
abline(h = 1 - alpha, col = 2)
abline(h = 1 - alpha + c(-1, 1) * qnorm(0.975) *
         sqrt(alpha * (1 - alpha) / M), col = 2, lty = 2)
legend(x = "bottom", legend = c("CI estimated var", "CI known var",
                                "Nominal level",
                                "95% CI for the nominal level"),
       col = c(1, 4, 2, 2), lwd = 2, lty = c(1, 1, 1, 2))

## ----manipulate-1, eval = FALSE-----------------------------------------------
# # Sample
# x <- rnorm(100)
# 
# # Simple plot of kde for varying h
# manipulate::manipulate({
# 
#   kde <- density(x = x, from = -4, to = 4, bw = h)
#   plot(kde, ylim = c(0, 1), type = "l", main = "")
#   curve(dnorm(x), from = -4, to = 4, col = 2, add = TRUE)
#   rug(x)
# 
# }, h = manipulate::slider(min = 0.01, max = 2, initial = 0.5, step = 0.01))

## ----lm-----------------------------------------------------------------------
# Generates 50 points from a N(0, 1): predictors and error
set.seed(34567)
x1 <- rnorm(50)
x2 <- rnorm(50)
x3 <- x1 + rnorm(50, sd = 0.05) # Make variables dependent
eps <- rnorm(50)

# Responses
y_lin <- -0.5 + 0.5 * x1 + 0.5 * x2 + eps
y_qua <- -0.5 + x1^2 + 0.5 * x2 + eps
y_exp <- -0.5 + 0.5 * exp(x2) + x3 + eps

# Data
data_animation <- data.frame(x1 = x1, x2 = x2, y_lin = y_lin,
                             y_qua = y_qua, y_exp = y_exp)

# Call lm
# lm employs formula = response ~ predictor1 + predictor2 + ...
# (names according to the data frame names) for denoting the regression
# to be done
mod <- lm(y_lin ~ x1 + x2, data = data_animation)
summary(mod)

# mod is a list with a lot of information
# str(mod) # Long output

# Coefficients
mod$coefficients

# Application of formula (3.4)

# Matrix X
X <- cbind(1, x1, x2)

# Vector Y
Y <- y_lin

# Coefficients
beta <- solve(t(X) %*% X) %*% t(X) %*% Y
beta

## ----glm-1--------------------------------------------------------------------
# Data
set.seed(34567)
x <- rnorm(50, sd = 1.5)
y1 <- -0.5 + 3 * x
y2 <- 0.5 - 2 * x
y3 <- -2 + 5 * x
y1 <- rbinom(50, size = 1, prob = 1 / (1 + exp(-y1)))
y2 <- rbinom(50, size = 1, prob = 1 / (1 + exp(-y2)))
y3 <- rbinom(50, size = 1, prob = 1 / (1 + exp(-y3)))

# Data
data_mle <- data.frame(x = x, y1 = y1, y2 = y2, y3 = y3)

## ----glm-2--------------------------------------------------------------------
# Call glm
# glm employs formula = response ~ predictor1 + predictor2 + ...
# (names according to the data frame names) for denoting the regression
# to be done. We need to specify family = "binomial" to make a
# logistic regression
mod <- glm(y1 ~ x, family = "binomial", data = data_mle)
summary(mod)

# mod is a list with a lot of information
# str(mod) # Long output

# Coefficients
mod$coefficients

# Plot the fitted regression curve
x_grid <- seq(-5, 5, l = 200)
y_grid <- 1 / (1 + exp(-(mod$coefficients[1] + mod$coefficients[2] * x_grid)))
plot(x_grid, y_grid, type = "l", col = 2, xlab = "x", ylab = "y")
points(x, y1)