#######################################################
###       Statistical Computing with R  2e          ###
###       Maria L. Rizzo                            ###
###       Chapman & Hall/CRC The R Series           ###
###       ISBN 9781466553323 - CAT# K15269          ###
###       January 2019                              ###
###                                                 ###
###       R code for Chapter 12                     ###
###       Probability Density Estimation            ###
#######################################################

# packages to install: ash, ggplot2 

### Example 12.1 (Histogram density estimates using Sturges' Rule)

    set.seed(12345) 
    n <- 25
    x <- rnorm(n)
    # calc breaks according to Sturges' Rule
    nclass <- ceiling(1 + log2(n))
    cwidth <- diff(range(x) / nclass)
    breaks <- min(x) + cwidth * 0:nclass
    h.default <- hist(x, freq = FALSE, xlab = "default",
        main = "hist: default")
    z <- qnorm(ppoints(1000))
    lines(z, dnorm(z))
    h.sturges <- hist(x, breaks = breaks, freq = FALSE,
        main = "hist: Sturges")
    lines(z, dnorm(z))

    print(h.default$breaks)
    print(h.default$counts)
    print(round(h.sturges$breaks, 1))
    print(h.sturges$counts)
    print(cwidth)

    print(h.default$density[5])
    print(h.sturges$density[4])


### Example 12.2 (Density estimates from a histogram)

    #using histograms from Example 12.1
    x0 <- .1
    b <- which.min(h.default$breaks <= x0) - 1
    print(c(b, h.default$density[b]))
    b <- which.min(h.sturges$breaks <= x0) - 1
    print(c(b, h.sturges$density[b]))

    h.default$counts[7] / (n * 0.5)
    h.sturges$counts[6] / (n * cwidth)


### Example 12.3 (Density estimation for Old Faithful)

    library(MASS)  #for geyser and truehist
    waiting <- geyser$waiting
    n <- length(waiting)
    # rounding the constant in Scott's rule
    # and using sample standard deviation to estimate sigma
    h <- 3.5 * sd(waiting) * n^(-1/3)

    # number of classes is determined by the range and h
    m <- min(waiting)
    M <- max(waiting)
    nclass <- ceiling((M - m) / h)
    breaks <- m + h * 0:nclass

    par(ask = TRUE)  #prompt to see next graph
    h.scott <- hist(waiting, breaks = breaks, freq = FALSE,
        main = "")
    truehist(waiting, nbins = "Scott", x0 = 0, prob=TRUE, col = 0)
    hist(waiting, breaks = "scott", prob=TRUE, density=5,
        add=TRUE)
    par(ask = FALSE)


### Example 12.4 (Frequency polygon density estimate)

    waiting <- geyser$waiting   #in MASS
    n <- length(waiting)
    # freq poly bin width using normal ref rule
    h <- 2.15 * sqrt(var(waiting)) * n^(-1/5)

    # calculate the sequence of breaks and histogram
    br <- pretty(waiting, diff(range(waiting)) / h)
    brplus <- c(min(br)-h, max(br+h))
    histg <- hist(waiting, breaks = br, freq = FALSE,
        main = "", xlim = brplus)

    vx <- histg$mids     #density est at vertices of polygon
    vy <- histg$density
    delta <- diff(vx)[1] # h after pretty is applied
    k <- length(vx)
    vx <- vx + delta     # the bins on the ends
    vx <- c(vx[1] - 2 * delta, vx[1] - delta, vx)
    vy <- c(0, vy, 0)

    # add the polygon to the histogram
    polygon(vx, vy)

    # check estimates by numerical integration
    fpoly <- approxfun(vx, vy)
    print(integrate(fpoly, lower=min(vx), upper=max(vx)))
    
    library(ggplot2)
    ggplot(geyser, aes(waiting)) + geom_freqpoly(binsize=h)
    
### Example 12.6 (ASH density estimate)

    library(MASS)
    waiting <- geyser$waiting
    n <- length(waiting)
    m <- 20
    a <- min(waiting) - .5
    b <- max(waiting) + .5
    h <- 7.27037
    delta <- h / m

    #get the bin counts on the delta-width mesh.
    br <- seq(a - delta*m, b + 2*delta*m, delta)
    histg <- hist(waiting, breaks = br, plot = FALSE)
    nk <- histg$counts
    K <- abs((1-m):(m-1))

    fhat <- function(x) {
        # locate the leftmost interval containing x
        i <- max(which(x > br))
        k <- (i - m + 1):(i + m - 1)
        # get the 2m-1 bin counts centered at x
        vk <- nk[k]
        sum((1 - K / m) * vk) / (n * h)   #f.hat
        }

    # density can be computed at any points in range of data
    z <- as.matrix(seq(a, b + h, .1))
    f.ash <- apply(z, 1, fhat)   #density estimates at midpts

    # plot ASH density estimate over histogram
    br2 <- seq(a, b + h, h)
    hist(waiting, breaks = br2, freq = FALSE, main = "",
        ylim = c(0, max(f.ash)))
    lines(z, f.ash, xlab = "waiting")
    
    
### Example 12.7 (KDE of Old Faithful waiting time)

    library(MASS)
    waiting <- geyser$waiting
    n <- length(waiting)

    h1 <- 1.06 * sd(waiting) * n^(-1/5)
    h2 <- .9 * min(c(IQR(waiting)/1.34, sd(waiting))) * n^(-1/5)
    plot(density(waiting))

    print(density(waiting))

    sdK <- density(kernel = "gaussian", give.Rkern = TRUE)
    print(c(sdK, sdK * sd(waiting)))
    print(c(sd(waiting), IQR(waiting)))
    print(c(h1, h2))


### Example 12.8 (KDE of precipitation data)

    n <- length(precip)
    h1 <- 1.06 * sd(precip) * n^(-1/5)
    h2 <- .9 * min(c(IQR(precip)/1.34, sd(precip))) * n^(-1/5)
    h0 <- bw.nrd0(precip)

    par(mfrow = c(2, 2))
    plot(density(precip))           #default Gaussian (h0)
    plot(density(precip, bw = h1))  #Gaussian, bandwidth h1
    plot(density(precip, bw = h2))  #Gaussian, bandwidth h2
    plot(density(precip, kernel = "cosine"))
    par(mfrow = c(1,1))

    print(c(h0, h1, h2))
    
    
### Example 12.9 (Computing $\hat f(x)$ for arbitrary x)

    d <- density(precip)
    xnew <- seq(0, 70, 10)
    approx(d$x, d$y, xout = xnew)

    fhat <- approxfun(d$x, d$y)
    fhat(xnew)    


### Example 12.10 (Exponential density)

    x <- rexp(1000, 1)
    plot(density(x), xlim = c(-1, 6), ylim = c(0, 1), main="")
    abline(v = 0)

    # add the true density to compare
    y <- seq(.001, 6, .01)
    lines(y, dexp(y, 1), lty = 2)
    

### Example 12.11 (Reflection boundary technique)

    xx <- c(x, -x)
    g <- density(xx, bw = bw.nrd0(x))
    a <- seq(0, 6, .01)

    ghat <- approx(g$x, g$y, xout = a)
    fhat <- 2 * ghat$y       # density estimate along a

    bw <- paste("Bandwidth = ", round(g$bw, 5))
    plot(a, fhat, type="l", xlim=c(-1, 6), ylim=c(0, 1),
        main = "", xlab = bw, ylab = "Density")
    abline(v = 0)

    # add the true density to compare
    y <- seq(.001, 6, .01)
    lines(y, dexp(y, 1), lty = 2)
    

### Example 12.12 (Bivariate frequency table: bin2d)

    bin2d <-
      function(x, breaks1 = "Sturges", breaks2 = "Sturges"){
      # Data matrix x is n by 2
      # breaks1, breaks2: any valid breaks for hist function
      # using same defaults as hist
      histg1 <- hist(x[,1], breaks = breaks1, plot = FALSE)
      histg2 <- hist(x[,2], breaks = breaks2, plot = FALSE)
      brx <- histg1$breaks
      bry <- histg2$breaks

      # bin frequencies
      freq <- table(cut(x[,1], brx),  cut(x[,2], bry))

      return(list(call = match.call(), freq = freq,
                breaks1 = brx, breaks2 = bry,
                mids1 = histg1$mids, mids2 = histg2$mids))
      }
    
    bin2d(iris[1:50,1:2])


### Example 12.13 (Bivariate density polygon)

    #generate standard bivariate normal random sample
    n <- 2000;   d <- 2
    x <- matrix(rnorm(n*d), n, d)

    # compute the frequency table and density estimates
    # using bin2d function from the previous example
    b <- bin2d(x)
    h1 <- diff(b$breaks1)
    h2 <- diff(b$breaks2)

    # matrix h contains the areas of the bins in b
    h <- outer(h1, h2, "*")

    Z <- b$freq / (n * h)  # the density estimate

    persp(x=b$mids1, y=b$mids2, z=Z, shade=TRUE,
          xlab="X", ylab="Y", main="",
          theta=45, phi=30, ltheta=60)


### Example 12.14 (Bivariate ASH density estimate)

    library(ash)  # for bivariate ASH density est.
    # generate N_2(0,Sigma) data
    n <- 2000
    d <- 2
    nbin <- c(30, 30)          # number of bins
    m <- c(5, 5)               # smoothing parameters

    # First example with positive correlation
    Sigma <- matrix(c(1, .9, .9, 1), 2, 2)
    set.seed(345)
    
    #rmvn.eigen from Chapter 3 used to generate data
    #alternately mvrnorm (MASS) can be used here
    
#   x <- rmvn.eigen(n, c(0, 0), Sigma=Sigma)
    x <- MASS::mvrnorm(n, c(0, 0), Sigma)
    b <- bin2(x, nbin = nbin)
    # kopt is the kernel type, here triangular
    est <- ash2(b, m = m, kopt = c(1,0))

    persp(x = est$x, y = est$y, z = est$z, shade=TRUE,
          xlab = "X", ylab = "Y", zlab = "", main="",
          theta = 30, phi = 75, ltheta = 30, box = FALSE)
    contour(x = est$x, y = est$y, z = est$z, main="")

    # Second example with negative correlation
    Sigma <- matrix(c(1, -.9, -.9, 1), 2, 2)
    set.seed(345)
#   x <- rmvn.eigen(n, c(0, 0), Sigma=Sigma)  #source from ch 3 or use mvrnorm
    x <- MASS::mvrnorm(n, c(0, 0), Sigma)
    b <- bin2(x, nbin = nbin)
    est <- ash2(b, m = m, kopt = c(1,0))

    persp(x = est$x, y = est$y, z = est$z, shade=TRUE,
          xlab = "X", ylab = "Y", zlab = "", main="",
          theta = 30, phi = 75, ltheta = 30, box = FALSE)
    contour(x = est$x, y = est$y, z = est$z, main="")
    par(ask = FALSE)
    detach(package:ash)
    
        
### Example 12.15 (Product kernel estimate of a BVN mixture)

    library(MASS)  #for mvrnorm and kde2d
    #generate the normal mixture data
    n <- 2000
    p <- c(.2, .3, .5)
    mu <- matrix(c(0, 1, 4, 0, 3, -1), 3, 2)
    Sigma <- diag(2)
    i <- sample(1:3, replace = TRUE, prob = p, size = n)
    k <- table(i)

    x1 <- mvrnorm(k[1], mu = mu[1,], Sigma)
    x2 <- mvrnorm(k[2], mu = mu[2,], Sigma)
    x3 <- mvrnorm(k[3], mu = mu[3,], Sigma)
    X <-  rbind(x1, x2, x3)   #the mixture data
    x <- X[,1]
    y <- X[,2]
    
    print(c(bandwidth.nrd(x), bandwidth.nrd(y)))

    # accepting the default normal reference bandwidth
    fhat <- kde2d(x, y)
    contour(fhat)
    persp(fhat, phi = 30, theta = 20, d = 5, xlab = "x")

    # select bandwidth by unbiased cross-validation
    h = c(ucv(x), ucv(y))
    fhat <- kde2d(x, y, h = h)
    contour(fhat)
    persp(fhat, phi = 30, theta = 20, d = 5, xlab = "x")


### Code to generate data as shown in Table 10.1

    N <- c(10, 20, 30, 50, 100, 200, 500, 1000, 5000, 10000)
    m <- length(N)
    out <- matrix(0, nrow = m, ncol = 8)
    out[ ,1] <- N
    out[ ,5] <- N
    for (i in 1:m) {
        x <- rnorm(N[i])
        out[i, 2:4] <- c(nclass.Sturges(x),
            nclass.scott(x), nclass.FD(x))
        x <- rexp(N[i])
        out[i, 6:8] <- c(nclass.Sturges(x),
            nclass.scott(x), nclass.FD(x))
    }
    print(out)


### Code to plot the histograms in Figure 12.4

    library(MASS)  #for truehist
    par(mfrow = c(2, 2))
    x <- sort(rnorm(1000))
    y <- dnorm(x)
    o <- (1:4) / 4
    h <- .35
    for (i in 1:4) {
        truehist(x, prob = TRUE, h = .35, x0 = o[i],
            xlim = c(-3.5, 3.5), ylim = c(0, 0.45),
            ylab = "Density", main = "")
        lines(x, y)
    }
    par(mfrow = c(1, 1))


### Code to plot Figure 12.6

    #set.seed(7555)
    n <- 10
    y <- rnorm(n)

    par(mfrow = c(2, 2))
    for (h in c(.25, .4, .6, 1)) {
        x <- seq(-4, 4, .01)
        fhat <- rep(0, length(x))
        # set up the plot window first
        plot(x, fhat, type="n", xlab="", ylab="",
            main=paste("h=",h), xlim=c(-4,4), ylim=c(0, .5))
        for (i in 1:n) {
            # plot a normal density at each sample pt
            z <- (x - y[i]) / h
            f <- dnorm(z)
            lines(x, f / (n * h))
            # sum the densities to get the estimates
            fhat <- fhat + f / (n * h)
        }
        lines(x, fhat, lwd=2) # add density estimate to plot
    }

    par(mfrow = c(1, 1))


### Code to plot kernels in Figure 12.7

    #see examples for density, kernels in S parametrization
    (kernels <- eval(formals(density.default)$kernel))

    plot(density(0, from=-1.2, to=1.2, width=2,
        kern="gaussian"), type="l", ylim=c(0, 1),
        xlab="", main="")
    for(i in 2:5)
        lines(density(0, width=2, kern=kernels[i]), lty=i)
    legend("topright", legend=kernels[1:5],
        lty=1:5, inset=.02)

################################################################################
    n <- 1000; x <- rnorm(n); x <- sort(x); h <- 1
    grids <- (-50:50)/10; df <- 2
    dtdf <- function(x,mu) {dt((x-mu)/h,df=df)/h}
    dchidf <- function(x,mu) {dchisq((x-mu)/h,df=df)/h}
    kern <- dtdf
    
    kerns <- outer(grids,x,kern)
    matplot(grids,kerns, type="l")
    
    par(ask=T)
    for (i in c(2^seq(1,log2(n)),n)) {
      est <- apply(kerns[,1:i], 1, mean)
      matplot(grids, kerns[,1:i]/i, type="l", ylim=range(c(est,dnorm(grids))))
      points(grids, est, type="l", lwd=3, col="red")
      points(grids, dnorm(grids), type="l", lwd=1, lty=2, col="green")
    }