## ---------------------------------------------------------------------------- ## Name: 03-kde-ii.R ## Description: Script for Chapter 3 of "Notes for Nonparametric Statistics" ## Link: https://bookdown.org/egarpor/NP-UC3M/ ## License: https://creativecommons.org/licenses/by-nc-nd/4.0/ ## Author: Eduardo García-Portugués ## Version: 6.9.3 ## ---------------------------------------------------------------------------- ## ----kde-2d-1--------------------------------------------------------------------------------------- # Simulated data from a bivariate normal n <- 200 set.seed(35233) x <- mvtnorm::rmvnorm(n = n, mean = c(0, 0), sigma = rbind(c(1.5, 0.25), c(0.25, 0.5))) # Compute kde for a diagonal bandwidth matrix (trivially positive definite) H <- diag(c(1.25, 0.75)) kde <- ks::kde(x = x, H = H) # The eval.points slot contains the grids on x and y str(kde$eval.points) # The grids in kde$eval.points are crossed in order to compute a grid matrix # where to evaluate the estimate dim(kde$estimate) # Manual plotting using the kde object structure image(kde$eval.points[[1]], kde$eval.points[[2]], kde$estimate, col = viridis::viridis(20), xlab = "x", ylab = "y") points(kde$x) # The data is returned in $x # Changing the grid size to compute the estimates to be 200 x 200 and in the # rectangle (-4, 4) x c(-3, 3) kde <- ks::kde(x = x, H = H, gridsize = c(200, 200), xmin = c(-4, -3), xmax = c(4, 3)) image(kde$eval.points[[1]], kde$eval.points[[2]], kde$estimate, col = viridis::viridis(20), xlab = "x", ylab = "y") dim(kde$estimate) # Do not confuse "gridsize" with "bgridsize". The latter controls the internal # grid size for binning the data and speeding up the computations (compare # with binned = FALSE for a large sample size), and is not recommended to # modify unless you know what you are doing. The binning takes place if # binned = TRUE or if "binned" is not specified and the sample size is large # Evaluating the kde at specific points can be done with "eval.points" kde_sample <- ks::kde(x = x, H = H, eval.points = x) str(kde_sample$estimate) # Assign colors automatically from quantiles to have an idea the densities of # each one n_cols <- 20 quantiles <- quantile(kde_sample$estimate, probs = seq(0, 1, l = n_cols + 1)) col <- viridis::viridis(n_cols)[cut(kde_sample$estimate, breaks = quantiles)] plot(x, col = col, pch = 19, xlab = "x", ylab = "y") # Binning vs. not binning abs(max(ks::kde(x = x, H = H, eval.points = x, binned = TRUE)$estimate - ks::kde(x = x, H = H, eval.points = x, binned = FALSE)$estimate)) ## ----kde-2d-2--------------------------------------------------------------------------------------- # Contourplot plot(kde, display = "slice", cont = c(25, 50, 75), xlab = "x", ylab = "y") # "cont" specifies the density contours, which are upper percentages of the # highest density regions. The default contours are at 25%, 50%, and 75% # Raw image with custom colors plot(kde, display = "image", xlab = "x", ylab = "y", col = viridis::viridis(20)) # Filled contour with custom color palette in "col.fun" plot(kde, display = "filled.contour2", cont = seq(5, 95, by = 10), xlab = "x", ylab = "y", col.fun = viridis::viridis) # Alternatively: col = viridis::viridis(length(cont) + 1) # Add contourlevels plot(kde, display = "filled.contour", cont = seq(5, 95, by = 10), xlab = "x", ylab = "y", col.fun = viridis::viridis) plot(kde, display = "slice", cont = seq(5, 95, by = 10), add = TRUE) # Perspective plot plot(kde, display = "persp", col.fun = viridis::viridis, xlab = "x", ylab = "y") ## ----kde-3d, cache = TRUE, eval = knitr:::is_html_output()------------------------------------------ # Simulated data from a trivariate normal n <- 500 set.seed(213212) x <- mvtnorm::rmvnorm(n = n, mean = c(0, 0, 0), sigma = rbind(c(1.5, 0.25, 0.5), c(0.25, 0.75, 1), c(0.5, 1, 2))) # Show nested contours of high-density regions plot(ks::kde(x = x, H = diag(c(rep(1.25, 3)))), drawpoints = TRUE, col.pt = 1) # Beware! Incorrect (not symmetric or positive definite) bandwidths do not # generate an error, but they return a non-sense kde head(ks::kde(x = x, H = diag(c(1, 1, -1)), eval.points = x)$estimate) head(ks::kde(x = x, H = diag(c(1, 1, 0)), eval.points = x)$estimate) # H not positive definite H <- rbind(c(1.5, 0.25, 0.5), c(0.25, 0.75, -1.5), c(0.5, -1.5, 2)) eigen(H)$values head(ks::kde(x = x, H = H, eval.points = x)$estimate) # H semipositive definite but not positive definite H <- rbind(c(1.5, 0.25, 0.5), c(0.25, 0.5, 1), c(0.5, 1, 2)) eigen(H)$values head(ks::kde(x = x, H = H, eval.points = x)$estimate) # Numerical instabilities ## ----ks-bug, eval = FALSE--------------------------------------------------------------------------- ## # Sample test data ## p <- 4 ## data <- mvtnorm::rmvnorm(n = 10, mean = rep(0, p)) ## kde <- ks::kde(x = data, H = diag(rep(1, p))) # Error on the verbose argument ## ----ks-bug-patch, eval = FALSE--------------------------------------------------------------------- ## # Create the replacement function. In this case, we just set the default ## # argument of ks:::kde.points to F (FALSE) ## kde.points.fixed <- function (x, H, eval.points, w, verbose = FALSE) { ## n <- nrow(x) ## d <- ncol(x) ## ne <- nrow(eval.points) ## Hs <- replicate(n, H, simplify = FALSE) ## Hs <- do.call(rbind, Hs) ## fhat <- dmvnorm.mixt(x = eval.points, mus = x, Sigmas = Hs, ## props = w / n, verbose = verbose) ## return(list(x = x, eval.points = eval.points, estimate = fhat, ## H = H, gridded = FALSE)) ## } ## ## # Assign package environment to the replacement function ## environment(kde.points.fixed) <- environment(ks:::kde.points) ## ## # Overwrite original function with replacement (careful -- you will have to ## # restart session to come back to the original object) ## assignInNamespace(x = "kde.points", value = kde.points.fixed, ns = "ks", ## pos = 3) ## # ns = "ks" to indicate the package namespace, pos = 3 to indicate ::: ## ## # Check the result ## ks:::kde.points ## ----kdde-1, fig.cap = '(ref:kdde-1-title)', fig.margin = FALSE------------------------------------- # Simulated univariate data n <- 1e3 set.seed(324178) x <- nor1mix::rnorMix(n = n, obj = nor1mix::MW.nm8) # Location of relative extrema dens <- function(x) nor1mix::dnorMix(x, obj = nor1mix::MW.nm8) minus_dens <- function(x) -dens(x) extrema <- c(nlm(f = minus_dens, p = 0)$estimate, nlm(f = dens, p = 0.75)$estimate, nlm(f = minus_dens, p = 1.5)$estimate) # Plot target density par(mfrow = c(2, 2)) plot(nor1mix::MW.nm8, p.norm = FALSE) rug(x) abline(v = extrema, col = c(3, 2, 3)) # Density estimation (automatically chosen bandwidth) kdde_0 <- ks::kdde(x = x, deriv.order = 0) plot(kdde_0, xlab = "x", main = "Density estimation") abline(v = extrema, col = c(3, 2, 3)) # Density derivative estimation (automatically chosen bandwidth, but different # from kdde_0!) kdde_1 <- ks::kdde(x = x, deriv.order = 1) plot(kdde_1, xlab = "x", main = "Density derivative estimation") abline(v = extrema, col = c(3, 2, 3)) # Density second derivative estimation kdde_2 <- ks::kdde(x = x, deriv.order = 2) plot(kdde_2, xlab = "x", main = "Density second derivative estimation") abline(v = extrema, col = c(3, 2, 3)) ## ----kdde-2----------------------------------------------------------------------------------------- # Simulated bivariate data n <- 1e3 mu_1 <- rep(1, 2) mu_2 <- rep(-1.5, 2) Sigma_1 <- matrix(c(1, -0.75, -0.75, 3), nrow = 2, ncol = 2) Sigma_2 <- matrix(c(2, 0.75, 0.75, 3), nrow = 2, ncol = 2) w <- 0.45 set.seed(324178) x <- ks::rmvnorm.mixt(n = n, mus = rbind(mu_1, mu_2), Sigmas = rbind(Sigma_1, Sigma_2), props = c(w, 1 - w)) # Density estimation kdde_0 <- ks::kdde(x = x, deriv.order = 0) plot(kdde_0, display = "filled.contour2", xlab = "x", ylab = "y") # Density derivative estimation kdde_1 <- ks::kdde(x = x, deriv.order = 1) str(kdde_1$estimate) # $estimate is now a list of two matrices with each of the derivatives # Plot of the gradient field - arrows pointing towards the modes plot(kdde_1, display = "quiver", xlab = "x", ylab = "y") # Plot of the two components of the gradient field for (i in 1:2) { plot(kdde_1, display = "filled.contour2", which.deriv.ind = i, xlab = "x", ylab = "y") } # Second density derivative estimation kdde_2 <- ks::kdde(x = x, deriv.order = 2) str(kdde_2$estimate) # $estimate is now a list of four matrices with each of the derivatives # Plot of the two components of the gradient field ("which.deriv.ind" indicates # the index in the Kronecker product) par(mfcol = c(2, 2)) for (i in 1:4) { plot(kdde_2, display = "filled.contour2", which.deriv.ind = i, xlab = "x", ylab = "y") } ## ----grad-hess-norm--------------------------------------------------------------------------------- # Gradient of a N(mu, Sigma) density (vectorized on x) grad_norm <- function(x, mu, Sigma) { # Check dimensions x <- rbind(x) p <- length(mu) stopifnot(ncol(x) == p & nrow(Sigma) == p & ncol(Sigma) == p) # Gradient grad <- -mvtnorm::dmvnorm(x = x, mean = mu, sigma = Sigma) * t(t(x) - mu) %*% solve(Sigma) return(grad) } # Hessian of a N(mu, Sigma) density (vectorized on x) Hess_norm <- function(x, mu, Sigma) { # Check dimensions x <- rbind(x) p <- length(mu) stopifnot(ncol(x) == p & nrow(Sigma) == p & ncol(Sigma) == p) # Hessian Sigma_inv <- solve(Sigma) H <- apply(x, 1, function(y) { mvtnorm::dmvnorm(x = y, mean = mu, sigma = Sigma) * (Sigma_inv %*% tcrossprod(y - mu) %*% Sigma_inv - Sigma_inv) }) # As an array return(array(data = c(H), dim = c(p, p, nrow(x)))) } ## ----bwd-pi, fig.margin = FALSE--------------------------------------------------------------------- # Simulated data n <- 500 Sigma_1 <- matrix(c(1, -0.75, -0.75, 2), nrow = 2, ncol = 2) Sigma_2 <- matrix(c(2, -0.25, -0.25, 1), nrow = 2, ncol = 2) set.seed(123456) samp <- ks::rmvnorm.mixt(n = n, mus = rbind(c(2, 2), c(-2, -2)), Sigmas = rbind(Sigma_1, Sigma_2), props = c(0.5, 0.5)) # Normal scale bandwidth (Hns <- ks::Hns(x = samp)) # PI bandwidth unconstrained (Hpi <- ks::Hpi(x = samp)) # PI bandwidth diagonal (Hpi_diag <- ks::Hpi.diag(x = samp)) # Compare kdes par(mfrow = c(2, 2)) cont <- seq(0, 0.05, l = 20) col <- viridis::viridis plot(ks::kde(x = samp, H = Hns), display = "filled.contour2", abs.cont = cont, col.fun = col, main = "NS") plot(ks::kde(x = samp, H = diag(diag(Hns))), display = "filled.contour2", abs.cont = cont, col.fun = col, main = "NS diagonal") plot(ks::kde(x = samp, H = Hpi), display = "filled.contour2", abs.cont = cont, col.fun = col, main = "PI") plot(ks::kde(x = samp, H = Hpi_diag), display = "filled.contour2", abs.cont = cont, col.fun = col, main = "PI diagonal") ## ----bwd-pi-der, fig.margin = FALSE----------------------------------------------------------------- # Normal scale bandwidth (compare with Hns) (Hns1 <- ks::Hns(x = samp, deriv.order = 1)) # PI bandwidth unconstrained (compare with Hpi) (Hpi1 <- ks::Hpi(x = samp, deriv.order = 1)) # PI bandwidth diagonal (compare with Hpi_diag) (Hpi_diag1 <- ks::Hpi.diag(x = samp, deriv.order = 1)) # Compare kddes par(mfrow = c(2, 2)) cont <- seq(-0.02, 0.02, l = 21) plot(ks::kdde(x = samp, H = Hns1, deriv.order = 1), display = "filled.contour2", main = "NS", abs.cont = cont) plot(ks::kdde(x = samp, H = diag(diag(Hns1)), deriv.order = 1), display = "filled.contour2", main = "NS diagonal", abs.cont = cont) plot(ks::kdde(x = samp, H = Hpi1, deriv.order = 1), display = "filled.contour2", main = "PI", abs.cont = cont) plot(ks::kdde(x = samp, H = Hpi_diag1, deriv.order = 1), display = "filled.contour2", main = "PI diagonal", abs.cont = cont) ## ----bwd-cv, fig.margin = FALSE--------------------------------------------------------------------- # LSCV bandwidth unconstrained Hlscv <- ks::Hlscv(x = samp) # LSCV bandwidth diagonal Hlscv_diag <- ks::Hlscv.diag(x = samp) # BCV bandwidth unconstrained Hbcv <- ks::Hbcv(x = samp) # BCV bandwidth diagonal Hbcv_diag <- ks::Hbcv.diag(x = samp) # Compare kdes par(mfrow = c(2, 2)) cont <- seq(0, 0.03, l = 20) col <- viridis::viridis plot(ks::kde(x = samp, H = Hlscv), display = "filled.contour2", abs.cont = cont, col.fun = col, main = "LSCV") plot(ks::kde(x = samp, H = Hlscv_diag), display = "filled.contour2", abs.cont = cont, col.fun = col, main = "LSCV diagonal") plot(ks::kde(x = samp, H = Hbcv), display = "filled.contour2", abs.cont = cont, col.fun = col, main = "BCV") plot(ks::kde(x = samp, H = Hbcv_diag), display = "filled.contour2", abs.cont = cont, col.fun = col, main = "BCV diagonal") ## ----level-set-1, fig.cap = '(ref:level-set-1-title)'----------------------------------------------- # Simulated sample n <- 100 set.seed(12345) samp <- rnorm(n = n) # Kde as usual, but force to evaluate it at seq(-4, 4, length = 4096) bw <- bw.nrd(x = samp) kde <- density(x = samp, bw = bw, n = 4096, from = -4, to = 4) # For a given c, what is the theoretical level set? Since we know that the # real density is symmetric and unimodal, then the level set is an interval # of the form [-x_c, x_c] c <- 0.2 x_c <- tryCatch(uniroot(function(x) dnorm(x) - c, lower = 0, upper = 4)$root, error = function(e) NA) # Show theoretical level set x <- seq(-4, 4, by = 0.01) plot(x, dnorm(x), type = "l", ylim = c(0, 0.5), ylab = "Density") rug(samp) polygon(x = c(-x_c, -x_c, x_c, x_c), y = c(0, c, c, 0), col = rgb(0, 0, 0, alpha = 0.5), density = 10) # Function to compute and plot a kde level set. Observe that kde stands for an # object containing the output of density(), although obvious modifications # could be done to the function to receive a ks::kde object # as the main argument kde_level_set <- function(kde, c, add_plot = FALSE, ...) { # Begin and end index for the potentially many intervals in the level sets # of the kde kde_larger_c <- kde$y >= c run_length_kde <- rle(kde_larger_c) # Trick to compute the length of the # sequence of TRUEs that indicates an interval for which kde$y >= c begin <- which(diff(kde_larger_c) > 0) + 1 # Trick to search for the beginning # of each of the intervals end <- begin + run_length_kde$lengths[run_length_kde$values] - 1 # Compute # the end of the intervals from begin + length # Add polygons to a density plot? If so, ... are the additional parameters # for polygon() if (add_plot) { apply(cbind(begin, end), 1, function(ind) { polygon(x = c(kde$x[ind[1]], kde$x[ind[1]], kde$x[ind[2]], kde$x[ind[2]]), y = c(0, kde$y[ind[1]], kde$y[ind[2]], 0), ...) }) } # Return the [a_i, b_i], i = 1, ..., K in the K rows return(cbind(kde$x[begin], kde$x[end])) } # Add kde and level set lines(kde, col = 2) kde_level_set(kde = kde, c = c, add_plot = TRUE, col = rgb(1, 0, 0, alpha = 0.5)) abline(h = c, col = 4) # Level legend("topright", legend = c("True density", "Kde", "True level set", "Kde level set", "Level c"), lwd = 2, col = c(1, 2, rgb(0:1, 0, 0, alpha = 0.5), 4)) ## ----level-set-2, eval = FALSE---------------------------------------------------------------------- ## # Simulated sample ## n <- 100 ## set.seed(12345) ## samp <- rnorm(n = n) ## ## # Interactive visualization ## x <- seq(-4, 4, by = 0.01) ## manipulate::manipulate({ ## ## # Show theoretical level set ## plot(x, dnorm(x), type = "l", ylim = c(0, 0.5), ylab = "Density") ## rug(samp) ## x_c <- tryCatch(uniroot(function(x) dnorm(x) - c, lower = 0, upper = 4)$root, ## error = function(e) NA) # tryCatch() to bypass errors ## polygon(x = c(-x_c, -x_c, x_c, x_c), y = c(0, c, c, 0), ## col = rgb(0, 0, 0, alpha = 0.5), density = 10) ## ## # Add estimation ## kde <- density(x = samp, bw = bw, n = 1e5, from = -4, to = 4) ## lines(kde, col = 2) ## kde_level_set(kde = kde, c = c, add_plot = TRUE, ## col = rgb(1, 0, 0, alpha = 0.5)) ## abline(h = c, col = 4) # Level ## legend("topright", legend = c("True density", "Kde", "True level set", ## "Kde level set", "Level c"), ## lwd = 2, col = c(1, 2, rgb(0:1, 0, 0, alpha = 0.5), 4)) ## ## }, c = manipulate::slider(min = 0.01, max = 0.5, initial = 0.2, step = 0.01), ## bw = manipulate::slider(min = 0.01, max = 1, initial = 0.25, step = 0.01)) ## ----level-set-3, fig.cap = '(ref:level-set-3-title)'----------------------------------------------- # Simulate sample n <- 200 set.seed(12345) samp <- rnorm(n = n) # We want to estimate the highest density region containing 0.75 probability alpha <- 0.25 # For the N(0, 1), we know that this region is the interval [-x_c, x_c] with x_c <- qnorm(1 - alpha / 2) c_alpha <- dnorm(x_c) c_alpha # This corresponds to the c_alpha # Theoretical level set x <- seq(-4, 4, by = 0.01) plot(x, dnorm(x), type = "l", ylim = c(0, 0.5), ylab = "Density") rug(samp) polygon(x = c(-x_c, -x_c, x_c, x_c), y = c(0, c_alpha, c_alpha, 0), col = rgb(0, 0, 0, alpha = 0.5), density = 10) abline(h = c_alpha, col = 3, lty = 2) # Level # Kde bw <- bw.nrd(x = samp) c_alpha_hat <- quantile(ks::kde(x = samp, h = bw, eval.points = samp)$estimate, probs = alpha) c_alpha_hat kde <- density(x = samp, bw = bw, n = 4096, from = -4, to = 4) lines(kde, col = 2) kde_level_set(kde = kde, c = c_alpha_hat, add_plot = TRUE, col = rgb(1, 0, 0, alpha = 0.5)) abline(h = c_alpha_hat, col = 4, lty = 2) # Level legend("topright", legend = expression("True density", "Kde", "True level set", "Kde level set", "Level " * c[alpha], "Level " * hat(c)[alpha]), lwd = 2, col = c(1, 2, rgb(0:1, 0, 0, alpha = 0.5), 3:4), lty = c(rep(1, 4), rep(2, 4))) ## ----level-set-4------------------------------------------------------------------------------------ # N(0, 1) case alpha <- 0.3 x_c <- qnorm(1 - alpha / 2) c_alpha <- dnorm(x_c) c_alpha # Approximates c_alpha quantile(dnorm(samp), probs = alpha) # True integral: 1 - alpha (by construction) 1 - 2 * pnorm(-x_c) # Monte Carlo integration, approximates 1 - alpha mean(dnorm(samp) >= c_alpha) ## ----level-set-5------------------------------------------------------------------------------------ # Simulated sample from a mixture of normals n <- 200 set.seed(123456) mu <- c(2, 2) Sigma1 <- diag(c(2, 2)) Sigma2 <- diag(c(1, 1)) samp <- rbind(mvtnorm::rmvnorm(n = n / 2, mean = mu, sigma = Sigma1), mvtnorm::rmvnorm(n = n / 2, mean = -mu, sigma = Sigma2)) # Level set of the true density at levels c c <- c(0.01, 0.03) x <- seq(-5, 5, by = 0.1) xx <- as.matrix(expand.grid(x, x)) contour(x, x, 0.5 * matrix(mvtnorm::dmvnorm(xx, mean = mu, sigma = Sigma1) + mvtnorm::dmvnorm(xx, mean = -mu, sigma = Sigma2), nrow = length(x), ncol = length(x)), levels = c) # Plot of the contour level H <- ks::Hpi(x = samp) kde <- ks::kde(x = samp, H = H) plot(kde, display = "slice", abs.cont = c, add = TRUE, col = 4) # Argument # "abs.cont" for specifying c rather than (1 - alpha) * 100 in "cont" legend("topleft", lwd = 2, col = c(1, 4), legend = expression(L * "(" * f * ";" * c * ")", L * "(" * hat(f) * "(" %.% ";" * H * "), " * c *")")) # Computation of the probability accumulated in the level sets by numerical # integration ks::contourSizes(kde, abs.cont = c) ## ----level-set-6a, cache = TRUE--------------------------------------------------------------------- # Simulate a sample from a mixture of normals n <- 5e2 set.seed(123456) mu <- c(2, 2, 2) Sigma1 <- rbind(c(1, 0.5, 0.2), c(0.5, 1, 0.5), c(0.2, 0.5, 1)) Sigma2 <- rbind(c(1, 0.25, -0.5), c(0.25, 1, 0.5), c(-0.5, 0.5, 1)) samp <- rbind(mvtnorm::rmvnorm(n = n / 2, mean = mu, sigma = Sigma1), mvtnorm::rmvnorm(n = n / 2, mean = -mu, sigma = Sigma2)) # Plot of the contour level, changing the color palette H <- ks::Hns(x = samp) kde <- ks::kde(x = samp, H = H) plot(kde, cont = 100 * c(0.99, 0.95, 0.5), col.fun = viridis::viridis, drawpoints = TRUE, col.pt = 1, theta = 20, phi = 20) ## ----level-set-6b, cache = TRUE--------------------------------------------------------------------- # Simulate a large sample from a single normal n <- 5e4 set.seed(123456) mu <- c(0, 0, 0) Sigma <- rbind(c(1, 0.5, 0.2), c(0.5, 1, 0.5), c(0.2, 0.5, 1)) samp <- mvtnorm::rmvnorm(n = n, mean = mu, sigma = Sigma) # Plot of the contour level H <- ks::Hns(x = samp) kde <- ks::kde(x = samp, H = H) plot(kde, cont = 100 * c(0.75, 0.5, 0.25), xlim = c(-2.5, 2.5), ylim = c(-2.5, 2.5), zlim = c(-2.5, 2.5)) ## ----level-set-7------------------------------------------------------------------------------------ # Compute kde of unicef dataset data(unicef, package = "ks") kde <- ks::kde(x = unicef) # ks::ksupp evaluates whether the points in the grid spanned by ks::kde belong # to the level set for alpha = 0.05 and then returns the points that belong to # the level set (when convex.hull = FALSE) supp <- as.matrix(ks::ksupp(fhat = kde, cont = 95, convex.hull = FALSE)) plot(supp) # Effective support except for a 5% of data ## ----level-set-8------------------------------------------------------------------------------------ # The convex hull boundary of the level set can be computed with chull() # It returns the indexes of the points passed that form the corners of the # polygon of the convex hull ch <- chull(supp) plot(supp) # One extra point for closing the polygon lines(supp[c(ch, ch[1]), ], col = 2, lwd = 2) # Alternatively, use convex.hull = TRUE (default) plot(supp) plot(ks::ksupp(fhat = kde, cont = 95, convex.hull = TRUE), border = 3, lwd = 2) # The plotting method of ks::ksupp calls to polygon() ## ----level-set-9------------------------------------------------------------------------------------ # Compute the convex hull of supp via geometry::convhulln() C <- geometry::convhulln(p = supp) # The output of geometry::convhulln() is different from chull() # The geometry::inhulln() allows to check if points are inside the convex hull geometry::inhulln(ch = C, p = rbind(c(50, 50), c(150, 50))) # The convex hull works as well in R^p. An example in which the level set is # evaluated by Monte Carlo and then the convex hull of the points in the level # set is computed # Sample set.seed(2134) samp <- mvtnorm::rmvnorm(n = 1e2, mean = rep(0, 3)) # Evaluation sample: random data in [-3, 3]^3 M <- 1e3 eval_set <- matrix(runif(n = 3 * M, -3, 3), M, 3) # Kde of samp, evaluated at eval_set H <- ks::Hns.diag(samp) kde <- ks::kde(x = samp, H = H, eval.points = eval_set) # Convex hull of points in the level set for a given c c <- 0.01 C <- geometry::convhulln(p = eval_set[kde$estimate > c, ]) # We can test if a new point belongs to the level set by just checking if # it belongs to the convex hull, which is much more efficient as it avoids # re-evaluating the kde new_points <- rbind(c(1, 1, 1), c(2, 2, 2)) geometry::inhulln(ch = C, p = new_points) ks::kde(x = samp, H = H, eval.points = new_points)$estimate > c # # Performance evaluation # microbenchmark::microbenchmark( # geometry::inhulln(ch = C, p = new_points), # ks::kde(x = samp, H = H, eval.points = new_points)$estimate > c) ## ----ref:level-set-11------------------------------------------------------------------------------- alpha <- 0.4 p <- 2 c_alpha <- exp(-0.5 * qchisq(p = 1 - alpha, df = p)) / (sqrt(det(Sigma)) * (2 * pi)^(p / 2)) ## ----kmeans, echo = FALSE, fig.margin = FALSE, fig.cap = '(ref:kmeans-title)'----------------------- # Data with 3 clusters set.seed(23456789) n <- 20 x <- rbind(matrix(rnorm(n, sd = 0.3), ncol = 2), cbind(rnorm(n, sd = 0.3), rnorm(n, mean = 2, sd = 0.3)), matrix(rnorm(n, mean = 1, sd = 0.3), ncol = 2)) colnames(x) <- c("x", "y") par(mfrow = c(2, 2)) for (k in 1:4) { set.seed(23456789) cl <- kmeans(x, centers = k, nstart = 20) plot(x, col = cl$cluster, pch = 16, main = paste("k =", k)) points(cl$centers, col = 1:k, pch = 8, cex = 2) } ## ----kmeans-claw, echo = FALSE, fig.cap = '(ref:kmeans-claw-title)'--------------------------------- set.seed(12345679) n <- 1e4 x <- nor1mix::rnorMix(n = n, obj = nor1mix::MW.nm10) cl <- kmeans(x, centers = 5, nstart = 20) plot(nor1mix::MW.nm10, main = "", p.norm = FALSE) points(x, rep(0, n), col = cl$cluster, pch = 15) ## ----gravity, fig.cap = '(ref:gravity-title)'------------------------------------------------------- # Planets th <- 2 * pi / 3 r <- 2 xi_1 <- r * c(cos(th + 0.5), sin(th + 0.5)) xi_2 <- r * c(cos(2 * th + 0.5), sin(2 * th + 0.5)) xi_3 <- r * c(cos(3 * th + 0.5), sin(3 * th + 0.5)) # Gravity force gravity <- function(x) { (mvtnorm::dmvnorm(x = x, mean = xi_1, sigma = diag(rep(0.5, 2))) + mvtnorm::dmvnorm(x = x, mean = xi_2, sigma = diag(rep(0.5, 2))) + mvtnorm::dmvnorm(x = x, mean = xi_3, sigma = diag(rep(0.5, 2)))) / 3 } # Compute numerically the gradient of an arbitrary function attraction <- function(x) numDeriv::grad(func = gravity, x = x) # Evaluate the vector field x <- seq(-4, 4, l = 20) xy <- expand.grid(x = x, y = x) dir <- apply(xy, 1, attraction) # Scale arrows to unit length for better visualization len <- sqrt(colSums(dir^2)) dir <- 0.25 * scale(dir, center = FALSE, scale = len) # Colors of the arrows according to their original magnitude brk <- quantile(len, probs = seq(0, 1, length.out = 21)) cuts <- cut(x = len, breaks = brk) cols <- viridis::viridis(20)[cuts] # Vector field plot plot(0, 0, type = "n", xlim = c(-4, 4), ylim = c(-4, 4), xlab = "x", ylab = "y") arrows(x0 = xy$x, y0 = xy$y, x1 = xy$x + dir[1, ], y1 = xy$y + dir[2, ], angle = 10, length = 0.1, col = cols, lwd = 2) points(rbind(xi_1, xi_2, xi_3), pch = 19, cex = 1.5) ## ----euler, fig.margin = FALSE, fig.cap = '(ref:euler-title)'--------------------------------------- # Mixture parameters mu_1 <- rep(1, 2) mu_2 <- rep(-1.5, 2) Sigma_1 <- matrix(c(1, -0.75, -0.75, 3), nrow = 2, ncol = 2) Sigma_2 <- matrix(c(2, 0.75, 0.75, 3), nrow = 2, ncol = 2) Sigma_1_inv <- solve(Sigma_1) Sigma_2_inv <- solve(Sigma_2) w <- 0.45 # Density f <- function(x) { w * mvtnorm::dmvnorm(x = x, mean = mu_1, sigma = Sigma_1) + (1 - w) * mvtnorm::dmvnorm(x = x, mean = mu_2, sigma = Sigma_2) } # Gradient (caution: only works adequately for x a vector, it is not # vectorized; observe that in the Sigma_inv %*% (x - mu) part the subtraction # of mu and premultiplication by Sigma_inv are specific to a *single* point x) Df <- function(x) { -(w * mvtnorm::dmvnorm(x = x, mean = mu_1, sigma = Sigma_1) * Sigma_1_inv %*% (x - mu_1) + (1 - w) * mvtnorm::dmvnorm(x = x, mean = mu_2, sigma = Sigma_2) * Sigma_2_inv %*% (x - mu_2)) } # Plot density ks::plotmixt(mus = rbind(mu_1, mu_2), Sigmas = rbind(Sigma_1, Sigma_2), props = c(w, 1 - w), display = "filled.contour2", gridsize = rep(251, 2), xlim = c(-5, 5), ylim = c(-5, 5), cont = seq(0, 90, by = 10), col.fun = viridis::viridis) # Euler solution x <- c(-2, 2) # x <- c(-4, 0) # x <- c(-4, 4) N <- 1e3 h <- 0.5 phi <- matrix(nrow = N + 1, ncol = 2) phi[1, ] <- x for (t in 1:N) { phi[t + 1, ] <- phi[t, ] + h * Df(phi[t, ])# / f(phi[t, ]) } lines(phi, type = "l") points(rbind(x), pch = 19) text(rbind(x), labels = "x", pos = 3) # Mean of the components points(rbind(mu_1, mu_2), pch = 16, col = 4) text(rbind(mu_1, mu_2), labels = expression(mu[1], mu[2]), pos = 4, col = 4) # The modes are different from the mean of the components! -- see the gradients cbind(Df(mu_1), Df(mu_2)) # Modes xi_1 <- optim(par = mu_1, fn = function(x) sum(Df(x)^2))$par xi_2 <- optim(par = mu_2, fn = function(x) sum(Df(x)^2))$par points(rbind(xi_1, xi_2), pch = 16, col = 2) text(rbind(xi_1, xi_2), labels = expression(xi[1], xi[2]), col = 2, pos = 2) ## ----gradfields, echo = FALSE, fig.cap = '(ref:gradfields-title)', fig.show = 'hold'---------------- # Evaluate the vector fields x <- seq(-5, 5, l = 15) xy <- expand.grid(x = x, y = x) grad_un <- apply(xy, 1, function(x) Df(x)) grad_no <- apply(xy, 1, function(x) Df(x) / f(x)) grad_un <- 2 * grad_un / max(sqrt(colSums(grad_un^2))) grad_no <- 2 * grad_no / max(sqrt(colSums(grad_no^2))) # Unnormalized gradient field ks::plotmixt(mus = rbind(mu_1, mu_2), Sigmas = rbind(Sigma_1, Sigma_2), props = c(w, 1 - w), display = "filled.contour2", gridsize = rep(251, 2), xlim = c(-5, 5), ylim = c(-5, 5), cont = seq(0, 90, by = 10), col.fun = viridis::viridis) arrows(x0 = xy$x, y0 = xy$y, x1 = xy$x + grad_un[1, ], y1 = xy$y + grad_un[2, ], angle = 10, length = 0.1, col = 1, lwd = 2) points(rbind(xi_1, xi_2), pch = 16, col = 2) text(rbind(xi_1, xi_2), labels = expression(xi[1], xi[2]), col = 2, pos = 2) # Normalized gradient field ks::plotmixt(mus = rbind(mu_1, mu_2), Sigmas = rbind(Sigma_1, Sigma_2), props = c(w, 1 - w), display = "filled.contour2", gridsize = rep(251, 2), xlim = c(-5, 5), ylim = c(-5, 5), cont = seq(0, 90, by = 10), col.fun = viridis::viridis) arrows(x0 = xy$x, y0 = xy$y, x1 = xy$x + grad_no[1, ], y1 = xy$y + grad_no[2, ], angle = 10, length = 0.1, col = 1, lwd = 2) points(rbind(xi_1, xi_2), pch = 16, col = 2) text(rbind(xi_1, xi_2), labels = expression(xi[1], xi[2]), col = 2, pos = 2) ## ----kms-1------------------------------------------------------------------------------------------ # A simulated example for which the population clusters are known # Extracted from ?ks::dmvnorm.mixt mus <- rbind(c(-1, 0), c(1, 2 / sqrt(3)), c(1, -2 / sqrt(3))) Sigmas <- 1/25 * rbind(ks::invvech(c(9, 63/10, 49/4)), ks::invvech(c(9, 0, 49/4)), ks::invvech(c(9, 0, 49/4))) props <- c(3, 3, 1) / 7 # Sample the mixture set.seed(123456) x <- ks::rmvnorm.mixt(n = 1000, mus = mus, Sigmas = Sigmas, props = props) # Kernel mean shift clustering. If H is not specified, then # H = ks::Hpi(x, deriv.order = 1) is employed. Its computation may take some # time, so it is advisable to compute it separately for later reuse H <- ks::Hpi(x = x, deriv.order = 1) kms <- ks::kms(x = x, H = H) # Plot clusters plot(kms, col = viridis::viridis(kms$nclust), pch = 19, xlab = "x", ylab = "y") # Summary summary(kms) # Objects in the kms object kms$nclust # Number of clusters found kms$nclust.table # Sizes of clusters kms$mode # Estimated modes # With keep.path = TRUE the ascending paths are returned kms <- ks::kms(x = x, H = H, keep.path = TRUE) cols <- viridis::viridis(kms$nclust, alpha = 0.5)[kms$label] plot(x, col = cols, pch = 19, xlab = "x", ylab = "y") for (i in 1:nrow(x)) lines(kms$path[[i]], col = cols[i]) points(kms$mode, pch = 8, cex = 2, lwd = 2) ## ----kms-2------------------------------------------------------------------------------------------ # Partition of the whole sample space kms_part <- ks::kms.part(x = x, H = H, xmin = c(-3, -3), xmax = c(3, 4), gridsize = c(150, 150)) plot(kms_part, display = "filled.contour2", col = viridis::viridis(kms$nclust), xlab = "x", ylab = "y") points(kms_part$mode, pch = 8, cex = 2, lwd = 2) # Partition of the population mixt_part <- ks::mvnorm.mixt.part(mus = mus, Sigmas = Sigmas, props = props, xmin = c(-3, -3), xmax = c(3, 4), gridsize = c(150, 150)) plot(mixt_part, display = "filled.contour2", col = viridis::viridis(kms$nclust), xlab = "x", ylab = "y") # Obtain the modes of a mixture of normals automatically modes <- ks::mvnorm.mixt.mode(mus = mus, Sigmas = Sigmas, props = props) points(modes, pch = 8, cex = 2, lwd = 2) modes mus ## ----kms-3a, cache = TRUE--------------------------------------------------------------------------- # Obtain PI bandwidth H <- ks::Hpi(x = iris[, 1:3], deriv.order = 1) # Many (8) clusters: probably due to the repetitions in the data kms_iris <- ks::kms(x = iris[, 1:3], H = H) summary(kms_iris) # Force to only find clusters that contain at least 10% of the data # kms merges internally the small clusters with the closest ones kms_iris <- ks::kms(x = iris[, 1:3], H = H, min.clust.size = 15) summary(kms_iris) # Pairs plot -- good match of clustering with Species plot(kms_iris, pch = as.numeric(iris$Species) + 1, col = viridis::viridis(kms_iris$nclust)) ## ----kms-3b, eval = knitr:::is_html_output(), cache = TRUE------------------------------------------ # See ascending paths kms_iris <- ks::kms(x = iris[, 1:3], H = H, min.clust.size = 15, keep.path = TRUE) cols <- viridis::viridis(kms_iris$nclust)[kms_iris$label] rgl::plot3d(kms_iris$x, col = cols) for (i in 1:nrow(iris)) rgl::lines3d(kms_iris$path[[i]], col = cols[i]) rgl::points3d(kms_iris$mode, size = 5) rgl::rglwidget() ## ----kda-1, fig.cap = '(ref:kda-1-title)'----------------------------------------------------------- # Univariate example x <- iris$Sepal.Length groups <- iris$Species # By default, the ks::hpi bandwidths are computed kda_1 <- ks::kda(x = x, x.group = groups) # Manual specification of bandwidths via ks::hkda (we have univariate data) hs <- ks::hkda(x = x, x.group = groups, bw = "plugin") kda_1 <- ks::kda(x = x, x.group = groups, hs = hs) # Estimated class probabilities kda_1$prior.prob # Classification head(kda_1$x.group.estimate) # (Training) classification error ks::compare(x.group = kda_1$x.group, est.group = kda_1$x.group.estimate) # Classification of new observations ind_1 <- c(5, 55, 105) newx <- x[ind_1] predict(kda_1, x = newx) groups[ind_1] # Reality # Classification regions (points on the bottom) plot(kda_1, xlab = "Sepal length", drawpoints = TRUE, col = rainbow(3)) legend("topright", legend = c("Setosa", "Versicolor", "Virginica"), lwd = 2, col = rainbow(3)) ## ----kda-2, fig.cap = '(ref:kda-2-title)', fig.show = 'hold'---------------------------------------- # Bivariate example x <- iris[, 1:2] groups <- iris$Species # By default, the ks::Hpi bandwidths are computed kda_2 <- ks::kda(x = x, x.group = groups) # Manual specification of bandwidths via ks::Hkda Hs <- ks::Hkda(x = x, x.group = groups, bw = "plugin") kda_2 <- ks::kda(x = x, x.group = groups, Hs = Hs) # Classification of new observations ind_2 <- c(5, 55, 105) newx <- x[ind_2, ] predict(kda_2, x = newx) groups[ind_2] # Reality # Classification error ks::compare(x.group = kda_2$x.group, est.group = kda_2$x.group.estimate) # Plot of classification regions plot(kda_2, col = rainbow(3), lwd = 2, col.pt = 1, cont = seq(5, 85, by = 20), col.part = rainbow(3, alpha = 0.25), drawpoints = TRUE) # The artifacts on the corners (low-density regions) are caused by # numerically-unstable divisions close to 0/0 # The artifacts can be avoided by enlarging the effective support of the normal # kernel that ks considers with supp (by default it is 3.7). Setting supp to # a larger value (~10) will avoid the normal kernel to reach the value 0 # exactly (but it may be required that the default gridsize has to be enlarged # to display the surface adequately if supp is quite large). This is a useful # practical tweak! kda_2 <- ks::kda(x = x, x.group = groups, Hs = Hs, supp = 10) plot(kda_2, col = rainbow(3), lwd = 2, col.pt = 1, cont = seq(5, 85, by = 20), col.part = rainbow(3, alpha = 0.25), drawpoints = TRUE) ## ----kda-3, cache = TRUE---------------------------------------------------------------------------- # Trivariate example x <- iris[, 1:3] groups <- iris$Species # Normal scale bandwidths to avoid undersmoothing Hs <- rbind(ks::Hns(x = x[groups == "setosa", ]), ks::Hns(x = x[groups == "versicolor", ]), ks::Hns(x = x[groups == "virginica", ])) kda_3 <- ks::kda(x = x, x.group = groups, Hs = Hs) # Classification of new observations ind_3 <- c(5, 55, 105) newx <- x[ind_3, ] predict(kda_3, x = newx) groups[ind_3] # Reality # Classification regions plot(kda_3, drawpoints = TRUE, col.pt = c(2, 3, 4), cont = seq(5, 85, by = 20), phi = 10, theta = 10) ## ----proj-grad-hess--------------------------------------------------------------------------------- # Projected gradient into the Hessian s-th eigenvector subspace proj_grad_norm <- function(x, mu, Sigma, s = 2) { # Gradient grad <- grad_norm(x = x, mu = mu, Sigma = Sigma) # Hessian Hess <- Hess_norm(x = x, mu = mu, Sigma = Sigma) # Eigenvectors Hessian eig_Hess <- t(apply(Hess, 3, function(A) { eigen(x = A, symmetric = TRUE)$vectors[, s] })) # Projected gradient proj_grad <- t(sapply(1:nrow(eig_Hess), function(i) { tcrossprod(eig_Hess[i, ]) %*% grad[i, ] })) # As an array return(proj_grad) } ## ----euler-ridge-1, fig.margin = FALSE, fig.cap = '(ref:euler-ridge-1-title)', fig.show = 'hold'---- mu <- c(0, 0) Sigma <- matrix(c(1, -0.71, -0.71, 2), nrow = 2, ncol = 2) ks::plotmixt(mus = mu, Sigmas = Sigma, props = 1, display = "filled.contour2", gridsize = rep(251, 2), xlim = c(-5, 5), ylim = c(-5, 5), cont = seq(0, 90, by = 10), col.fun = viridis::viridis) # Euler solution x0 <- as.matrix(expand.grid(seq(-3, 3, l = 12), seq(-3, 3, l = 12))) x <- matrix(NA, nrow = nrow(x0), ncol = 2) N <- 500 h <- 0.5 phi <- matrix(nrow = N + 1, ncol = 2) eps <- 1e-4 for (i in 1:nrow(x0)) { # Move along the flow curve phi[1, ] <- x0[i, ] for (t in 1:N) { # Euler update phi[t + 1, ] <- phi[t, ] + h * proj_grad_norm(phi[t, ], mu = mu, Sigma = Sigma) / mvtnorm::dmvnorm(x = phi[t, ], mean = mu, sigma = Sigma) # Stopping criterion (to save computing time!) abs_tol <- max(abs(phi[t + 1, ] - phi[t, ])) rel_tol <- abs_tol / max(abs(phi[t, ])) if (abs_tol < eps | rel_tol < eps) break } # Save final point x[i, ] <- phi[t + 1, , drop = FALSE] # Plot lines and x0 lines(phi[1:(t + 1), ], type = "l") points(x0[i, , drop = FALSE], pch = 19) } # Plot final points points(x, pch = 19, col = 2) # Join the ridge points with lines in an automatic and sensible way: # an Euclidean Minimum Spanning Tree (EMST) problem! emst <- emstreeR::ComputeMST(x = x, verbose = FALSE) segments(x0 = x[emst$from, 1], y0 = x[emst$from, 2], x1 = x[emst$to, 1], y1 = x[emst$to, 2], col = 2, lwd = 2) ## ----euler-ridge-2---------------------------------------------------------------------------------- # "Oval" density f_oval <- function(x, mu = 2, sigma = 0.35, Sigma = rbind(c(1, -0.71), c(-0.71, 2))) { # x always as a matrix x <- rbind(x) # Rotate x with distortion Sigma_inv_sqrt <- solve(chol(Sigma)) x <- x %*% Sigma_inv_sqrt # Polar coordinates r <- sqrt(rowSums(x^2)) # Density as conditional * marginal f_theta <- 1 / (2 * pi) f_r_theta <- dnorm(x = r, mean = mu, sd = sigma) jacobian <- det(Sigma_inv_sqrt) / r f <- f_r_theta * f_theta * jacobian return(f) } # "Croissant" density f_crois <- function(x, mu = 2, sigma = 0.5, mu_theta = pi / 2, kappa = 1) { # x always as a matrix x <- rbind(x) # Polar coordinates theta <- atan2(x[, 2], x[, 1]) r <- sqrt(rowSums(x^2)) # Density as conditional * marginal f_theta <- exp(kappa * cos(theta - mu_theta)) / (2 * pi * besselI(kappa, nu = 0)) f_r_theta <- dnorm(x = r, mean = mu, sd = sigma) jacobian <- 1 / r f <- f_r_theta * f_theta * jacobian return(f) } # "Sin" density f_sin <- function(x, a = 0.5, b = 1.75, sigma_x = 2, sigma_y = 0.5) { # x always as a matrix x <- rbind(x) # Density as conditional * marginal f_y <- dnorm(x = x[, 1], mean = 0, sd = sigma_x) f_x_y <- dnorm(x = x[, 2], mean = a * (1 + x[, 1]) * sin(b * x[, 1]), sd = sigma_y) f <- f_x_y * f_y return(f) } ## ----kdr-1------------------------------------------------------------------------------------------ # Simulation from the "oval" density r_oval <- function(n, mu = 2, sigma = 0.35, Sigma = rbind(c(1, -0.71), c(-0.71, 2))) { # Sampling in polar coordinates r <- rnorm(n = n, mean = mu, sd = sigma) theta <- runif(n, 0, 2 * pi) x <- r * cbind(cos(theta), sin(theta)) # Data rotation Sigma_sqrt <- chol(Sigma) return(x %*% Sigma_sqrt) } # Simulation from the "croissant" density r_crois <- function(n, mu = 2, sigma = 0.5, mu_theta = pi / 2, kappa = 1) { # Sampling in polar coordinates as conditional * marginal theta <- circular:::RvonmisesRad(n = n, mu = mu_theta, kappa = kappa) r <- rnorm(n = n, mean = mu, sd = sigma) x <- r * cbind(cos(theta), sin(theta)) return(x) } # Simulation from the "sin" density r_sin <- function(n, a = 0.5, b = 1.75, sigma_x = 2, sigma_y = 0.5) { # Sampling as conditional * marginal x <- rnorm(n = n, mean = 0, sd = sigma_x) y <- rnorm(n = n, mean = a * (1 + x) * sin(b * x), sd = sigma_y) return(cbind(x, y)) } # Oval set.seed(123456) samp_oval <- r_oval(n = 1e3) kdr_oval <- ks::kdr(x = samp_oval) plot(samp_oval) col <- rainbow(max(kdr_oval$end.points$segment))[kdr_oval$end.points$segment] points(kdr_oval$end.points[, 1:2], col = col) emst <- emstreeR::ComputeMST(x = kdr_oval$end.points[, 1:2], verbose = FALSE) segments(x0 = kdr_oval$end.points[emst$from, 1], y0 = kdr_oval$end.points[emst$from, 2], x1 = kdr_oval$end.points[emst$to, 1], y1 = kdr_oval$end.points[emst$to, 2], lwd = 2) # The $end.points$segment output of ks::kdr is very useful, as it allows # handling the components of the ridges easily # Croissant set.seed(526123) samp_crois <- r_crois(n = 1e3) kdr_crois <- ks::kdr(x = samp_crois) plot(samp_crois) col <- rainbow(max(kdr_crois$end.points$segment))[kdr_crois$end.points$segment] points(kdr_crois$end.points[, 1:2], col = col) emst <- emstreeR::ComputeMST(x = kdr_crois$end.points[, 1:2], verbose = FALSE) segments(x0 = kdr_crois$end.points[emst$from, 1], y0 = kdr_crois$end.points[emst$from, 2], x1 = kdr_crois$end.points[emst$to, 1], y1 = kdr_crois$end.points[emst$to, 2], lwd = 2) # Sin set.seed(123456) samp_sin <- r_sin(n = 1e3) kdr_sin <- ks::kdr(x = samp_sin) plot(samp_sin) col <- rainbow(max(kdr_sin$end.points$segment))[kdr_sin$end.points$segment] points(kdr_sin$end.points[, 1:2], col = col) emst <- emstreeR::ComputeMST(x = kdr_sin$end.points[, 1:2], verbose = FALSE) segments(x0 = kdr_sin$end.points[emst$from, 1], y0 = kdr_sin$end.points[emst$from, 2], x1 = kdr_sin$end.points[emst$to, 1], y1 = kdr_sin$end.points[emst$to, 2], lwd = 2) ## ----kdr-2, warning = FALSE------------------------------------------------------------------------- # The initial values are chosen automatically, but they can be specified, # gives faster computations y <- expand.grid(seq(-3, 3, l = 20), seq(-4, 4, l = 20)) # Use y and save paths kdr_oval_1 <- ks::kdr(x = samp_oval, y = y, keep.path = TRUE) plot(samp_oval) paths <- kdr_oval_1$path points(kdr_oval_1$y, col = 4, pch = 19, cex = 0.5) for (i in seq_along(paths)) { lines(paths[[i]], col = 4, cex = 0.5) } points(kdr_oval_1$end.points, col = 2, pch = 19) length(paths) # Ascent done only for 235 out of the 400 y's # By default, ks::kdr employs H = ks::Hpi(..., deriv.order = 2). It can be # precomputed to reduce the computational cost of ks::kdr(). But care is needed: # if H is provided, ks::kdr() needs to be called with pre = FALSE to avoid an # internal scaling of the sample that will result in H being not adequate for # the scaled sample H <- ks::Hpi(x = samp_oval, deriv.order = 2) kdr_oval_1a <- ks::kdr(x = samp_oval, H = H, pre = FALSE, keep.path = TRUE) # There is a bug in ks 1.13.3 that prevents using pre = FALSE and y at the same # time. A partial fix is to specify xmin/xmax and gridsize, which will determine # y, but keeping in mind that these parameters will also affect the precision of # the Hessian estimation (so if they are too small to save computing time, the # accuracy of the ridge estimation will decrease) kdr_oval_1b <- ks::kdr(x = samp_oval, H = H, xmin = c(-3, -4), xmax = c(3, 4), gridsize = c(20, 20), pre = FALSE, keep.path = TRUE) # Compare different approaches -- same main structure, different end points # and spurious ridges depending on the size of the initial grid plot(samp_oval, ylim = c(-4, 6)) points(kdr_oval_1a$end.points[, 1:2], col = 2, pch = 19, cex = 1) points(kdr_oval_1b$end.points[, 1:2], col = 3, pch = 19, cex = 0.25) points(kdr_oval_1$end.points[, 1:2], col = 4, pch = 19, cex = 0.25) legend("topright", legend = c("H given, default y", "xmin/xmax/gridsize/H given", "y given, default H"), col = 2:4, lwd = 2) length(kdr_oval_1$path) # y given, default H length(kdr_oval_1a$path) # H given, default y length(kdr_oval_1b$path) # xmin/xmax/gridsize/H given # If we want to get rid of the points outside the oval, we can identify # them using the density level set for alpha = 0.15 plot(samp_oval) points(kdr_oval_1$end.points, col = 2, pch = 19) alpha <- 0.15 supp <- ks::ksupp(fhat = ks::kde(x = samp_oval, H = H), cont = (1 - alpha) * 100, convex.hull = FALSE) points(supp, col = 3, cex = 0.5) # Two ways of excluding the "spurious" ridges: via convex hull and via # fhat < c_alpha C <- geometry::convhulln(p = supp) out_chull <- !geometry::inhulln(ch = C, p = as.matrix(kdr_oval_1$end.points)[, 1:2]) c_alpha <- quantile(ks::kde(x = samp_oval, H = H, eval.points = samp_oval)$estimate, probs = alpha) out_kde <- ks::kde(x = samp_oval, H = H, eval.points = kdr_oval_1$end.points[, 1:2])$estimate < c_alpha points(kdr_oval_1$end.points[out_chull, 1:2], col = 4, cex = 0.75, pch = 19) points(kdr_oval_1$end.points[out_kde, 1:2], col = 5, cex = 0.75, pch = 19) # The initial grid can also be specified with xmax, xmin, and gridsize # (pre = FALSE because H is precomputed) kdr_oval_2 <- ks::kdr(x = samp_oval, H = H, xmin = c(-3, -3), xmax = c(3, 3), gridsize = c(20, 20), keep.path = TRUE, pre = FALSE) plot(samp_oval) points(kdr_oval_2$end.points[, 1:2], col = 2, pch = 19) paths <- kdr_oval_2$path points(kdr_oval_2$y, col = 4, pch = 19, cex = 0.5) for (i in seq_along(paths)) { lines(paths[[i]], col = 4, cex = 0.5) } # Save also computing time by increasing density.cutoff alpha <- 0.5 c_alpha <- quantile(ks::kde(x = samp_oval, H = H, eval.points = samp_oval)$estimate, probs = alpha) kdr_oval_3 <- ks::kdr(x = samp_oval, y = y, H = H, density.cutoff = c_alpha) plot(samp_oval) points(kdr_oval_3$y, col = 4, pch = 19, cex = 0.5) points(kdr_oval_3$end.points[, 1:2], col = 2, pch = 19) ## ----kdr-3, fig.margin = FALSE, fig.cap = '(ref:kdr-3-title)', fig.show = 'hold'-------------------- # Load data data(quake, package = "ks") # Earthquakes locations data(plate, package = "ks") # Tectonic plate boundaries # Select the Pacific Ring of Fire and disregard other variables # except location of craters quake <- quake[quake$prof == 1, c("long", "lat")] # Fix negative longitude quake$long[quake$long < 0] <- quake$long[quake$long < 0] + 360 # Select relevant plates plate <- plate[plate$long < -20 | plate$long > 20, ] plate$long[plate$long < 0 & !is.na(plate$long)] <- plate$long[plate$long < 0 & !is.na(plate$long)] + 360 # Display raw data maps::map("world2", xlim = c(85, 305), ylim = c(-70, 70), mar = c(0, 0, 0, 0), interior = FALSE, lty = 2) lines(plate[, 1:2], col = 3, lwd = 2) points(quake, cex = 0.5, pch = 16, col = 2) # Density ridges kdr_quake <- ks::kdr(x = quake, xmin = c(70, -70), xmax = c(310, 80)) points(kdr_quake$end.points[, 1:2], cex = 0.5, pch = 16, col = 4)