## ----echo=knitr::is_html_output()-------------------------------------------- #| code-summary: "Load libraries" source("code/setup.R") ## ---------------------------------------------------------------------------- #| label: fig-sup-example #| fig-cap: "Examples of supervised classification patterns: (a) linearly separable, (b) linear but not completely separable, (c) non-linearly separable, (d) non-linear, but not completely separable." #| fig-alt: "A set of four scatter plots labeled (a), (b), (c), and (d), each showing two groups of data points in red and blue. The distribution of points varies across the plots. In (a), the red and blue points are relatively well separated along a diagonal trend. In (b), the separation is less clear, with more overlap between the groups. In (c) and (d), the points follow a zig-zag pattern, separated in (c) but overlapping in (d)." #| echo: false #| message: false #| fig-height: 2.5 #| fig-width: 7.5 #| out-width: 100% set.seed(524) x1 <- runif(176) + 0.5 x1[1:61] <- x1[1:61] - 1.2 x2 <- 1 + 2*x1 + rnorm(176) x2[1:61] <- 2 - 3*x1[1:61] + rnorm(61) x3 <- runif(176) + 0.5 x3[1:61] <- x3[1:61] - 0.5 x4 <- 0.25 - x3 + rnorm(176) x4[1:61] <- -0.25 + 3*x3[1:61] + rnorm(61) cl <- factor(c(rep("A", 61), rep("B", 176-61))) df <- data.frame(x1, x2, x3, x4, cl) class1 <- ggplot(df, aes(x=x1, y=x2, colour = cl)) + geom_point(alpha=0.7) + scale_colour_discrete_divergingx( palette = "Zissou 1", nmax = 2, rev = TRUE) + annotate("text", -0.65, 6.6, label="a") + theme(aspect.ratio=1, legend.position = "none", axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), panel.background = element_rect("white"), panel.border = element_rect("black", fill=NA, linewidth = 0.5)) class2 <- ggplot(df, aes(x=x3, y=x4, colour = cl)) + geom_point(alpha=0.7) + scale_colour_discrete_divergingx( palette = "Zissou 1", nmax = 2, rev = TRUE) + annotate("text", 0.05, 4.1, label="b") + theme(aspect.ratio=1, legend.position = "none", axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), panel.background = element_rect("white"), panel.border = element_rect("black", fill=NA, linewidth = 0.5)) set.seed(826) x5 <- 2*(runif(176) - 0.5) x6 <- case_when(x5 < -0.4 ~ -1.2 - 3 * x5, x5 > 0.2 ~ 2.4 - 3 * x5, .default = 1.2 + 3 * x5) x5 <- 2*x5 x6 <- x6 + rnorm(176) * 0.25 x6[1:83] <- x6[1:83] - 1.5 x7 <- 2*(runif(176) - 0.5) x8 <- case_when(x7 < -0.4 ~ -1.2 - 3 * x7, x7 > 0.2 ~ 2.4 - 3 * x7, .default = 1.2 + 3 * x7) x7 <- 2*x7 x8[x7 < -0.1] <- x8[x7 < -0.1] + rnorm(length(x8[x7 < -0.1])) * 0.25 x8[x7 >= -0.1] <- x8[x7 >= -0.1] + rnorm(length(x8[x7 >= -0.1])) * 0.5 x8[1:83] <- x8[1:83] - 1.5 cl2 <- factor(c(rep("A", 83), rep("B", 176-83))) df2 <- data.frame(x5, x6, x7, x8, cl2) class3 <- ggplot(df2, aes(x=x5, y=x6, colour = cl2)) + geom_point(alpha=0.7) + scale_colour_discrete_divergingx( palette = "Zissou 1", nmax = 2, rev = TRUE) + annotate("text", 1.92, 2.07, label="c") + theme(aspect.ratio=1, legend.position = "none", axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), panel.background = element_rect("white"), panel.border = element_rect("black", fill=NA, linewidth = 0.5)) class4 <- ggplot(df2, aes(x=x7, y=x8, colour = cl2)) + geom_point(alpha=0.7) + scale_colour_discrete_divergingx( palette = "Zissou 1", nmax = 2, rev = TRUE) + annotate("text", 1.95, 1.9, label="d") + theme(aspect.ratio=1, legend.position = "none", axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank(), panel.background = element_rect("white"), panel.border = element_rect("black", fill=NA, linewidth = 0.5)) print(class1 + class2 + class3 + class4 + plot_layout(ncol=4)) ## ---------------------------------------------------------------------------- #| label: fig-bias-variance #| fig-cap: "Illustrating bias and variance using three different samples (black points) from the same distribution (in the columns) and two different model fits (rows). Colour indicates predictions from the resulting fitted model. The model used in a-c does not capture the zig-zag clusters, but is virtually the same regardless of training sample: it has high bias but low variance. The model used in d-f captures the zig-zag reasonably, but the boundary differs substantially between training samples: it has low bias but high variance." #| fig-alt: "A set of six square plots labeled (a) to (f), arranged in a 2x3 grid. Data points are overlaid as black crosses and circles. Each group follows a zig-zag pattern with crosses at top and separated from circles. The colours, red and blue, display a binary classification boundary, which is linear in (a) to (c) and different step function shapes in (d) to (f). " #| echo: false #| message: false #| fig-height: 4 #| fig-width: 6 #| out-width: 100% n <- 486 set.seed(826) x1 <- 2*(runif(n) - 0.5) x2 <- case_when(x1 < -0.4 ~ -1.2 - 3 * x1, x1 > 0.2 ~ 2.4 - 3 * x1, .default = 1.2 + 3 * x1) x1 <- 2*x1 x2 <- x2 + rnorm(n) * 0.25 x2[1:(n/2)] <- x2[1:(n/2)] - 1.5 cl <- factor(c(rep("A", n/2), rep("B", n-n/2))) df3 <- data.frame(x1, x2, cl) df_lda <- lda(cl~x1+x2, data=df3) df_lda_bnd <- explore(df_lda, df3) bv1 <- ggplot() + geom_point(data=filter(df_lda_bnd, .TYPE == "simulated"), aes(x=x1, y=x2, colour=cl), alpha=0.6) + scale_colour_discrete_divergingx(palette = "Zissou 1") + geom_point(data=df3, aes(x=x1, y=x2, shape=cl), colour="black", alpha=0.5) + scale_shape_manual(values = c(1,4)) + annotate("text", 1.8, 1.9, label="a") + theme_minimal() + theme(aspect.ratio = 1, legend.position = "none", axis.title = element_blank(), axis.text = element_blank()) df_tree <- rpart(cl~x1+x2, data=df3, control = rpart.control(minsplit = 5, cp = 0.00001)) df_tree_bnd <- explore(df_tree, df3) bv2 <- ggplot() + geom_point(data=filter(df_tree_bnd, .TYPE == "simulated"), aes(x=x1, y=x2, colour=cl), alpha=0.6) + scale_colour_discrete_divergingx(palette = "Zissou 1") + geom_point(data=df3, aes(x=x1, y=x2, shape=cl), colour="black", alpha=0.5) + scale_shape_manual(values = c(1,4)) + annotate("text", 1.8, 1.9, label="d") + theme_minimal() + theme(aspect.ratio = 1, legend.position = "none", axis.title = element_blank(), axis.text = element_blank()) set.seed(104) x1 <- 2*(runif(n) - 0.5) x2 <- case_when(x1 < -0.4 ~ -1.2 - 3 * x1, x1 > 0.2 ~ 2.4 - 3 * x1, .default = 1.2 + 3 * x1) x1 <- 2*x1 x2 <- x2 + rnorm(n) * 0.25 x2[1:(n/2)] <- x2[1:(n/2)] - 1.5 cl <- factor(c(rep("A", n/2), rep("B", n-n/2))) df3 <- data.frame(x1, x2, cl) df_lda <- lda(cl~x1+x2, data=df3) df_lda_bnd <- explore(df_lda, df3) bv3 <- ggplot() + geom_point(data=filter(df_lda_bnd, .TYPE == "simulated"), aes(x=x1, y=x2, colour=cl), alpha=0.6) + scale_colour_discrete_divergingx(palette = "Zissou 1") + geom_point(data=df3, aes(x=x1, y=x2, shape=cl), colour="black", alpha=0.5) + scale_shape_manual(values = c(1,4)) + annotate("text", 1.8, 1.9, label="b") + theme_minimal() + theme(aspect.ratio = 1, legend.position = "none", axis.title = element_blank(), axis.text = element_blank()) df_tree <- rpart(cl~x1+x2, data=df3, control = rpart.control(minsplit = 5, cp = 0.00001)) df_tree_bnd <- explore(df_tree, df3) bv4 <- ggplot() + geom_point(data=filter(df_tree_bnd, .TYPE == "simulated"), aes(x=x1, y=x2, colour=cl), alpha=0.6) + scale_colour_discrete_divergingx(palette = "Zissou 1") + geom_point(data=df3, aes(x=x1, y=x2, shape=cl), colour="black", alpha=0.5) + scale_shape_manual(values = c(1,4)) + annotate("text", 1.8, 1.9, label="e") + theme_minimal() + theme(aspect.ratio = 1, legend.position = "none", axis.title = element_blank(), axis.text = element_blank()) set.seed(601) x1 <- 2*(runif(n) - 0.5) x2 <- case_when(x1 < -0.4 ~ -1.2 - 3 * x1, x1 > 0.2 ~ 2.4 - 3 * x1, .default = 1.2 + 3 * x1) x1 <- 2*x1 x2 <- x2 + rnorm(n) * 0.25 x2[1:(n/2)] <- x2[1:(n/2)] - 1.5 cl <- factor(c(rep("A", n/2), rep("B", n-n/2))) df3 <- data.frame(x1, x2, cl) df_lda <- lda(cl~x1+x2, data=df3) df_lda_bnd <- explore(df_lda, df3) bv5 <- ggplot() + geom_point(data=filter(df_lda_bnd, .TYPE == "simulated"), aes(x=x1, y=x2, colour=cl), alpha=0.6) + scale_colour_discrete_divergingx(palette = "Zissou 1") + geom_point(data=df3, aes(x=x1, y=x2, shape=cl), colour="black", alpha=0.5) + scale_shape_manual(values = c(1,4)) + annotate("text", 1.8, 1.9, label="c") + theme_minimal() + theme(aspect.ratio = 1, legend.position = "none", axis.title = element_blank(), axis.text = element_blank()) df_tree <- rpart(cl~x1+x2, data=df3, control = rpart.control(minsplit = 5, cp = 0.00001)) df_tree_bnd <- explore(df_tree, df3) bv6 <- ggplot() + geom_point(data=filter(df_tree_bnd, .TYPE == "simulated"), aes(x=x1, y=x2, colour=cl), alpha=0.6) + scale_colour_discrete_divergingx(palette = "Zissou 1") + geom_point(data=df3, aes(x=x1, y=x2, shape=cl), colour="black", alpha=0.5) + scale_shape_manual(values = c(1,4)) + annotate("text", 1.8, 1.9, label="f") + theme_minimal() + theme(aspect.ratio = 1, legend.position = "none", axis.title = element_blank(), axis.text = element_blank()) bv1 + bv3 + bv5 + bv2 + bv4 + bv6 + plot_layout(ncol=3) ## ---------------------------------------------------------------------------- #| eval: false #| echo: false # vr <- c(-0.5, -0.3, 0.0, 0.3, 0.5) # set.seed(503) # vc1 <- matrix(c(1.5, sample(vr, 1), sample(vr, 1), sample(vr, 1), # sample(vr, 1), 1.5, sample(vr, 1), sample(vr, 1), # sample(vr, 1), sample(vr, 1), 1.5, sample(vr, 1), # sample(vr, 1), sample(vr, 1), sample(vr, 1), 1.5), # ncol=4, byrow=TRUE) # vc1[1,2] <- vc1[2,1] # vc1[1,3] <- vc1[3,1] # vc1[1,4] <- vc1[4,1] # vc1[2,3] <- vc1[3,2] # vc1[2,4] <- vc1[4,2] # vc1[3,4] <- vc1[4,3] # g1 <- rmvn(n=335, p=4, mn=c(3, 3, 3, 3), vc=vc1) # vc2 <- matrix(c(1, 0.6, 0.6, 0.6, # 0.6, 1, 0.6, 0.6, # 0.6, 0.6, 1, 0.6, # 0.6, 0.6, 0.6, 1), ncol=4, byrow=TRUE) # g2 <- rmvn(n=335, p=4, mn=c(3, -3, 3, -3), vc=vc2) # g1 <- bind_cols(as.data.frame(g1), rep("A", 335)) # colnames(g1) <- c("x1", "x2", "x3", "x4", "cl") # g2 <- bind_cols(as.data.frame(g2), rep("B", 335)) # colnames(g2) <- c("x1", "x2", "x3", "x4", "cl") # gd <- bind_rows(g1, g2) |> # mutate(cl = factor(cl)) # animate_xy(gd[,1:4], col=gd$cl) # animate_xy(gd[,1:4], guided_tour(lda_pp(gd$cl)), col=gd$cl) # GGally::ggscatmat(gd, columns=1:4, color="cl") + # scale_color_discrete_divergingx(palette="Zissou 1") # # set.seed(645) # render_gif(gd[,1:4], # grand_tour(), # display_xy(col=gd$cl, # axes="bottomleft"), # gif_file = "gifs/intro_class1.gif", # frames=200, # width=400, # height=400) # render_gif(gd[,1:4], # guided_tour(lda_pp(gd$cl)), # display_xy(col=gd$cl, # axes="bottomleft"), # gif_file = "gifs/intro_class2.gif", # frames=200, # width=400, # height=400, # loop=FALSE) # # # non-linear example # knots <- c(-0.75, -0.5, 0.1, 0.7) # n <- 1000 # # Piece 1 has x2=-0.75 # # Piece 2 has x2=c1+5*x1 has to pass through (-0.75, -0.75) so c1=3 # # Piece 3 has x2=c2-1.5*x1 has to pass through (-0.5, 0.5) so c2=-0.25 # # Piece 4 has x2=c3+2*x1 has to pass through (0.1, -0.4) so c3=-0.6 # # Piece 5 has x2=0.8 # x1 <- runif(n, -1, 1) # x2 <- case_when( # x1 < (-0.75) ~ -0.75, # between(x1, -0.75, -0.5) ~ 3+5*x1, # between(x1, -0.5, 0.1) ~ -0.25-1.5*x1, # between(x1, 0.1, 0.7) ~ -0.6+2*x1, # x1 > 0.7 ~ 0.8) # border <- tibble(x1, x2) |> arrange(x1) # # ggplot(border, aes(x1, x2)) + # geom_line() + # xlim(c(-1,1)) + # ylim(c(-1,1)) # # n_obs <- 5000 # set.seed(401) # d <- tibble(x1 = runif(n_obs, -1, 1), # x2 = runif(n_obs, -1, 1)) |> # mutate(cl = "A") # d <- d |> # mutate(cl = case_when( # (x1 < (-0.75)) & (x2 < -0.75) ~ "B", # between(x1, -0.75, -0.5) & (x2 < 3+5*x1) ~ "B", # between(x1, -0.5, 0.1) & (x2 < -0.25-1.5*x1) ~ "B", # between(x1, 0.1, 0.7) & (x2 < -0.6+2*x1) ~ "B", # (x1 > 0.7) & (x2 < 0.8) ~ "B", # .default = "A") # ) |> # mutate(cl = factor(cl)) # # # Build gap # gap <- 0.1 # d <- d |> # mutate(drop = case_when( # (x1 < (-0.75)) & (abs(x2 - (-0.75)) < gap) ~ "yes", # between(x1, -0.75, -0.5) & (abs(x2-(3+5*x1)) < 5*gap) ~ "yes", # between(x1, -0.5, 0.1) & (abs(x2 -(-0.25-1.5*x1)) < 1.5*gap) ~ "yes", # between(x1, 0.1, 0.7) & (abs(x2 -(-0.6+2*x1)) < 2*gap) ~ "yes", # (x1 > 0.7) & (abs(x2 - 0.8) < gap) ~ "yes", # .default = "no")) # # d_gap <- d |> # filter(drop == "no") # ggplot() + # geom_point(data=d_gap, aes(x1, x2, colour=cl), alpha=0.7) + # scale_colour_discrete_divergingx(palette = "Zissou 1") + # geom_line(data=border, aes(x1, x2)) + # theme_minimal() + # theme(aspect.ratio = 1) # # # Raise into high-d # d_high <- d |> # mutate(x3 = runif(n_obs, -1, 1), # x4 = runif(n_obs, -1, 1), # x5 = runif(n_obs, -1, 1), # x6 = runif(n_obs, -1, 1)) |> # dplyr::select(x1,x2,x3,x4,x5,x6,cl,drop) # # d_high_gap <- d_high |> # filter(drop == "no") # # # Make boundary a combination of variables: x1-x3, x2-x4 # theta <- pi/4 # d_high_comb <- d_high |> # mutate(x7 = cos(theta)*x1 + sin(theta)*x3, # x8 = -sin(theta)*x1 + cos(theta)*x3, # x9 = cos(theta)*x2 + sin(theta)*x4, # x10 = -sin(theta)*x2 + cos(theta)*x4) |> # dplyr::select(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,cl,drop) # d_high_comb_gap <- d_high_comb |> # filter(drop == "no") # # animate_slice(d_high_comb_gap[,6:10], col=d_high_gap$cl, axes="bottomleft", v_rel=0.2) # # # Doesn't work # r_breaks <- linear_breaks(5, 0, 1) # a_breaks <- angular_breaks(10) # eps <- estimate_eps(nrow(d_high_comb_gap), # ncol(d_high_comb_gap), 0.1 / 1, 5 * 10, 10, # r_breaks) # idx <- slice_index(r_breaks, a_breaks, eps, bintype = "polar", # power = 1, reweight = TRUE, p = 5) # animate_slice(d_high_comb_gap[,6:10], # guided_section_tour(idx, v_rel = 0.2, max.tries = 50), # v_rel = 0.2) # # # Try with spheres # set.seed(637) # sg1 <- sphere.solid.random(n=313, p=4)$points # sg2 <- sphere.hollow(n=323, p=4)$points * 2 # sg1 <- bind_cols(as.data.frame(sg1), as.data.frame(rep("A", 313))) # colnames(sg1) <- c("x1", "x2", "x3", "x4", "cl") # sg2 <- bind_cols(as.data.frame(sg2), as.data.frame(rep("B", 323))) # colnames(sg2) <- c("x1", "x2", "x3", "x4", "cl") # sg <- bind_rows(sg1, sg2) |> # mutate(cl = factor(cl)) # # animate_xy(sg[,1:4], col=sg$cl) # animate_slice(sg[,1:4], col=sg$cl, v_rel=1.2) # # set.seed(645) # sph_path <- save_history(sg[,1:4]) # render_gif(sg[,1:4], # planned_tour(sph_path), # display_xy(col=sg$cl, # axes="bottomleft"), # gif_file = "gifs/intro_class3.gif", # frames=200, # width=400, # height=400) # render_gif(sg[,1:4], # planned_tour(sph_path), # display_slice(col=sg$cl, v_rel=1.2, # axes="bottomleft"), # gif_file = "gifs/intro_class4.gif", # frames=200, # width=400, # height=400) ## ---------------------------------------------------------------------------- #| echo: true #| eval: false # library(readr) # library(dplyr) # music <- read_csv("http://ggobi.org/book/data/music-sub.csv", # show_col_types = FALSE) |> # rename(title = `...1`) |> # mutate(type = factor(type)) ## ---------------------------------------------------------------------------- #| eval: false # library(mulgar) # data(aflw) # aflw_sub <- aflw |> # dplyr::filter(position %in% c("BPL", "FF", "RR")) |> # dplyr::mutate(position = factor(position)) |> # dplyr::select(goals:tackles) ## ----eval=FALSE-------------------------------------------------------------- #| echo: false # data(bushfires) # b_sub <- bushfires |> # select(se, maxt, mint, log_dist_road, cause) |> # filter(cause %in% c("accident", "lightning")) |> # rename(ldr = log_dist_road) |> # mutate(cause = factor(cause)) # animate_xy(b_sub[,-5], col=b_sub$cause, rescale=TRUE) # animate_xy(b_sub[,-5], guided_tour(lda_pp(b_sub$cause)), col=b_sub$cause, rescale=TRUE) # # data(pisa) # set.seed(441) # pisa_sub <- pisa |> # group_by(CNT) |> # sample_frac(0.10) # animate_xy(pisa_sub[,2:6], col=pisa_sub$CNT) # # animate_xy(music[,4:8], guided_tour(lda_pp(music$type)), col=music$type, rescale=TRUE) # # data(aflw) # aflw_sub <- aflw |> # filter(position %in% c("BPL", "FF", "RR")) |> # mutate(position = factor(position)) |> # select(goals:tackles) # animate_xy(aflw_sub[,1:7], col=aflw_sub$position, rescale=TRUE) # animate_xy(aflw_sub[,1:7], # guided_tour(lda_pp(aflw_sub$position)), # col=aflw_sub$position, # rescale=TRUE) #