pendigits data - interactive analysis

Now, we want to build an application which includes all the methods we see in previous sessions. Our goal is to create a tool that permits us to analyse the data, in order to chose an as good as possible number of clusters, for each digit.

The app includes 4 panels: the description of data and one for each method (HAC, \(k\)-means, mixture model). The original data are also presented. Here is the specifications for each panel:

Description: 4 sub-panels
- Distribution of the digits (bar chart)
- Distribution of the coordinates, according to the digits (boxplot)
- Average digit
- PCA projection (global plot or separated for each digit)
For each method:
- Left side:
  - Choice of the digit
  - Choice of the number of clusters
- Right side:
  - Specific graph:
    - HAC: dendrogram
    - kmeans: evolution of \(r^2\)
    - mclust: evolution of the user specified criterion (\(BIC\), \(ICL\), \(AIC\), \(AIC3\))
  - For each cluster:
    - Plot of the average digit
    - PCA projection

You can run directly this application with:

runUrl("http://fxjollois.github.io/stat-prog-R/pendigits.zip")

Here are the two files for this application:

ui.R

library(shiny)
library(shinythemes)

shinyUI(navbarPage(
    "Pen digits",
    theme = shinytheme("darkly"),
    
    # Description of the data
    tabPanel(
        "Description",
        tabsetPanel(
            tabPanel(
                "Digit distribution",
                helpText("The digits are approximatively equi-distributed"),
                plotOutput("desc.digit")
            ),
            tabPanel(
                "Coordinates distribution",
                helpText(""),
                plotOutput("desc.coord")
            ),
            tabPanel(
                "Average digits",
                helpText(""),
                plotOutput("desc.average")
            ),
            tabPanel(
                "PCA",
                radioButtons("desc.pca.type",
                             "Plot type",
                             c("Global", "Separated"),
                             inline = TRUE),
                plotOutput("desc.pca.plot")
            )
        )
    ),
    
    # HAC
    tabPanel(
        "HAC",
        sidebarLayout(
            sidebarPanel(
                selectInput("hac.digit",
                            "Digit",
                            0:9),
                radioButtons("hac.ncl",
                             "Number of clusters",
                             1:10)
            ),
            mainPanel(
                plotOutput("hac.tree"),
                plotOutput("hac.plot")
            )
        )
    ),
    
    # kmeans
    tabPanel(
        "kmeans",
        sidebarLayout(
            sidebarPanel(
                selectInput("km.digit",
                            "Digit",
                            0:9),
                radioButtons("km.ncl",
                             "Number of clusters",
                             1:10)
            ),
            mainPanel(
                plotOutput("km.r2", height = 200),
                plotOutput("km.plot")
            )
        )
    ),
    
    # Mixture model
    tabPanel(
        "mclust",
        sidebarLayout(
            sidebarPanel(
                selectInput("mclust.digit",
                            "Digit",
                            0:9),
                radioButtons("mclust.ncl",
                             "Number of clusters",
                             1:10)
            ),
            mainPanel(
                fluidRow(
                    column(3, selectInput("mclust.choice", "Criterion", c("BIC", "ICL", "AIC", "AIC3"))),
                    column(9, plotOutput("mclust.crit", height = 200))
                ),
                plotOutput("mclust.plot")
            )
        )
    ),
    
    # Original data
    navbarMenu(
        "Original data",
        tabPanel("pendigits.tra",
                 tags$pre(includeText("../../donnees/pendigits.tra"))),
        tabPanel("pendigits.tes",
                 tags$pre(includeText("../../donnees/pendigits.tes")))
    )

))

server.R

library(shiny)
library(reshape2)
library(ggplot2)
library(scales)
library(FactoMineR)
library(mclust)

load("pendigits.RData")

showDigitPartition <- function(z, mean, pca) {
    tab = table(z)
    ncl = length(tab)
    par(mfcol = c(2, ncl), mar = c(0, 0, 0, 0) + .1)
    for (k in 1:ncl) {
        if (ncl == 1) {
            drawn(mean, n = tab[k], point = TRUE)
        } else {
            drawn(mean[k,], n = tab[k], point = TRUE)
        }
        plot(Dim.2 ~ Dim.1, subset(pca, cluster == k),
             xlim = range(res2$Dim.1),
             ylim = range(res2$Dim.2),
             col = rainbow(ncl)[k], pch = 19, 
             axes = FALSE, xlab = "", ylab = "", frame.plot = TRUE)
    }
}

shinyServer(function(input, output) {
   
    # Description
    output$desc.digit <- renderPlot({
        ggplot(pen, aes(digit, fill = digit)) + 
            geom_bar() +
            xlab("") + ylab("") +
            theme_minimal() +
            theme(axis.text.x = element_blank(),
                  axis.ticks.x = element_blank())
    })
    output$desc.coord <- renderPlot({
        ggplot(pen.melt, aes(digit, value, color = digit)) + 
            geom_boxplot() +
            facet_wrap(~ variable) +
            theme_minimal() +
            xlab("") + ylab("") +
            theme(axis.text.x = element_blank(),
                  axis.ticks.x = element_blank())
    })
    output$desc.average <- renderPlot({
        par(mfrow = c(2, 5))
        for (i in 0:9) {
            s = subset(pen, digit == i)
            # Computing the average values
            mean = apply(s[-17], 2, mean)
            # Drawn the first drawn
            drawn(s[1,1:16], col = "gray90", n = i)
            # Add all other drawn
            for (j in 2:nrow(s)) 
                drawn(s[j,1:16], col = "gray90", add = TRUE)
            # Last, add the average line
            drawn(mean[-17], add = TRUE)
        }
    })
    output$desc.pca.plot <- renderPlot({
        g = ggplot(res2, aes(Dim.1, Dim.2, color = digit)) +
            geom_point() +
            theme_minimal()
        if (input$desc.pca.type == "Separated") {
            g = g + facet_wrap(~ digit) +
                guides(color = FALSE)
        } 
        g
    })
    
    # HAC
    output$hac.tree <- renderPlot({
        dig = as.numeric(input$hac.digit)
        ncl = as.numeric(input$hac.ncl)
        tree = hac.ward[[dig+1]]
        par(mar = c(2, 2, 0, 0) + .1)
        plot(tree, hang = -1, labels = FALSE,
             main = "")
        if (ncl > 1)
            rect.hclust(tree, ncl)
    })
    output$hac.plot <- renderPlot({
        dig = as.numeric(input$hac.digit)
        ncl = as.numeric(input$hac.ncl)
        z = cutree(hac.ward[[dig+1]], ncl)
        mean = apply(subset(pen, digit == dig, -digit), 2, tapply, z, mean)
        pca = data.frame(subset(res2, digit == dig, -digit),
                         cluster = z)
        showDigitPartition(z, mean, pca)
    })

    # kmeans
    output$km.r2 <- renderPlot({
        dig = as.numeric(input$km.digit)
        ncl = as.numeric(input$km.ncl)
        I = km[[dig+1]][[1]]$totss
        B = sapply(km[[dig+1]], function(res) return(res$betweenss))
        ggplot(data.frame(G = 1:10, r2 = B / I), aes(G, r2)) +
            geom_line() +
            scale_x_discrete(limits = 1:10) +
            scale_y_continuous(labels = percent) +
            theme_minimal() +
            geom_vline(xintercept = ncl, color = "grey30", linetype = 2) +
            geom_hline(yintercept = B[ncl] / I, color = "grey30", linetype = 2)
    })
    output$km.plot <- renderPlot({
        dig = as.numeric(input$km.digit)
        ncl = as.numeric(input$km.ncl)
        z = km[[dig+1]][[ncl]]$cluster
        mean = km[[dig+1]][[ncl]]$centers
        pca = data.frame(subset(res2, digit == dig, -digit),
                         cluster = z)
        showDigitPartition(z, mean, pca)
    })

    # Mixture model
    output$mclust.crit <- renderPlot({
        dig = as.numeric(input$mclust.digit)
        ncl = as.numeric(input$mclust.ncl)
        crit = data.frame(t(sapply(mclust[[dig+1]], function(m) {
            return(c(G = m$G, BIC = m$bic, ICL = m$ICL, AIC = m$AIC, AIC3 = m$AIC3))
        })))
        names(crit) = c("G", "BIC", "ICL", "AIC", "AIC3")
        ggplot(crit, aes_string("G", input$mclust.choice)) +
            geom_line() +
            scale_x_discrete(limits = 1:10) +
            theme_minimal() +
            geom_vline(xintercept = ncl, color = "grey30", linetype = 2) +
            geom_hline(yintercept = crit[ncl,input$mclust.choice], color = "grey30", linetype = 2)
    })
    output$mclust.plot <- renderPlot({
        dig = as.numeric(input$mclust.digit)
        ncl = as.numeric(input$mclust.ncl)
        z = mclust[[dig+1]][[ncl]]$classification
        mean = t(mclust[[dig+1]][[ncl]]$parameters$mean)
        pca = data.frame(subset(res2, digit == dig, -digit),
                         cluster = z)
        showDigitPartition(z, mean, pca)
    })    
})

In server.R, we load a specific file, named a RData file. This file contains the results of the computation done in a third file (code.R, presented below). It allows us to pre-compute results, instead of online computing.

code.R

# Importation
pen.tra = read.table("../../donnees/pendigits.tra", sep = ",")
pen.tes = read.table("../../donnees/pendigits.tes", sep = ",")

pen = rbind(pen.tra, pen.tes)
names(pen) = c(paste0(c("X", "Y"), rep(1:8, each = 2)), "digit")
pen$digit = factor(pen$digit)

# remove useless variables
rm(pen.tra, pen.tes)

# Melt data
pen.melt = melt(pen, id.vars = 17)

# Drawn function
drawn <- function(v, n = NULL, point = FALSE, add = FALSE, color = "black") {
    # Transformation of the data.frame if needed
    if (is.data.frame(v))
        v = unlist(v)
    # extract x and y coordinates
    x = v[seq(1, 15, by = 2)]
    y = v[seq(2, 16, by = 2)]
    # optimize space into graphics in reducing margin (sse ?par for more information)
    opar = par(mar = c(0, 0, 2, 0) + .1)
    if (!add) { # Create a graphic
        plot(x, y, 
             # Specify limits is a way to have always the same frame for plotting lines
             xlim = c(-5, 105), ylim = c(-5, 105),
             # Do not show axes
             axes = FALSE,
             # If point is TRUE, we add a space (with pch = " ") at each point
             # If not, draw a line
             type = ifelse(point, "b", "l"), 
             pch = " ", 
             # Specify color (black by default)
             col = color,
             # Add a title (NULL by default)
             main = n)
        if (point) text(x, y, 1:8)
    } else { # Add line to the plot
        # lines() add lines to an existing plot (the last produce)
        lines(x, y, 
              # same comment than before
              xlim = c(-5, 105), ylim = c(-5, 105),
              type = ifelse(point, "b", "l"), 
              pch = " ", 
              col = color)
    }
    par(opar)
}

# PCA
res = PCA(pen, quali.sup = 17, graph = FALSE)
res2 = data.frame(res$ind$coord, digit = pen$digit)

# Compute HAC
## Ward criterion
hac.ward = lapply(0:9, function(dig) {
    sub = subset(pen, digit == dig, -digit)
    h = hclust(dist(sub), "ward.D2")
    return(h)
})

# Compute k-means
km = lapply(0:9, function(dig) {
    sub = subset(pen, digit == dig, -digit)
    res = lapply(1:10, kmeans, x = sub, nstart = 30, iter.max = 20)
    return(res)
})

# Compute Mclust
mclust = lapply(0:9, function(dig) {
    sub = subset(pen, digit == dig, -digit)
    res = lapply(1:10, function(G) {
        m = Mclust(data = sub, G = G)
        p = nMclustParams(m$modelName, m$d, G)
        m$ICL = icl(m)
        m$AIC = -2 * m$loglik + 2 * p
        m$AIC3 = -2 * m$loglik + 3 * p
        return(m)
    })
    return(res)
})

# save results
save.image(file = "pendigits.RData")

`pendigits` data - interactive analysis

Statistical Programming using `R`

pendigits data - interactive analysis

Statistical Programming using R

`pendigits` data - interactive analysis

Statistical Programming using `R`