pendigits
data - interactive analysisR
Now, we want to build an application which includes all the methods we see in previous sessions. Our goal is to create a tool that permits us to analyse the data, in order to chose an as good as possible number of clusters, for each digit.
The app includes 4 panels: the description of data and one for each method (HAC, \(k\)-means, mixture model). The original data are also presented. Here is the specifications for each panel:
You can run directly this application with:
runUrl("http://fxjollois.github.io/stat-prog-R/pendigits.zip")
Here are the two files for this application:
library(shiny)
library(shinythemes)
shinyUI(navbarPage(
"Pen digits",
theme = shinytheme("darkly"),
# Description of the data
tabPanel(
"Description",
tabsetPanel(
tabPanel(
"Digit distribution",
helpText("The digits are approximatively equi-distributed"),
plotOutput("desc.digit")
),
tabPanel(
"Coordinates distribution",
helpText(""),
plotOutput("desc.coord")
),
tabPanel(
"Average digits",
helpText(""),
plotOutput("desc.average")
),
tabPanel(
"PCA",
radioButtons("desc.pca.type",
"Plot type",
c("Global", "Separated"),
inline = TRUE),
plotOutput("desc.pca.plot")
)
)
),
# HAC
tabPanel(
"HAC",
sidebarLayout(
sidebarPanel(
selectInput("hac.digit",
"Digit",
0:9),
radioButtons("hac.ncl",
"Number of clusters",
1:10)
),
mainPanel(
plotOutput("hac.tree"),
plotOutput("hac.plot")
)
)
),
# kmeans
tabPanel(
"kmeans",
sidebarLayout(
sidebarPanel(
selectInput("km.digit",
"Digit",
0:9),
radioButtons("km.ncl",
"Number of clusters",
1:10)
),
mainPanel(
plotOutput("km.r2", height = 200),
plotOutput("km.plot")
)
)
),
# Mixture model
tabPanel(
"mclust",
sidebarLayout(
sidebarPanel(
selectInput("mclust.digit",
"Digit",
0:9),
radioButtons("mclust.ncl",
"Number of clusters",
1:10)
),
mainPanel(
fluidRow(
column(3, selectInput("mclust.choice", "Criterion", c("BIC", "ICL", "AIC", "AIC3"))),
column(9, plotOutput("mclust.crit", height = 200))
),
plotOutput("mclust.plot")
)
)
),
# Original data
navbarMenu(
"Original data",
tabPanel("pendigits.tra",
tags$pre(includeText("../../donnees/pendigits.tra"))),
tabPanel("pendigits.tes",
tags$pre(includeText("../../donnees/pendigits.tes")))
)
))
library(shiny)
library(reshape2)
library(ggplot2)
library(scales)
library(FactoMineR)
library(mclust)
load("pendigits.RData")
showDigitPartition <- function(z, mean, pca) {
tab = table(z)
ncl = length(tab)
par(mfcol = c(2, ncl), mar = c(0, 0, 0, 0) + .1)
for (k in 1:ncl) {
if (ncl == 1) {
drawn(mean, n = tab[k], point = TRUE)
} else {
drawn(mean[k,], n = tab[k], point = TRUE)
}
plot(Dim.2 ~ Dim.1, subset(pca, cluster == k),
xlim = range(res2$Dim.1),
ylim = range(res2$Dim.2),
col = rainbow(ncl)[k], pch = 19,
axes = FALSE, xlab = "", ylab = "", frame.plot = TRUE)
}
}
shinyServer(function(input, output) {
# Description
output$desc.digit <- renderPlot({
ggplot(pen, aes(digit, fill = digit)) +
geom_bar() +
xlab("") + ylab("") +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
})
output$desc.coord <- renderPlot({
ggplot(pen.melt, aes(digit, value, color = digit)) +
geom_boxplot() +
facet_wrap(~ variable) +
theme_minimal() +
xlab("") + ylab("") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
})
output$desc.average <- renderPlot({
par(mfrow = c(2, 5))
for (i in 0:9) {
s = subset(pen, digit == i)
# Computing the average values
mean = apply(s[-17], 2, mean)
# Drawn the first drawn
drawn(s[1,1:16], col = "gray90", n = i)
# Add all other drawn
for (j in 2:nrow(s))
drawn(s[j,1:16], col = "gray90", add = TRUE)
# Last, add the average line
drawn(mean[-17], add = TRUE)
}
})
output$desc.pca.plot <- renderPlot({
g = ggplot(res2, aes(Dim.1, Dim.2, color = digit)) +
geom_point() +
theme_minimal()
if (input$desc.pca.type == "Separated") {
g = g + facet_wrap(~ digit) +
guides(color = FALSE)
}
g
})
# HAC
output$hac.tree <- renderPlot({
dig = as.numeric(input$hac.digit)
ncl = as.numeric(input$hac.ncl)
tree = hac.ward[[dig+1]]
par(mar = c(2, 2, 0, 0) + .1)
plot(tree, hang = -1, labels = FALSE,
main = "")
if (ncl > 1)
rect.hclust(tree, ncl)
})
output$hac.plot <- renderPlot({
dig = as.numeric(input$hac.digit)
ncl = as.numeric(input$hac.ncl)
z = cutree(hac.ward[[dig+1]], ncl)
mean = apply(subset(pen, digit == dig, -digit), 2, tapply, z, mean)
pca = data.frame(subset(res2, digit == dig, -digit),
cluster = z)
showDigitPartition(z, mean, pca)
})
# kmeans
output$km.r2 <- renderPlot({
dig = as.numeric(input$km.digit)
ncl = as.numeric(input$km.ncl)
I = km[[dig+1]][[1]]$totss
B = sapply(km[[dig+1]], function(res) return(res$betweenss))
ggplot(data.frame(G = 1:10, r2 = B / I), aes(G, r2)) +
geom_line() +
scale_x_discrete(limits = 1:10) +
scale_y_continuous(labels = percent) +
theme_minimal() +
geom_vline(xintercept = ncl, color = "grey30", linetype = 2) +
geom_hline(yintercept = B[ncl] / I, color = "grey30", linetype = 2)
})
output$km.plot <- renderPlot({
dig = as.numeric(input$km.digit)
ncl = as.numeric(input$km.ncl)
z = km[[dig+1]][[ncl]]$cluster
mean = km[[dig+1]][[ncl]]$centers
pca = data.frame(subset(res2, digit == dig, -digit),
cluster = z)
showDigitPartition(z, mean, pca)
})
# Mixture model
output$mclust.crit <- renderPlot({
dig = as.numeric(input$mclust.digit)
ncl = as.numeric(input$mclust.ncl)
crit = data.frame(t(sapply(mclust[[dig+1]], function(m) {
return(c(G = m$G, BIC = m$bic, ICL = m$ICL, AIC = m$AIC, AIC3 = m$AIC3))
})))
names(crit) = c("G", "BIC", "ICL", "AIC", "AIC3")
ggplot(crit, aes_string("G", input$mclust.choice)) +
geom_line() +
scale_x_discrete(limits = 1:10) +
theme_minimal() +
geom_vline(xintercept = ncl, color = "grey30", linetype = 2) +
geom_hline(yintercept = crit[ncl,input$mclust.choice], color = "grey30", linetype = 2)
})
output$mclust.plot <- renderPlot({
dig = as.numeric(input$mclust.digit)
ncl = as.numeric(input$mclust.ncl)
z = mclust[[dig+1]][[ncl]]$classification
mean = t(mclust[[dig+1]][[ncl]]$parameters$mean)
pca = data.frame(subset(res2, digit == dig, -digit),
cluster = z)
showDigitPartition(z, mean, pca)
})
})
In server.R
, we load a specific file, named a RData
file. This file contains the results of the computation done in a third file (code.R
, presented below). It allows us to pre-compute results, instead of online computing.
# Importation
pen.tra = read.table("../../donnees/pendigits.tra", sep = ",")
pen.tes = read.table("../../donnees/pendigits.tes", sep = ",")
pen = rbind(pen.tra, pen.tes)
names(pen) = c(paste0(c("X", "Y"), rep(1:8, each = 2)), "digit")
pen$digit = factor(pen$digit)
# remove useless variables
rm(pen.tra, pen.tes)
# Melt data
pen.melt = melt(pen, id.vars = 17)
# Drawn function
drawn <- function(v, n = NULL, point = FALSE, add = FALSE, color = "black") {
# Transformation of the data.frame if needed
if (is.data.frame(v))
v = unlist(v)
# extract x and y coordinates
x = v[seq(1, 15, by = 2)]
y = v[seq(2, 16, by = 2)]
# optimize space into graphics in reducing margin (sse ?par for more information)
opar = par(mar = c(0, 0, 2, 0) + .1)
if (!add) { # Create a graphic
plot(x, y,
# Specify limits is a way to have always the same frame for plotting lines
xlim = c(-5, 105), ylim = c(-5, 105),
# Do not show axes
axes = FALSE,
# If point is TRUE, we add a space (with pch = " ") at each point
# If not, draw a line
type = ifelse(point, "b", "l"),
pch = " ",
# Specify color (black by default)
col = color,
# Add a title (NULL by default)
main = n)
if (point) text(x, y, 1:8)
} else { # Add line to the plot
# lines() add lines to an existing plot (the last produce)
lines(x, y,
# same comment than before
xlim = c(-5, 105), ylim = c(-5, 105),
type = ifelse(point, "b", "l"),
pch = " ",
col = color)
}
par(opar)
}
# PCA
res = PCA(pen, quali.sup = 17, graph = FALSE)
res2 = data.frame(res$ind$coord, digit = pen$digit)
# Compute HAC
## Ward criterion
hac.ward = lapply(0:9, function(dig) {
sub = subset(pen, digit == dig, -digit)
h = hclust(dist(sub), "ward.D2")
return(h)
})
# Compute k-means
km = lapply(0:9, function(dig) {
sub = subset(pen, digit == dig, -digit)
res = lapply(1:10, kmeans, x = sub, nstart = 30, iter.max = 20)
return(res)
})
# Compute Mclust
mclust = lapply(0:9, function(dig) {
sub = subset(pen, digit == dig, -digit)
res = lapply(1:10, function(G) {
m = Mclust(data = sub, G = G)
p = nMclustParams(m$modelName, m$d, G)
m$ICL = icl(m)
m$AIC = -2 * m$loglik + 2 * p
m$AIC3 = -2 * m$loglik + 3 * p
return(m)
})
return(res)
})
# save results
save.image(file = "pendigits.RData")