Direct Clustering

library(reshape2)
library(ggplot2)
library(FactoMineR)

Direct clustering

Package `stats`

We use the kmeans() function, in the package stats (already installed and loaded in each R session).

Default use

Example with iris data and 3 clusters

res = kmeans(iris[-5], 3)
res

## K-means clustering with 3 clusters of sizes 33, 21, 96
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.175758    3.624242     1.472727   0.2727273
## 2     4.738095    2.904762     1.790476   0.3523810
## 3     6.314583    2.895833     4.973958   1.7031250
## 
## Clustering vector:
##   [1] 1 2 2 2 1 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 1 1 1 2
##  [36] 1 1 1 2 1 1 2 2 1 1 2 1 2 1 1 3 3 3 3 3 3 3 2 3 3 2 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 2 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1]   6.432121  17.669524 118.651875
##  (between_SS / total_SS =  79.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

names(res)

## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

table(res$cluster)

## 
##  1  2  3 
## 33 21 96

res$size

## [1] 33 21 96

res$centers

##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.175758    3.624242     1.472727   0.2727273
## 2     4.738095    2.904762     1.790476   0.3523810
## 3     6.314583    2.895833     4.973958   1.7031250

res$totss # I

## [1] 681.3706

res$tot.withinss # W

## [1] 142.7535

res$betweenss # B

## [1] 538.6171

Advanced use

With multiple randoms initialization, and on a standardized dataset

res.mult = kmeans(scale(iris[-5]), 3, nstart = 30, iter.max = 20)
res.mult

## K-means clustering with 3 clusters of sizes 50, 53, 47
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1  -1.01119138  0.85041372   -1.3006301  -1.2507035
## 2  -0.05005221 -0.88042696    0.3465767   0.2805873
## 3   1.13217737  0.08812645    0.9928284   1.0141287
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2
##  [71] 3 2 2 2 2 3 3 3 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 3 3 3 3 3 3 2 2 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 47.35062 44.08754 47.45019
##  (between_SS / total_SS =  76.7 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

With defind starting points (here, the means of each specie)

m = aggregate(. ~ Species, iris, mean)
res.init = kmeans(iris[-5], m[-1])
res.init

## K-means clustering with 3 clusters of sizes 50, 62, 38
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.006000    3.428000     1.462000    0.246000
## 2     5.901613    2.748387     4.393548    1.433871
## 3     6.850000    3.073684     5.742105    2.071053
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

Package `cluster`

This package is already installed with R.

library(cluster)

Fuzzy analysis clustering

fanny() computes a fuzzy clustering int \(k\) clusters (see ?fanny for more details).

res.fanny = fanny(iris[-5], 3)
names(res.fanny)

##  [1] "membership"  "coeff"       "memb.exp"    "clustering"  "k.crisp"    
##  [6] "objective"   "convergence" "diss"        "call"        "silinfo"    
## [11] "data"

table(res.fanny$clustering)

## 
##  1  2  3 
## 50 45 55

head(res.fanny$membership)

##           [,1]       [,2]       [,3]
## [1,] 0.9142273 0.03603116 0.04974153
## [2,] 0.8594576 0.05854637 0.08199602
## [3,] 0.8700857 0.05463714 0.07527719
## [4,] 0.8426296 0.06555926 0.09181118
## [5,] 0.9044503 0.04025288 0.05529687
## [6,] 0.7680227 0.09717445 0.13480286

plot(res.fanny)

Partitioning around medoids

pam() performs a partitioning of the data in \(k\) clusters, using medoids instead of means, to have more robust results (see ?pam for more details).

res.pam = pam(iris[-5], 3)
res.pam

## Medoids:
##       ID Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]   8          5.0         3.4          1.5         0.2
## [2,]  79          6.0         2.9          4.5         1.5
## [3,] 113          6.8         3.0          5.5         2.1
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
## Objective function:
##     build      swap 
## 0.6709391 0.6542077 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"

table(res.pam$clustering)

## 
##  1  2  3 
## 50 62 38

res.pam$medoids

##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]          5.0         3.4          1.5         0.2
## [2,]          6.0         2.9          4.5         1.5
## [3,]          6.8         3.0          5.5         2.1

Clustering large applications

clara() is able to deal with large datasets (sse ?clara for more details). It divides the set into subset of fixed size, and apply pam() algorithm on each of them.

Number of clusters

We also use the package NbClust to search an interesting number of clusters. We only have the \(k\)-means method, for direct clustering.

library(NbClust)
nb = NbClust(iris[-5], method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 10 proposed 2 as the best number of clusters 
## * 8 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************

We can explore, as for HAC (and with the code), the results for more details

t(nb$Best.nc)

##            Number_clusters Value_Index
## KL                       4      7.2495
## CH                       3    561.6278
## Hartigan                 3     82.4072
## CCC                      3     37.6701
## Scott                    3    202.0631
## Marriot                  3 149982.8597
## TrCovW                   3    796.9882
## TraceW                   3     51.8735
## Friedman                14    222.0920
## Rubin                    8    -29.5317
## Cindex                   2      0.2728
## DB                       2      0.4744
## Silhouette               2      0.6810
## Duda                     2      1.9253
## PseudoT2                 2    -52.8667
## Beale                    2     -1.1380
## Ratkowsky                2      0.5462
## Ball                     3     49.8902
## PtBiserial               2      0.8345
## Frey                     5      1.3762
## McClain                  2      0.2723
## Dunn                     4      0.1365
## Hubert                   0      0.0000
## SDindex                  2      1.6370
## Dindex                   0      0.0000
## SDbw                    15      0.0243

par(mfrow = c(4, 7), mar = c(1, 1, 2, 0) + .1)
for (i in 1:ncol(nb$All.index)) {
    plot(rownames(nb$All.index), nb$All.index[,i], type = "l",
         main = colnames(nb$All.index)[i], axes = FALSE)
    axis(1, at = rownames(nb$All.index), labels = rownames(nb$All.index), 
         lwd = 0, padj = -2)
    best = nb$Best.nc[1,i]
    if (best != 0)
        points(best[1], nb$All.index[as.character(best),i], col = "red")
}

We get also the best partition

nb$Best.partition

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2

table(nb$Best.partition)

## 
##  1  2 
## 53 97

Clusters validation and representation

table(iris$Species, res.init$cluster)

##             
##               1  2  3
##   setosa     50  0  0
##   versicolor  0 48  2
##   virginica   0 14 36

pairs(iris[-5], col = rainbow(3)[res.init$cluster], pch = 19)

dres = data.frame(iris[-5], cluster = factor(res.init$cluster))
dres.melt = melt(dres, id.vars = "cluster")
ggplot(dres.melt, aes(cluster, value, fill = cluster)) + 
    geom_boxplot() + 
    facet_wrap(~ variable, scales = "free")

pca = PCA(iris, quali.sup = 5, graph = FALSE)
res.pca = data.frame(pca$ind$coord, cluster = factor(res.init$cluster))
ggplot(res.pca, aes(Dim.1, Dim.2, color = cluster)) + 
    geom_point() +
    stat_ellipse()

Some work

As for hierarchical clustering, from the previous pendigits data, use direct clustering to find, for each digit, a number of types of writing, and represent them.

Direct Clustering

Statistical Programming using R