R
library(reshape2)
library(ggplot2)
library(FactoMineR)
stats
We use the kmeans()
function, in the package stats
(already installed and loaded in each R
session).
Example with iris
data and 3
clusters
res = kmeans(iris[-5], 3)
res
## K-means clustering with 3 clusters of sizes 33, 21, 96
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.175758 3.624242 1.472727 0.2727273
## 2 4.738095 2.904762 1.790476 0.3523810
## 3 6.314583 2.895833 4.973958 1.7031250
##
## Clustering vector:
## [1] 1 2 2 2 1 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 1 1 1 2
## [36] 1 1 1 2 1 1 2 2 1 1 2 1 2 1 1 3 3 3 3 3 3 3 2 3 3 2 3 3 3 3 3 3 3 3 3
## [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 2 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 6.432121 17.669524 118.651875
## (between_SS / total_SS = 79.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
names(res)
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
table(res$cluster)
##
## 1 2 3
## 33 21 96
res$size
## [1] 33 21 96
res$centers
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.175758 3.624242 1.472727 0.2727273
## 2 4.738095 2.904762 1.790476 0.3523810
## 3 6.314583 2.895833 4.973958 1.7031250
res$totss # I
## [1] 681.3706
res$tot.withinss # W
## [1] 142.7535
res$betweenss # B
## [1] 538.6171
With multiple randoms initialization, and on a standardized dataset
res.mult = kmeans(scale(iris[-5]), 3, nstart = 30, iter.max = 20)
res.mult
## K-means clustering with 3 clusters of sizes 50, 53, 47
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 -1.01119138 0.85041372 -1.3006301 -1.2507035
## 2 -0.05005221 -0.88042696 0.3465767 0.2805873
## 3 1.13217737 0.08812645 0.9928284 1.0141287
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2
## [71] 3 2 2 2 2 3 3 3 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 3 3 3 3 3 3 2 2 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 47.35062 44.08754 47.45019
## (between_SS / total_SS = 76.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
With defind starting points (here, the means of each specie)
m = aggregate(. ~ Species, iris, mean)
res.init = kmeans(iris[-5], m[-1])
res.init
## K-means clustering with 3 clusters of sizes 50, 62, 38
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.006000 3.428000 1.462000 0.246000
## 2 5.901613 2.748387 4.393548 1.433871
## 3 6.850000 3.073684 5.742105 2.071053
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
cluster
This package is already installed with R
.
library(cluster)
fanny()
computes a fuzzy clustering int \(k\) clusters (see ?fanny
for more details).
res.fanny = fanny(iris[-5], 3)
names(res.fanny)
## [1] "membership" "coeff" "memb.exp" "clustering" "k.crisp"
## [6] "objective" "convergence" "diss" "call" "silinfo"
## [11] "data"
table(res.fanny$clustering)
##
## 1 2 3
## 50 45 55
head(res.fanny$membership)
## [,1] [,2] [,3]
## [1,] 0.9142273 0.03603116 0.04974153
## [2,] 0.8594576 0.05854637 0.08199602
## [3,] 0.8700857 0.05463714 0.07527719
## [4,] 0.8426296 0.06555926 0.09181118
## [5,] 0.9044503 0.04025288 0.05529687
## [6,] 0.7680227 0.09717445 0.13480286
plot(res.fanny)
pam()
performs a partitioning of the data in \(k\) clusters, using medoids instead of means, to have more robust results (see ?pam
for more details).
res.pam = pam(iris[-5], 3)
res.pam
## Medoids:
## ID Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] 8 5.0 3.4 1.5 0.2
## [2,] 79 6.0 2.9 4.5 1.5
## [3,] 113 6.8 3.0 5.5 2.1
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
## Objective function:
## build swap
## 0.6709391 0.6542077
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
table(res.pam$clustering)
##
## 1 2 3
## 50 62 38
res.pam$medoids
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] 5.0 3.4 1.5 0.2
## [2,] 6.0 2.9 4.5 1.5
## [3,] 6.8 3.0 5.5 2.1
clara()
is able to deal with large datasets (sse ?clara
for more details). It divides the set into subset of fixed size, and apply pam()
algorithm on each of them.
We also use the package NbClust
to search an interesting number of clusters. We only have the \(k\)-means method, for direct clustering.
library(NbClust)
nb = NbClust(iris[-5], method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 10 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 14 as the best number of clusters
## * 1 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
We can explore, as for HAC (and with the code), the results for more details
t(nb$Best.nc)
## Number_clusters Value_Index
## KL 4 7.2495
## CH 3 561.6278
## Hartigan 3 82.4072
## CCC 3 37.6701
## Scott 3 202.0631
## Marriot 3 149982.8597
## TrCovW 3 796.9882
## TraceW 3 51.8735
## Friedman 14 222.0920
## Rubin 8 -29.5317
## Cindex 2 0.2728
## DB 2 0.4744
## Silhouette 2 0.6810
## Duda 2 1.9253
## PseudoT2 2 -52.8667
## Beale 2 -1.1380
## Ratkowsky 2 0.5462
## Ball 3 49.8902
## PtBiserial 2 0.8345
## Frey 5 1.3762
## McClain 2 0.2723
## Dunn 4 0.1365
## Hubert 0 0.0000
## SDindex 2 1.6370
## Dindex 0 0.0000
## SDbw 15 0.0243
par(mfrow = c(4, 7), mar = c(1, 1, 2, 0) + .1)
for (i in 1:ncol(nb$All.index)) {
plot(rownames(nb$All.index), nb$All.index[,i], type = "l",
main = colnames(nb$All.index)[i], axes = FALSE)
axis(1, at = rownames(nb$All.index), labels = rownames(nb$All.index),
lwd = 0, padj = -2)
best = nb$Best.nc[1,i]
if (best != 0)
points(best[1], nb$All.index[as.character(best),i], col = "red")
}
We get also the best partition
nb$Best.partition
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 2 2 2 2
table(nb$Best.partition)
##
## 1 2
## 53 97
table(iris$Species, res.init$cluster)
##
## 1 2 3
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 14 36
pairs(iris[-5], col = rainbow(3)[res.init$cluster], pch = 19)
dres = data.frame(iris[-5], cluster = factor(res.init$cluster))
dres.melt = melt(dres, id.vars = "cluster")
ggplot(dres.melt, aes(cluster, value, fill = cluster)) +
geom_boxplot() +
facet_wrap(~ variable, scales = "free")
pca = PCA(iris, quali.sup = 5, graph = FALSE)
res.pca = data.frame(pca$ind$coord, cluster = factor(res.init$cluster))
ggplot(res.pca, aes(Dim.1, Dim.2, color = cluster)) +
geom_point() +
stat_ellipse()
As for hierarchical clustering, from the previous pendigits
data, use direct clustering to find, for each digit, a number of types of writing, and represent them.