R
For mixture models clustering, we use the following packages:
library(mclust)
library(Rmixmod)
# We also need
library(ggplot2)
mclust
In this package, the function mclustBIC()
computes the \(EM\) algorithm for many values of number of clusters (with G
parameter, between 1 and 9 by default) and models (with modelNames
parameter, all available models by default). It uses \(BIC\) criterion to choose the best model.
mBIC = mclustBIC(iris[-5])
summary(mBIC)
## Best BIC values:
## VEV,2 VEV,3 VVV,2
## BIC -561.7285 -562.5514380 -574.01783
## BIC diff 0.0000 -0.8229759 -12.28937
plot(mBIC)
Now, we apply Mclust()
function to get the results of the best model.
mBIC1 = Mclust(iris[-5], x = mBIC)
summary(mBIC1, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEV (ellipsoidal, equal shape) model with 2 components:
##
## log.likelihood n df BIC ICL
## -215.726 150 26 -561.7285 -561.7289
##
## Clustering table:
## 1 2
## 50 100
##
## Mixing probabilities:
## 1 2
## 0.333332 0.666668
##
## Means:
## [,1] [,2]
## Sepal.Length 5.0060021 6.261996
## Sepal.Width 3.4280046 2.871999
## Petal.Length 1.4620006 4.905993
## Petal.Width 0.2459998 1.675997
##
## Variances:
## [,,1]
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.15065097 0.13080108 0.020844624 0.013091029
## Sepal.Width 0.13080108 0.17604544 0.016032479 0.012214539
## Petal.Length 0.02084462 0.01603248 0.028082603 0.006015675
## Petal.Width 0.01309103 0.01221454 0.006015675 0.010423651
## [,,2]
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.4000437 0.10865439 0.3994013 0.14368238
## Sepal.Width 0.1086544 0.10928074 0.1238902 0.07284378
## Petal.Length 0.3994013 0.12389025 0.6109012 0.25738947
## Petal.Width 0.1436824 0.07284378 0.2573895 0.16808166
table(mBIC1$classification)
##
## 1 2
## 50 100
t(mBIC1$parameters$mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] 5.006002 3.428005 1.462001 0.2459998
## [2,] 6.261996 2.871999 4.905993 1.6759972
plot(mBIC1, what = "classification")
plot(mBIC1, what = "uncertainty")
plot(mBIC1, what = "density")
table(iris$Species, mBIC1$classification)
##
## 1 2
## setosa 50 0
## versicolor 0 50
## virginica 0 50
We see that the same model with 3 clusters is the second choice.
mBIC2 = Mclust(iris[-5], G = 3, modelNames = "VEV")
table(mBIC2$classification)
##
## 1 2 3
## 50 45 55
t(mBIC2$parameters$mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] 5.006000 3.428000 1.462000 0.246000
## [2,] 5.914879 2.777504 4.203758 1.298819
## [3,] 6.546670 2.949495 5.481901 1.985322
plot(mBIC2, what = "classification")
table(iris$Species, mBIC2$classification)
##
## 1 2 3
## setosa 50 0 0
## versicolor 0 45 5
## virginica 0 0 50
If you prefer use the \(ICL\) criterion, you can apply the mclustICL()
function.
mICL = mclustICL(iris[-5])
summary(mICL)
## Best ICL values:
## VEV,2 VEV,3 VVV,2
## ICL -561.7289 -566.45770 -574.01910
## ICL diff 0.0000 -4.72882 -12.29022
plot(mICL)
Now, we apply Mclust()
function to get the results of the best model. But, we have to indicate manually the number of clusters and the model to use in the Mclust()
function.
mICL1 = Mclust(iris[-5],
G = 2,
modelNames = "VEV")
summary(mICL1, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEV (ellipsoidal, equal shape) model with 2 components:
##
## log.likelihood n df BIC ICL
## -215.726 150 26 -561.7285 -561.7289
##
## Clustering table:
## 1 2
## 50 100
##
## Mixing probabilities:
## 1 2
## 0.333332 0.666668
##
## Means:
## [,1] [,2]
## Sepal.Length 5.0060021 6.261996
## Sepal.Width 3.4280046 2.871999
## Petal.Length 1.4620006 4.905993
## Petal.Width 0.2459998 1.675997
##
## Variances:
## [,,1]
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.15065097 0.13080108 0.020844624 0.013091029
## Sepal.Width 0.13080108 0.17604544 0.016032479 0.012214539
## Petal.Length 0.02084462 0.01603248 0.028082603 0.006015675
## Petal.Width 0.01309103 0.01221454 0.006015675 0.010423651
## [,,2]
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 0.4000437 0.10865439 0.3994013 0.14368238
## Sepal.Width 0.1086544 0.10928074 0.1238902 0.07284378
## Petal.Length 0.3994013 0.12389025 0.6109012 0.25738947
## Petal.Width 0.1436824 0.07284378 0.2573895 0.16808166
table(mICL1$classification)
##
## 1 2
## 50 100
t(mICL1$parameters$mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] 5.006002 3.428005 1.462001 0.2459998
## [2,] 6.261996 2.871999 4.905993 1.6759972
plot(mICL1, what = "classification")
table(iris$Species, mICL1$classification)
##
## 1 2
## setosa 50 0
## versicolor 0 50
## virginica 0 50
We can also compute some other criteria, such as \(AIC\) and \(AIC3\).
G = attr(mBIC, "G")
modelNames = attr(mBIC, "modelNames")
d = attr(mBIC, "d")
criterion = data.frame(G = rep(G, length(modelNames)),
model = rep(modelNames, each = length(G)),
AIC = NA,
AIC3 = NA,
stringsAsFactors = FALSE)
for (i in 1:nrow(criterion)) {
m = Mclust(iris[-5], G = criterion$G[i],
modelNames = criterion$model[i])
if (!is.null(m)) {
p = nMclustParams(criterion$model[i], d, criterion$G[i])
criterion[i, "AIC"] = -2 * m$loglik + 2 * p
criterion[i, "AIC3"] = -2 * m$loglik + 3 * p
}
}
## Warning in pickBIC(object[as.character(G), modelNames, drop = FALSE], k =
## 3): none of the selected models could be fitted
head(criterion[order(criterion$AIC),], 3)
## G model AIC AIC3
## 124 7 VVV 415.4447 519.4447
## 107 8 VEV 429.1328 527.1328
## 126 9 VVV 432.4844 566.4844
ggplot(criterion, aes(G, AIC, color = model)) +
geom_line() +
scale_x_discrete(limits = G)
## Warning: Removed 1 rows containing missing values (geom_path).
head(criterion[order(criterion$AIC3),], 3)
## G model AIC AIC3
## 102 3 VEV 448.1473 486.1473
## 120 3 VVV 448.3719 492.3719
## 103 4 VEV 453.3948 503.3948
ggplot(criterion, aes(G, AIC3, color = model)) +
geom_line() +
scale_x_discrete(limits = G)
## Warning: Removed 1 rows containing missing values (geom_path).
This package also contains the hc()
function, computing a model-based hierarchical clustering. You have to specify the model to use.
hm = hc(iris[-5], "VVV")
hm2 = hclass(hm, 2)
clPairs(iris[-5], cl = hm2)
table(iris$Species, hm2)
## hm2
## 1 2
## setosa 50 0
## versicolor 0 50
## virginica 0 50
hm3 = hclass(hm, 3)
clPairs(iris[-5], cl = hm3)
table(iris$Species, hm3)
## hm3
## 1 2 3
## setosa 50 0 0
## versicolor 0 50 0
## virginica 0 14 36
Rmixmod
In this package, the function mixmodCluster()
apply the \(EM\) algorithm (initialized by \(smallEM\)), with many values of number of clusters (with nbCluster
parameter, without default value) and models (with models
parameter, "Gaussian_pk_Lk_C"
by default). It uses by default the \(BIC\) criterion.
mixmodBIC = mixmodCluster(iris[-5], 1:9)
mixmodBIC
## ****************************************
## *** INPUT:
## ****************************************
## * nbCluster = 1 2 3 4 5 6 7 8 9
## * criterion = BIC
## ****************************************
## *** MIXMOD Models:
## * list = Gaussian_pk_Lk_C
## * This list includes only models with free proportions.
## ****************************************
## * data (limited to a 10x10 matrix) =
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] 5.1 3.5 1.4 0.2
## [2,] 4.9 3 1.4 0.2
## [3,] 4.7 3.2 1.3 0.2
## [4,] 4.6 3.1 1.5 0.2
## [5,] 5 3.6 1.4 0.2
## [6,] 5.4 3.9 1.7 0.4
## [7,] 4.6 3.4 1.4 0.3
## [8,] 5 3.4 1.5 0.2
## [9,] 4.4 2.9 1.4 0.2
## [10,] 4.9 3.1 1.5 0.1
## * ... ...
## ****************************************
## *** MIXMOD Strategy:
## * algorithm = EM
## * number of tries = 1
## * number of iterations = 200
## * epsilon = 0.001
## *** Initialization strategy:
## * algorithm = smallEM
## * number of tries = 10
## * number of iterations = 5
## * epsilon = 0.001
## * seed = NULL
## ****************************************
##
##
## ****************************************
## *** BEST MODEL OUTPUT:
## *** According to the BIC criterion
## ****************************************
## * nbCluster = 5
## * model name = Gaussian_pk_Lk_C
## * criterion = BIC(597.3072)
## * likelihood = -203.4515
## ****************************************
## *** Cluster 1
## * proportion = 0.2297
## * means = 6.3526 2.9961 5.3257 2.0978
## * variances = | 0.2344 0.0871 0.1368 0.0516 |
## | 0.0871 0.1110 0.0603 0.0313 |
## | 0.1368 0.0603 0.1564 0.0536 |
## | 0.0516 0.0313 0.0536 0.0389 |
## *** Cluster 2
## * proportion = 0.3280
## * means = 5.9401 2.7635 4.2519 1.3189
## * variances = | 0.2212 0.0822 0.1292 0.0487 |
## | 0.0822 0.1048 0.0569 0.0295 |
## | 0.1292 0.0569 0.1477 0.0506 |
## | 0.0487 0.0295 0.0506 0.0368 |
## *** Cluster 3
## * proportion = 0.1090
## * means = 7.0398 2.9371 5.9901 1.8617
## * variances = | 0.3152 0.1171 0.1840 0.0694 |
## | 0.1171 0.1493 0.0810 0.0421 |
## | 0.1840 0.0810 0.2104 0.0722 |
## | 0.0694 0.0421 0.0722 0.0524 |
## *** Cluster 4
## * proportion = 0.1331
## * means = 5.2643 3.7517 1.4479 0.2545
## * variances = | 0.1237 0.0459 0.0722 0.0272 |
## | 0.0459 0.0586 0.0318 0.0165 |
## | 0.0722 0.0318 0.0826 0.0283 |
## | 0.0272 0.0165 0.0283 0.0206 |
## *** Cluster 5
## * proportion = 0.2002
## * means = 4.8342 3.2127 1.4714 0.2403
## * variances = | 0.1096 0.0407 0.0640 0.0241 |
## | 0.0407 0.0519 0.0282 0.0146 |
## | 0.0640 0.0282 0.0731 0.0251 |
## | 0.0241 0.0146 0.0251 0.0182 |
## ****************************************
summary(mixmodBIC)
## **************************************************************
## * Number of samples = 150
## * Problem dimension = 4
## **************************************************************
## * Number of cluster = 5
## * Model Type = Gaussian_pk_Lk_C
## * Criterion = BIC(597.3072)
## * Parameters = list by cluster
## * Cluster 1 :
## Proportion = 0.2297
## Means = 6.3526 2.9961 5.3257 2.0978
## Variances = | 0.2344 0.0871 0.1368 0.0516 |
## | 0.0871 0.1110 0.0603 0.0313 |
## | 0.1368 0.0603 0.1564 0.0536 |
## | 0.0516 0.0313 0.0536 0.0389 |
## * Cluster 2 :
## Proportion = 0.3280
## Means = 5.9401 2.7635 4.2519 1.3189
## Variances = | 0.2212 0.0822 0.1292 0.0487 |
## | 0.0822 0.1048 0.0569 0.0295 |
## | 0.1292 0.0569 0.1477 0.0506 |
## | 0.0487 0.0295 0.0506 0.0368 |
## * Cluster 3 :
## Proportion = 0.1090
## Means = 7.0398 2.9371 5.9901 1.8617
## Variances = | 0.3152 0.1171 0.1840 0.0694 |
## | 0.1171 0.1493 0.0810 0.0421 |
## | 0.1840 0.0810 0.2104 0.0722 |
## | 0.0694 0.0421 0.0722 0.0524 |
## * Cluster 4 :
## Proportion = 0.1331
## Means = 5.2643 3.7517 1.4479 0.2545
## Variances = | 0.1237 0.0459 0.0722 0.0272 |
## | 0.0459 0.0586 0.0318 0.0165 |
## | 0.0722 0.0318 0.0826 0.0283 |
## | 0.0272 0.0165 0.0283 0.0206 |
## * Cluster 5 :
## Proportion = 0.2002
## Means = 4.8342 3.2127 1.4714 0.2403
## Variances = | 0.1096 0.0407 0.0640 0.0241 |
## | 0.0407 0.0519 0.0282 0.0146 |
## | 0.0640 0.0282 0.0731 0.0251 |
## | 0.0241 0.0146 0.0251 0.0182 |
## * Log-likelihood = -203.4515
## **************************************************************
plot(mixmodBIC)
hist(mixmodBIC)
If you want to use \(ICL\) criterion, or even \(NEC\), you have to specify it with the criterion
parameter.
mixmodICL = mixmodCluster(iris[-5], 1:9, criterion = "ICL")
mixmodICL
## ****************************************
## *** INPUT:
## ****************************************
## * nbCluster = 1 2 3 4 5 6 7 8 9
## * criterion = ICL
## ****************************************
## *** MIXMOD Models:
## * list = Gaussian_pk_Lk_C
## * This list includes only models with free proportions.
## ****************************************
## * data (limited to a 10x10 matrix) =
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] 5.1 3.5 1.4 0.2
## [2,] 4.9 3 1.4 0.2
## [3,] 4.7 3.2 1.3 0.2
## [4,] 4.6 3.1 1.5 0.2
## [5,] 5 3.6 1.4 0.2
## [6,] 5.4 3.9 1.7 0.4
## [7,] 4.6 3.4 1.4 0.3
## [8,] 5 3.4 1.5 0.2
## [9,] 4.4 2.9 1.4 0.2
## [10,] 4.9 3.1 1.5 0.1
## * ... ...
## ****************************************
## *** MIXMOD Strategy:
## * algorithm = EM
## * number of tries = 1
## * number of iterations = 200
## * epsilon = 0.001
## *** Initialization strategy:
## * algorithm = smallEM
## * number of tries = 10
## * number of iterations = 5
## * epsilon = 0.001
## * seed = NULL
## ****************************************
##
##
## ****************************************
## *** BEST MODEL OUTPUT:
## *** According to the ICL criterion
## ****************************************
## * nbCluster = 4
## * model name = Gaussian_pk_Lk_C
## * criterion = ICL(612.7535)
## * likelihood = -216.7619
## ****************************************
## *** Cluster 1
## * proportion = 0.0999
## * means = 7.0921 2.9686 6.0552 1.8730
## * variances = | 0.2960 0.1298 0.1584 0.0605 |
## | 0.1298 0.1653 0.0623 0.0354 |
## | 0.1584 0.0623 0.1852 0.0638 |
## | 0.0605 0.0354 0.0638 0.0443 |
## *** Cluster 2
## * proportion = 0.3359
## * means = 5.9438 2.7564 4.2704 1.3245
## * variances = | 0.2353 0.1031 0.1259 0.0481 |
## | 0.1031 0.1314 0.0495 0.0282 |
## | 0.1259 0.0495 0.1472 0.0508 |
## | 0.0481 0.0282 0.0508 0.0352 |
## *** Cluster 3
## * proportion = 0.2308
## * means = 6.3657 2.9984 5.3336 2.1023
## * variances = | 0.2504 0.1098 0.1340 0.0511 |
## | 0.1098 0.1399 0.0527 0.0300 |
## | 0.1340 0.0527 0.1567 0.0540 |
## | 0.0511 0.0300 0.0540 0.0375 |
## *** Cluster 4
## * proportion = 0.3333
## * means = 5.0060 3.4280 1.4620 0.2460
## * variances = | 0.1543 0.0677 0.0826 0.0315 |
## | 0.0677 0.0862 0.0325 0.0185 |
## | 0.0826 0.0325 0.0966 0.0333 |
## | 0.0315 0.0185 0.0333 0.0231 |
## ****************************************
summary(mixmodICL)
## **************************************************************
## * Number of samples = 150
## * Problem dimension = 4
## **************************************************************
## * Number of cluster = 4
## * Model Type = Gaussian_pk_Lk_C
## * Criterion = ICL(612.7535)
## * Parameters = list by cluster
## * Cluster 1 :
## Proportion = 0.0999
## Means = 7.0921 2.9686 6.0552 1.8730
## Variances = | 0.2960 0.1298 0.1584 0.0605 |
## | 0.1298 0.1653 0.0623 0.0354 |
## | 0.1584 0.0623 0.1852 0.0638 |
## | 0.0605 0.0354 0.0638 0.0443 |
## * Cluster 2 :
## Proportion = 0.3359
## Means = 5.9438 2.7564 4.2704 1.3245
## Variances = | 0.2353 0.1031 0.1259 0.0481 |
## | 0.1031 0.1314 0.0495 0.0282 |
## | 0.1259 0.0495 0.1472 0.0508 |
## | 0.0481 0.0282 0.0508 0.0352 |
## * Cluster 3 :
## Proportion = 0.2308
## Means = 6.3657 2.9984 5.3336 2.1023
## Variances = | 0.2504 0.1098 0.1340 0.0511 |
## | 0.1098 0.1399 0.0527 0.0300 |
## | 0.1340 0.0527 0.1567 0.0540 |
## | 0.0511 0.0300 0.0540 0.0375 |
## * Cluster 4 :
## Proportion = 0.3333
## Means = 5.0060 3.4280 1.4620 0.2460
## Variances = | 0.1543 0.0677 0.0826 0.0315 |
## | 0.0677 0.0862 0.0325 0.0185 |
## | 0.0826 0.0325 0.0966 0.0333 |
## | 0.0315 0.0185 0.0333 0.0231 |
## * Log-likelihood = -216.7619
## **************************************************************
plot(mixmodICL)
hist(mixmodICL)
If you want to test more models, you can use the mixmodGaussianModel()
function to list them.
mixmodGaussianModel()
## ****************************************
## *** MIXMOD Models:
## * list = Gaussian_pk_L_I Gaussian_pk_Lk_I Gaussian_pk_L_B Gaussian_pk_Lk_B Gaussian_pk_L_Bk Gaussian_pk_Lk_Bk Gaussian_pk_L_C Gaussian_pk_Lk_C Gaussian_pk_L_D_Ak_D Gaussian_pk_Lk_D_Ak_D Gaussian_pk_L_Dk_A_Dk Gaussian_pk_Lk_Dk_A_Dk Gaussian_pk_L_Ck Gaussian_pk_Lk_Ck Gaussian_p_L_I Gaussian_p_Lk_I Gaussian_p_L_B Gaussian_p_Lk_B Gaussian_p_L_Bk Gaussian_p_Lk_Bk Gaussian_p_L_C Gaussian_p_Lk_C Gaussian_p_L_D_Ak_D Gaussian_p_Lk_D_Ak_D Gaussian_p_L_Dk_A_Dk Gaussian_p_Lk_Dk_A_Dk Gaussian_p_L_Ck Gaussian_p_Lk_Ck
## * This list includes models with free and equal proportions.
## ****************************************
mixmodGaussianModel(family = "general")
## ****************************************
## *** MIXMOD Models:
## * list = Gaussian_pk_L_C Gaussian_pk_Lk_C Gaussian_pk_L_D_Ak_D Gaussian_pk_Lk_D_Ak_D Gaussian_pk_L_Dk_A_Dk Gaussian_pk_Lk_Dk_A_Dk Gaussian_pk_L_Ck Gaussian_pk_Lk_Ck Gaussian_p_L_C Gaussian_p_Lk_C Gaussian_p_L_D_Ak_D Gaussian_p_Lk_D_Ak_D Gaussian_p_L_Dk_A_Dk Gaussian_p_Lk_Dk_A_Dk Gaussian_p_L_Ck Gaussian_p_Lk_Ck
## * This list includes models with free and equal proportions.
## ****************************************
mixmodGaussianModel(family = "spherical",
free.proportions = FALSE)
## ****************************************
## *** MIXMOD Models:
## * list = Gaussian_p_L_I Gaussian_p_Lk_I
## * This list includes only models with equal proportions.
## ****************************************
Then, we test all the gaussian models. We also change the initialization strategy. By default, mixmodCluster()
use a \(smallEM\), from 50 random starts. We decide here to test \(EM\) with 20 random initialization.
mixmodAll = mixmodCluster(
iris[-5], 1:9,
criterion = c("BIC", "ICL", "NEC"),
models = mixmodGaussianModel(),
strategy = mixmodStrategy(algo = "EM",
initMethod = "random",
nbTry = 20))
temp = sapply(attr(mixmodAll, "results"), function(mod) {
K = attr(mod, "nbCluster")
BIC = attr(mod, "criterionValue")[1]
ICL = attr(mod, "criterionValue")[2]
NEC = attr(mod, "criterionValue")[3]
model = attr(mod, "model")
return (c(K = K, model = model, BIC = BIC, ICL = ICL, NEC = NEC))
})
mixmodCriterion = transform(
data.frame(t(temp), stringsAsFactors = FALSE),
BIC = as.numeric(BIC),
ICL = as.numeric(ICL),
NEC = as.numeric(ICL))
ggplot(mixmodCriterion, aes(K, model, fill = BIC)) + geom_bin2d()
ggplot(mixmodCriterion, aes(K, BIC, color = model)) +
stat_summary(aes(group = model), fun.y = mean, geom = "line")
ggplot(mixmodCriterion, aes(K, model, fill = ICL)) + geom_bin2d()
ggplot(mixmodCriterion, aes(K, ICL, color = model)) +
stat_summary(aes(group = model), fun.y = mean, geom = "line")
ggplot(mixmodCriterion, aes(K, model, fill = NEC)) + geom_bin2d()
ggplot(mixmodCriterion, aes(K, NEC, color = model)) +
stat_summary(aes(group = model), fun.y = mean, geom = "line")
Finally, we want to use these methods to search a good number of clusters for each digit in the pendigits
data.