Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

# Natural Language Toolkit: Expectation Maximization Clusterer 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Trevor Cohn <tacohn@cs.mu.oz.au> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

from __future__ import print_function 

 

import numpy 

 

from .util import VectorSpaceClusterer 

 

class EMClusterer(VectorSpaceClusterer): 

    """ 

    The Gaussian EM clusterer models the vectors as being produced by 

    a mixture of k Gaussian sources. The parameters of these sources 

    (prior probability, mean and covariance matrix) are then found to 

    maximise the likelihood of the given data. This is done with the 

    expectation maximisation algorithm. It starts with k arbitrarily 

    chosen means, priors and covariance matrices. It then calculates 

    the membership probabilities for each vector in each of the 

    clusters; this is the 'E' step. The cluster parameters are then 

    updated in the 'M' step using the maximum likelihood estimate from 

    the cluster membership probabilities. This process continues until 

    the likelihood of the data does not significantly increase. 

    """ 

 

    def __init__(self, initial_means, priors=None, covariance_matrices=None, 

                       conv_threshold=1e-6, bias=0.1, normalise=False, 

                       svd_dimensions=None): 

        """ 

        Creates an EM clusterer with the given starting parameters, 

        convergence threshold and vector mangling parameters. 

 

        :param  initial_means: the means of the gaussian cluster centers 

        :type   initial_means: [seq of] numpy array or seq of SparseArray 

        :param  priors: the prior probability for each cluster 

        :type   priors: numpy array or seq of float 

        :param  covariance_matrices: the covariance matrix for each cluster 

        :type   covariance_matrices: [seq of] numpy array 

        :param  conv_threshold: maximum change in likelihood before deemed 

                    convergent 

        :type   conv_threshold: int or float 

        :param  bias: variance bias used to ensure non-singular covariance 

                      matrices 

        :type   bias: float 

        :param  normalise:  should vectors be normalised to length 1 

        :type   normalise:  boolean 

        :param  svd_dimensions: number of dimensions to use in reducing vector 

                               dimensionsionality with SVD 

        :type   svd_dimensions: int 

        """ 

        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) 

        self._means = numpy.array(initial_means, numpy.float64) 

        self._num_clusters = len(initial_means) 

        self._conv_threshold = conv_threshold 

        self._covariance_matrices = covariance_matrices 

        self._priors = priors 

        self._bias = bias 

 

    def num_clusters(self): 

        return self._num_clusters 

 

    def cluster_vectorspace(self, vectors, trace=False): 

        assert len(vectors) > 0 

 

        # set the parameters to initial values 

        dimensions = len(vectors[0]) 

        means = self._means 

        priors = self._priors 

        if not priors: 

            priors = self._priors = numpy.ones(self._num_clusters, 

                                        numpy.float64) / self._num_clusters 

        covariances = self._covariance_matrices 

        if not covariances: 

            covariances = self._covariance_matrices = \ 

                [ numpy.identity(dimensions, numpy.float64) 

                  for i in range(self._num_clusters) ] 

 

        # do the E and M steps until the likelihood plateaus 

        lastl = self._loglikelihood(vectors, priors, means, covariances) 

        converged = False 

 

        while not converged: 

            if trace: print('iteration; loglikelihood', lastl) 

            # E-step, calculate hidden variables, h[i,j] 

            h = numpy.zeros((len(vectors), self._num_clusters), 

                numpy.float64) 

            for i in range(len(vectors)): 

                for j in range(self._num_clusters): 

                    h[i,j] = priors[j] * self._gaussian(means[j], 

                                               covariances[j], vectors[i]) 

                h[i,:] /= sum(h[i,:]) 

 

            # M-step, update parameters - cvm, p, mean 

            for j in range(self._num_clusters): 

                covariance_before = covariances[j] 

                new_covariance = numpy.zeros((dimensions, dimensions), 

                            numpy.float64) 

                new_mean = numpy.zeros(dimensions, numpy.float64) 

                sum_hj = 0.0 

                for i in range(len(vectors)): 

                    delta = vectors[i] - means[j] 

                    new_covariance += h[i,j] * \ 

                        numpy.multiply.outer(delta, delta) 

                    sum_hj += h[i,j] 

                    new_mean += h[i,j] * vectors[i] 

                covariances[j] = new_covariance / sum_hj 

                means[j] = new_mean / sum_hj 

                priors[j] = sum_hj / len(vectors) 

 

                # bias term to stop covariance matrix being singular 

                covariances[j] += self._bias * \ 

                    numpy.identity(dimensions, numpy.float64) 

 

            # calculate likelihood - FIXME: may be broken 

            l = self._loglikelihood(vectors, priors, means, covariances) 

 

            # check for convergence 

            if abs(lastl - l) < self._conv_threshold: 

                converged = True 

            lastl = l 

 

    def classify_vectorspace(self, vector): 

        best = None 

        for j in range(self._num_clusters): 

            p = self._priors[j] * self._gaussian(self._means[j], 

                                    self._covariance_matrices[j], vector) 

            if not best or p > best[0]: 

                best = (p, j) 

        return best[1] 

 

    def likelihood_vectorspace(self, vector, cluster): 

        cid = self.cluster_names().index(cluster) 

        return self._priors[cluster] * self._gaussian(self._means[cluster], 

                                self._covariance_matrices[cluster], vector) 

 

    def _gaussian(self, mean, cvm, x): 

        m = len(mean) 

        assert cvm.shape == (m, m), \ 

            'bad sized covariance matrix, %s' % str(cvm.shape) 

        try: 

            det = numpy.linalg.det(cvm) 

            inv = numpy.linalg.inv(cvm) 

            a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0) 

            dx = x - mean 

            print(dx, inv) 

            b = -0.5 * numpy.dot( numpy.dot(dx, inv), dx) 

            return a * numpy.exp(b) 

        except OverflowError: 

            # happens when the exponent is negative infinity - i.e. b = 0 

            # i.e. the inverse of cvm is huge (cvm is almost zero) 

            return 0 

 

    def _loglikelihood(self, vectors, priors, means, covariances): 

        llh = 0.0 

        for vector in vectors: 

            p = 0 

            for j in range(len(priors)): 

                p += priors[j] * \ 

                         self._gaussian(means[j], covariances[j], vector) 

            llh += numpy.log(p) 

        return llh 

 

    def __repr__(self): 

        return '<EMClusterer means=%s>' % list(self._means) 

 

def demo(): 

    """ 

    Non-interactive demonstration of the clusterers with simple 2-D data. 

    """ 

 

    from nltk import cluster 

 

    # example from figure 14.10, page 519, Manning and Schutze 

 

    vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]] 

    means = [[4, 2], [4, 2.01]] 

 

    clusterer = cluster.EMClusterer(means, bias=0.1) 

    clusters = clusterer.cluster(vectors, True, trace=True) 

 

    print('Clustered:', vectors) 

    print('As:       ', clusters) 

    print() 

 

    for c in range(2): 

        print('Cluster:', c) 

        print('Prior:  ', clusterer._priors[c]) 

        print('Mean:   ', clusterer._means[c]) 

        print('Covar:  ', clusterer._covariance_matrices[c]) 

        print() 

 

    # classify a new vector 

    vector = numpy.array([2, 2]) 

    print('classify(%s):' % vector, end=' ') 

    print(clusterer.classify(vector)) 

 

    # show the classification probabilities 

    vector = numpy.array([2, 2]) 

    print('classification_probdist(%s):' % vector) 

    pdist = clusterer.classification_probdist(vector) 

    for sample in pdist.samples(): 

        print('%s => %.0f%%' % (sample, 

                    pdist.prob(sample) *100)) 

 

# 

#     The following demo code is broken. 

# 

#     # use a set of tokens with 2D indices 

#     vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] 

 

#     # test the EM clusterer with means given by k-means (2) and 

#     # dimensionality reduction 

#     clusterer = cluster.KMeans(2, euclidean_distance, svd_dimensions=1) 

#     print 'Clusterer:', clusterer 

#     clusters = clusterer.cluster(vectors) 

#     means = clusterer.means() 

#     print 'Means:', clusterer.means() 

#     print 

 

#     clusterer = cluster.EMClusterer(means, svd_dimensions=1) 

#     clusters = clusterer.cluster(vectors, True) 

#     print 'Clusterer:', clusterer 

#     print 'Clustered:', str(vectors)[:60], '...' 

#     print 'As:', str(clusters)[:60], '...' 

#     print 

 

#     # classify a new vector 

#     vector = numpy.array([3, 3]) 

#     print 'classify(%s):' % vector, 

#     print clusterer.classify(vector) 

#     print 

 

#     # show the classification probabilities 

#     vector = numpy.array([2.2, 2]) 

#     print 'classification_probdist(%s)' % vector 

#     pdist = clusterer.classification_probdist(vector) 

#     for sample in pdist: 

#         print '%s => %.0f%%' % (sample, pdist.prob(sample) *100) 

 

if __name__ == '__main__': 

    demo()