Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

# Natural Language Toolkit: K-Means Clusterer 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Trevor Cohn <tacohn@cs.mu.oz.au> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

from __future__ import print_function 

 

import copy 

import numpy 

import random 

import sys 

 

from nltk.cluster.util import VectorSpaceClusterer 

 

class KMeansClusterer(VectorSpaceClusterer): 

    """ 

    The K-means clusterer starts with k arbitrary chosen means then allocates 

    each vector to the cluster with the closest mean. It then recalculates the 

    means of each cluster as the centroid of the vectors in the cluster. This 

    process repeats until the cluster memberships stabilise. This is a 

    hill-climbing algorithm which may converge to a local maximum. Hence the 

    clustering is often repeated with random initial means and the most 

    commonly occurring output means are chosen. 

    """ 

 

    def __init__(self, num_means, distance, repeats=1, 

                       conv_test=1e-6, initial_means=None, 

                       normalise=False, svd_dimensions=None, 

                       rng=None, avoid_empty_clusters=False): 

 

        """ 

        :param  num_means:  the number of means to use (may use fewer) 

        :type   num_means:  int 

        :param  distance:   measure of distance between two vectors 

        :type   distance:   function taking two vectors and returing a float 

        :param  repeats:    number of randomised clustering trials to use 

        :type   repeats:    int 

        :param  conv_test:  maximum variation in mean differences before 

                            deemed convergent 

        :type   conv_test:  number 

        :param  initial_means: set of k initial means 

        :type   initial_means: sequence of vectors 

        :param  normalise:  should vectors be normalised to length 1 

        :type   normalise:  boolean 

        :param svd_dimensions: number of dimensions to use in reducing vector 

                               dimensionsionality with SVD 

        :type svd_dimensions: int 

        :param  rng:        random number generator (or None) 

        :type   rng:        Random 

        :param avoid_empty_clusters: include current centroid in computation 

                                     of next one; avoids undefined behavior 

                                     when clusters become empty 

        :type avoid_empty_clusters: boolean 

        """ 

        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) 

        self._num_means = num_means 

        self._distance = distance 

        self._max_difference = conv_test 

        assert not initial_means or len(initial_means) == num_means 

        self._means = initial_means 

        assert repeats >= 1 

        assert not (initial_means and repeats > 1) 

        self._repeats = repeats 

        if rng: self._rng = rng 

        else:   self._rng = random.Random() 

        self._avoid_empty_clusters = avoid_empty_clusters 

 

    def cluster_vectorspace(self, vectors, trace=False): 

        if self._means and self._repeats > 1: 

            print('Warning: means will be discarded for subsequent trials') 

 

        meanss = [] 

        for trial in range(self._repeats): 

            if trace: print('k-means trial', trial) 

            if not self._means or trial > 1: 

                self._means = self._rng.sample(vectors, self._num_means) 

            self._cluster_vectorspace(vectors, trace) 

            meanss.append(self._means) 

 

        if len(meanss) > 1: 

            # sort the means first (so that different cluster numbering won't 

            # effect the distance comparison) 

            for means in meanss: 

                means.sort(cmp = _vector_compare) 

 

            # find the set of means that's minimally different from the others 

            min_difference = min_means = None 

            for i in range(len(meanss)): 

                d = 0 

                for j in range(len(meanss)): 

                    if i != j: 

                        d += self._sum_distances(meanss[i], meanss[j]) 

                if min_difference is None or d < min_difference: 

                    min_difference, min_means = d, meanss[i] 

 

            # use the best means 

            self._means = min_means 

 

    def _cluster_vectorspace(self, vectors, trace=False): 

        if self._num_means < len(vectors): 

            # perform k-means clustering 

            converged = False 

            while not converged: 

                # assign the tokens to clusters based on minimum distance to 

                # the cluster means 

                clusters = [[] for m in range(self._num_means)] 

                for vector in vectors: 

                    index = self.classify_vectorspace(vector) 

                    clusters[index].append(vector) 

 

                if trace: print('iteration') 

                #for i in range(self._num_means): 

                    #print '  mean', i, 'allocated', len(clusters[i]), 'vectors' 

 

                # recalculate cluster means by computing the centroid of each cluster 

                new_means = map(self._centroid, clusters, self._means) 

 

                # measure the degree of change from the previous step for convergence 

                difference = self._sum_distances(self._means, new_means) 

                if difference < self._max_difference: 

                    converged = True 

 

                # remember the new means 

                self._means = new_means 

 

    def classify_vectorspace(self, vector): 

        # finds the closest cluster centroid 

        # returns that cluster's index 

        best_distance = best_index = None 

        for index in range(len(self._means)): 

            mean = self._means[index] 

            dist = self._distance(vector, mean) 

            if best_distance is None or dist < best_distance: 

                best_index, best_distance = index, dist 

        return best_index 

 

    def num_clusters(self): 

        if self._means: 

            return len(self._means) 

        else: 

            return self._num_means 

 

    def means(self): 

        """ 

        The means used for clustering. 

        """ 

        return self._means 

 

    def _sum_distances(self, vectors1, vectors2): 

        difference = 0.0 

        for u, v in zip(vectors1, vectors2): 

            difference += self._distance(u, v) 

        return difference 

 

    def _centroid(self, cluster, mean): 

        if self._avoid_empty_clusters: 

            centroid = copy.copy(mean) 

            for vector in cluster: 

                centroid += vector 

            return centroid / (1+float(len(cluster))) 

        else: 

            if not len(cluster): 

                sys.stderr.write('Error: no centroid defined for empty cluster.\n') 

                sys.stderr.write('Try setting argument \'avoid_empty_clusters\' to True\n') 

                assert(False) 

            centroid = copy.copy(cluster[0]) 

            for vector in cluster[1:]: 

                centroid += vector 

            return centroid / float(len(cluster)) 

 

    def __repr__(self): 

        return '<KMeansClusterer means=%s repeats=%d>' % \ 

                    (self._means, self._repeats) 

 

def _vector_compare(x, y): 

    xs, ys = sum(x), sum(y) 

    if xs < ys:     return -1 

    elif xs > ys:   return 1 

    else:           return 0 

 

################################################################################# 

 

def demo(): 

    # example from figure 14.9, page 517, Manning and Schutze 

 

    from nltk.cluster import KMeansClusterer, euclidean_distance 

 

    vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] 

    means = [[4, 3], [5, 5]] 

 

    clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means) 

    clusters = clusterer.cluster(vectors, True, trace=True) 

 

    print('Clustered:', vectors) 

    print('As:', clusters) 

    print('Means:', clusterer.means()) 

    print() 

 

    vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] 

 

    # test k-means using the euclidean distance metric, 2 means and repeat 

    # clustering 10 times with random seeds 

 

    clusterer = KMeansClusterer(2, euclidean_distance, repeats=10) 

    clusters = clusterer.cluster(vectors, True) 

    print('Clustered:', vectors) 

    print('As:', clusters) 

    print('Means:', clusterer.means()) 

    print() 

 

    # classify a new vector 

    vector = numpy.array([3, 3]) 

    print('classify(%s):' % vector, end=' ') 

    print(clusterer.classify(vector)) 

    print() 

 

if __name__ == '__main__': 

    demo()