Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

# Natural Language Toolkit: Clusterer Utilities 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Trevor Cohn <tacohn@cs.mu.oz.au> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

from __future__ import print_function 

 

import copy 

import numpy 

from sys import stdout 

from math import sqrt 

 

from nltk.cluster.api import ClusterI 

 

class VectorSpaceClusterer(ClusterI): 

    """ 

    Abstract clusterer which takes tokens and maps them into a vector space. 

    Optionally performs singular value decomposition to reduce the 

    dimensionality. 

    """ 

    def __init__(self, normalise=False, svd_dimensions=None): 

        """ 

        :param normalise:       should vectors be normalised to length 1 

        :type normalise:        boolean 

        :param svd_dimensions:  number of dimensions to use in reducing vector 

                                dimensionsionality with SVD 

        :type svd_dimensions:   int 

        """ 

        self._Tt = None 

        self._should_normalise = normalise 

        self._svd_dimensions = svd_dimensions 

 

    def cluster(self, vectors, assign_clusters=False, trace=False): 

        assert len(vectors) > 0 

 

        # normalise the vectors 

        if self._should_normalise: 

            vectors = list(map(self._normalise, vectors)) 

 

        # use SVD to reduce the dimensionality 

        if self._svd_dimensions and self._svd_dimensions < len(vectors[0]): 

            [u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors))) 

            S = d[:self._svd_dimensions] * \ 

                numpy.identity(self._svd_dimensions, numpy.float64) 

            T = u[:,:self._svd_dimensions] 

            Dt = vt[:self._svd_dimensions,:] 

            vectors = numpy.transpose(numpy.dot(S, Dt)) 

            self._Tt = numpy.transpose(T) 

 

        # call abstract method to cluster the vectors 

        self.cluster_vectorspace(vectors, trace) 

 

        # assign the vectors to clusters 

        if assign_clusters: 

            print(self._Tt, vectors) 

            return [self.classify(vector) for vector in vectors] 

 

    def cluster_vectorspace(self, vectors, trace): 

        """ 

        Finds the clusters using the given set of vectors. 

        """ 

        raise NotImplementedError() 

 

    def classify(self, vector): 

        if self._should_normalise: 

            vector = self._normalise(vector) 

        if self._Tt is not None: 

            vector = numpy.dot(self._Tt, vector) 

        cluster = self.classify_vectorspace(vector) 

        return self.cluster_name(cluster) 

 

    def classify_vectorspace(self, vector): 

        """ 

        Returns the index of the appropriate cluster for the vector. 

        """ 

        raise NotImplementedError() 

 

    def likelihood(self, vector, label): 

        if self._should_normalise: 

            vector = self._normalise(vector) 

        if self._Tt is not None: 

            vector = numpy.dot(self._Tt, vector) 

        return self.likelihood_vectorspace(vector, label) 

 

    def likelihood_vectorspace(self, vector, cluster): 

        """ 

        Returns the likelihood of the vector belonging to the cluster. 

        """ 

        predicted = self.classify_vectorspace(vector) 

        if cluster == predicted: return 1.0 

        else:                    return 0.0 

 

    def vector(self, vector): 

        """ 

        Returns the vector after normalisation and dimensionality reduction 

        """ 

        if self._should_normalise: 

            vector = self._normalise(vector) 

        if self._Tt is not None: 

            vector = numpy.dot(self._Tt, vector) 

        return vector 

 

    def _normalise(self, vector): 

        """ 

        Normalises the vector to unit length. 

        """ 

        return vector / sqrt(numpy.dot(vector, vector)) 

 

def euclidean_distance(u, v): 

    """ 

    Returns the euclidean distance between vectors u and v. This is equivalent 

    to the length of the vector (u - v). 

    """ 

    diff = u - v 

    return sqrt(numpy.dot(diff, diff)) 

 

def cosine_distance(u, v): 

    """ 

    Returns 1 minus the cosine of the angle between vectors v and u. This is equal to 

    1 - (u.v / |u||v|). 

    """ 

    return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v)))) 

 

class _DendrogramNode(object): 

    """ Tree node of a dendrogram. """ 

 

    def __init__(self, value, *children): 

        self._value = value 

        self._children = children 

 

    def leaves(self, values=True): 

        if self._children: 

            leaves = [] 

            for child in self._children: 

                leaves.extend(child.leaves(values)) 

            return leaves 

        elif values: 

            return [self._value] 

        else: 

            return [self] 

 

    def groups(self, n): 

        queue = [(self._value, self)] 

 

        while len(queue) < n: 

            priority, node = queue.pop() 

            if not node._children: 

                queue.push((priority, node)) 

                break 

            for child in node._children: 

                if child._children: 

                    queue.append((child._value, child)) 

                else: 

                    queue.append((0, child)) 

            # makes the earliest merges at the start, latest at the end 

            queue.sort() 

 

        groups = [] 

        for priority, node in queue: 

            groups.append(node.leaves()) 

        return groups 

 

class Dendrogram(object): 

    """ 

    Represents a dendrogram, a tree with a specified branching order.  This 

    must be initialised with the leaf items, then iteratively call merge for 

    each branch. This class constructs a tree representing the order of calls 

    to the merge function. 

    """ 

 

    def __init__(self, items=[]): 

        """ 

        :param  items: the items at the leaves of the dendrogram 

        :type   items: sequence of (any) 

        """ 

        self._items = [_DendrogramNode(item) for item in items] 

        self._original_items = copy.copy(self._items) 

        self._merge = 1 

 

    def merge(self, *indices): 

        """ 

        Merges nodes at given indices in the dendrogram. The nodes will be 

        combined which then replaces the first node specified. All other nodes 

        involved in the merge will be removed. 

 

        :param  indices: indices of the items to merge (at least two) 

        :type   indices: seq of int 

        """ 

        assert len(indices) >= 2 

        node = _DendrogramNode(self._merge, *[self._items[i] for i in indices]) 

        self._merge += 1 

        self._items[indices[0]] = node 

        for i in indices[1:]: 

            del self._items[i] 

 

    def groups(self, n): 

        """ 

        Finds the n-groups of items (leaves) reachable from a cut at depth n. 

        :param  n: number of groups 

        :type   n: int 

        """ 

        if len(self._items) > 1: 

            root = _DendrogramNode(self._merge, *self._items) 

        else: 

            root = self._items[0] 

        return root.groups(n) 

 

    def show(self, leaf_labels=[]): 

        """ 

        Print the dendrogram in ASCII art to standard out. 

        :param leaf_labels: an optional list of strings to use for labeling the leaves 

        :type leaf_labels: list 

        """ 

 

        # ASCII rendering characters 

        JOIN, HLINK, VLINK = '+', '-', '|' 

 

        # find the root (or create one) 

        if len(self._items) > 1: 

            root = _DendrogramNode(self._merge, *self._items) 

        else: 

            root = self._items[0] 

        leaves = self._original_items 

 

        if leaf_labels: 

            last_row = leaf_labels 

        else: 

            last_row = [str(leaf._value) for leaf in leaves] 

 

        # find the bottom row and the best cell width 

        width = max(map(len, last_row)) + 1 

        lhalf = width / 2 

        rhalf = width - lhalf - 1 

 

        # display functions 

        def format(centre, left=' ', right=' '): 

            return '%s%s%s' % (lhalf*left, centre, right*rhalf) 

        def display(str): 

            stdout.write(str) 

 

        # for each merge, top down 

        queue = [(root._value, root)] 

        verticals = [ format(' ') for leaf in leaves ] 

        while queue: 

            priority, node = queue.pop() 

            child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children)) 

            indices = list(map(leaves.index, child_left_leaf)) 

            if child_left_leaf: 

                min_idx = min(indices) 

                max_idx = max(indices) 

            for i in range(len(leaves)): 

                if leaves[i] in child_left_leaf: 

                    if i == min_idx:    display(format(JOIN, ' ', HLINK)) 

                    elif i == max_idx:  display(format(JOIN, HLINK, ' ')) 

                    else:               display(format(JOIN, HLINK, HLINK)) 

                    verticals[i] = format(VLINK) 

                elif min_idx <= i <= max_idx: 

                    display(format(HLINK, HLINK, HLINK)) 

                else: 

                    display(verticals[i]) 

            display('\n') 

            for child in node._children: 

                if child._children: 

                    queue.append((child._value, child)) 

            queue.sort() 

 

            for vertical in verticals: 

                display(vertical) 

            display('\n') 

 

        # finally, display the last line 

        display(''.join(item.center(width) for item in last_row)) 

        display('\n') 

 

    def __repr__(self): 

        if len(self._items) > 1: 

            root = _DendrogramNode(self._merge, *self._items) 

        else: 

            root = self._items[0] 

        leaves = root.leaves(False) 

        return '<Dendrogram with %d leaves>' % len(leaves)