Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

# Natural Language Toolkit: Clusterer Interfaces 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Trevor Cohn <tacohn@cs.mu.oz.au> 

# Porting: Steven Bird <sb@csse.unimelb.edu.au> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

from nltk.probability import DictionaryProbDist 

 

class ClusterI(object): 

    """ 

    Interface covering basic clustering functionality. 

    """ 

 

    def cluster(self, vectors, assign_clusters=False): 

        """ 

        Assigns the vectors to clusters, learning the clustering parameters 

        from the data. Returns a cluster identifier for each vector. 

        """ 

        raise NotImplementedError() 

 

    def classify(self, token): 

        """ 

        Classifies the token into a cluster, setting the token's CLUSTER 

        parameter to that cluster identifier. 

        """ 

        raise NotImplementedError() 

 

    def likelihood(self, vector, label): 

        """ 

        Returns the likelihood (a float) of the token having the 

        corresponding cluster. 

        """ 

        if self.classify(vector) == label: 

            return 1.0 

        else: 

            return 0.0 

 

    def classification_probdist(self, vector): 

        """ 

        Classifies the token into a cluster, returning 

        a probability distribution over the cluster identifiers. 

        """ 

        likelihoods = {} 

        sum = 0.0 

        for cluster in self.cluster_names(): 

            likelihoods[cluster] = self.likelihood(vector, cluster) 

            sum += likelihoods[cluster] 

        for cluster in self.cluster_names(): 

            likelihoods[cluster] /= sum 

        return DictionaryProbDist(likelihoods) 

 

    def num_clusters(self): 

        """ 

        Returns the number of clusters. 

        """ 

        raise NotImplementedError() 

 

    def cluster_names(self): 

        """ 

        Returns the names of the clusters. 

        """ 

        return list(range(self.num_clusters())) 

 

    def cluster_name(self, index): 

        """ 

        Returns the names of the cluster at index. 

        """ 

        return index