Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

# Natural Language Toolkit: Positive Naive Bayes Classifier 

# 

# Copyright (C) 2012 NLTK Project 

# Author: Alessandro Presta <alessandro.presta@gmail.com> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

A variant of the Naive Bayes Classifier that performs binary classification with 

partially-labeled training sets. In other words, assume we want to build a classifier 

that assigns each example to one of two complementary classes (e.g., male names and 

female names). 

If we have a training set with labeled examples for both classes, we can use a 

standard Naive Bayes Classifier. However, consider the case when we only have labeled 

examples for one of the classes, and other, unlabeled, examples. 

Then, assuming a prior distribution on the two labels, we can use the unlabeled set 

to estimate the frequencies of the various features. 

 

Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1 

and unlabeled examples. We are also given an estimate of P(1). 

 

We compute P(feature|1) exactly as in the standard case. 

 

To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are 

assuming that the unlabeled examples are drawn according to the given prior distribution) 

and then express the conditional probability as: 

 

|                  P(feature) - P(feature|1) * P(1) 

|  P(feature|0) = ---------------------------------- 

|                               P(0) 

 

Example: 

 

    >>> from nltk.classify import PositiveNaiveBayesClassifier 

 

Some sentences about sports: 

 

    >>> sports_sentences = [ 'The team dominated the game', 

    ...                      'They lost the ball', 

    ...                      'The game was intense', 

    ...                      'The goalkeeper catched the ball', 

    ...                      'The other team controlled the ball' ] 

 

Mixed topics, including sports: 

 

    >>> various_sentences = [ 'The President did not comment', 

    ...                       'I lost the keys', 

    ...                       'The team won the game', 

    ...                       'Sara has two kids', 

    ...                       'The ball went off the court', 

    ...                       'They had the ball for the whole game', 

    ...                       'The show is over' ] 

 

The features of a sentence are simply the words it contains: 

 

    >>> def features(sentence): 

    ...     words = sentence.lower().split() 

    ...     return dict(('contains(%s)' % w, True) for w in words) 

 

We use the sports sentences as positive examples, the mixed ones ad unlabeled examples: 

 

    >>> positive_featuresets = list(map(features, sports_sentences)) 

    >>> unlabeled_featuresets = list(map(features, various_sentences)) 

    >>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, 

    ...                                                 unlabeled_featuresets) 

 

Is the following sentence about sports? 

 

    >>> classifier.classify(features('The cat is on the table')) 

    False 

 

What about this one? 

 

    >>> classifier.classify(features('My team lost the game')) 

    True 

""" 

 

from collections import defaultdict 

 

from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist 

 

from .naivebayes import NaiveBayesClassifier 

 

##////////////////////////////////////////////////////// 

##  Positive Naive Bayes Classifier 

##////////////////////////////////////////////////////// 

 

class PositiveNaiveBayesClassifier(NaiveBayesClassifier): 

    @staticmethod 

    def train(positive_featuresets, unlabeled_featuresets, positive_prob_prior=0.5, 

              estimator=ELEProbDist): 

        """ 

        :param positive_featuresets: A list of featuresets that are known as positive 

            examples (i.e., their label is ``True``). 

 

        :param unlabeled_featuresets: A list of featuresets whose label is unknown. 

 

        :param positive_prob_prior: A prior estimate of the probability of the label 

            ``True`` (default 0.5). 

        """ 

        positive_feature_freqdist = defaultdict(FreqDist) 

        unlabeled_feature_freqdist = defaultdict(FreqDist) 

        feature_values = defaultdict(set) 

        fnames = set() 

 

        # Count up how many times each feature value occurred in positive examples. 

        for featureset in positive_featuresets: 

            for fname, fval in featureset.items(): 

                positive_feature_freqdist[fname].inc(fval) 

                feature_values[fname].add(fval) 

                fnames.add(fname) 

 

        # Count up how many times each feature value occurred in unlabeled examples. 

        for featureset in unlabeled_featuresets: 

            for fname, fval in featureset.items(): 

                unlabeled_feature_freqdist[fname].inc(fval) 

                feature_values[fname].add(fval) 

                fnames.add(fname) 

 

        # If a feature didn't have a value given for an instance, then we assume that 

        # it gets the implicit value 'None'. 

        num_positive_examples = len(positive_featuresets) 

        for fname in fnames: 

            count = positive_feature_freqdist[fname].N() 

            positive_feature_freqdist[fname].inc(None, num_positive_examples-count) 

            feature_values[fname].add(None) 

 

        num_unlabeled_examples = len(unlabeled_featuresets) 

        for fname in fnames: 

            count = unlabeled_feature_freqdist[fname].N() 

            unlabeled_feature_freqdist[fname].inc(None, num_unlabeled_examples-count) 

            feature_values[fname].add(None) 

 

        negative_prob_prior = 1.0 - positive_prob_prior 

 

        # Create the P(label) distribution. 

        label_probdist = DictionaryProbDist({True: positive_prob_prior, 

                                             False: negative_prob_prior}) 

 

        # Create the P(fval|label, fname) distribution. 

        feature_probdist = {} 

        for fname, freqdist in positive_feature_freqdist.items(): 

            probdist = estimator(freqdist, bins=len(feature_values[fname])) 

            feature_probdist[True, fname] = probdist 

 

        for fname, freqdist in unlabeled_feature_freqdist.items(): 

            global_probdist = estimator(freqdist, bins=len(feature_values[fname])) 

            negative_feature_probs = {} 

            for fval in feature_values[fname]: 

                prob = (global_probdist.prob(fval) 

                        - positive_prob_prior * 

                        feature_probdist[True, fname].prob(fval)) \ 

                        / negative_prob_prior 

                # TODO: We need to add some kind of smoothing here, instead of 

                # setting negative probabilities to zero and normalizing. 

                negative_feature_probs[fval] = max(prob, 0.0) 

            feature_probdist[False, fname] = DictionaryProbDist(negative_feature_probs, 

                                                                normalize=True) 

 

        return PositiveNaiveBayesClassifier(label_probdist, feature_probdist) 

 

##////////////////////////////////////////////////////// 

##  Demo 

##////////////////////////////////////////////////////// 

 

def demo(): 

    from nltk.classify.util import partial_names_demo 

    classifier = partial_names_demo(PositiveNaiveBayesClassifier.train) 

    classifier.show_most_informative_features() 

 

##////////////////////////////////////////////////////// 

##  Test 

##////////////////////////////////////////////////////// 

 

if __name__ == '__main__': 

    import doctest 

    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)