Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

# Natural Language Toolkit: Classifier Utility Functions 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Edward Loper <edloper@gradient.cis.upenn.edu> 

#         Steven Bird <sb@csse.unimelb.edu.au> (minor additions) 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

Utility functions and classes for classifiers. 

""" 

from __future__ import print_function 

 

import math 

 

#from nltk.util import Deprecated 

import nltk.classify.util # for accuracy & log_likelihood 

from nltk.util import LazyMap 

 

###################################################################### 

#{ Helper Functions 

###################################################################### 

 

# alternative name possibility: 'map_featurefunc()'? 

# alternative name possibility: 'detect_features()'? 

# alternative name possibility: 'map_featuredetect()'? 

# or.. just have users use LazyMap directly? 

def apply_features(feature_func, toks, labeled=None): 

    """ 

    Use the ``LazyMap`` class to construct a lazy list-like 

    object that is analogous to ``map(feature_func, toks)``.  In 

    particular, if ``labeled=False``, then the returned list-like 

    object's values are equal to:: 

 

        [feature_func(tok) for tok in toks] 

 

    If ``labeled=True``, then the returned list-like object's values 

    are equal to:: 

 

        [(feature_func(tok), label) for (tok, label) in toks] 

 

    The primary purpose of this function is to avoid the memory 

    overhead involved in storing all the featuresets for every token 

    in a corpus.  Instead, these featuresets are constructed lazily, 

    as-needed.  The reduction in memory overhead can be especially 

    significant when the underlying list of tokens is itself lazy (as 

    is the case with many corpus readers). 

 

    :param feature_func: The function that will be applied to each 

        token.  It should return a featureset -- i.e., a dict 

        mapping feature names to feature values. 

    :param toks: The list of tokens to which ``feature_func`` should be 

        applied.  If ``labeled=True``, then the list elements will be 

        passed directly to ``feature_func()``.  If ``labeled=False``, 

        then the list elements should be tuples ``(tok,label)``, and 

        ``tok`` will be passed to ``feature_func()``. 

    :param labeled: If true, then ``toks`` contains labeled tokens -- 

        i.e., tuples of the form ``(tok, label)``.  (Default: 

        auto-detect based on types.) 

    """ 

    if labeled is None: 

        labeled = toks and isinstance(toks[0], (tuple, list)) 

    if labeled: 

        def lazy_func(labeled_token): 

            return (feature_func(labeled_token[0]), labeled_token[1]) 

        return LazyMap(lazy_func, toks) 

    else: 

        return LazyMap(feature_func, toks) 

 

def attested_labels(tokens): 

    """ 

    :return: A list of all labels that are attested in the given list 

        of tokens. 

    :rtype: list of (immutable) 

    :param tokens: The list of classified tokens from which to extract 

        labels.  A classified token has the form ``(token, label)``. 

    :type tokens: list 

    """ 

    return tuple(set([label for (tok,label) in tokens])) 

 

def log_likelihood(classifier, gold): 

    results = classifier.batch_prob_classify([fs for (fs,l) in gold]) 

    ll = [pdist.prob(l) for ((fs,l), pdist) in zip(gold, results)] 

    return math.log(float(sum(ll))/len(ll)) 

 

def accuracy(classifier, gold): 

    results = classifier.batch_classify([fs for (fs,l) in gold]) 

    correct = [l==r for ((fs,l), r) in zip(gold, results)] 

    if correct: 

        return float(sum(correct))/len(correct) 

    else: 

        return 0 

 

class CutoffChecker(object): 

    """ 

    A helper class that implements cutoff checks based on number of 

    iterations and log likelihood. 

 

    Accuracy cutoffs are also implemented, but they're almost never 

    a good idea to use. 

    """ 

    def __init__(self, cutoffs): 

        self.cutoffs = cutoffs.copy() 

        if 'min_ll' in cutoffs: 

            cutoffs['min_ll'] = -abs(cutoffs['min_ll']) 

        if 'min_lldelta' in cutoffs: 

            cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta']) 

        self.ll = None 

        self.acc = None 

        self.iter = 1 

 

    def check(self, classifier, train_toks): 

        cutoffs = self.cutoffs 

        self.iter += 1 

        if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']: 

            return True # iteration cutoff. 

 

        new_ll = nltk.classify.util.log_likelihood(classifier, train_toks) 

        if math.isnan(new_ll): 

            return True 

 

        if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs: 

            if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']: 

                return True # log likelihood cutoff 

            if ('min_lldelta' in cutoffs and self.ll and 

                ((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))): 

                return True # log likelihood delta cutoff 

            self.ll = new_ll 

 

        if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs: 

            new_acc = nltk.classify.util.log_likelihood( 

                classifier, train_toks) 

            if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']: 

                return True # log likelihood cutoff 

            if ('min_accdelta' in cutoffs and self.acc and 

                ((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))): 

                return True # log likelihood delta cutoff 

            self.acc = new_acc 

 

            return False # no cutoff reached. 

 

###################################################################### 

#{ Demos 

###################################################################### 

 

def names_demo_features(name): 

    features = {} 

    features['alwayson'] = True 

    features['startswith'] = name[0].lower() 

    features['endswith'] = name[-1].lower() 

    for letter in 'abcdefghijklmnopqrstuvwxyz': 

        features['count(%s)' % letter] = name.lower().count(letter) 

        features['has(%s)' % letter] = letter in name.lower() 

    return features 

 

def binary_names_demo_features(name): 

    features = {} 

    features['alwayson'] = True 

    features['startswith(vowel)'] = name[0].lower() in 'aeiouy' 

    features['endswith(vowel)'] = name[-1].lower() in 'aeiouy' 

    for letter in 'abcdefghijklmnopqrstuvwxyz': 

        features['count(%s)' % letter] = name.lower().count(letter) 

        features['has(%s)' % letter] = letter in name.lower() 

        features['startswith(%s)' % letter] = (letter==name[0].lower()) 

        features['endswith(%s)' % letter] = (letter==name[-1].lower()) 

    return features 

 

def names_demo(trainer, features=names_demo_features): 

    from nltk.corpus import names 

    import random 

 

    # Construct a list of classified names, using the names corpus. 

    namelist = ([(name, 'male') for name in names.words('male.txt')] + 

                [(name, 'female') for name in names.words('female.txt')]) 

 

    # Randomly split the names into a test & train set. 

    random.seed(123456) 

    random.shuffle(namelist) 

    train = namelist[:5000] 

    test = namelist[5000:5500] 

 

    # Train up a classifier. 

    print('Training classifier...') 

    classifier = trainer( [(features(n), g) for (n,g) in train] ) 

 

    # Run the classifier on the test data. 

    print('Testing classifier...') 

    acc = accuracy(classifier, [(features(n),g) for (n,g) in test]) 

    print('Accuracy: %6.4f' % acc) 

 

    # For classifiers that can find probabilities, show the log 

    # likelihood and some sample probability distributions. 

    try: 

        test_featuresets = [features(n) for (n,g) in test] 

        pdists = classifier.batch_prob_classify(test_featuresets) 

        ll = [pdist.logprob(gold) 

              for ((name, gold), pdist) in zip(test, pdists)] 

        print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test))) 

        print() 

        print('Unseen Names      P(Male)  P(Female)\n'+'-'*40) 

        for ((name, gender), pdist) in zip(test, pdists)[:5]: 

            if gender == 'male': 

                fmt = '  %-15s *%6.4f   %6.4f' 

            else: 

                fmt = '  %-15s  %6.4f  *%6.4f' 

            print(fmt % (name, pdist.prob('male'), pdist.prob('female'))) 

    except NotImplementedError: 

        pass 

 

    # Return the classifier 

    return classifier 

 

def partial_names_demo(trainer, features=names_demo_features): 

    from nltk.corpus import names 

    import random 

 

    male_names = names.words('male.txt') 

    female_names = names.words('female.txt') 

 

    random.seed(654321) 

    random.shuffle(male_names) 

    random.shuffle(female_names) 

 

    # Create a list of male names to be used as positive-labeled examples for training 

    positive = map(features, male_names[:2000]) 

 

    # Create a list of male and female names to be used as unlabeled examples 

    unlabeled = map(features, male_names[2000:2500] + female_names[:500]) 

 

    # Create a test set with correctly-labeled male and female names 

    test = [(name, True) for name in male_names[2500:2750]] \ 

        + [(name, False) for name in female_names[500:750]] 

 

    random.shuffle(test) 

 

    # Train up a classifier. 

    print('Training classifier...') 

    classifier = trainer(positive, unlabeled) 

 

    # Run the classifier on the test data. 

    print('Testing classifier...') 

    acc = accuracy(classifier, [(features(n),m) for (n,m) in test]) 

    print('Accuracy: %6.4f' % acc) 

 

    # For classifiers that can find probabilities, show the log 

    # likelihood and some sample probability distributions. 

    try: 

        test_featuresets = [features(n) for (n,m) in test] 

        pdists = classifier.batch_prob_classify(test_featuresets) 

        ll = [pdist.logprob(gold) 

              for ((name, gold), pdist) in zip(test, pdists)] 

        print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test))) 

        print() 

        print('Unseen Names      P(Male)  P(Female)\n'+'-'*40) 

        for ((name, is_male), pdist) in zip(test, pdists)[:5]: 

            if is_male == True: 

                fmt = '  %-15s *%6.4f   %6.4f' 

            else: 

                fmt = '  %-15s  %6.4f  *%6.4f' 

            print(fmt % (name, pdist.prob(True), pdist.prob(False))) 

    except NotImplementedError: 

        pass 

 

    # Return the classifier 

    return classifier 

 

_inst_cache = {} 

def wsd_demo(trainer, word, features, n=1000): 

    from nltk.corpus import senseval 

    import random 

 

    # Get the instances. 

    print('Reading data...') 

    global _inst_cache 

    if word not in _inst_cache: 

        _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] 

    instances = _inst_cache[word][:] 

    if n> len(instances): n = len(instances) 

    senses = list(set(l for (i,l) in instances)) 

    print('  Senses: ' + ' '.join(senses)) 

 

    # Randomly split the names into a test & train set. 

    print('Splitting into test & train...') 

    random.seed(123456) 

    random.shuffle(instances) 

    train = instances[:int(.8*n)] 

    test = instances[int(.8*n):n] 

 

    # Train up a classifier. 

    print('Training classifier...') 

    classifier = trainer( [(features(i), l) for (i,l) in train] ) 

 

    # Run the classifier on the test data. 

    print('Testing classifier...') 

    acc = accuracy(classifier, [(features(i),l) for (i,l) in test]) 

    print('Accuracy: %6.4f' % acc) 

 

    # For classifiers that can find probabilities, show the log 

    # likelihood and some sample probability distributions. 

    try: 

        test_featuresets = [features(i) for (i,n) in test] 

        pdists = classifier.batch_prob_classify(test_featuresets) 

        ll = [pdist.logprob(gold) 

              for ((name, gold), pdist) in zip(test, pdists)] 

        print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test))) 

    except NotImplementedError: 

        pass 

 

    # Return the classifier 

    return classifier