Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

# Natural Language Toolkit: SVM-based classifier 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Leon Derczynski <leon@dcs.shef.ac.uk> 

# 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

A classifier based on a support vector machine. This code uses Thorsten Joachims' 

SVM^light implementation (http://svmlight.joachims.org/), wrapped using 

PySVMLight (https://bitbucket.org/wcauchois/pysvmlight). The default settings are to 

train a linear classification kernel, though through minor modification, full SVMlight 

capabilities should be accessible if needed. Only binary classification is possible at present. 

""" 

from __future__ import print_function 

 

from nltk import compat 

from nltk.probability import DictionaryProbDist 

 

from nltk.classify.api import ClassifierI 

 

# 

# Interface to Support Vector Machine 

# 

 

try: 

    import svmlight 

except: 

    raise LookupError("\n\n===========================================================================\n  NLTK was unable to import SVMlight!\n\n  For more information, see <https://bitbucket.org/wcauchois/pysvmlight>\n===========================================================================") 

 

# create a boolean feature name for the SVM from a feature/value pair, 

# that'll take on a 1.0 value if the original feature:value is asserted. 

def featurename(feature, value): 

    """ 

    :param feature: a string denoting a feature name 

    :param value: the value of the feature 

    """ 

    return '|'.join([feature, type(value).__name__, str(value)]) 

 

# convert a set of NLTK classifier features to SVMlight format 

def map_features_to_svm(features, svmfeatureindex): 

    """ 

    :param features: a dict of features in the format {'feature':value} 

    :param svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels 

    """ 

    instancefeatures = [] 

    # svmlight supports sparse feature sets and so we simply omit features that we don't include 

    for k,v in compat.iteritems(features): 

        # each feature is represented as an (int, float) tuple where the int is the SVMlight feature label and the float is the value; as we either have or have not a feature, this is 1.0 

        # this does not support scalar features - rather, each value that a feature may take on is a discrete independent label 

        # use 1.0 as the feature value to specify the presence of a feature:value couple 

        svmfeaturename = featurename(k, v) 

        if svmfeaturename not in svmfeatureindex: 

            # skip over feature:value pairs that were not in the training data and so not included in our mappings 

            continue 

        instancefeatures.append( (svmfeatureindex[svmfeaturename], 1.0) ) 

    return instancefeatures 

 

 

# convert a whole instance (including label) from NLTK to SVMlight format 

 

def map_instance_to_svm(instance, labelmapping, svmfeatureindex): 

    """ 

    :param instance: an NLTK format instance, which is in the tuple format (dict(), label), where the dict contains feature:value pairs, and the label signifies the target attribute's value for this instance (e.g. its class) 

    :param labelmapping: a previously-defined dict mapping from text labels in the NLTK instance format to SVMlight labels of either +1 or -1 

    @svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels 

    """ 

    (features, label) = instance 

    instancefeatures = map_features_to_svm(features, svmfeatureindex) 

    return (labelmapping[label], instancefeatures) 

 

 

class SvmClassifier(ClassifierI): 

    """ 

    A Support Vector Machine classifier. To explain briefly, support 

    vector machines (SVM) treat each feature as a dimension, and 

    position features in n-dimensional feature space.  An optimal 

    hyperplane is then determined that best divides feature space into 

    classes, and future instances classified based on which side of 

    the hyperplane they lie on, and their proximity to it. 

 

    This implementation is for a binary SVM - that is, only two 

    classes are supported. You may achieve perform classification with 

    more classes by training an SVM per class and then picking a best 

    option for new instances given results from each binary class-SVM. 

    """ 

 

    def __init__(self, labels, labelmapping, svmfeatures, model=None): 

        """ 

        :param labels: a list of text labels for classes 

        :param labelmapping: a mapping from labels to SVM classes (-1,+1) 

        :param svmfeatures: a list of svm features, where the index is the integer feature number and the value an feature/value pair 

        :param model: the model generated by svmlight.learn() 

        """ 

        self._labels = labels 

        self._model = model 

        self._labelmapping = labelmapping 

        self._svmfeatures = svmfeatures 

        # _svmfeatureindex is the inverse of svmfeatures, allowing us 

        # to find an SVM feature name (int) given a feature/value 

        self._svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures)))) 

        self._verbose = False 

 

    def labels(self): 

        """ 

        Return the list of class labels. 

        """ 

        return self._labels 

 

    def svm_label_name(self, label): 

        """ 

        searches values of _labelmapping to resolve +1 or -1 to a string 

 

        :param label: the string label to look up 

        """ 

        labelname = [k for k, v in compat.iteritems(self._labelmapping) if v == label][0] 

        return labelname 

 

    def resolve_prediction(self, prediction): 

        """ 

        resolve a float (in this case, probably from 

        svmlight.learn().classify()) to either -1 or +1, and then look 

        up the label for that class in _labelmapping, and return the 

        text label 

 

        :param prediction: a signed float describing classifier confidence 

        """ 

        classification = cmp(prediction, 0) 

        return self.svm_label_name(classification) 

 

    def _get_svm_classification(self, featureset): 

        """ 

        given a set of features, classify them with our trained model 

        and return a signed float 

 

        :param featureset: a dict of feature/value pairs in NLTK format, representing a single instance 

        """ 

        instance_to_classify = (0, map_features_to_svm(featureset, self._svmfeatureindex)) 

        if self._verbose: 

            print('instance', instance_to_classify) 

        # svmlight.classify expects a list; this should be taken advantage of when writing SvmClassifier.batch_classify / .batch_prob_classify. 

        # it returns a list of floats, too. 

        [prediction] = svmlight.classify(self._model, [instance_to_classify]) 

        return prediction 

 

    def prob_classify(self, featureset): 

        """ 

        Return a probability distribution of classifications 

 

        :param featureset: a dict of feature/value pairs in NLTK format, representing a single instance 

        """ 

        if self._model is None: 

            raise Exception('This classifier is not yet trained') 

            return None 

 

        # do the classification 

        prediction = self._get_svm_classification(featureset) 

        if self._verbose: 

            print('prediction', prediction) 

 

        # lump it into a boolean class, -1 or +1 

        predicted_label = cmp(prediction, 0) 

 

        # sometimes the result is not within -1 ... +1; clip it so 

        # that it is, and we get a sane-looking probability 

        # distribution.  this will upset some results with non-linear 

        # partitioning where instance-hyperplane distance can be many 

        # orders of magnitude larger; I don't have a fix for that 

        if prediction < -1.0: 

            prediction = -1.0 

        if prediction > 1.0: 

            prediction = 1.0 

 

        # if the prediction is negative, then we will maximise the 

        # value of the -1 class; otherwise, that of the 1 class will 

        # be greater. 

        if predicted_label == 1: 

            distribution = {str(self.resolve_prediction(1)): prediction, 

                            str(self.resolve_prediction(-1)): 1 - prediction} 

        else: 

            distribution = {str(self.resolve_prediction(1)): prediction + 1, 

                            str(self.resolve_prediction(-1)): -prediction} 

 

        return DictionaryProbDist(distribution) 

 

 

    def classify(self, featureset): 

        """ 

        Use a trained SVM to predict a label given for an unlabelled instance 

 

        :param featureset: a dict of feature/value pairs in NLTK format, representing a single instance 

        """ 

        prediction = self._get_svm_classification(featureset) 

        if self._verbose: 

            print('prediction', prediction) 

        return self.resolve_prediction(prediction) 

 

    @staticmethod 

    def train(featuresets): 

        """ 

        given a set of training instances in nltk format: 

        [ ( {feature:value, ..}, str(label) ) ] 

        train a support vector machine 

 

        :param featuresets: training instances 

        """ 

 

        # build a unique list of labels 

        labels = set() 

        for (features, label) in featuresets: 

            labels.add(label) 

 

        # this is a binary classifier only 

        if len(labels) > 2: 

            raise ValueError('Can only do boolean classification (labels: '+ str(labels) + ')') 

            return False 

 

        # we need ordering, so a set's no good 

        labels = list(labels) 

 

        # next, assign -1 and 1 

        labelmapping = {labels[0]:-1, labels[1]:1} 

 

        # now for feature conversion 

        # iter through instances, building a set of feature:type:str(value) triples 

        svmfeatures = set() 

        for (features, label) in featuresets: 

            for k,v in compat.iteritems(features): 

                svmfeatures.add(featurename(k, v)) 

        # svmfeatures is indexable by integer svm feature number 

        # svmfeatureindex is the inverse (svm feature name -> number) 

        svmfeatures = list(svmfeatures) 

        svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures)))) 

 

        # build svm feature set case by case 

        svmfeatureset = [] 

        for instance in featuresets: 

            svmfeatureset.append(map_instance_to_svm(instance, labelmapping, svmfeatureindex)) 

 

        # train the svm 

        # TODO: implement passing of SVMlight parameters from train() to learn() 

        return SvmClassifier(labels, labelmapping, svmfeatures, svmlight.learn(svmfeatureset, type='classification')) 

 

 

def demo(): 

 

    def gender_features(word): 

        return {'last_letter': word[-1], 'penultimate_letter': word[-2]} 

 

    from nltk.classify import accuracy 

    from nltk.corpus import names 

 

 

    import random 

    names = ([(name, 'male') for name in names.words('male.txt')] + 

             [(name, 'female') for name in names.words('female.txt')]) 

    import random 

    random.seed(60221023) 

    random.shuffle(names) 

 

    featuresets = [(gender_features(n), g) for (n,g) in names] 

    train_set, test_set = featuresets[500:], featuresets[:500] 

 

    print('--- nltk.classify.svm demo ---') 

    print('Number of training examples:', len(train_set)) 

    classifier = SvmClassifier.train(train_set) 

    print('Total SVM dimensions:', len(classifier._svmfeatureindex)) 

    print('Label mapping:', classifier._labelmapping) 

    print('--- Processing an example instance ---') 

    print('Reference instance:', names[0]) 

    print('NLTK-format features:\n    ' + str(test_set[0])) 

    print('SVMlight-format features:\n    ' + str(map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex))) 

    distr = classifier.prob_classify(test_set[0][0]) 

    print('Instance classification and confidence:', distr.max(), distr.prob(distr.max())) 

    print('--- Measuring classifier performance ---') 

    print('Overall accuracy:', accuracy(classifier, test_set)) 

 

 

if __name__ == '__main__': 

    demo()