Coverage for nltk.classify.svm: 89%

raise LookupError("\n\n===========================================================================\n NLTK was unable to import SVMlight!\n\n For more information, see <https://bitbucket.org/wcauchois/pysvmlight>\n===========================================================================")

# create a boolean feature name for the SVM from a feature/value pair,

# that'll take on a 1.0 value if the original feature:value is asserted.

def featurename(feature, value):

"""

:param feature: a string denoting a feature name

:param value: the value of the feature

"""

return '|'.join([feature, type(value).__name__, str(value)])

# convert a set of NLTK classifier features to SVMlight format

def map_features_to_svm(features, svmfeatureindex):

"""

:param features: a dict of features in the format {'feature':value}

:param svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels

"""

instancefeatures = []

# svmlight supports sparse feature sets and so we simply omit features that we don't include

for k,v in compat.iteritems(features):

# each feature is represented as an (int, float) tuple where the int is the SVMlight feature label and the float is the value; as we either have or have not a feature, this is 1.0

# this does not support scalar features - rather, each value that a feature may take on is a discrete independent label

# use 1.0 as the feature value to specify the presence of a feature:value couple

svmfeaturename = featurename(k, v)

if svmfeaturename not in svmfeatureindex:

# skip over feature:value pairs that were not in the training data and so not included in our mappings

continue

instancefeatures.append( (svmfeatureindex[svmfeaturename], 1.0) )

return instancefeatures

# convert a whole instance (including label) from NLTK to SVMlight format

def map_instance_to_svm(instance, labelmapping, svmfeatureindex):

"""

:param instance: an NLTK format instance, which is in the tuple format (dict(), label), where the dict contains feature:value pairs, and the label signifies the target attribute's value for this instance (e.g. its class)

:param labelmapping: a previously-defined dict mapping from text labels in the NLTK instance format to SVMlight labels of either +1 or -1

@svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels

"""

(features, label) = instance

instancefeatures = map_features_to_svm(features, svmfeatureindex)

return (labelmapping[label], instancefeatures)

class SvmClassifier(ClassifierI):

"""

A Support Vector Machine classifier. To explain briefly, support

vector machines (SVM) treat each feature as a dimension, and

position features in n-dimensional feature space. An optimal

hyperplane is then determined that best divides feature space into

classes, and future instances classified based on which side of

the hyperplane they lie on, and their proximity to it.

This implementation is for a binary SVM - that is, only two

classes are supported. You may achieve perform classification with

more classes by training an SVM per class and then picking a best

option for new instances given results from each binary class-SVM.

"""

def __init__(self, labels, labelmapping, svmfeatures, model=None):

"""

:param labels: a list of text labels for classes

:param labelmapping: a mapping from labels to SVM classes (-1,+1)

:param svmfeatures: a list of svm features, where the index is the integer feature number and the value an feature/value pair

:param model: the model generated by svmlight.learn()

"""

self._labels = labels

self._model = model

self._labelmapping = labelmapping

self._svmfeatures = svmfeatures

# _svmfeatureindex is the inverse of svmfeatures, allowing us

# to find an SVM feature name (int) given a feature/value

self._svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures))))

self._verbose = False

def labels(self):

"""

Return the list of class labels.

"""

return self._labels

def svm_label_name(self, label):

"""

searches values of _labelmapping to resolve +1 or -1 to a string

:param label: the string label to look up

"""

labelname = [k for k, v in compat.iteritems(self._labelmapping) if v == label][0]

return labelname

def resolve_prediction(self, prediction):

"""

resolve a float (in this case, probably from

svmlight.learn().classify()) to either -1 or +1, and then look

up the label for that class in _labelmapping, and return the

text label

:param prediction: a signed float describing classifier confidence

"""

classification = cmp(prediction, 0)

return self.svm_label_name(classification)

def _get_svm_classification(self, featureset):

"""

given a set of features, classify them with our trained model

and return a signed float

:param featureset: a dict of feature/value pairs in NLTK format, representing a single instance

"""

instance_to_classify = (0, map_features_to_svm(featureset, self._svmfeatureindex))

if self._verbose:

print('instance', instance_to_classify)

# svmlight.classify expects a list; this should be taken advantage of when writing SvmClassifier.batch_classify / .batch_prob_classify.

# it returns a list of floats, too.

[prediction] = svmlight.classify(self._model, [instance_to_classify])

return prediction

def prob_classify(self, featureset):

"""

Return a probability distribution of classifications

:param featureset: a dict of feature/value pairs in NLTK format, representing a single instance

"""

if self._model is None:

raise Exception('This classifier is not yet trained')

return None

# do the classification

prediction = self._get_svm_classification(featureset)

if self._verbose:

print('prediction', prediction)

# lump it into a boolean class, -1 or +1

predicted_label = cmp(prediction, 0)

# sometimes the result is not within -1 ... +1; clip it so

# that it is, and we get a sane-looking probability

# distribution. this will upset some results with non-linear

# partitioning where instance-hyperplane distance can be many

# orders of magnitude larger; I don't have a fix for that

if prediction < -1.0:

prediction = -1.0

if prediction > 1.0:

prediction = 1.0

# if the prediction is negative, then we will maximise the

# value of the -1 class; otherwise, that of the 1 class will

# be greater.

if predicted_label == 1:

distribution = {str(self.resolve_prediction(1)): prediction,

str(self.resolve_prediction(-1)): 1 - prediction}

else:

distribution = {str(self.resolve_prediction(1)): prediction + 1,

str(self.resolve_prediction(-1)): -prediction}

return DictionaryProbDist(distribution)

def classify(self, featureset):

"""

Use a trained SVM to predict a label given for an unlabelled instance

:param featureset: a dict of feature/value pairs in NLTK format, representing a single instance

"""

prediction = self._get_svm_classification(featureset)

if self._verbose:

print('prediction', prediction)

return self.resolve_prediction(prediction)

@staticmethod

def train(featuresets):

"""

given a set of training instances in nltk format:

[ ( {feature:value, ..}, str(label) ) ]

train a support vector machine

:param featuresets: training instances

"""

# build a unique list of labels

labels = set()

for (features, label) in featuresets:

labels.add(label)

# this is a binary classifier only

if len(labels) > 2:

raise ValueError('Can only do boolean classification (labels: '+ str(labels) + ')')

return False

# we need ordering, so a set's no good

labels = list(labels)

# next, assign -1 and 1

labelmapping = {labels[0]:-1, labels[1]:1}

# now for feature conversion

# iter through instances, building a set of feature:type:str(value) triples

svmfeatures = set()

for (features, label) in featuresets:

for k,v in compat.iteritems(features):

svmfeatures.add(featurename(k, v))

# svmfeatures is indexable by integer svm feature number

# svmfeatureindex is the inverse (svm feature name -> number)

svmfeatures = list(svmfeatures)

svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures))))

# build svm feature set case by case

svmfeatureset = []

for instance in featuresets:

svmfeatureset.append(map_instance_to_svm(instance, labelmapping, svmfeatureindex))

# train the svm

# TODO: implement passing of SVMlight parameters from train() to learn()

return SvmClassifier(labels, labelmapping, svmfeatures, svmlight.learn(svmfeatureset, type='classification'))

def demo():

def gender_features(word):

return {'last_letter': word[-1], 'penultimate_letter': word[-2]}

from nltk.classify import accuracy

from nltk.corpus import names

import random

names = ([(name, 'male') for name in names.words('male.txt')] +

[(name, 'female') for name in names.words('female.txt')])

import random

random.seed(60221023)

random.shuffle(names)

featuresets = [(gender_features(n), g) for (n,g) in names]

train_set, test_set = featuresets[500:], featuresets[:500]

print('--- nltk.classify.svm demo ---')

print('Number of training examples:', len(train_set))

classifier = SvmClassifier.train(train_set)

print('Total SVM dimensions:', len(classifier._svmfeatureindex))

print('Label mapping:', classifier._labelmapping)

print('--- Processing an example instance ---')

print('Reference instance:', names[0])

print('NLTK-format features:\n ' + str(test_set[0]))

print('SVMlight-format features:\n ' + str(map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex)))

distr = classifier.prob_classify(test_set[0][0])

print('Instance classification and confidence:', distr.max(), distr.prob(distr.max()))

print('--- Measuring classifier performance ---')

print('Overall accuracy:', accuracy(classifier, test_set))

if __name__ == '__main__':

demo()

Coverage for nltk.classify.svm : 89%

112 statements 100 run 12 missing 0 excluded