Coverage for nltk.classify.util: 23%

100

101

102

103

# Natural Language Toolkit: Classifier Utility Functions

# Author: Edward Loper <edloper@gradient.cis.upenn.edu>

# Steven Bird <sb@csse.unimelb.edu.au> (minor additions)

# URL: <http://www.nltk.org/>

# For license information, see LICENSE.TXT

"""

Utility functions and classes for classifiers.

"""

from __future__ import print_function

import math

#from nltk.util import Deprecated

import nltk.classify.util # for accuracy & log_likelihood

from nltk.util import LazyMap

######################################################################

#{ Helper Functions

######################################################################

# alternative name possibility: 'map_featurefunc()'?

# alternative name possibility: 'detect_features()'?

# alternative name possibility: 'map_featuredetect()'?

# or.. just have users use LazyMap directly?

def apply_features(feature_func, toks, labeled=None):

"""

Use the ``LazyMap`` class to construct a lazy list-like

object that is analogous to ``map(feature_func, toks)``. In

particular, if ``labeled=False``, then the returned list-like

object's values are equal to::

[feature_func(tok) for tok in toks]

If ``labeled=True``, then the returned list-like object's values

are equal to::

[(feature_func(tok), label) for (tok, label) in toks]

The primary purpose of this function is to avoid the memory

overhead involved in storing all the featuresets for every token

in a corpus. Instead, these featuresets are constructed lazily,

as-needed. The reduction in memory overhead can be especially

significant when the underlying list of tokens is itself lazy (as

is the case with many corpus readers).

:param feature_func: The function that will be applied to each

token. It should return a featureset -- i.e., a dict

mapping feature names to feature values.

:param toks: The list of tokens to which ``feature_func`` should be

applied. If ``labeled=True``, then the list elements will be

passed directly to ``feature_func()``. If ``labeled=False``,

then the list elements should be tuples ``(tok,label)``, and

``tok`` will be passed to ``feature_func()``.

:param labeled: If true, then ``toks`` contains labeled tokens --

i.e., tuples of the form ``(tok, label)``. (Default:

auto-detect based on types.)

"""

if labeled is None:

labeled = toks and isinstance(toks[0], (tuple, list))

if labeled:

def lazy_func(labeled_token):

return (feature_func(labeled_token[0]), labeled_token[1])

return LazyMap(lazy_func, toks)

else:

return LazyMap(feature_func, toks)

def attested_labels(tokens):

"""

:return: A list of all labels that are attested in the given list

of tokens.

:rtype: list of (immutable)

:param tokens: The list of classified tokens from which to extract

labels. A classified token has the form ``(token, label)``.

:type tokens: list

"""

return tuple(set([label for (tok,label) in tokens]))

def log_likelihood(classifier, gold):

results = classifier.batch_prob_classify([fs for (fs,l) in gold])

ll = [pdist.prob(l) for ((fs,l), pdist) in zip(gold, results)]

return math.log(float(sum(ll))/len(ll))

def accuracy(classifier, gold):

results = classifier.batch_classify([fs for (fs,l) in gold])

correct = [l==r for ((fs,l), r) in zip(gold, results)]

if correct:

return float(sum(correct))/len(correct)

else:

return 0

class CutoffChecker(object):

"""

A helper class that implements cutoff checks based on number of

iterations and log likelihood.

Accuracy cutoffs are also implemented, but they're almost never

a good idea to use.

"""

def __init__(self, cutoffs):

self.cutoffs = cutoffs.copy()

if 'min_ll' in cutoffs:

cutoffs['min_ll'] = -abs(cutoffs['min_ll'])

if 'min_lldelta' in cutoffs:

cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta'])

self.ll = None

self.acc = None

self.iter = 1

def check(self, classifier, train_toks):

cutoffs = self.cutoffs

self.iter += 1

if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']:

return True # iteration cutoff.

new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)

if math.isnan(new_ll):

return True

if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs:

if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']:

return True # log likelihood cutoff

if ('min_lldelta' in cutoffs and self.ll and

((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))):

return True # log likelihood delta cutoff

self.ll = new_ll

if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs:

new_acc = nltk.classify.util.log_likelihood(

classifier, train_toks)

if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']:

return True # log likelihood cutoff

if ('min_accdelta' in cutoffs and self.acc and

((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))):

return True # log likelihood delta cutoff

self.acc = new_acc

return False # no cutoff reached.

######################################################################

#{ Demos

######################################################################

def names_demo_features(name):

features = {}

features['alwayson'] = True

features['startswith'] = name[0].lower()

features['endswith'] = name[-1].lower()

for letter in 'abcdefghijklmnopqrstuvwxyz':

features['count(%s)' % letter] = name.lower().count(letter)

features['has(%s)' % letter] = letter in name.lower()

return features

def binary_names_demo_features(name):

features = {}

features['alwayson'] = True

features['startswith(vowel)'] = name[0].lower() in 'aeiouy'

features['endswith(vowel)'] = name[-1].lower() in 'aeiouy'

for letter in 'abcdefghijklmnopqrstuvwxyz':

features['count(%s)' % letter] = name.lower().count(letter)

features['has(%s)' % letter] = letter in name.lower()

features['startswith(%s)' % letter] = (letter==name[0].lower())

features['endswith(%s)' % letter] = (letter==name[-1].lower())

return features

def names_demo(trainer, features=names_demo_features):

from nltk.corpus import names

import random

# Construct a list of classified names, using the names corpus.

namelist = ([(name, 'male') for name in names.words('male.txt')] +

[(name, 'female') for name in names.words('female.txt')])

# Randomly split the names into a test & train set.

random.seed(123456)

random.shuffle(namelist)

train = namelist[:5000]

test = namelist[5000:5500]

# Train up a classifier.

print('Training classifier...')

classifier = trainer( [(features(n), g) for (n,g) in train] )

# Run the classifier on the test data.

print('Testing classifier...')

acc = accuracy(classifier, [(features(n),g) for (n,g) in test])

print('Accuracy: %6.4f' % acc)

# For classifiers that can find probabilities, show the log

# likelihood and some sample probability distributions.

try:

test_featuresets = [features(n) for (n,g) in test]

pdists = classifier.batch_prob_classify(test_featuresets)

ll = [pdist.logprob(gold)

for ((name, gold), pdist) in zip(test, pdists)]

print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test)))

print()

print('Unseen Names P(Male) P(Female)\n'+'-'*40)

for ((name, gender), pdist) in zip(test, pdists)[:5]:

if gender == 'male':

fmt = ' %-15s *%6.4f %6.4f'

else:

fmt = ' %-15s %6.4f *%6.4f'

print(fmt % (name, pdist.prob('male'), pdist.prob('female')))

except NotImplementedError:

pass

# Return the classifier

return classifier

def partial_names_demo(trainer, features=names_demo_features):

from nltk.corpus import names

import random

male_names = names.words('male.txt')

female_names = names.words('female.txt')

random.seed(654321)

random.shuffle(male_names)

random.shuffle(female_names)

# Create a list of male names to be used as positive-labeled examples for training

positive = map(features, male_names[:2000])

# Create a list of male and female names to be used as unlabeled examples

unlabeled = map(features, male_names[2000:2500] + female_names[:500])

# Create a test set with correctly-labeled male and female names

test = [(name, True) for name in male_names[2500:2750]] \

+ [(name, False) for name in female_names[500:750]]

random.shuffle(test)

# Train up a classifier.

print('Training classifier...')

classifier = trainer(positive, unlabeled)

# Run the classifier on the test data.

print('Testing classifier...')

acc = accuracy(classifier, [(features(n),m) for (n,m) in test])

print('Accuracy: %6.4f' % acc)

# For classifiers that can find probabilities, show the log

# likelihood and some sample probability distributions.

try:

test_featuresets = [features(n) for (n,m) in test]

pdists = classifier.batch_prob_classify(test_featuresets)

ll = [pdist.logprob(gold)

for ((name, gold), pdist) in zip(test, pdists)]

print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test)))

print()

print('Unseen Names P(Male) P(Female)\n'+'-'*40)

for ((name, is_male), pdist) in zip(test, pdists)[:5]:

if is_male == True:

fmt = ' %-15s *%6.4f %6.4f'

else:

fmt = ' %-15s %6.4f *%6.4f'

print(fmt % (name, pdist.prob(True), pdist.prob(False)))

except NotImplementedError:

pass

# Return the classifier

return classifier

_inst_cache = {}

def wsd_demo(trainer, word, features, n=1000):

from nltk.corpus import senseval

import random

# Get the instances.

print('Reading data...')

global _inst_cache

if word not in _inst_cache:

_inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]

instances = _inst_cache[word][:]

if n> len(instances): n = len(instances)

senses = list(set(l for (i,l) in instances))

print(' Senses: ' + ' '.join(senses))

# Randomly split the names into a test & train set.

print('Splitting into test & train...')

random.seed(123456)

random.shuffle(instances)

train = instances[:int(.8*n)]

test = instances[int(.8*n):n]

# Train up a classifier.

print('Training classifier...')

classifier = trainer( [(features(i), l) for (i,l) in train] )

# Run the classifier on the test data.

print('Testing classifier...')

acc = accuracy(classifier, [(features(i),l) for (i,l) in test])

print('Accuracy: %6.4f' % acc)

# For classifiers that can find probabilities, show the log

# likelihood and some sample probability distributions.

try:

test_featuresets = [features(i) for (i,n) in test]

pdists = classifier.batch_prob_classify(test_featuresets)

ll = [pdist.logprob(gold)

for ((name, gold), pdist) in zip(test, pdists)]

print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test)))

except NotImplementedError:

pass

# Return the classifier

return classifier

Coverage for nltk.classify.util : 23%

166 statements 39 run 127 missing 0 excluded