Coverage for nltk.classify.maxent: 69%

100

101

102

103

# Natural Language Toolkit: Maximum Entropy Classifiers

# Author: Edward Loper <edloper@gradient.cis.upenn.edu>

# Dmitry Chichkov <dchichkov@gmail.com> (TypedMaxentFeatureEncoding)

# URL: <http://www.nltk.org/>

# For license information, see LICENSE.TXT

"""

A classifier model based on maximum entropy modeling framework. This

framework considers all of the probability distributions that are

empirically consistent with the training data; and chooses the

distribution with the highest entropy. A probability distribution is

"empirically consistent" with a set of training data if its estimated

frequency with which a class and a feature vector value co-occur is

equal to the actual frequency in the data.

Terminology: 'feature'

======================

The term *feature* is usually used to refer to some property of an

unlabeled token. For example, when performing word sense

disambiguation, we might define a ``'prevword'`` feature whose value is

the word preceding the target word. However, in the context of

maxent modeling, the term *feature* is typically used to refer to a

property of a "labeled" token. In order to prevent confusion, we

will introduce two distinct terms to disambiguate these two different

concepts:

- An "input-feature" is a property of an unlabeled token.

- A "joint-feature" is a property of a labeled token.

In the rest of the ``nltk.classify`` module, the term "features" is

used to refer to what we will call "input-features" in this module.

In literature that describes and discusses maximum entropy models,

input-features are typically called "contexts", and joint-features

are simply referred to as "features".

Converting Input-Features to Joint-Features

-------------------------------------------

In maximum entropy models, joint-features are required to have numeric

values. Typically, each input-feature ``input_feat`` is mapped to a

set of joint-features of the form:

| joint_feat(token, label) = { 1 if input_feat(token) == feat_val

| { and label == some_label

| {

| { 0 otherwise

For all values of ``feat_val`` and ``some_label``. This mapping is

performed by classes that implement the ``MaxentFeatureEncodingI``

interface.

"""

from __future__ import print_function

__docformat__ = 'epytext en'

import numpy

import time

import tempfile

import os

import gzip

from collections import defaultdict

from nltk import compat

from nltk.util import OrderedDict

from nltk.probability import DictionaryProbDist

from nltk.classify.api import ClassifierI

from nltk.classify.util import CutoffChecker, accuracy, log_likelihood

from nltk.classify.megam import call_megam, write_megam_file, parse_megam_weights

from nltk.classify.tadm import call_tadm, write_tadm_file, parse_tadm_weights

######################################################################

#{ Classifier Model

######################################################################

class MaxentClassifier(ClassifierI):

"""

A maximum entropy classifier (also known as a "conditional

exponential classifier"). This classifier is parameterized by a

set of "weights", which are used to combine the joint-features

that are generated from a featureset by an "encoding". In

particular, the encoding maps each ``(featureset, label)`` pair to

a vector. The probability of each label is then computed using

the following equation::

dotprod(weights, encode(fs,label))

prob(fs|label) = ---------------------------------------------------

sum(dotprod(weights, encode(fs,l)) for l in labels)

Where ``dotprod`` is the dot product::

dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))

"""

def __init__(self, encoding, weights, logarithmic=True):

"""

Construct a new maxent classifier model. Typically, new

classifier models are created using the ``train()`` method.

:type encoding: MaxentFeatureEncodingI

:param encoding: An encoding that is used to convert the

featuresets that are given to the ``classify`` method into

joint-feature vectors, which are used by the maxent

classifier model.

:type weights: list of float

:param weights: The feature weight vector for this classifier.

:type logarithmic: bool

:param logarithmic: If false, then use non-logarithmic weights.

"""

self._encoding = encoding

self._weights = weights

self._logarithmic = logarithmic

#self._logarithmic = False

assert encoding.length() == len(weights)

def labels(self):

return self._encoding.labels()

def set_weights(self, new_weights):

"""

Set the feature weight vector for this classifier.

:param new_weights: The new feature weight vector.

:type new_weights: list of float

"""

self._weights = new_weights

assert (self._encoding.length() == len(new_weights))

def weights(self):

"""

:return: The feature weight vector for this classifier.

:rtype: list of float

"""

return self._weights

def classify(self, featureset):

return self.prob_classify(featureset).max()

def prob_classify(self, featureset):

prob_dict = {}

for label in self._encoding.labels():

feature_vector = self._encoding.encode(featureset, label)

if self._logarithmic:

total = 0.0

for (f_id, f_val) in feature_vector:

total += self._weights[f_id] * f_val

prob_dict[label] = total

else:

prod = 1.0

for (f_id, f_val) in feature_vector:

prod *= self._weights[f_id] ** f_val

prob_dict[label] = prod

# Normalize the dictionary to give a probability distribution

return DictionaryProbDist(prob_dict, log=self._logarithmic,

normalize=True)

def explain(self, featureset, columns=4):

"""

Print a table showing the effect of each of the features in

the given feature set, and how they combine to determine the

probabilities of each label for that featureset.

"""

descr_width = 50

TEMPLATE = ' %-'+str(descr_width-2)+'s%s%8.3f'

pdist = self.prob_classify(featureset)

labels = sorted(pdist.samples(), key=pdist.prob, reverse=True)

labels = labels[:columns]

print(' Feature'.ljust(descr_width)+''.join(

'%8s' % str(l)[:7] for l in labels))

print(' '+'-'*(descr_width-2+8*len(labels)))

sums = defaultdict(int)

for i, label in enumerate(labels):

feature_vector = self._encoding.encode(featureset, label)

feature_vector.sort(key=lambda fid__: abs(self._weights[fid__[0]]),

reverse=True)

for (f_id, f_val) in feature_vector:

if self._logarithmic: score = self._weights[f_id] * f_val

else: score = self._weights[fid] ** f_val

descr = self._encoding.describe(f_id)

descr = descr.split(' and label is ')[0] # hack

descr += ' (%s)' % f_val # hack

if len(descr) > 47: descr = descr[:44]+'...'

print(TEMPLATE % (descr, i*8*' ', score))

sums[label] += score

print(' '+'-'*(descr_width-1+8*len(labels)))

print(' TOTAL:'.ljust(descr_width)+''.join(

'%8.3f' % sums[l] for l in labels))

print(' PROBS:'.ljust(descr_width)+''.join(

'%8.3f' % pdist.prob(l) for l in labels))

def show_most_informative_features(self, n=10, show='all'):

"""

:param show: all, neg, or pos (for negative-only or positive-only)

"""

fids = sorted(list(range(len(self._weights))),

key=lambda fid: abs(self._weights[fid]),

reverse=True)

if show == 'pos':

fids = [fid for fid in fids if self._weights[fid]>0]

elif show == 'neg':

fids = [fid for fid in fids if self._weights[fid]<0]

for fid in fids[:n]:

print('%8.3f %s' % (self._weights[fid],

self._encoding.describe(fid)))

def __repr__(self):

return ('<ConditionalExponentialClassifier: %d labels, %d features>' %

(len(self._encoding.labels()), self._encoding.length()))

#: A list of the algorithm names that are accepted for the

#: ``train()`` method's ``algorithm`` parameter.

ALGORITHMS = ['GIS', 'IIS', 'CG', 'BFGS', 'Powell', 'LBFGSB',

'Nelder-Mead', 'MEGAM', 'TADM']

@classmethod

def train(cls, train_toks, algorithm=None, trace=3, encoding=None,

labels=None, sparse=True, gaussian_prior_sigma=0, **cutoffs):

"""

Train a new maxent classifier based on the given corpus of

training samples. This classifier will have its weights

chosen to maximize entropy while remaining empirically

consistent with the training corpus.

:rtype: MaxentClassifier

:return: The new maxent classifier

:type train_toks: list

:param train_toks: Training data, represented as a list of

pairs, the first member of which is a featureset,

and the second of which is a classification label.

:type algorithm: str

:param algorithm: A case-insensitive string, specifying which

algorithm should be used to train the classifier. The

following algorithms are currently available.

- Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``),

Improved Iterative Scaling (``'IIS'``)

- Optimization Methods (requiring scipy): Conjugate gradient (``'CG'``)

Broyden-Fletcher-Goldfarb-Shanno algorithm (``'BFGS'``),

Powell algorithm (``'Powell'``),

A limited-memory variant of the BFGS algorithm (``'LBFGSB'``),

The Nelder-Mead algorithm (``'Nelder-Mead'``).

- External Libraries (requiring megam):

LM-BFGS algorithm, with training performed by Megam (``'megam'``)

The default algorithm is ``'CG'`` if scipy is

installed; and ``'IIS'`` otherwise.

:type trace: int

:param trace: The level of diagnostic tracing output to produce.

Higher values produce more verbose output.

:type encoding: MaxentFeatureEncodingI

:param encoding: A feature encoding, used to convert featuresets

into feature vectors. If none is specified, then a

``BinaryMaxentFeatureEncoding`` will be built based on the

features that are attested in the training corpus.

:type labels: list(str)

:param labels: The set of possible labels. If none is given, then

the set of all labels attested in the training data will be

used instead.

:param sparse: If True, then use sparse matrices instead of

dense matrices. Currently, this is only supported by

the scipy (optimization method) algorithms. For other

algorithms, its value is ignored.

:param gaussian_prior_sigma: The sigma value for a gaussian

prior on model weights. Currently, this is supported by

the scipy (optimization method) algorithms and ``megam``.

For other algorithms, its value is ignored.

:param cutoffs: Arguments specifying various conditions under

which the training should be halted. (Some of the cutoff

conditions are not supported by some algorithms.)

- ``max_iter=v``: Terminate after ``v`` iterations.

- ``min_ll=v``: Terminate after the negative average

log-likelihood drops under ``v``.

- ``min_lldelta=v``: Terminate if a single iteration improves

log likelihood by less than ``v``.

- ``tolerance=v``: Terminate a scipy optimization method when

improvement drops below a tolerance level ``v``. The

exact meaning of this tolerance depends on the scipy

algorithm used. See ``scipy`` documentation for more

info. Default values: 1e-3 for CG, 1e-5 for LBFGSB,

and 1e-4 for other algorithms. (``scipy`` only)

"""

if algorithm is None:

try:

import scipy

algorithm = 'cg'

except ImportError:

algorithm = 'iis'

for key in cutoffs:

if key not in ('max_iter', 'min_ll', 'min_lldelta', 'tolerance',

'max_acc', 'min_accdelta', 'count_cutoff',

'norm', 'explicit', 'bernoulli'):

raise TypeError('Unexpected keyword arg %r' % key)

algorithm = algorithm.lower()

if algorithm == 'iis':

return train_maxent_classifier_with_iis(

train_toks, trace, encoding, labels, **cutoffs)

elif algorithm == 'gis':

return train_maxent_classifier_with_gis(

train_toks, trace, encoding, labels, **cutoffs)

elif algorithm in cls._SCIPY_ALGS:

return train_maxent_classifier_with_scipy(

train_toks, trace, encoding, labels,

cls._SCIPY_ALGS[algorithm], sparse,

gaussian_prior_sigma, **cutoffs)

elif algorithm == 'megam':

return train_maxent_classifier_with_megam(

train_toks, trace, encoding, labels,

gaussian_prior_sigma, **cutoffs)

elif algorithm == 'tadm':

kwargs = cutoffs

kwargs['trace'] = trace

kwargs['encoding'] = encoding

kwargs['labels'] = labels

kwargs['gaussian_prior_sigma'] = gaussian_prior_sigma

return TadmMaxentClassifier.train(train_toks, **kwargs)

else:

raise ValueError('Unknown algorithm %s' % algorithm)

_SCIPY_ALGS = {'cg':'CG', 'bfgs':'BFGS', 'powell':'Powell',

'lbfgsb':'LBFGSB', 'nelder-mead':'Nelder-Mead'}

#: Alias for MaxentClassifier.

ConditionalExponentialClassifier = MaxentClassifier

######################################################################

#{ Feature Encodings

######################################################################

class MaxentFeatureEncodingI(object):

"""

A mapping that converts a set of input-feature values to a vector

of joint-feature values, given a label. This conversion is

necessary to translate featuresets into a format that can be used

by maximum entropy models.

The set of joint-features used by a given encoding is fixed, and

each index in the generated joint-feature vectors corresponds to a

single joint-feature. The length of the generated joint-feature

vectors is therefore constant (for a given encoding).

Because the joint-feature vectors generated by

``MaxentFeatureEncodingI`` are typically very sparse, they are

represented as a list of ``(index, value)`` tuples, specifying the

value of each non-zero joint-feature.

Feature encodings are generally created using the ``train()``

method, which generates an appropriate encoding based on the

input-feature values and labels that are present in a given

corpus.

"""

def encode(self, featureset, label):

"""

Given a (featureset, label) pair, return the corresponding

vector of joint-feature values. This vector is represented as

a list of ``(index, value)`` tuples, specifying the value of

each non-zero joint-feature.

:type featureset: dict

:rtype: list(tuple(int, int))

"""

raise NotImplementedError()

def length(self):

"""

:return: The size of the fixed-length joint-feature vectors

that are generated by this encoding.

:rtype: int

"""

raise NotImplementedError()

def labels(self):

"""

:return: A list of the \"known labels\" -- i.e., all labels

``l`` such that ``self.encode(fs,l)`` can be a nonzero

joint-feature vector for some value of ``fs``.

:rtype: list

"""

raise NotImplementedError()

def describe(self, fid):

"""

:return: A string describing the value of the joint-feature

whose index in the generated feature vectors is ``fid``.

:rtype: str

"""

raise NotImplementedError()

def train(cls, train_toks):

"""

Construct and return new feature encoding, based on a given

training corpus ``train_toks``.

:type train_toks: list(tuple(dict, str))

:param train_toks: Training data, represented as a list of

pairs, the first member of which is a feature dictionary,

and the second of which is a classification label.

"""

raise NotImplementedError()

class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI):

"""

A feature encoding that calls a user-supplied function to map a

given featureset/label pair to a sparse joint-feature vector.

"""

def __init__(self, func, length, labels):

"""

Construct a new feature encoding based on the given function.

:type func: (callable)

:param func: A function that takes two arguments, a featureset

and a label, and returns the sparse joint feature vector

that encodes them:

>>> func(featureset, label) -> feature_vector

This sparse joint feature vector (``feature_vector``) is a

list of ``(index,value)`` tuples.

:type length: int

:param length: The size of the fixed-length joint-feature

vectors that are generated by this encoding.

:type labels: list

:param labels: A list of the \"known labels\" for this

encoding -- i.e., all labels ``l`` such that

``self.encode(fs,l)`` can be a nonzero joint-feature vector

for some value of ``fs``.

"""

self._length = length

self._func = func

self._labels = labels

def encode(self, featureset, label):

return self._func(featureset, label)

def length(self):

return self._length

def labels(self):

return self._labels

def describe(self, fid):

return 'no description available'

class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):

"""

A feature encoding that generates vectors containing a binary

joint-features of the form:

| joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)

| {

| { 0 otherwise

Where ``fname`` is the name of an input-feature, ``fval`` is a value

for that input-feature, and ``label`` is a label.

Typically, these features are constructed based on a training

corpus, using the ``train()`` method. This method will create one

feature for each combination of ``fname``, ``fval``, and ``label``

that occurs at least once in the training corpus.

The ``unseen_features`` parameter can be used to add "unseen-value

features", which are used whenever an input feature has a value

that was not encountered in the training corpus. These features

have the form:

| joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])

| { and l == label

| {

| { 0 otherwise

Where ``is_unseen(fname, fval)`` is true if the encoding does not

contain any joint features that are true when ``fs[fname]==fval``.

The ``alwayson_features`` parameter can be used to add "always-on

features", which have the form::

| joint_feat(fs, l) = { 1 if (l == label)

| {

| { 0 otherwise

These always-on features allow the maxent model to directly model

the prior probabilities of each label.

"""

def __init__(self, labels, mapping, unseen_features=False,

alwayson_features=False):

"""

:param labels: A list of the \"known labels\" for this encoding.

:param mapping: A dictionary mapping from ``(fname,fval,label)``

tuples to corresponding joint-feature indexes. These

indexes must be the set of integers from 0...len(mapping).

If ``mapping[fname,fval,label]=id``, then

``self.encode(..., fname:fval, ..., label)[id]`` is 1;

otherwise, it is 0.

:param unseen_features: If true, then include unseen value

features in the generated joint-feature vectors.

:param alwayson_features: If true, then include always-on

features in the generated joint-feature vectors.

"""

if set(mapping.values()) != set(range(len(mapping))):

raise ValueError('Mapping values must be exactly the '

'set of integers from 0...len(mapping)')

self._labels = list(labels)

"""A list of attested labels."""

self._mapping = mapping

"""dict mapping from (fname,fval,label) -> fid"""

self._length = len(mapping)

"""The length of generated joint feature vectors."""

self._alwayson = None

"""dict mapping from label -> fid"""

self._unseen = None

"""dict mapping from fname -> fid"""

if alwayson_features:

self._alwayson = dict([(label,i+self._length)

for (i,label) in enumerate(labels)])

self._length += len(self._alwayson)

if unseen_features:

fnames = set(fname for (fname, fval, label) in mapping)

self._unseen = dict([(fname, i+self._length)

for (i, fname) in enumerate(fnames)])

self._length += len(fnames)

def encode(self, featureset, label):

# Inherit docs.

encoding = []

# Convert input-features to joint-features:

for fname, fval in featureset.items():

# Known feature name & value:

if (fname, fval, label) in self._mapping:

encoding.append((self._mapping[fname, fval, label], 1))

# Otherwise, we might want to fire an "unseen-value feature".

elif self._unseen:

# Have we seen this fname/fval combination with any label?

for label2 in self._labels:

if (fname, fval, label2) in self._mapping:

break # we've seen this fname/fval combo

# We haven't -- fire the unseen-value feature

else:

if fname in self._unseen:

encoding.append((self._unseen[fname], 1))

# Add always-on features:

if self._alwayson and label in self._alwayson:

encoding.append((self._alwayson[label], 1))

return encoding

def describe(self, f_id):

# Inherit docs.

if not isinstance(f_id, compat.integer_types):

raise TypeError('describe() expected an int')

try:

self._inv_mapping

except AttributeError:

self._inv_mapping = [-1]*len(self._mapping)

for (info, i) in self._mapping.items():

self._inv_mapping[i] = info

if f_id < len(self._mapping):

(fname, fval, label) = self._inv_mapping[f_id]

return '%s==%r and label is %r' % (fname, fval, label)

elif self._alwayson and f_id in self._alwayson.values():

for (label, f_id2) in self._alwayson.items():

if f_id==f_id2: return 'label is %r' % label

elif self._unseen and f_id in self._unseen.values():

for (fname, f_id2) in self._unseen.items():

if f_id==f_id2: return '%s is unseen' % fname

else:

raise ValueError('Bad feature id')

def labels(self):

# Inherit docs.

return self._labels

def length(self):

# Inherit docs.

return self._length

@classmethod

def train(cls, train_toks, count_cutoff=0, labels=None, **options):

"""

Construct and return new feature encoding, based on a given

training corpus ``train_toks``. See the class description

``BinaryMaxentFeatureEncoding`` for a description of the

joint-features that will be included in this encoding.

:type train_toks: list(tuple(dict, str))

:param train_toks: Training data, represented as a list of

pairs, the first member of which is a feature dictionary,

and the second of which is a classification label.

:type count_cutoff: int

:param count_cutoff: A cutoff value that is used to discard

rare joint-features. If a joint-feature's value is 1

fewer than ``count_cutoff`` times in the training corpus,

then that joint-feature is not included in the generated

encoding.

:type labels: list

:param labels: A list of labels that should be used by the

classifier. If not specified, then the set of labels

attested in ``train_toks`` will be used.

:param options: Extra parameters for the constructor, such as

``unseen_features`` and ``alwayson_features``.

"""

mapping = {} # maps (fname, fval, label) -> fid

seen_labels = set() # The set of labels we've encountered

count = defaultdict(int) # maps (fname, fval) -> count

for (tok, label) in train_toks:

if labels and label not in labels:

raise ValueError('Unexpected label %s' % label)

seen_labels.add(label)

# Record each of the features.

for (fname, fval) in tok.items():

# If a count cutoff is given, then only add a joint

# feature once the corresponding (fname, fval, label)

# tuple exceeds that cutoff.

count[fname,fval] += 1

if count[fname,fval] >= count_cutoff:

if (fname, fval, label) not in mapping:

mapping[fname, fval, label] = len(mapping)

if labels is None: labels = seen_labels

return cls(labels, mapping, **options)

class GISEncoding(BinaryMaxentFeatureEncoding):

"""

A binary feature encoding which adds one new joint-feature to the

joint-features defined by ``BinaryMaxentFeatureEncoding``: a

correction feature, whose value is chosen to ensure that the

sparse vector always sums to a constant non-negative number. This

new feature is used to ensure two preconditions for the GIS

training algorithm:

- At least one feature vector index must be nonzero for every

token.

- The feature vector must sum to a constant non-negative number

for every token.

"""

def __init__(self, labels, mapping, unseen_features=False,

alwayson_features=False, C=None):

"""

:param C: The correction constant. The value of the correction

feature is based on this value. In particular, its value is

``C - sum([v for (f,v) in encoding])``.

:seealso: ``BinaryMaxentFeatureEncoding.__init__``

"""

BinaryMaxentFeatureEncoding.__init__(

self, labels, mapping, unseen_features, alwayson_features)

if C is None:

C = len(set([fname for (fname,fval,label) in mapping]))+1

self._C = C

@property

def C(self):

"""The non-negative constant that all encoded feature vectors

will sum to."""

return self._C

def encode(self, featureset, label):

# Get the basic encoding.

encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label)

base_length = BinaryMaxentFeatureEncoding.length(self)

# Add a correction feature.

total = sum([v for (f,v) in encoding])

if total >= self._C:

raise ValueError('Correction feature is not high enough!')

encoding.append( (base_length, self._C-total) )

# Return the result

return encoding

def length(self):

return BinaryMaxentFeatureEncoding.length(self) + 1

def describe(self, f_id):

if f_id == BinaryMaxentFeatureEncoding.length(self):

return 'Correction feature (%s)' % self._C

else:

return BinaryMaxentFeatureEncoding.describe(self, f_id)

class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding):

def __init__(self, labels, mapping, unseen_features=False,

alwayson_features=False):

self._mapping = OrderedDict(mapping)

self._label_mapping = OrderedDict()

BinaryMaxentFeatureEncoding.__init__(self, labels, self._mapping,

unseen_features,

alwayson_features)

def encode(self, featureset, label):

encoding = []

for feature, value in featureset.items():

if (feature, label) not in self._mapping:

self._mapping[(feature, label)] = len(self._mapping)

if value not in self._label_mapping:

if not isinstance(value, int):

self._label_mapping[value] = len(self._label_mapping)

else:

self._label_mapping[value] = value

encoding.append((self._mapping[(feature, label)],

self._label_mapping[value]))

return encoding

def labels(self):

return self._labels

def describe(self, fid):

for (feature, label) in self._mapping:

if self._mapping[(feature, label)] == fid:

return (feature, label)

def length(self):

return len(self._mapping)

@classmethod

def train(cls, train_toks, count_cutoff=0, labels=None, **options):

mapping = OrderedDict()

if not labels:

labels = []

# This gets read twice, so compute the values in case it's lazy.

train_toks = list(train_toks)

for (featureset, label) in train_toks:

if label not in labels:

labels.append(label)

for (featureset, label) in train_toks:

for label in labels:

for feature in featureset:

if (feature, label) not in mapping:

mapping[(feature, label)] = len(mapping)

return cls(labels, mapping, **options)

class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):

"""

A feature encoding that generates vectors containing integer,

float and binary joint-features of the form:

Binary (for string and boolean features):

| joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)

| {

| { 0 otherwise

Value (for integer and float features):

| joint_feat(fs, l) = { fval if (fs[fname] == type(fval))

| { and (l == label)

| {

| { not encoded otherwise

Where ``fname`` is the name of an input-feature, ``fval`` is a value

for that input-feature, and ``label`` is a label.

Typically, these features are constructed based on a training

corpus, using the ``train()`` method.

For string and boolean features [type(fval) not in (int, float)]

this method will create one feature for each combination of

``fname``, ``fval``, and ``label`` that occurs at least once in the

training corpus.

For integer and float features [type(fval) in (int, float)] this

method will create one feature for each combination of ``fname``

and ``label`` that occurs at least once in the training corpus.

For binary features the ``unseen_features`` parameter can be used

to add "unseen-value features", which are used whenever an input

feature has a value that was not encountered in the training

corpus. These features have the form:

| joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])

| { and l == label

| {

| { 0 otherwise

Where ``is_unseen(fname, fval)`` is true if the encoding does not

contain any joint features that are true when ``fs[fname]==fval``.

The ``alwayson_features`` parameter can be used to add "always-on

features", which have the form:

| joint_feat(fs, l) = { 1 if (l == label)

| {

| { 0 otherwise

These always-on features allow the maxent model to directly model

the prior probabilities of each label.

"""

def __init__(self, labels, mapping, unseen_features=False,

alwayson_features=False):

"""

:param labels: A list of the \"known labels\" for this encoding.

:param mapping: A dictionary mapping from ``(fname,fval,label)``

tuples to corresponding joint-feature indexes. These

indexes must be the set of integers from 0...len(mapping).

If ``mapping[fname,fval,label]=id``, then

``self.encode({..., fname:fval, ...``, label)[id]} is 1;

otherwise, it is 0.

:param unseen_features: If true, then include unseen value

features in the generated joint-feature vectors.

:param alwayson_features: If true, then include always-on

features in the generated joint-feature vectors.

"""

if set(mapping.values()) != set(range(len(mapping))):

raise ValueError('Mapping values must be exactly the '

'set of integers from 0...len(mapping)')

self._labels = list(labels)

"""A list of attested labels."""

self._mapping = mapping

"""dict mapping from (fname,fval,label) -> fid"""

self._length = len(mapping)

"""The length of generated joint feature vectors."""

self._alwayson = None

"""dict mapping from label -> fid"""

self._unseen = None

"""dict mapping from fname -> fid"""

if alwayson_features:

self._alwayson = dict([(label,i+self._length)

for (i,label) in enumerate(labels)])

self._length += len(self._alwayson)

if unseen_features:

fnames = set(fname for (fname, fval, label) in mapping)

self._unseen = dict([(fname, i+self._length)

for (i, fname) in enumerate(fnames)])

self._length += len(fnames)

def encode(self, featureset, label):

# Inherit docs.

encoding = []

# Convert input-features to joint-features:

for fname, fval in featureset.items():

if isinstance(fval, (compat.integer_types, float)):

# Known feature name & value:

if (fname, type(fval), label) in self._mapping:

encoding.append((self._mapping[fname, type(fval), label], fval))

else:

# Known feature name & value:

if (fname, fval, label) in self._mapping:

encoding.append((self._mapping[fname, fval, label], 1))

# Otherwise, we might want to fire an "unseen-value feature".

elif self._unseen:

# Have we seen this fname/fval combination with any label?

for label2 in self._labels:

if (fname, fval, label2) in self._mapping:

break # we've seen this fname/fval combo

# We haven't -- fire the unseen-value feature

else:

if fname in self._unseen:

encoding.append((self._unseen[fname], 1))

# Add always-on features:

if self._alwayson and label in self._alwayson:

encoding.append((self._alwayson[label], 1))

return encoding

def describe(self, f_id):

# Inherit docs.

if not isinstance(f_id, compat.integer_types):

raise TypeError('describe() expected an int')

try:

self._inv_mapping

except AttributeError:

self._inv_mapping = [-1]*len(self._mapping)

for (info, i) in self._mapping.items():

self._inv_mapping[i] = info

if f_id < len(self._mapping):

(fname, fval, label) = self._inv_mapping[f_id]

return '%s==%r and label is %r' % (fname, fval, label)

elif self._alwayson and f_id in self._alwayson.values():

for (label, f_id2) in self._alwayson.items():

if f_id==f_id2: return 'label is %r' % label

elif self._unseen and f_id in self._unseen.values():

for (fname, f_id2) in self._unseen.items():

if f_id==f_id2: return '%s is unseen' % fname

else:

raise ValueError('Bad feature id')

def labels(self):

# Inherit docs.

return self._labels

def length(self):

# Inherit docs.

return self._length

@classmethod

def train(cls, train_toks, count_cutoff=0, labels=None, **options):

"""

Construct and return new feature encoding, based on a given

training corpus ``train_toks``. See the class description

``TypedMaxentFeatureEncoding`` for a description of the

joint-features that will be included in this encoding.

Note: recognized feature values types are (int, float), over

types are interpreted as regular binary features.

:type train_toks: list(tuple(dict, str))

:param train_toks: Training data, represented as a list of

pairs, the first member of which is a feature dictionary,

and the second of which is a classification label.

:type count_cutoff: int

:param count_cutoff: A cutoff value that is used to discard

rare joint-features. If a joint-feature's value is 1

fewer than ``count_cutoff`` times in the training corpus,

then that joint-feature is not included in the generated

encoding.

:type labels: list

:param labels: A list of labels that should be used by the

classifier. If not specified, then the set of labels

attested in ``train_toks`` will be used.

:param options: Extra parameters for the constructor, such as

``unseen_features`` and ``alwayson_features``.

"""

mapping = {} # maps (fname, fval, label) -> fid

seen_labels = set() # The set of labels we've encountered

count = defaultdict(int) # maps (fname, fval) -> count

for (tok, label) in train_toks:

if labels and label not in labels:

raise ValueError('Unexpected label %s' % label)

seen_labels.add(label)

# Record each of the features.

for (fname, fval) in tok.items():

if(type(fval) in (int, float)): fval = type(fval)

# If a count cutoff is given, then only add a joint

# feature once the corresponding (fname, fval, label)

# tuple exceeds that cutoff.

count[fname,fval] += 1

if count[fname,fval] >= count_cutoff:

if (fname, fval, label) not in mapping:

mapping[fname, fval, label] = len(mapping)

if labels is None: labels = seen_labels

return cls(labels, mapping, **options)

######################################################################

#{ Classifier Trainer: Generalized Iterative Scaling

######################################################################

def train_maxent_classifier_with_gis(train_toks, trace=3, encoding=None,

labels=None, **cutoffs):

"""

Train a new ``ConditionalExponentialClassifier``, using the given

training samples, using the Generalized Iterative Scaling

algorithm. This ``ConditionalExponentialClassifier`` will encode

the model that maximizes entropy from all the models that are

empirically consistent with ``train_toks``.

:see: ``train_maxent_classifier()`` for parameter descriptions.

"""

cutoffs.setdefault('max_iter', 100)

cutoffchecker = CutoffChecker(cutoffs)

# Construct an encoding from the training data.

if encoding is None:

encoding = GISEncoding.train(train_toks, labels=labels)

if not hasattr(encoding, 'C'):

raise TypeError('The GIS algorithm requires an encoding that '

'defines C (e.g., GISEncoding).')

# Cinv is the inverse of the sum of each joint feature vector.

# This controls the learning rate: higher Cinv (or lower C) gives

# faster learning.

Cinv = 1.0/encoding.C

# Count how many times each feature occurs in the training data.

empirical_fcount = calculate_empirical_fcount(train_toks, encoding)

# Check for any features that are not attested in train_toks.

unattested = set(numpy.nonzero(empirical_fcount==0)[0])

# Build the classifier. Start with weight=0 for each attested

# feature, and weight=-infinity for each unattested feature.

weights = numpy.zeros(len(empirical_fcount), 'd')

for fid in unattested: weights[fid] = numpy.NINF

classifier = ConditionalExponentialClassifier(encoding, weights)

# Take the log of the empirical fcount.

log_empirical_fcount = numpy.log2(empirical_fcount)

del empirical_fcount

# Old log-likelihood and accuracy; used to check if the change

# in log-likelihood or accuracy is sufficient to indicate convergence.

ll_old = None

acc_old = None

if trace > 0: print(' ==> Training (%d iterations)' % cutoffs['max_iter'])

if trace > 2:

print()

print(' Iteration Log Likelihood Accuracy')

print(' ---------------------------------------')

# Train the classifier.

try:

while True:

if trace > 2:

ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)

acc = cutoffchecker.acc or accuracy(classifier, train_toks)

iternum = cutoffchecker.iter

print(' %9d %14.5f %9.3f' % (iternum, ll, acc))

# Use the model to estimate the number of times each

# feature should occur in the training data.

estimated_fcount = calculate_estimated_fcount(

classifier, train_toks, encoding)

# Take the log of estimated fcount (avoid taking log(0).)

for fid in unattested: estimated_fcount[fid] += 1

log_estimated_fcount = numpy.log2(estimated_fcount)

del estimated_fcount

# Update the classifier weights

weights = classifier.weights()

weights += (log_empirical_fcount - log_estimated_fcount) * Cinv

classifier.set_weights(weights)

# Check the log-likelihood & accuracy cutoffs.

if cutoffchecker.check(classifier, train_toks):

break

except KeyboardInterrupt:

print(' Training stopped: keyboard interrupt')

except:

raise

if trace > 2:

ll = log_likelihood(classifier, train_toks)

acc = accuracy(classifier, train_toks)

print(' Final %14.5f %9.3f' % (ll, acc))

# Return the classifier.

return classifier

def calculate_empirical_fcount(train_toks, encoding):

fcount = numpy.zeros(encoding.length(), 'd')

for tok, label in train_toks:

for (index, val) in encoding.encode(tok, label):

fcount[index] += val

return fcount

def calculate_estimated_fcount(classifier, train_toks, encoding):

fcount = numpy.zeros(encoding.length(), 'd')

for tok, label in train_toks:

pdist = classifier.prob_classify(tok)

for label in pdist.samples():

prob = pdist.prob(label)

for (fid, fval) in encoding.encode(tok, label):

fcount[fid] += prob*fval

return fcount

######################################################################

#{ Classifier Trainer: Improved Iterative Scaling

######################################################################

def train_maxent_classifier_with_iis(train_toks, trace=3, encoding=None,

labels=None, **cutoffs):

"""

Train a new ``ConditionalExponentialClassifier``, using the given

training samples, using the Improved Iterative Scaling algorithm.

This ``ConditionalExponentialClassifier`` will encode the model

that maximizes entropy from all the models that are empirically

consistent with ``train_toks``.

:see: ``train_maxent_classifier()`` for parameter descriptions.

"""

cutoffs.setdefault('max_iter', 100)

cutoffchecker = CutoffChecker(cutoffs)

# Construct an encoding from the training data.

if encoding is None:

encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels)

# Count how many times each feature occurs in the training data.

empirical_ffreq = (calculate_empirical_fcount(train_toks, encoding) /

len(train_toks))

# Find the nf map, and related variables nfarray and nfident.

# nf is the sum of the features for a given labeled text.

# nfmap compresses this sparse set of values to a dense list.

# nfarray performs the reverse operation. nfident is

# nfarray multiplied by an identity matrix.

nfmap = calculate_nfmap(train_toks, encoding)

nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd')

nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))

# Check for any features that are not attested in train_toks.

unattested = set(numpy.nonzero(empirical_ffreq==0)[0])

# Build the classifier. Start with weight=0 for each attested

# feature, and weight=-infinity for each unattested feature.

weights = numpy.zeros(len(empirical_ffreq), 'd')

for fid in unattested: weights[fid] = numpy.NINF

classifier = ConditionalExponentialClassifier(encoding, weights)

if trace > 0: print(' ==> Training (%d iterations)' % cutoffs['max_iter'])

if trace > 2:

print()

print(' Iteration Log Likelihood Accuracy')

print(' ---------------------------------------')

# Old log-likelihood and accuracy; used to check if the change

# in log-likelihood or accuracy is sufficient to indicate convergence.

ll_old = None

acc_old = None

# Train the classifier.

try:

while True:

if trace > 2:

ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)

acc = cutoffchecker.acc or accuracy(classifier, train_toks)

iternum = cutoffchecker.iter

print(' %9d %14.5f %9.3f' % (iternum, ll, acc))

# Calculate the deltas for this iteration, using Newton's method.

deltas = calculate_deltas(

train_toks, classifier, unattested, empirical_ffreq,

nfmap, nfarray, nftranspose, encoding)

# Use the deltas to update our weights.

weights = classifier.weights()

weights += deltas

classifier.set_weights(weights)

# Check the log-likelihood & accuracy cutoffs.

if cutoffchecker.check(classifier, train_toks):

break

except KeyboardInterrupt:

print(' Training stopped: keyboard interrupt')

except:

raise

if trace > 2:

ll = log_likelihood(classifier, train_toks)

acc = accuracy(classifier, train_toks)

print(' Final %14.5f %9.3f' % (ll, acc))

# Return the classifier.

return classifier

def calculate_nfmap(train_toks, encoding):

"""

Construct a map that can be used to compress ``nf`` (which is

typically sparse).

*nf(feature_vector)* is the sum of the feature values for

*feature_vector*.

This represents the number of features that are active for a

given labeled text. This method finds all values of *nf(t)*

that are attested for at least one token in the given list of

training tokens; and constructs a dictionary mapping these

attested values to a continuous range *0...N*. For example,

if the only values of *nf()* that were attested were 3, 5, and

7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``.

:return: A map that can be used to compress ``nf`` to a dense

vector.

:rtype: dict(int -> int)

"""

# Map from nf to indices. This allows us to use smaller arrays.

nfset = set()

for tok, _ in train_toks:

for label in encoding.labels():

nfset.add(sum([val for (id,val) in encoding.encode(tok,label)]))

return dict([(nf, i) for (i, nf) in enumerate(nfset)])

def calculate_deltas(train_toks, classifier, unattested, ffreq_empirical,

nfmap, nfarray, nftranspose, encoding):

"""

Calculate the update values for the classifier weights for

this iteration of IIS. These update weights are the value of

``delta`` that solves the equation::

ffreq_empirical[i]

SUM[fs,l] (classifier.prob_classify(fs).prob(l) *

feature_vector(fs,l)[i] *

exp(delta[i] * nf(feature_vector(fs,l))))

Where:

- *(fs,l)* is a (featureset, label) tuple from ``train_toks``

- *feature_vector(fs,l)* = ``encoding.encode(fs,l)``

- *nf(vector)* = ``sum([val for (id,val) in vector])``

This method uses Newton's method to solve this equation for

*delta[i]*. In particular, it starts with a guess of

``delta[i]`` = 1; and iteratively updates ``delta`` with:

| delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])

until convergence, where *sum1* and *sum2* are defined as:

| sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)

| sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l)))

| f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) .

| feature_vector(fs,l)[i] .

| exp(delta[i] . nf(feature_vector(fs,l))))

Note that *sum1* and *sum2* depend on ``delta``; so they need

to be re-computed each iteration.

The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are

used to generate a dense encoding for *nf(ltext)*. This

allows ``_deltas`` to calculate *sum1* and *sum2* using

matrices, which yields a significant performance improvement.

:param train_toks: The set of training tokens.

:type train_toks: list(tuple(dict, str))

:param classifier: The current classifier.

:type classifier: ClassifierI

:param ffreq_empirical: An array containing the empirical

frequency for each feature. The *i*\ th element of this

array is the empirical frequency for feature *i*.

:type ffreq_empirical: sequence of float

:param unattested: An array that is 1 for features that are

not attested in the training data; and 0 for features that

are attested. In other words, ``unattested[i]==0`` iff

``ffreq_empirical[i]==0``.

:type unattested: sequence of int

:param nfmap: A map that can be used to compress ``nf`` to a dense

vector.

:type nfmap: dict(int -> int)

:param nfarray: An array that can be used to uncompress ``nf``

from a dense vector.

:type nfarray: array(float)

:param nftranspose: The transpose of ``nfarray``

:type nftranspose: array(float)

"""

# These parameters control when we decide that we've

# converged. It probably should be possible to set these

# manually, via keyword arguments to train.

NEWTON_CONVERGE = 1e-12

MAX_NEWTON = 300

deltas = numpy.ones(encoding.length(), 'd')

# Precompute the A matrix:

# A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) )

# over all label,fs s.t. num_features[label,fs]=nf

A = numpy.zeros((len(nfmap), encoding.length()), 'd')

for tok, label in train_toks:

dist = classifier.prob_classify(tok)

for label in encoding.labels():

# Generate the feature vector

feature_vector = encoding.encode(tok,label)

# Find the number of active features

nf = sum([val for (id, val) in feature_vector])

# Update the A matrix

for (id, val) in feature_vector:

A[nfmap[nf], id] += dist.prob(label) * val

A /= len(train_toks)

# Iteratively solve for delta. Use the following variables:

# - nf_delta[x][y] = nfarray[x] * delta[y]

# - exp_nf_delta[x][y] = exp(nf[x] * delta[y])

# - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y])

# - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)

# exp(delta[i]nf)

# - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)

# nf exp(delta[i]nf)

for rangenum in range(MAX_NEWTON):

nf_delta = numpy.outer(nfarray, deltas)

exp_nf_delta = 2 ** nf_delta

nf_exp_nf_delta = nftranspose * exp_nf_delta

sum1 = numpy.sum(exp_nf_delta * A, axis=0)

sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)

# Avoid division by zero.

for fid in unattested: sum2[fid] += 1

# Update the deltas.

deltas -= (ffreq_empirical - sum1) / -sum2

# We can stop once we converge.

n_error = (numpy.sum(abs((ffreq_empirical-sum1)))/

numpy.sum(abs(deltas)))

if n_error < NEWTON_CONVERGE:

return deltas

######################################################################

#{ Classifier Trainer: scipy algorithms (GC, LBFGSB, etc.)

######################################################################

# [xx] n.b.: it's possible to supply custom trace functions, which

# could be used to make trace output consistent with iis/gis.

def train_maxent_classifier_with_scipy(train_toks, trace=3, encoding=None,

labels=None, algorithm='CG',

sparse=True, gaussian_prior_sigma=0,

**cutoffs):

"""

Train a new ``ConditionalExponentialClassifier``, using the given

training samples, using the specified ``scipy`` optimization

algorithm. This ``ConditionalExponentialClassifier`` will encode

the model that maximizes entropy from all the models that are

empirically consistent with ``train_toks``.

:see: ``train_maxent_classifier()`` for parameter descriptions.

:require: The ``scipy`` package must be installed.

"""

try:

import scipy

except ImportError as e:

raise ValueError('The maxent training algorithm %r requires '

'that the scipy package be installed. See '

'http://www.scipy.org/' % algorithm)

try:

# E.g., if libgfortran.2.dylib is not found.

import scipy.sparse, scipy.maxentropy

except ImportError as e:

raise ValueError('Import of scipy package failed: %s' % e)

# Construct an encoding from the training data.

if encoding is None:

encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels)

elif labels is not None:

raise ValueError('Specify encoding or labels, not both')

labels = encoding.labels()

labelnum = dict([(label, i) for (i, label) in enumerate(labels)])

num_features = encoding.length()

num_toks = len(train_toks)

num_labels = len(labels)

# Decide whether to use a sparse matrix or a dense one. Very

# limited testing has shown that the lil matrix format

# (list-of-lists) performs better than csr and csc formats.

# Limited testing also suggests that the sparse matrix format

# doesn't save much memory over the dense format in practice

# (in terms of max memory usage).

if sparse: zeros = scipy.sparse.lil_matrix

else: zeros = numpy.zeros

# Construct the 'F' matrix, which lists the feature values for

# each training instance. F[i, j*len(labels)+k] is equal to the

# value of the i'th feature for the feature vector corresponding

# to (tok[j], label[k]).

F = zeros((num_features, num_toks*num_labels))

# Construct the 'N' matrix, which specifies the correct label for

# each training instance. N[0, j*len(labels)+k] is equal to one

# iff label[k] is the correct label for tok[j].

N = zeros((1, num_toks*num_labels))

# Fill in the 'F' and 'N' matrices (just make one pass through the

# training tokens.)

for toknum, (featureset, label) in enumerate(train_toks):

N[0, toknum*len(labels) + labelnum[label]] += 1

for label2 in labels:

for (fid, fval) in encoding.encode(featureset, label2):

F[fid, toknum*len(labels) + labelnum[label2]] = fval

# Set up the scipy model, based on the matrices F and N.

model = scipy.maxentropy.conditionalmodel(F, N, num_toks)

# note -- model.setsmooth() is buggy.

if gaussian_prior_sigma:

model.sigma2 = gaussian_prior_sigma**2

if algorithm == 'LBFGSB':

model.log = None

if trace >= 3:

model.verbose = True

if 'max_iter' in cutoffs:

model.maxiter = cutoffs['max_iter']

if 'tolerance' in cutoffs:

if algorithm == 'CG': model.avegtol = cutoffs['tolerance']

elif algorithm == 'LBFGSB': model.maxgtol = cutoffs['tolerance']

else: model.tol = cutoffs['tolerance']

# Train the model.

model.fit(algorithm=algorithm)

# Convert the model's weights from base-e to base-2 weights.

weights = model.params * numpy.log2(numpy.e)

# Build the classifier

return MaxentClassifier(encoding, weights)

######################################################################

#{ Classifier Trainer: megam

######################################################################

# [xx] possible extension: add support for using implicit file format;

# this would need to put requirements on what encoding is used. But

# we may need this for other maxent classifier trainers that require

# implicit formats anyway.

def train_maxent_classifier_with_megam(train_toks, trace=3, encoding=None,

labels=None, gaussian_prior_sigma=0,

**kwargs):

"""

Train a new ``ConditionalExponentialClassifier``, using the given

training samples, using the external ``megam`` library. This

``ConditionalExponentialClassifier`` will encode the model that

maximizes entropy from all the models that are empirically

consistent with ``train_toks``.

:see: ``train_maxent_classifier()`` for parameter descriptions.

:see: ``nltk.classify.megam``

"""

explicit = True

bernoulli = True

if 'explicit' in kwargs: explicit = kwargs['explicit']

if 'bernoulli' in kwargs: bernoulli = kwargs['bernoulli']

# Construct an encoding from the training data.

if encoding is None:

# Count cutoff can also be controlled by megam with the -minfc

# option. Not sure where the best place for it is.

count_cutoff = kwargs.get('count_cutoff', 0)

encoding = BinaryMaxentFeatureEncoding.train(train_toks, count_cutoff,

labels=labels,

alwayson_features=True)

elif labels is not None:

raise ValueError('Specify encoding or labels, not both')

# Write a training file for megam.

try:

fd, trainfile_name = tempfile.mkstemp(prefix='nltk-', suffix='.gz')

trainfile = gzip.open(trainfile_name, 'wb')

write_megam_file(train_toks, encoding, trainfile, \

explicit=explicit, bernoulli=bernoulli)

trainfile.close()

except (OSError, IOError, ValueError) as e:

raise ValueError('Error while creating megam training file: %s' % e)

# Run megam on the training file.

options = []

options += ['-nobias', '-repeat', '10']

if explicit:

options += ['-explicit']

if not bernoulli:

options += ['-fvals']

if gaussian_prior_sigma:

# Lambda is just the precision of the Gaussian prior, i.e. it's the

# inverse variance, so the parameter conversion is 1.0/sigma**2.

# See http://www.cs.utah.edu/~hal/docs/daume04cg-bfgs.pdf.

inv_variance = 1.0 / gaussian_prior_sigma**2

else:

inv_variance = 0

options += ['-lambda', '%.2f' % inv_variance, '-tune']

if trace < 3:

options += ['-quiet']

if 'max_iter' in kwargs:

options += ['-maxi', '%s' % kwargs['max_iter']]

if 'll_delta' in kwargs:

# [xx] this is actually a perplexity delta, not a log

# likelihood delta

options += ['-dpp', '%s' % abs(kwargs['ll_delta'])]

if hasattr(encoding, 'cost'):

options += ['-multilabel'] # each possible la

options += ['multiclass', trainfile_name]

stdout = call_megam(options)

# print './megam_i686.opt ', ' '.join(options)

# Delete the training file

try: os.remove(trainfile_name)

except (OSError, IOError) as e:

print('Warning: unable to delete %s: %s' % (trainfile_name, e))

# Parse the generated weight vector.

weights = parse_megam_weights(stdout, encoding.length(), explicit)

# Convert from base-e to base-2 weights.

weights *= numpy.log2(numpy.e)

# Build the classifier

return MaxentClassifier(encoding, weights)

######################################################################

#{ Classifier Trainer: tadm

######################################################################

class TadmMaxentClassifier(MaxentClassifier):

@classmethod

def train(cls, train_toks, **kwargs):

algorithm = kwargs.get('algorithm', 'tao_lmvm')

trace = kwargs.get('trace', 3)

encoding = kwargs.get('encoding', None)

labels = kwargs.get('labels', None)

sigma = kwargs.get('gaussian_prior_sigma', 0)

count_cutoff = kwargs.get('count_cutoff', 0)

max_iter = kwargs.get('max_iter')

ll_delta = kwargs.get('min_lldelta')

# Construct an encoding from the training data.

if not encoding:

encoding = TadmEventMaxentFeatureEncoding.train(train_toks,

count_cutoff,

labels=labels)

trainfile_fd, trainfile_name = \

tempfile.mkstemp(prefix='nltk-tadm-events-', suffix='.gz')

weightfile_fd, weightfile_name = \

tempfile.mkstemp(prefix='nltk-tadm-weights-')

trainfile = gzip.open(trainfile_name, 'wb')

write_tadm_file(train_toks, encoding, trainfile)

trainfile.close()

options = []

options.extend(['-monitor'])

options.extend(['-method', algorithm])

if sigma:

options.extend(['-l2', '%.6f' % sigma**2])

if max_iter:

options.extend(['-max_it', '%d' % max_iter])

if ll_delta:

options.extend(['-fatol', '%.6f' % abs(ll_delta)])

options.extend(['-events_in', trainfile_name])

options.extend(['-params_out', weightfile_name])

if trace < 3:

options.extend(['2>&1'])

else:

options.extend(['-summary'])

call_tadm(options)

weightfile = open(weightfile_name, 'rb')

weights = parse_tadm_weights(weightfile)

weightfile.close()

os.remove(trainfile_name)

os.remove(weightfile_name)

# Convert from base-e to base-2 weights.

weights *= numpy.log2(numpy.e)

# Build the classifier

return cls(encoding, weights)

######################################################################

#{ Demo

######################################################################

def demo():

from nltk.classify.util import names_demo

classifier = names_demo(MaxentClassifier.train)

if __name__ == '__main__':

demo()

Coverage for nltk.classify.maxent : 69%

602 statements 416 run 186 missing 0 excluded