Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

# Natural Language Toolkit: Interface to Megam Classifier 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Edward Loper <edloper@gradient.cis.upenn.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

""" 

A set of functions used to interface with the external megam_ maxent 

optimization package. Before megam can be used, you should tell NLTK where it 

can find the megam binary, using the ``config_megam()`` function. Typical 

usage: 

 

.. doctest:: 

    :options: +SKIP 

 

    >>> from nltk.classify import megam 

    >>> megam.config_megam() # pass path to megam if not found in PATH 

    [Found megam: ...] 

 

Use with MaxentClassifier. Example below, see MaxentClassifier documentation 

for details. 

 

    nltk.classify.MaxentClassifier.train(corpus, 'megam') 

 

.. _megam: http://www.cs.utah.edu/~hal/megam/ 

""" 

from __future__ import print_function 

 

import os 

import os.path 

import subprocess 

 

from nltk import compat 

from nltk.internals import find_binary 

try: 

    import numpy 

except ImportError: 

    numpy = None 

 

###################################################################### 

#{ Configuration 

###################################################################### 

 

_megam_bin = None 

def config_megam(bin=None): 

    """ 

    Configure NLTK's interface to the ``megam`` maxent optimization 

    package. 

 

    :param bin: The full path to the ``megam`` binary.  If not specified, 

        then nltk will search the system for a ``megam`` binary; and if 

        one is not found, it will raise a ``LookupError`` exception. 

    :type bin: str 

    """ 

    global _megam_bin 

    _megam_bin = find_binary( 

        'megam', bin, 

        env_vars=['MEGAM',  'MEGAMHOME'], 

        binary_names=['megam.opt', 'megam', 'megam_686', 'megam_i686.opt'], 

        url='http://www.cs.utah.edu/~hal/megam/') 

 

###################################################################### 

#{ Megam Interface Functions 

###################################################################### 

 

def write_megam_file(train_toks, encoding, stream, 

                     bernoulli=True, explicit=True): 

    """ 

    Generate an input file for ``megam`` based on the given corpus of 

    classified tokens. 

 

    :type train_toks: list(tuple(dict, str)) 

    :param train_toks: Training data, represented as a list of 

        pairs, the first member of which is a feature dictionary, 

        and the second of which is a classification label. 

 

    :type encoding: MaxentFeatureEncodingI 

    :param encoding: A feature encoding, used to convert featuresets 

        into feature vectors. May optionally implement a cost() method 

        in order to assign different costs to different class predictions. 

 

    :type stream: stream 

    :param stream: The stream to which the megam input file should be 

        written. 

 

    :param bernoulli: If true, then use the 'bernoulli' format.  I.e., 

        all joint features have binary values, and are listed iff they 

        are true.  Otherwise, list feature values explicitly.  If 

        ``bernoulli=False``, then you must call ``megam`` with the 

        ``-fvals`` option. 

 

    :param explicit: If true, then use the 'explicit' format.  I.e., 

        list the features that would fire for any of the possible 

        labels, for each token.  If ``explicit=True``, then you must 

        call ``megam`` with the ``-explicit`` option. 

    """ 

    # Look up the set of labels. 

    labels = encoding.labels() 

    labelnum = dict([(label, i) for (i, label) in enumerate(labels)]) 

 

    # Write the file, which contains one line per instance. 

    for featureset, label in train_toks: 

        # First, the instance number (or, in the weighted multiclass case, the cost of each label). 

        if hasattr(encoding,'cost'): 

            stream.write(':'.join(str(encoding.cost(featureset, label, l)) for l in labels)) 

        else: 

            stream.write('%d' % labelnum[label]) 

 

        # For implicit file formats, just list the features that fire 

        # for this instance's actual label. 

        if not explicit: 

            _write_megam_features(encoding.encode(featureset, label), 

                                  stream, bernoulli) 

 

        # For explicit formats, list the features that would fire for 

        # any of the possible labels. 

        else: 

            for l in labels: 

                stream.write(' #') 

                _write_megam_features(encoding.encode(featureset, l), 

                                      stream, bernoulli) 

 

        # End of the instance. 

        stream.write('\n') 

 

def parse_megam_weights(s, features_count, explicit=True): 

    """ 

    Given the stdout output generated by ``megam`` when training a 

    model, return a ``numpy`` array containing the corresponding weight 

    vector.  This function does not currently handle bias features. 

    """ 

    if numpy is None: 

        raise ValueError('This function requires that numpy be installed') 

    assert explicit, 'non-explicit not supported yet' 

    lines = s.strip().split('\n') 

    weights = numpy.zeros(features_count, 'd') 

    for line in lines: 

        if line.strip(): 

            fid, weight = line.split() 

            weights[int(fid)] = float(weight) 

    return weights 

 

def _write_megam_features(vector, stream, bernoulli): 

    if not vector: 

        raise ValueError('MEGAM classifier requires the use of an ' 

                         'always-on feature.') 

    for (fid, fval) in vector: 

        if bernoulli: 

            if fval == 1: 

                stream.write(' %s' % fid) 

            elif fval != 0: 

                raise ValueError('If bernoulli=True, then all' 

                                 'features must be binary.') 

        else: 

            stream.write(' %s %s' % (fid, fval)) 

 

def call_megam(args): 

    """ 

    Call the ``megam`` binary with the given arguments. 

    """ 

    if isinstance(args, compat.string_types): 

        raise TypeError('args should be a list of strings') 

    if _megam_bin is None: 

        config_megam() 

 

    # Call megam via a subprocess 

    cmd = [_megam_bin] + args 

    p = subprocess.Popen(cmd, stdout=subprocess.PIPE) 

    (stdout, stderr) = p.communicate() 

 

    # Check the return code. 

    if p.returncode != 0: 

        print() 

        print(stderr) 

        raise OSError('megam command failed!') 

 

    return stdout