Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

# Natural Language Toolkit: Interface to scikit-learn classifiers 

# 

# Author: Lars Buitinck <L.J.Buitinck@uva.nl> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

""" 

scikit-learn (http://scikit-learn.org) is a machine learning library for 

Python, supporting most of the basic classification algorithms, including SVMs, 

Naive Bayes, logistic regression and decision trees. 

 

This package implement a wrapper around scikit-learn classifiers. To use this 

wrapper, construct a scikit-learn classifier, then use that to construct a 

SklearnClassifier. E.g., to wrap a linear SVM classifier with default settings, 

do 

 

>>> from sklearn.svm.sparse import LinearSVC 

>>> from nltk.classify.scikitlearn import SklearnClassifier 

>>> classif = SklearnClassifier(LinearSVC()) 

 

The scikit-learn classifier may be arbitrarily complex. E.g., the following 

constructs and wraps a Naive Bayes estimator with tf-idf weighting and 

chi-square feature selection: 

 

>>> from sklearn.feature_extraction.text import TfidfTransformer 

>>> from sklearn.feature_selection import SelectKBest, chi2 

>>> from sklearn.naive_bayes import MultinomialNB 

>>> from sklearn.pipeline import Pipeline 

>>> pipeline = Pipeline([('tfidf', TfidfTransformer()), 

...                      ('chi2', SelectKBest(chi2, k=1000)), 

...                      ('nb', MultinomialNB())]) 

>>> classif = SklearnClassifier(pipeline) 

 

(Such a classifier could be trained on word counts for text classification.) 

""" 

 

import numpy as np 

from scipy.sparse import coo_matrix 

 

from nltk.classify.api import ClassifierI 

from nltk.probability import DictionaryProbDist 

from nltk import compat 

 

class SklearnClassifier(ClassifierI): 

    """Wrapper for scikit-learn classifiers.""" 

 

    def __init__(self, estimator, dtype=float, sparse=True): 

        """ 

        :param estimator: scikit-learn classifier object. 

 

        :param dtype: data type used when building feature array. 

            scikit-learn estimators work exclusively on numeric data; use bool 

            when all features are binary. 

 

        :param sparse: Whether to use sparse matrices. The estimator must 

            support these; not all scikit-learn classifiers do. The default 

            value is True, since most NLP problems involve sparse feature sets. 

        :type sparse: boolean. 

        """ 

        self._clf = estimator 

        self._dtype = dtype 

        self._sparse = sparse 

 

    def __repr__(self): 

        return "<SklearnClassifier(%r)>" % self._clf 

 

    def batch_classify(self, featuresets): 

        X = self._convert(featuresets) 

        y = self._clf.predict(X) 

        return [self._index_label[int(yi)] for yi in y] 

 

    def batch_prob_classify(self, featuresets): 

        X = self._convert(featuresets) 

        y_proba_list = self._clf.predict_proba(X) 

        return [self._make_probdist(y_proba) for y_proba in y_proba_list] 

 

    def labels(self): 

        return self._label_index.keys() 

 

    def train(self, labeled_featuresets): 

        """ 

        Train (fit) the scikit-learn estimator. 

 

        :param labeled_featuresets: A list of classified featuresets, 

            i.e., a list of tuples ``(featureset, label)``. 

        """ 

 

        self._feature_index = {} 

        self._index_label = [] 

        self._label_index = {} 

 

        for fs, label in labeled_featuresets: 

            for f in fs: 

                if f not in self._feature_index: 

                    self._feature_index[f] = len(self._feature_index) 

            if label not in self._label_index: 

                self._index_label.append(label) 

                self._label_index[label] = len(self._label_index) 

 

        featuresets, labels = zip(*labeled_featuresets) 

        X = self._convert(featuresets) 

        y = np.array([self._label_index[l] for l in labels]) 

 

        self._clf.fit(X, y) 

 

        return self 

 

    def _convert(self, featuresets): 

        if self._sparse: 

            return self._featuresets_to_coo(featuresets) 

        else: 

            return self._featuresets_to_array(featuresets) 

 

    def _featuresets_to_coo(self, featuresets): 

        """Convert featuresets to sparse matrix (COO format).""" 

 

        i_ind = [] 

        j_ind = [] 

        values = [] 

 

        for i, fs in enumerate(featuresets): 

            for f, v in compat.iteritems(fs): 

                try: 

                    j = self._feature_index[f] 

                    i_ind.append(i) 

                    j_ind.append(j) 

                    values.append(self._dtype(v)) 

                except KeyError: 

                    pass 

 

        shape = (i + 1, len(self._feature_index)) 

        return coo_matrix((values, (i_ind, j_ind)), shape=shape, dtype=self._dtype) 

 

    def _featuresets_to_array(self, featuresets): 

        """Convert featureset to Numpy array.""" 

 

        X = np.zeros((len(featuresets), len(self._feature_index)), 

                     dtype=self._dtype) 

 

        for i, fs in enumerate(featuresets): 

            for f, v in compat.iteritems(fs): 

                try: 

                    X[i, self._feature_index[f]] = self._dtype(v) 

                except KeyError:    # feature not seen in training 

                    pass 

 

        return X 

 

    def _make_probdist(self, y_proba): 

        return DictionaryProbDist(dict((self._index_label[i], p) 

                                       for i, p in enumerate(y_proba))) 

 

 

if __name__ == "__main__": 

    from nltk.classify.util import names_demo, binary_names_demo_features 

    try: 

        from sklearn.linear_model.sparse import LogisticRegression 

    except ImportError:     # separate sparse LR to be removed in 0.12 

        from sklearn.linear_model import LogisticRegression 

    from sklearn.naive_bayes import BernoulliNB 

 

    print("scikit-learn Naive Bayes:") 

    names_demo(SklearnClassifier(BernoulliNB(binarize=False), dtype=bool).train, 

               features=binary_names_demo_features) 

    print("scikit-learn logistic regression:") 

    names_demo(SklearnClassifier(LogisticRegression(), dtype=np.float64).train, 

               features=binary_names_demo_features)