Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

# Natural Language Toolkit: Tokenizers 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Edward Loper <edloper@gradient.cis.upenn.edu> 

#         Steven Bird <sb@csse.unimelb.edu.au> 

#         Trevor Cohn <tacohn@csse.unimelb.edu.au> 

# URL: <http://nltk.sourceforge.net> 

# For license information, see LICENSE.TXT 

 

r""" 

Regular-Expression Tokenizers 

 

A ``RegexpTokenizer`` splits a string into substrings using a regular expression. 

For example, the following tokenizer forms tokens out of alphabetic sequences, 

money expressions, and any other non-whitespace sequences: 

 

    >>> from nltk.tokenize import RegexpTokenizer 

    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks." 

    >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') 

    >>> tokenizer.tokenize(s) 

    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', 

    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] 

 

A ``RegexpTokenizer`` can use its regexp to match delimiters instead: 

 

    >>> tokenizer = RegexpTokenizer('\s+', gaps=True) 

    >>> tokenizer.tokenize(s) 

    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 

    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] 

 

Note that empty tokens are not returned when the delimiter appears at 

the start or end of the string. 

 

The material between the tokens is discarded.  For example, 

the following tokenizer selects just the capitalized words: 

 

    >>> capword_tokenizer = RegexpTokenizer('[A-Z]\w+') 

    >>> capword_tokenizer.tokenize(s) 

    ['Good', 'New', 'York', 'Please', 'Thanks'] 

 

This module contains several subclasses of ``RegexpTokenizer`` 

that use pre-defined regular expressions. 

 

    >>> from nltk.tokenize import BlanklineTokenizer 

    >>> # Uses '\s*\n\s*\n\s*': 

    >>> BlanklineTokenizer().tokenize(s) 

    ['Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.', 

    'Thanks.'] 

 

All of the regular expression tokenizers are also available as functions: 

 

    >>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize 

    >>> regexp_tokenize(s, pattern='\w+|\$[\d\.]+|\S+') 

    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', 

    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] 

    >>> wordpunct_tokenize(s) 

    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', 

     '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] 

    >>> blankline_tokenize(s) 

    ['Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.', 'Thanks.'] 

 

Caution: The function ``regexp_tokenize()`` takes the text as its 

first argument, and the regular expression pattern as its second 

argument.  This differs from the conventions used by Python's 

``re`` functions, where the pattern is always the first argument. 

(This is for consistency with the other NLTK tokenizers.) 

""" 

 

import re 

import sre_constants 

 

from nltk.internals import convert_regexp_to_nongrouping 

from nltk.tokenize.api import TokenizerI 

from nltk.tokenize.util import regexp_span_tokenize 

 

class RegexpTokenizer(TokenizerI): 

    """ 

    A tokenizer that splits a string using a regular expression, which 

    matches either the tokens or the separators between tokens. 

 

        >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') 

 

    :type pattern: str 

    :param pattern: The pattern used to build this tokenizer. 

        (This pattern may safely contain grouping parentheses.) 

    :type gaps: bool 

    :param gaps: True if this tokenizer's pattern should be used 

        to find separators between tokens; False if this 

        tokenizer's pattern should be used to find the tokens 

        themselves. 

    :type discard_empty: bool 

    :param discard_empty: True if any empty tokens `''` 

        generated by the tokenizer should be discarded.  Empty 

        tokens can only be generated if `_gaps == True`. 

    :type flags: int 

    :param flags: The regexp flags used to compile this 

        tokenizer's pattern.  By default, the following flags are 

        used: `re.UNICODE | re.MULTILINE | re.DOTALL`. 

 

    """ 

    def __init__(self, pattern, gaps=False, discard_empty=True, 

                 flags=re.UNICODE | re.MULTILINE | re.DOTALL): 

        # If they gave us a regexp object, extract the pattern. 

        pattern = getattr(pattern, 'pattern', pattern) 

 

        self._pattern = pattern 

        self._gaps = gaps 

        self._discard_empty = discard_empty 

        self._flags = flags 

        self._regexp = None 

 

        # Remove grouping parentheses -- if the regexp contains any 

        # grouping parentheses, then the behavior of re.findall and 

        # re.split will change. 

        nongrouping_pattern = convert_regexp_to_nongrouping(pattern) 

 

        try: 

            self._regexp = re.compile(nongrouping_pattern, flags) 

        except re.error as e: 

            raise ValueError('Error in regular expression %r: %s' % 

                             (pattern, e)) 

 

    def tokenize(self, text): 

        # If our regexp matches gaps, use re.split: 

        if self._gaps: 

            if self._discard_empty: 

                return [tok for tok in self._regexp.split(text) if tok] 

            else: 

                return self._regexp.split(text) 

 

        # If our regexp matches tokens, use re.findall: 

        else: 

            return self._regexp.findall(text) 

 

    def span_tokenize(self, text): 

        if self._gaps: 

            for left, right in regexp_span_tokenize(text, self._regexp): 

                if not (self._discard_empty and left == right): 

                    yield left, right 

        else: 

            for m in re.finditer(self._regexp, text): 

                yield m.span() 

 

    def __repr__(self): 

        return ('%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)' % 

                (self.__class__.__name__, self._pattern, self._gaps, 

                 self._discard_empty, self._flags)) 

 

class WhitespaceTokenizer(RegexpTokenizer): 

    r""" 

    Tokenize a string on whitespace (space, tab, newline). 

    In general, users should use the string ``split()`` method instead. 

 

        >>> from nltk.tokenize import WhitespaceTokenizer 

        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks." 

        >>> WhitespaceTokenizer().tokenize(s) 

        ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 

        'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] 

    """ 

 

    def __init__(self): 

        RegexpTokenizer.__init__(self, r'\s+', gaps=True) 

 

class BlanklineTokenizer(RegexpTokenizer): 

    """ 

    Tokenize a string, treating any sequence of blank lines as a delimiter. 

    Blank lines are defined as lines containing no characters, except for 

    space or tab characters. 

    """ 

    def __init__(self): 

        RegexpTokenizer.__init__(self, r'\s*\n\s*\n\s*', gaps=True) 

 

class WordPunctTokenizer(RegexpTokenizer): 

    """ 

    Tokenize a text into a sequence of alphabetic and 

    non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``. 

 

        >>> from nltk.tokenize import WordPunctTokenizer 

        >>> s = "Good muffins cost $3.88\\nin New York.  Please buy me\\ntwo of them.\\n\\nThanks." 

        >>> WordPunctTokenizer().tokenize(s) 

        ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', 

        '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] 

    """ 

    def __init__(self): 

        RegexpTokenizer.__init__(self, r'\w+|[^\w\s]+') 

 

###################################################################### 

#{ Tokenization Functions 

###################################################################### 

 

def regexp_tokenize(text, pattern, gaps=False, discard_empty=True, 

                    flags=re.UNICODE | re.MULTILINE | re.DOTALL): 

    """ 

    Return a tokenized copy of *text*.  See :class:`.RegexpTokenizer` 

    for descriptions of the arguments. 

    """ 

    tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags) 

    return tokenizer.tokenize(text) 

 

blankline_tokenize = BlanklineTokenizer().tokenize 

wordpunct_tokenize = WordPunctTokenizer().tokenize 

 

 

if __name__ == "__main__": 

    import doctest 

    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)