Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

# Natural Language Toolkit: Simple Tokenizers 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Edward Loper <edloper@gradient.cis.upenn.edu> 

#         Steven Bird <sb@csse.unimelb.edu.au> 

# URL: <http://nltk.sourceforge.net> 

# For license information, see LICENSE.TXT 

 

r""" 

Simple Tokenizers 

 

These tokenizers divide strings into substrings using the string 

``split()`` method. 

When tokenizing using a particular delimiter string, use 

the string ``split()`` method directly, as this is more efficient. 

 

The simple tokenizers are *not* available as separate functions; 

instead, you should just use the string ``split()`` method directly: 

 

    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks." 

    >>> s.split() 

    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 

    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] 

    >>> s.split(' ') 

    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '', 

    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] 

    >>> s.split('\n') 

    ['Good muffins cost $3.88', 'in New York.  Please buy me', 

    'two of them.', '', 'Thanks.'] 

 

The simple tokenizers are mainly useful because they follow the 

standard ``TokenizerI`` interface, and so can be used with any code 

that expects a tokenizer.  For example, these tokenizers can be used 

to specify the tokenization conventions when building a `CorpusReader`. 

 

""" 

 

from nltk.tokenize.api import TokenizerI, StringTokenizer 

from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize 

 

class SpaceTokenizer(StringTokenizer): 

    r"""Tokenize a string using the space character as a delimiter, 

    which is the same as ``s.split(' ')``. 

 

        >>> from nltk.tokenize import SpaceTokenizer 

        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks." 

        >>> SpaceTokenizer().tokenize(s) 

        ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '', 

        'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] 

    """ 

 

    _string = ' ' 

 

class TabTokenizer(StringTokenizer): 

    r"""Tokenize a string use the tab character as a delimiter, 

    the same as ``s.split('\t')``. 

 

        >>> from nltk.tokenize import TabTokenizer 

        >>> TabTokenizer().tokenize('a\tb c\n\t d') 

        ['a', 'b c\n', ' d'] 

    """ 

 

    _string = '\t' 

 

class CharTokenizer(StringTokenizer): 

    """Tokenize a string into individual characters.  If this functionality 

    is ever required directly, use ``for char in string``. 

    """ 

 

    def tokenize(self, s): 

        return list(s) 

 

    def span_tokenize(self, s): 

        for i, j in enumerate(range(1, len(s+1))): 

            yield i, j 

 

class LineTokenizer(TokenizerI): 

    r"""Tokenize a string into its lines, optionally discarding blank lines. 

    This is similar to ``s.split('\n')``. 

 

        >>> from nltk.tokenize import LineTokenizer 

        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks." 

        >>> LineTokenizer(blanklines='keep').tokenize(s) 

        ['Good muffins cost $3.88', 'in New York.  Please buy me', 

        'two of them.', '', 'Thanks.'] 

        >>> # same as [l for l in s.split('\n') if l.strip()]: 

        >>> LineTokenizer(blanklines='discard').tokenize(s) 

        ['Good muffins cost $3.88', 'in New York.  Please buy me', 

        'two of them.', 'Thanks.'] 

 

    :param blanklines: Indicates how blank lines should be handled.  Valid values are: 

 

        - ``discard``: strip blank lines out of the token list before returning it. 

           A line is considered blank if it contains only whitespace characters. 

        - ``keep``: leave all blank lines in the token list. 

        - ``discard-eof``: if the string ends with a newline, then do not generate 

           a corresponding token ``''`` after that newline. 

    """ 

 

    def __init__(self, blanklines='discard'): 

        valid_blanklines = ('discard', 'keep', 'discard-eof') 

        if blanklines not in valid_blanklines: 

            raise ValueError('Blank lines must be one of: %s' % 

                             ' '.join(valid_blanklines)) 

 

        self._blanklines = blanklines 

 

    def tokenize(self, s): 

        lines = s.splitlines() 

        # If requested, strip off blank lines. 

        if self._blanklines == 'discard': 

            lines = [l for l in lines if l.rstrip()] 

        elif self._blanklines == 'discard-eof': 

            if lines and not lines[-1].strip(): lines.pop() 

        return lines 

 

    # discard-eof not implemented 

    def span_tokenize(self, s): 

        if self._blanklines == 'keep': 

            for span in string_span_tokenize(s, r'\n'): 

                yield span 

        else: 

            for span in regexp_span_tokenize(s, r'\n(\s+\n)*'): 

                yield span 

 

###################################################################### 

#{ Tokenization Functions 

###################################################################### 

# XXX: it is stated in module docs that there is no function versions 

 

def line_tokenize(text, blanklines='discard'): 

    return LineTokenizer(blanklines).tokenize(text) 

 

 

if __name__ == "__main__": 

    import doctest 

    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)