Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

# Natural Language Toolkit: Tokenizers 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Yoav Goldberg <yoavg@cs.bgu.ac.il> 

#         Steven Bird <sb@csse.unimelb.edu.au> (minor edits) 

# URL: <http://nltk.sourceforge.net> 

# For license information, see LICENSE.TXT 

 

""" 

S-Expression Tokenizer 

 

``SExprTokenizer`` is used to find parenthesized expressions in a 

string.  In particular, it divides a string into a sequence of 

substrings that are either parenthesized expressions (including any 

nested parenthesized expressions), or other whitespace-separated 

tokens. 

 

    >>> from nltk.tokenize import SExprTokenizer 

    >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)') 

    ['(a b (c d))', 'e', 'f', '(g)'] 

 

By default, `SExprTokenizer` will raise a ``ValueError`` exception if 

used to tokenize an expression with non-matching parentheses: 

 

    >>> SExprTokenizer().tokenize('c) d) e (f (g') 

    Traceback (most recent call last): 

      ... 

    ValueError: Un-matched close paren at char 1 

 

The ``strict`` argument can be set to False to allow for 

non-matching parentheses.  Any unmatched close parentheses will be 

listed as their own s-expression; and the last partial sexpr with 

unmatched open parentheses will be listed as its own sexpr: 

 

    >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g') 

    ['c', ')', 'd', ')', 'e', '(f (g'] 

 

The characters used for open and close parentheses may be customized 

using the ``parens`` argument to the `SExprTokenizer` constructor: 

 

    >>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}') 

    ['{a b {c d}}', 'e', 'f', '{g}'] 

 

The s-expression tokenizer is also available as a function: 

 

    >>> from nltk.tokenize import sexpr_tokenize 

    >>> sexpr_tokenize('(a b (c d)) e f (g)') 

    ['(a b (c d))', 'e', 'f', '(g)'] 

 

""" 

 

import re 

 

from nltk.tokenize.api import TokenizerI 

 

class SExprTokenizer(TokenizerI): 

    """ 

    A tokenizer that divides strings into s-expressions. 

    An s-expresion can be either: 

 

      - a parenthesized expression, including any nested parenthesized 

        expressions, or 

      - a sequence of non-whitespace non-parenthesis characters. 

 

    For example, the string ``(a (b c)) d e (f)`` consists of four 

    s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``. 

 

    By default, the characters ``(`` and ``)`` are treated as open and 

    close parentheses, but alternative strings may be specified. 

 

    :param parens: A two-element sequence specifying the open and close parentheses 

        that should be used to find sexprs.  This will typically be either a 

        two-character string, or a list of two strings. 

    :type parens: str or list 

    :param strict: If true, then raise an exception when tokenizing an ill-formed sexpr. 

    """ 

 

    def __init__(self, parens='()', strict=True): 

        if len(parens) != 2: 

            raise ValueError('parens must contain exactly two strings') 

        self._strict = strict 

        self._open_paren = parens[0] 

        self._close_paren = parens[1] 

        self._paren_regexp = re.compile('%s|%s' % (re.escape(parens[0]), 

                                                   re.escape(parens[1]))) 

 

    def tokenize(self, text): 

        """ 

        Return a list of s-expressions extracted from *text*. 

        For example: 

 

            >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)') 

            ['(a b (c d))', 'e', 'f', '(g)'] 

 

        All parentheses are assumed to mark s-expressions. 

        (No special processing is done to exclude parentheses that occur 

        inside strings, or following backslash characters.) 

 

        If the given expression contains non-matching parentheses, 

        then the behavior of the tokenizer depends on the ``strict`` 

        parameter to the constructor.  If ``strict`` is ``True``, then 

        raise a ``ValueError``.  If ``strict`` is ``False``, then any 

        unmatched close parentheses will be listed as their own 

        s-expression; and the last partial s-expression with unmatched open 

        parentheses will be listed as its own s-expression: 

 

            >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g') 

            ['c', ')', 'd', ')', 'e', '(f (g'] 

 

        :param text: the string to be tokenized 

        :type text: str or iter(str) 

        :rtype: iter(str) 

        """ 

        result = [] 

        pos = 0 

        depth = 0 

        for m in self._paren_regexp.finditer(text): 

            paren = m.group() 

            if depth == 0: 

                result += text[pos:m.start()].split() 

                pos = m.start() 

            if paren == self._open_paren: 

                depth += 1 

            if paren == self._close_paren: 

                if self._strict and depth == 0: 

                    raise ValueError('Un-matched close paren at char %d' 

                                     % m.start()) 

                depth = max(0, depth-1) 

                if depth == 0: 

                    result.append(text[pos:m.end()]) 

                    pos = m.end() 

        if self._strict and depth > 0: 

            raise ValueError('Un-matched open paren at char %d' % pos) 

        if pos < len(text): 

            result.append(text[pos:]) 

        return result 

 

sexpr_tokenize = SExprTokenizer().tokenize 

 

 

if __name__ == "__main__": 

    import doctest 

    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)