proxygen: cpp.tokenize Namespace Reference

Classes
class	Token

Functions
def	_GetString (source, start, i)

def	_GetChar (source, start, i)

def	GetTokens (source)

def	main (argv)

Variables
string	__author__ = 'nnorwitz@google.com (Neal Norwitz)'

string	_letters = 'abcdefghijklmnopqrstuvwxyz'

	VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')

	HEX_DIGITS = set('0123456789abcdefABCDEF')

	INT_OR_FLOAT_DIGITS = set('01234567890eE-+')

	_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))

string	UNKNOWN = 'UNKNOWN'

string	SYNTAX = 'SYNTAX'

string	CONSTANT = 'CONSTANT'

string	NAME = 'NAME'

string	PREPROCESSOR = 'PREPROCESSOR'

	WHENCE_STREAM

	WHENCE_QUEUE

Function Documentation

def cpp.tokenize._GetChar	(	source,
		start,
		i
	)

private

Definition at line 105 of file tokenize.py.

Referenced by cpp.tokenize.Token.__str__(), and cpp.tokenize.GetTokens().

 def _GetChar(source, start, i):
     # NOTE(nnorwitz): may not be quite correct, should be good enough.
     i = source.find("'", i+1)
     while source[i-1] == '\\':
         # Need to special case '\\'.
         if (i - 2) > start and source[i-2] == '\\':
             break
         i = source.find("'", i+1)
     # Try to handle unterminated single quotes (in a #if 0 block).
     if i < 0:
         i = start
     return i + 1
 
 

def cpp.tokenize._GetString	(	source,
		start,
		i
	)

private

Definition at line 89 of file tokenize.py.

Referenced by cpp.tokenize.Token.__str__(), and cpp.tokenize.GetTokens().

 def _GetString(source, start, i):
     i = source.find('"', i+1)
     while source[i-1] == '\\':
         # Count the trailing backslashes.
         backslash_count = 1
         j = i - 2
         while source[j] == '\\':
             backslash_count += 1
             j -= 1
         # When trailing backslashes are even, they escape each other.
         if (backslash_count % 2) == 0:
             break
         i = source.find('"', i+1)
     return i + 1
 
 

def cpp.tokenize.GetTokens ( source )

Returns a sequence of Tokens.

Args:
  source: string of C++ source code.

Yields:
  Token that represents the next token in the source.

Definition at line 119 of file tokenize.py.

References cpp.tokenize._GetChar(), cpp.tokenize._GetString(), and min.

Referenced by cpp.tokenize.Token.__str__(), and cpp.tokenize.main().

 def GetTokens(source):
     """Returns a sequence of Tokens.
 
     Args:
       source: string of C++ source code.
 
     Yields:
       Token that represents the next token in the source.
     """
     # Cache various valid character sets for speed.
     valid_identifier_chars = VALID_IDENTIFIER_CHARS
     hex_digits = HEX_DIGITS
     int_or_float_digits = INT_OR_FLOAT_DIGITS
     int_or_float_digits2 = int_or_float_digits | set('.')
 
     # Only ignore errors while in a #if 0 block.
     ignore_errors = False
     count_ifs = 0
 
     i = 0
     end = len(source)
     while i < end:
         # Skip whitespace.
         while i < end and source[i].isspace():
             i += 1
         if i >= end:
             return
 
         token_type = UNKNOWN
         start = i
         c = source[i]
         if c.isalpha() or c == '_':              # Find a string token.
             token_type = NAME
             while source[i] in valid_identifier_chars:
                 i += 1
             # String and character constants can look like a name if
             # they are something like L"".
             if (source[i] == "'" and (i - start) == 1 and
                 source[start:i] in 'uUL'):
                 # u, U, and L are valid C++0x character preffixes.
                 token_type = CONSTANT
                 i = _GetChar(source, start, i)
             elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
                 token_type = CONSTANT
                 i = _GetString(source, start, i)
         elif c == '/' and source[i+1] == '/':    # Find // comments.
             i = source.find('\n', i)
             if i == -1:  # Handle EOF.
                 i = end
             continue
         elif c == '/' and source[i+1] == '*':    # Find /* comments. */
             i = source.find('*/', i) + 2
             continue
         elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
             token_type = SYNTAX
             i += 1
             new_ch = source[i]
             if new_ch == c and c != '>':         # Treat ">>" as two tokens.
                 i += 1
             elif c == '-' and new_ch == '>':
                 i += 1
             elif new_ch == '=':
                 i += 1
         elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
             token_type = SYNTAX
             i += 1
             if c == '.' and source[i].isdigit():
                 token_type = CONSTANT
                 i += 1
                 while source[i] in int_or_float_digits:
                     i += 1
                 # Handle float suffixes.
                 for suffix in ('l', 'f'):
                     if suffix == source[i:i+1].lower():
                         i += 1
                         break
         elif c.isdigit():                        # Find integer.
             token_type = CONSTANT
             if c == '0' and source[i+1] in 'xX':
                 # Handle hex digits.
                 i += 2
                 while source[i] in hex_digits:
                     i += 1
             else:
                 while source[i] in int_or_float_digits2:
                     i += 1
             # Handle integer (and float) suffixes.
             for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
                 size = len(suffix)
                 if suffix == source[i:i+size].lower():
                     i += size
                     break
         elif c == '"':                           # Find string.
             token_type = CONSTANT
             i = _GetString(source, start, i)
         elif c == "'":                           # Find char.
             token_type = CONSTANT
             i = _GetChar(source, start, i)
         elif c == '#':                           # Find pre-processor command.
             token_type = PREPROCESSOR
             got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
             if got_if:
                 count_ifs += 1
             elif source[i:i+6] == '#endif':
                 count_ifs -= 1
                 if count_ifs == 0:
                     ignore_errors = False
 
             # TODO(nnorwitz): handle preprocessor statements (\ continuations).
             while 1:
                 i1 = source.find('\n', i)
                 i2 = source.find('//', i)
                 i3 = source.find('/*', i)
                 i4 = source.find('"', i)
                 # NOTE(nnorwitz): doesn't handle comments in #define macros.
                 # Get the first important symbol (newline, comment, EOF/end).
                 i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
 
                 # Handle #include "dir//foo.h" properly.
                 if source[i] == '"':
                     i = source.find('"', i+1) + 1
                     assert i > 0
                     continue
                 # Keep going if end of the line and the line ends with \.
                 if not (i == i1 and source[i-1] == '\\'):
                     if got_if:
                         condition = source[start+4:i].lstrip()
                         if (condition.startswith('0') or
                             condition.startswith('(0)')):
                             ignore_errors = True
                     break
                 i += 1
         elif c == '\\':                          # Handle \ in code.
             # This is different from the pre-processor \ handling.
             i += 1
             continue
         elif ignore_errors:
             # The tokenizer seems to be in pretty good shape.  This
             # raise is conditionally disabled so that bogus code
             # in an #if 0 block can be handled.  Since we will ignore
             # it anyways, this is probably fine.  So disable the
             # exception and  return the bogus char.
             i += 1
         else:
             sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
                              ('?', i, c, source[i-10:i+10]))
             raise RuntimeError('unexpected token')
 
         if i <= 0:
             print('Invalid index, exiting now.')
             return
         yield Token(token_type, source[start:i], start, i)
 
 

def cpp.tokenize.main ( argv )

Driver mostly for testing purposes.

Definition at line 274 of file tokenize.py.

References cpp.tokenize.GetTokens().

Referenced by cpp.tokenize.Token.__str__().

     def main(argv):
         """Driver mostly for testing purposes."""
         for filename in argv[1:]:
             source = utils.ReadFile(filename)
             if source is None:
                 continue
 
             for token in GetTokens(source):
                 print('%-12s: %s' % (token.token_type, token.name))
                 # print('\r%6.2f%%' % (100.0 * index / token.end),)
             sys.stdout.write('\n')
 
 

Variable Documentation

string cpp.tokenize.__author__ = 'nnorwitz@google.com (Neal Norwitz)'

private

Definition at line 20 of file tokenize.py.

string cpp.tokenize._letters = 'abcdefghijklmnopqrstuvwxyz'

private

Definition at line 42 of file tokenize.py.

cpp.tokenize._STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))

private

Definition at line 49 of file tokenize.py.

string cpp.tokenize.CONSTANT = 'CONSTANT'

Definition at line 55 of file tokenize.py.

cpp.tokenize.HEX_DIGITS = set('0123456789abcdefABCDEF')

Definition at line 44 of file tokenize.py.

cpp.tokenize.INT_OR_FLOAT_DIGITS = set('01234567890eE-+')

Definition at line 45 of file tokenize.py.

string cpp.tokenize.NAME = 'NAME'

Definition at line 56 of file tokenize.py.

string cpp.tokenize.PREPROCESSOR = 'PREPROCESSOR'

Definition at line 57 of file tokenize.py.

string cpp.tokenize.SYNTAX = 'SYNTAX'

Definition at line 54 of file tokenize.py.

string cpp.tokenize.UNKNOWN = 'UNKNOWN'

Definition at line 53 of file tokenize.py.

Referenced by TEST_F().

cpp.tokenize.VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')

Definition at line 43 of file tokenize.py.

cpp.tokenize.WHENCE_QUEUE

Definition at line 61 of file tokenize.py.

cpp.tokenize.WHENCE_STREAM

Definition at line 61 of file tokenize.py.

Classes

Functions

Variables

Function Documentation

Variable Documentation