proxygen
cpp.tokenize Namespace Reference

Classes

class  Token
 

Functions

def _GetString (source, start, i)
 
def _GetChar (source, start, i)
 
def GetTokens (source)
 
def main (argv)
 

Variables

string __author__ = 'nnorwitz@google.com (Neal Norwitz)'
 
string _letters = 'abcdefghijklmnopqrstuvwxyz'
 
 VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
 
 HEX_DIGITS = set('0123456789abcdefABCDEF')
 
 INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
 
 _STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
 
string UNKNOWN = 'UNKNOWN'
 
string SYNTAX = 'SYNTAX'
 
string CONSTANT = 'CONSTANT'
 
string NAME = 'NAME'
 
string PREPROCESSOR = 'PREPROCESSOR'
 
 WHENCE_STREAM
 
 WHENCE_QUEUE
 

Function Documentation

def cpp.tokenize._GetChar (   source,
  start,
  i 
)
private

Definition at line 105 of file tokenize.py.

Referenced by cpp.tokenize.Token.__str__(), and cpp.tokenize.GetTokens().

105 def _GetChar(source, start, i):
106  # NOTE(nnorwitz): may not be quite correct, should be good enough.
107  i = source.find("'", i+1)
108  while source[i-1] == '\\':
109  # Need to special case '\\'.
110  if (i - 2) > start and source[i-2] == '\\':
111  break
112  i = source.find("'", i+1)
113  # Try to handle unterminated single quotes (in a #if 0 block).
114  if i < 0:
115  i = start
116  return i + 1
117 
118 
def _GetChar(source, start, i)
Definition: tokenize.py:105
def cpp.tokenize._GetString (   source,
  start,
  i 
)
private

Definition at line 89 of file tokenize.py.

Referenced by cpp.tokenize.Token.__str__(), and cpp.tokenize.GetTokens().

89 def _GetString(source, start, i):
90  i = source.find('"', i+1)
91  while source[i-1] == '\\':
92  # Count the trailing backslashes.
93  backslash_count = 1
94  j = i - 2
95  while source[j] == '\\':
96  backslash_count += 1
97  j -= 1
98  # When trailing backslashes are even, they escape each other.
99  if (backslash_count % 2) == 0:
100  break
101  i = source.find('"', i+1)
102  return i + 1
103 
104 
def _GetString(source, start, i)
Definition: tokenize.py:89
def cpp.tokenize.GetTokens (   source)
Returns a sequence of Tokens.

Args:
  source: string of C++ source code.

Yields:
  Token that represents the next token in the source.

Definition at line 119 of file tokenize.py.

References cpp.tokenize._GetChar(), cpp.tokenize._GetString(), and min.

Referenced by cpp.tokenize.Token.__str__(), and cpp.tokenize.main().

119 def GetTokens(source):
120  """Returns a sequence of Tokens.
121 
122  Args:
123  source: string of C++ source code.
124 
125  Yields:
126  Token that represents the next token in the source.
127  """
128  # Cache various valid character sets for speed.
129  valid_identifier_chars = VALID_IDENTIFIER_CHARS
130  hex_digits = HEX_DIGITS
131  int_or_float_digits = INT_OR_FLOAT_DIGITS
132  int_or_float_digits2 = int_or_float_digits | set('.')
133 
134  # Only ignore errors while in a #if 0 block.
135  ignore_errors = False
136  count_ifs = 0
137 
138  i = 0
139  end = len(source)
140  while i < end:
141  # Skip whitespace.
142  while i < end and source[i].isspace():
143  i += 1
144  if i >= end:
145  return
146 
147  token_type = UNKNOWN
148  start = i
149  c = source[i]
150  if c.isalpha() or c == '_': # Find a string token.
151  token_type = NAME
152  while source[i] in valid_identifier_chars:
153  i += 1
154  # String and character constants can look like a name if
155  # they are something like L"".
156  if (source[i] == "'" and (i - start) == 1 and
157  source[start:i] in 'uUL'):
158  # u, U, and L are valid C++0x character preffixes.
159  token_type = CONSTANT
160  i = _GetChar(source, start, i)
161  elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
162  token_type = CONSTANT
163  i = _GetString(source, start, i)
164  elif c == '/' and source[i+1] == '/': # Find // comments.
165  i = source.find('\n', i)
166  if i == -1: # Handle EOF.
167  i = end
168  continue
169  elif c == '/' and source[i+1] == '*': # Find /* comments. */
170  i = source.find('*/', i) + 2
171  continue
172  elif c in ':+-<>&|*=': # : or :: (plus other chars).
173  token_type = SYNTAX
174  i += 1
175  new_ch = source[i]
176  if new_ch == c and c != '>': # Treat ">>" as two tokens.
177  i += 1
178  elif c == '-' and new_ch == '>':
179  i += 1
180  elif new_ch == '=':
181  i += 1
182  elif c in '()[]{}~!?^%;/.,': # Handle single char tokens.
183  token_type = SYNTAX
184  i += 1
185  if c == '.' and source[i].isdigit():
186  token_type = CONSTANT
187  i += 1
188  while source[i] in int_or_float_digits:
189  i += 1
190  # Handle float suffixes.
191  for suffix in ('l', 'f'):
192  if suffix == source[i:i+1].lower():
193  i += 1
194  break
195  elif c.isdigit(): # Find integer.
196  token_type = CONSTANT
197  if c == '0' and source[i+1] in 'xX':
198  # Handle hex digits.
199  i += 2
200  while source[i] in hex_digits:
201  i += 1
202  else:
203  while source[i] in int_or_float_digits2:
204  i += 1
205  # Handle integer (and float) suffixes.
206  for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
207  size = len(suffix)
208  if suffix == source[i:i+size].lower():
209  i += size
210  break
211  elif c == '"': # Find string.
212  token_type = CONSTANT
213  i = _GetString(source, start, i)
214  elif c == "'": # Find char.
215  token_type = CONSTANT
216  i = _GetChar(source, start, i)
217  elif c == '#': # Find pre-processor command.
218  token_type = PREPROCESSOR
219  got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
220  if got_if:
221  count_ifs += 1
222  elif source[i:i+6] == '#endif':
223  count_ifs -= 1
224  if count_ifs == 0:
225  ignore_errors = False
226 
227  # TODO(nnorwitz): handle preprocessor statements (\ continuations).
228  while 1:
229  i1 = source.find('\n', i)
230  i2 = source.find('//', i)
231  i3 = source.find('/*', i)
232  i4 = source.find('"', i)
233  # NOTE(nnorwitz): doesn't handle comments in #define macros.
234  # Get the first important symbol (newline, comment, EOF/end).
235  i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
236 
237  # Handle #include "dir//foo.h" properly.
238  if source[i] == '"':
239  i = source.find('"', i+1) + 1
240  assert i > 0
241  continue
242  # Keep going if end of the line and the line ends with \.
243  if not (i == i1 and source[i-1] == '\\'):
244  if got_if:
245  condition = source[start+4:i].lstrip()
246  if (condition.startswith('0') or
247  condition.startswith('(0)')):
248  ignore_errors = True
249  break
250  i += 1
251  elif c == '\\': # Handle \ in code.
252  # This is different from the pre-processor \ handling.
253  i += 1
254  continue
255  elif ignore_errors:
256  # The tokenizer seems to be in pretty good shape. This
257  # raise is conditionally disabled so that bogus code
258  # in an #if 0 block can be handled. Since we will ignore
259  # it anyways, this is probably fine. So disable the
260  # exception and return the bogus char.
261  i += 1
262  else:
263  sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
264  ('?', i, c, source[i-10:i+10]))
265  raise RuntimeError('unexpected token')
266 
267  if i <= 0:
268  print('Invalid index, exiting now.')
269  return
270  yield Token(token_type, source[start:i], start, i)
271 
272 
def _GetChar(source, start, i)
Definition: tokenize.py:105
def _GetString(source, start, i)
Definition: tokenize.py:89
LogLevel min
Definition: LogLevel.cpp:30
Definition: Traits.h:592
def GetTokens(source)
Definition: tokenize.py:119
def cpp.tokenize.main (   argv)
Driver mostly for testing purposes.

Definition at line 274 of file tokenize.py.

References cpp.tokenize.GetTokens().

Referenced by cpp.tokenize.Token.__str__().

274  def main(argv):
275  """Driver mostly for testing purposes."""
276  for filename in argv[1:]:
277  source = utils.ReadFile(filename)
278  if source is None:
279  continue
280 
281  for token in GetTokens(source):
282  print('%-12s: %s' % (token.token_type, token.name))
283  # print('\r%6.2f%%' % (100.0 * index / token.end),)
284  sys.stdout.write('\n')
285 
286 
def main(argv)
Definition: tokenize.py:274
def GetTokens(source)
Definition: tokenize.py:119

Variable Documentation

string cpp.tokenize.__author__ = 'nnorwitz@google.com (Neal Norwitz)'
private

Definition at line 20 of file tokenize.py.

string cpp.tokenize._letters = 'abcdefghijklmnopqrstuvwxyz'
private

Definition at line 42 of file tokenize.py.

cpp.tokenize._STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
private

Definition at line 49 of file tokenize.py.

string cpp.tokenize.CONSTANT = 'CONSTANT'

Definition at line 55 of file tokenize.py.

cpp.tokenize.HEX_DIGITS = set('0123456789abcdefABCDEF')

Definition at line 44 of file tokenize.py.

cpp.tokenize.INT_OR_FLOAT_DIGITS = set('01234567890eE-+')

Definition at line 45 of file tokenize.py.

string cpp.tokenize.NAME = 'NAME'

Definition at line 56 of file tokenize.py.

string cpp.tokenize.PREPROCESSOR = 'PREPROCESSOR'

Definition at line 57 of file tokenize.py.

string cpp.tokenize.SYNTAX = 'SYNTAX'

Definition at line 54 of file tokenize.py.

string cpp.tokenize.UNKNOWN = 'UNKNOWN'

Definition at line 53 of file tokenize.py.

Referenced by TEST_F().

cpp.tokenize.VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')

Definition at line 43 of file tokenize.py.

cpp.tokenize.WHENCE_QUEUE

Definition at line 61 of file tokenize.py.

cpp.tokenize.WHENCE_STREAM

Definition at line 61 of file tokenize.py.