/* vim: set sw=4 ts=4 et tw=78: */ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is the Narcissus JavaScript engine. * * The Initial Developer of the Original Code is * Brendan Eich . * Portions created by the Initial Developer are Copyright (C) 2004 * the Initial Developer. All Rights Reserved. * * Contributor(s): * Tom Austin * Brendan Eich * Shu-Yu Guo * Dave Herman * Dimitris Vardoulakis * Patrick Walton * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ /* * Narcissus - JS implemented in JS. * * Lexical scanner. */ Narcissus.lexer = (function() { var definitions = Narcissus.definitions; // Set constants in the local scope. eval(definitions.consts); // Build up a trie of operator tokens. var opTokens = {}; for (var op in definitions.opTypeNames) { if (op === '\n' || op === '.') continue; var node = opTokens; for (var i = 0; i < op.length; i++) { var ch = op[i]; if (!(ch in node)) node[ch] = {}; node = node[ch]; node.op = op; } } /* * Tokenizer :: (source, filename, line number) -> Tokenizer */ function Tokenizer(s, f, l) { this.cursor = 0; this.source = String(s); this.tokens = []; this.tokenIndex = 0; this.lookahead = 0; this.scanNewlines = false; this.unexpectedEOF = false; this.filename = f || ""; this.lineno = l || 1; } Tokenizer.prototype = { get done() { // We need to set scanOperand to true here because the first thing // might be a regexp. return this.peek(true) === END; }, get token() { return this.tokens[this.tokenIndex]; }, match: function (tt, scanOperand) { return this.get(scanOperand) === tt || this.unget(); }, mustMatch: function (tt) { if (!this.match(tt)) { throw this.newSyntaxError("Missing " + definitions.tokens[tt].toLowerCase()); } return this.token; }, forceIdentifier: function() { if (!this.match(IDENTIFIER)) { // keywords are valid property names in ES 5 if (this.get() >= definitions.keywords[0] || this.unget) { this.token.type = IDENTIFIER; } else { throw this.newSyntaxError("Missing identifier"); } } return this.token; }, peek: function (scanOperand) { var tt, next; if (this.lookahead) { next = this.tokens[(this.tokenIndex + this.lookahead) & 3]; tt = (this.scanNewlines && next.lineno !== this.lineno) ? NEWLINE : next.type; } else { tt = this.get(scanOperand); this.unget(); } return tt; }, peekOnSameLine: function (scanOperand) { this.scanNewlines = true; var tt = this.peek(scanOperand); this.scanNewlines = false; return tt; }, // Eat comments and whitespace. skip: function () { var input = this.source; for (;;) { var ch = input[this.cursor++]; var next = input[this.cursor]; if (ch === '\n' && !this.scanNewlines) { this.lineno++; } else if (ch === '/' && next === '*') { this.cursor++; for (;;) { ch = input[this.cursor++]; if (ch === undefined) throw this.newSyntaxError("Unterminated comment"); if (ch === '*') { next = input[this.cursor]; if (next === '/') { this.cursor++; break; } } else if (ch === '\n') { this.lineno++; } } } else if (ch === '/' && next === '/') { this.cursor++; for (;;) { ch = input[this.cursor++]; if (ch === undefined) return; if (ch === '\n') { this.lineno++; break; } } } else if (ch !== ' ' && ch !== '\t') { this.cursor--; return; } } }, // Lex the exponential part of a number, if present. Return true iff an // exponential part was found. lexExponent: function() { var input = this.source; var next = input[this.cursor]; if (next === 'e' || next === 'E') { this.cursor++; ch = input[this.cursor++]; if (ch === '+' || ch === '-') ch = input[this.cursor++]; if (ch < '0' || ch > '9') throw this.newSyntaxError("Missing exponent"); do { ch = input[this.cursor++]; } while (ch >= '0' && ch <= '9'); this.cursor--; return true; } return false; }, lexZeroNumber: function (ch) { var token = this.token, input = this.source; token.type = NUMBER; ch = input[this.cursor++]; if (ch === '.') { do { ch = input[this.cursor++]; } while (ch >= '0' && ch <= '9'); this.cursor--; this.lexExponent(); token.value = parseFloat(input.substring(token.start, this.cursor)); } else if (ch === 'x' || ch === 'X') { do { ch = input[this.cursor++]; } while ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')); this.cursor--; token.value = parseInt(input.substring(token.start, this.cursor)); } else if (ch >= '0' && ch <= '7') { do { ch = input[this.cursor++]; } while (ch >= '0' && ch <= '7'); this.cursor--; token.value = parseInt(input.substring(token.start, this.cursor)); } else { this.cursor--; this.lexExponent(); // 0E1, &c. token.value = 0; } }, lexNumber: function (ch) { var token = this.token, input = this.source; token.type = NUMBER; var floating = false; do { ch = input[this.cursor++]; if (ch === '.' && !floating) { floating = true; ch = input[this.cursor++]; } } while (ch >= '0' && ch <= '9'); this.cursor--; var exponent = this.lexExponent(); floating = floating || exponent; var str = input.substring(token.start, this.cursor); token.value = floating ? parseFloat(str) : parseInt(str); }, lexDot: function (ch) { var token = this.token, input = this.source; var next = input[this.cursor]; if (next >= '0' && next <= '9') { do { ch = input[this.cursor++]; } while (ch >= '0' && ch <= '9'); this.cursor--; this.lexExponent(); token.type = NUMBER; token.value = parseFloat(input.substring(token.start, this.cursor)); } else { token.type = DOT; token.assignOp = null; token.value = '.'; } }, lexString: function (ch) { var token = this.token, input = this.source; token.type = STRING; var hasEscapes = false; var delim = ch; while ((ch = input[this.cursor++]) !== delim) { if (this.cursor == input.length) throw this.newSyntaxError("Unterminated string literal"); if (ch === '\\') { hasEscapes = true; if (input[this.cursor] === '\n') this.lineno++; // fix for escaped newline if (++this.cursor == input.length) throw this.newSyntaxError("Unterminated string literal"); } } token.value = hasEscapes ? eval(input.substring(token.start, this.cursor)) : input.substring(token.start + 1, this.cursor - 1); }, lexRegExp: function (ch) { var token = this.token, input = this.source; token.type = REGEXP; do { ch = input[this.cursor++]; if (ch === '\\') { this.cursor++; } else if (ch === '[') { do { if (ch === undefined) throw this.newSyntaxError("Unterminated character class"); if (ch === '\\') this.cursor++; ch = input[this.cursor++]; } while (ch !== ']'); } else if (ch === undefined) { throw this.newSyntaxError("Unterminated regex"); } } while (ch !== '/'); do { ch = input[this.cursor++]; } while (ch >= 'a' && ch <= 'z'); this.cursor--; token.value = eval(input.substring(token.start, this.cursor)); }, lexOp: function (ch) { var token = this.token, input = this.source; // A bit ugly, but it seems wasteful to write a trie lookup routine // for only 3 characters... var node = opTokens[ch]; var next = input[this.cursor]; if (next in node) { node = node[next]; this.cursor++; next = input[this.cursor]; if (next in node) { node = node[next]; this.cursor++; next = input[this.cursor]; } } var op = node.op; if (definitions.assignOps[op] && input[this.cursor] === '=') { this.cursor++; token.type = ASSIGN; token.assignOp = definitions.tokenIds[definitions.opTypeNames[op]]; op += '='; } else { token.type = definitions.tokenIds[definitions.opTypeNames[op]]; token.assignOp = null; } token.value = op; }, // FIXME: Unicode escape sequences // FIXME: Unicode identifiers lexIdent: function (ch) { var token = this.token, input = this.source; do { ch = input[this.cursor++]; } while ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') || ch === '$' || ch === '_'); this.cursor--; // Put the non-word character back. var id = input.substring(token.start, this.cursor); token.type = definitions.keywords[id] || IDENTIFIER; token.value = id; }, /* * Tokenizer.get :: void -> token type * * Consume input *only* if there is no lookahead. * Dispatch to the appropriate lexing function depending on the input. */ get: function (scanOperand) { var token; while (this.lookahead) { --this.lookahead; this.tokenIndex = (this.tokenIndex + 1) & 3; token = this.tokens[this.tokenIndex]; if (token.type !== NEWLINE || this.scanNewlines) return token.type; } this.skip(); this.tokenIndex = (this.tokenIndex + 1) & 3; token = this.tokens[this.tokenIndex]; if (!token) this.tokens[this.tokenIndex] = token = {}; var input = this.source; if (this.cursor === input.length) return token.type = END; token.start = this.cursor; token.lineno = this.lineno; var ch = input[this.cursor++]; if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch === '$' || ch === '_') { this.lexIdent(ch); } else if (scanOperand && ch === '/') { this.lexRegExp(ch); } else if (ch in opTokens) { this.lexOp(ch); } else if (ch === '.') { this.lexDot(ch); } else if (ch >= '1' && ch <= '9') { this.lexNumber(ch); } else if (ch === '0') { this.lexZeroNumber(ch); } else if (ch === '"' || ch === "'") { this.lexString(ch); } else if (this.scanNewlines && ch === '\n') { token.type = NEWLINE; token.value = '\n'; this.lineno++; } else { throw this.newSyntaxError("Illegal token"); } token.end = this.cursor; return token.type; }, /* * Tokenizer.unget :: void -> undefined * * Match depends on unget returning undefined. */ unget: function () { if (++this.lookahead === 4) throw "PANIC: too much lookahead!"; this.tokenIndex = (this.tokenIndex - 1) & 3; }, newSyntaxError: function (m) { var e = new SyntaxError(this.filename + ":" + this.lineno + ":" + m); e.source = this.source; e.cursor = this.lookahead ? this.tokens[(this.tokenIndex + this.lookahead) & 3].start : this.cursor; return e; }, }; return { Tokenizer: Tokenizer }; }());