// This module implements the Earley Parser, an algorithm [given on Wikipedia // here](https://en.wikipedia.org/wiki/Earley_parser). Much of this code was // translated from [the desktop version of Lurch](www.lurchmath.org). // An Earley state is an object of the following form. The `lhs` and `rhs` // together are the rule currently being matched, `pos` is the current // position in that production, a zero-based index through all the interstitial // points in `rhs` (zero being before the whole thing, 1 after the first // entry, etc.), `ori` the position in the input text at which the match // began (called the origin), and `got` is the list of tokens parsed so far. // ``` // { // lhs : categoryname, // rhs : [ ... ], // pos : integerindex, // ori : integerindex, // got : [ ... ] // } // ``` // A parsed token is either a plain string containing the terminal or an array // whose first element is the category name and the rest of which are the // terminals and nonterminals in its parsing. const getNext = state => state.pos < state.rhs.length ? state.rhs[state.pos] : null // The following simple tool is used to copy state objects. It is a shallow // copy in all except the `got` array. const copyState = state => { return { lhs : state.lhs, rhs : state.rhs, pos : state.pos, ori : state.ori, got : state.got.slice(), } } // We will later need to compare two arrays of strings and/or regular // expressions for equality. This function does so. const equalArrays = ( array1, array2 ) => { if ( array1.length != array2.length ) return false for ( let i = 0 ; i < array1.length ; i++ ) { const entry1 = array1[i] const entry2 = array2[i] if ( entry1 instanceof RegExp ) { if ( !( entry2 instanceof RegExp ) || entry1.source != entry2.source ) return false } else { if ( entry1 != entry2 ) return false } } return true } // All of the functionality of this module is embedded in a class called // `Grammar`, which lets you define new grammars and then run them on strings // to parse those strings. This section defines that class. // As mentioned on the Wikipedia page linked to above, a grammar is a set of // rules of the form `C -> A1 A2 ... An` where `C` is the name of a category // and each `Ai` can be a category name or a terminal. // The `Grammar` class defined below stores a grammar as an object whose keys // are category names with values of the following form. // ``` // [ // [ 'catname', 'catname', /terminal/, /terminal/, ... ], // [ 'catname', /terminal/, 'catname', /terminal/, ... ], // ... // ], // ``` // Each row in the two-dimensional array represents the right-hand side of one // rule in the grammar, whose left hand side is the category name under which // the entire two-dimensional array is stored. // The entries in the arrays can be strings (which signify the names of // non-terminals) or regular expressions (which signify that they are // terminals, which must match the regular expression). // Now we begin the class. export class Grammar { // Indicate which of the categories is the starting category by passing its // name to a grammar when you construct one. constructor ( START ) { this.START = START this.rules = { } this.defaults = { addCategories : true, collapseBranches : false, showDebuggingOutput : false, expressionBuilder : null, tokenizer : null, comparator : sameJSON, maxIterations : -1 // no maximum } } // The default options for the parsing algorithm are initialized in the // constructor above, but you can change them using the following routine. The // first parameter is the name of the option (from the list immediately above) // and the second parameter is its new value. The meaning of these options is // documented [below](#earley-algorithm). setOption ( name, value ) { this.defaults[name] = value } // Add a rule to the grammar by specifying the category name and the sequence // of Ai that appear on the right hand side. This creates/extends the // two-dimensional array described above. // You can pass more than one sequence by providing additional parameters, to // add them all at once. You can also provide a string instead of an array, // and it will be converted into an array by splitting at spaces as if it were // a string. Regular expressions will be automatically wrapped in `^...$` for // you, so that they are always tested against the entire string. addRule ( categoryName, ...sequences ) { sequences.forEach( sequence => { if ( sequence instanceof RegExp ) sequence = [ sequence ] if ( !( sequence instanceof Array ) ) sequence = `${sequence}`.split( ' ' ) sequence.forEach( ( entry, index ) => { if ( entry instanceof RegExp ) sequence[index] = new RegExp( `^${entry.source}$` ) } ) if ( !this.rules[categoryName] ) this.rules[categoryName] = [ ] this.rules[categoryName].push( sequence ) } ) } // The following function is the workhorse of this module. It assumes that the // input is a string of a nonzero length. Options is not a required parameter, // but if it is present it should be an object with some subset of the // following properties. Any unspecified properties take the defaults given in // the constructor for this class, unless you changed them with `setOption`, // defined [above](#constructor). // * `addCategories : true` iff category names should be prepended to each // match sequence // * `collapseBranches : true` iff one-argument match sequences should be // collapsed, as in `[[[[a]]]] -> a` // * `showDebuggingOutput : true` iff lots of debugging spam should be dumped // to the console as the algorithm executes // * `expressionBuilder` can be set to a function that will be called each // time a production is completed. It will receive as input the results of // that production (wrapped in an array if `collapseBranches` is true, with // the category name prepended if `addCategories` is true) and it can return // any object to replace that array in the final result. Since this will be // called at every level of the hierarchy, you can use this to recursively // build expressions from the leaves upwards. Because it will need to be // copyable, outputs are restricted to JSON data. // * `tokenizer` can be an instance of the `Tokenizer` class // [defined later in this module](#tokenizing), and if it is, it will be // applied to any string input received by the parser before the parser does // anything with it. This way you can simply place the tokenizer inside the // parser and forget about it; it will be run automatically. // * `comparator` is used to compare two results before returning the full // list, so that duplicates can be removed. This defaults to a JSON-based // comparison, but will therefore go into an infinite loop for circular // structures. Feel free to provide a different one if the default does not // meet your needs. To return duplicates, simply set this to `-> no`. // * `maxIterations` defaults to infinite, but can be specified as a positive // integer, and the parsing algorithm will not iterate its innermost loops // any more than this many times. This can be useful if you have a // suspected infinite loop in a grammar, and want to debug it. // This algorithm is documented to some degree, but it will make much more // sense if you have read the Wikipedia page cited at the top of this file. parse ( input, options = { } ) { options = Object.assign( this.defaults, options ) const expressionBuilderFlag = { } const debug = ( ...args ) => { if ( options.showDebuggingOutput ) console.log( ...args ) } debug( '\n\n' ) // Run the tokenizer if there is one, and the input needs it. if ( options.tokenizer && typeof( input ) == 'string' ) input = options.tokenizer.tokenize( input ) // Initialize the set of states to the array `[ [], [], ..., [] ]`, one entry // for each interstice between characters in `input`, including one for before // the first character and one for after the last. const stateGrid = new Array( input.length + 1 ).fill( null ).map( () => [ ] ) // Push all productions for the starting non-terminal onto the initial state // set. stateGrid[0].push( { lhs : '', rhs : [ this.START ], pos : 0, ori : 0, got : [ ] } ) // Do the main nested loop which solves the whole problem. let numIterationsDone = 0 stateGrid.forEach( ( stateSet, i ) => { debug( `processing stateSet ${i} in this stateGrid (with input ${input}):` ) debug( '----------------------' ) for ( let tmpi = 0 ; tmpi < stateGrid.length ; tmpi++ ) { debug( `| state set ${tmpi}:` ) let skipped = 0 for ( let tmpj = 0 ; tmpj < stateGrid[tmpi].length ; tmpj++ ) { if ( stateGrid[tmpi].length < 15 || stateGrid[tmpi][tmpj].pos > 0 ) debug( `| entry ${tmpj}: ${debugState(stateGrid[tmpi][tmpj])}` ) else skipped++ } if ( skipped > 0 ) debug( `| (plus ${skipped} at pos 0 not shown)` ) } debug( '----------------------' ) // The following loop is written in this indirect way (not using `for`) because // the contents of `stateSet` may be modified within the loop, so we need to be // sure that we do not pre-compute its length, but allow it to grow. let j = 0 while ( j < stateSet.length ) { const state = stateSet[j] debug( `entry ${j}:`, debugState( state ) ) // There are three possibilities. // * The next state is a terminal, // * the next state is a production, or // * there is no next state. // Each of these is handled by a separate sub-task of the Earley algorithm. const next = getNext( state ) debug( 'next:', next ) if ( next === null ) { // This is the case in which there is no next state. It is handled by running // the "completer": We just completed a nonterminal, so mark progress in // whichever rules spawned it by copying them into the next column in // `stateGrid`, with progress incremented one step. // Then we proceed with the code for the completer. debug( 'considering if this completion matters to state set', state.ori ) stateGrid[state.ori].forEach( s => { if ( getNext( s ) == state.lhs ) { s = copyState( s ) s.pos++ let got = state.got.slice() if ( options.addCategories ) got.unshift( state.lhs ) if ( options.expressionBuilder ) got.unshift( expressionBuilderFlag ) if ( options.collapseBranches && got.length == 1 ) got = got[0] s.got.push( got ) stateGrid[i].push( s ) debug( `completer added this to ${i}:`, debugState( s ) ) if ( numIterationsDone++ > options.maxIterations && options.maxIterations > 0 ) throw new Error( 'Maximum number of iterations reached.' ) } } ) j++ continue } if ( i >= input.length ) { j++ continue } debug( 'is it a terminal?', next instanceof RegExp ) if ( next instanceof RegExp ) { // This is the case in which the next state is a terminal. It is handled by // running the "scanner": If the next terminal in `state` is the one we see // coming next in the input string, then find every production at that // terminal's origin that contained that terminal, and mark progress here. if ( next.test( input[i] ) ) { const copy = copyState( state ) copy.pos++ copy.got.push( input[i] ) stateGrid[i+1].push( copy ) debug( `scanner added this to ${i+1}:`, debugState( copy ) ) } j++ continue } if ( !this.rules.hasOwnProperty( next ) ) throw new Error( `Unknown non-terminal in grammar rule: ${next}` ) // This is the case in which the next state is a non-terminal, i.e., the lhs of // one or more rules. It is handled by running the "predictor:" For every // rule that starts with the non-terminal that's coming next, add that rule to // the current state set so that it will be explored in future passes through // the inner of the two main loops. const rhss = this.rules[next] debug( `rhss: [${rhss.join('],[')}]` ) rhss.forEach( rhs => { let found = false for ( let s of stateSet ) { if ( s.lhs == next && equalArrays( s.rhs, rhs ) && s.pos == 0 ) { found = true break } } if ( !found ) { stateSet.push( { lhs : next, rhs : rhs, pos : 0, ori : i, got : [ ] } ) debug( 'adding this state:', debugState( stateSet[stateSet.length-1] ) ) } } ) j++ if ( numIterationsDone++ > options.maxIterations && options.maxIterations > 0 ) throw new Error( 'Maximum number of iterations reached.' ) } } ) debug( `finished processing this stateGrid (with input ${input}):` ) debug( '----------------------' ) for ( let tmpi = 0 ; tmpi < stateGrid.length ; tmpi++ ) { debug( `| state set ${tmpi}:` ) let skipped = 0 for ( let tmpj = 0 ; tmpj < stateGrid[tmpi].length ; tmpj++ ) { if ( stateGrid[tmpi].length < 15 || stateGrid[tmpi][tmpj].pos > 0 ) debug( `| entry ${tmpj}: ${debugState( stateGrid[tmpi][tmpj])}` ) else skipped++ } if ( skipped > 0 ) debug( `| (plus ${skipped} at pos 0 not shown)` ) } debug( '----------------------' ) // The main loop is complete. Any completed production in the final state set // that's marked as a result (and thus coming from state 0 to boot) is a valid // parsing and should be returned. We find such productions with this loop: const results = [ ] stateGrid[stateGrid.length-1].forEach( stateSet => { if ( stateSet.lhs == '' && getNext( stateSet ) == null ) { let result = stateSet.got[0] // When we find one, we have some checks to do before returning it. First, // recursively apply `expressionBuilder`, if the client asked us to. if ( options.expressionBuilder ) { const recur = obj => { if ( !( obj instanceof Array ) || obj[0] != expressionBuilderFlag ) return obj let args = obj.slice( 1 ).map( recur ) if ( args.length == 1 && options.collapseBranches ) args = args[0] // If the expression builder function returns undefined for any subexpression // of the whole, we treat that as an error (saying the expression cannot be // built for whatever application-specific reason the builder function has) and // we thus do not include that result in the list. if ( args.indexOf( undefined ) > -1 ) return undefined return options.expressionBuilder( args ) } result = recur( result ) if ( !result ) return } // Second, don't return any duplicates. So check to see if we've already seen // this result before we add it to the final list of results to return. if ( !results.some( old => options.comparator( old, result ) ) ) results.push( result ) } } ) // Now return the final result list. return results } } // We also provide a class for doing simple tokenization of strings into arrays // of tokens, which can then be passed to a parser. To use this class, create // an instance, add some token types using the `addType` function documented // below, then either call its `tokenize` function yourself on a string, or // just set this tokenizer as the default tokenizer on a parser. export class Tokenizer { constructor () { this.tokenTypes = [ ] } // This function adds a token type to this object. The first parameter is the // regular expression used to match the tokens. The second parameter can be // either of three things: // * If it is a function, that function will be run on every instance of the // token that's found in any input being tokenized, and the output of the // function used in place of the token string in the return value from this // tokenizer. But if the function returns null, the tokenizer will omit // that token from the output array. This is useful for, say, removing // whitespace: `addType( /\s/, -> null )`. The function will actually // receive two parameters, the second being the regular expresison match // object, which can be useful if there were captured subexpressions. // * If it is a string, that string will be used as the output token instead // of the actual matched token. All `%n` patterns in the output will be // simultaneously replaced with the captured expressions of the type's // regular expression (with zero being the entire match). This is useful // for reformatting tokens by adding detail. Example: // `addType( /-?[0-9]+/, 'Integer(%0)' )` // * The second parameter may be omitted, and it will be treated as the // identity function, as in the first bullet point above. addType ( regexp, formatter = x => x ) { if ( regexp.source[0] != '^' ) regexp = new RegExp( `^(?:${regexp.source})` ) this.tokenTypes.push( { regexp, formatter } ) } // Tokenizing is useful for grouping large, complex chunks of text into one // piece before parsing, so that the parsing rules can be simpler and clearer. // For example, a regular expression that groups double-quoted string literals // into single tokens is `/"(?:[^\\"]|\\\\|\\")*"/`. That's a much shorter bit // of code to write than a complex set of parsing rules that accomplish the // same purpose; it will also run more efficiently than those rules would. // The following routine tokenizes the input, returning one of two things: // * an array of tokens, each of which was the output of the formatter // function/string provided to `addType()`, above, or // * null, because some portion of the input string did not match any of the // token types added with `addType()`. // The routine simply tries every regular expression of every token type added // with `addType()`, above, and when one succeeds, it pops that text off the // input string, saving it to a results list after passing it through the // corresponding formatter. If at any point none of the regular expressions // matches the beginning of the remaining input, null is returned. tokenize ( input ) { const result = [ ] while ( input.length > 0 ) { const original = input.length for ( let type of this.tokenTypes ) { const match = type.regexp.exec( input ) if ( !match ) continue input = input.substr( match[0].length ) if ( type.formatter instanceof Function ) { const next = type.formatter( match[0], match ) if ( next ) result.push( next ) } else { let format = `${type.formatter}` let token = '' let next while ( next = /\%([0-9]+)/.exec( format ) ) { token += format.slice( 0, next.index ) + match[next[1]] format = format.slice( next.index + next[0].length ) } result.push( token + format ) } break } if ( input.length == original ) return null } return result } } // The following debugging routines are used in some of the code above. const debugNestedArrays = array => { if ( array instanceof Array ) { if ( JSON.stringify( array[0] ) == '{}' ) array = array.slice( 1 ) return '[' + array.map( debugNestedArrays ).join( ',' ) + ']' } return array } const debugState = state => `(${state.lhs} -> ${state.pos}in[${state.rhs}], ${state.ori}) got ${debugNestedArrays(state.got)}` // By a "JSON object" I mean an object where the only information we care about // is that which would be preserved by `JSON.stringify` (i.e., an object that // can be serialized and deserialized with JSON's `stringify` and `parse` // without bringing any harm to our data). // We wish to be able to compare such objects for semantic equality (not actual // equality of objects in memory, as `==` would do). We cannot simply do this // by comparing the `JSON.stringify` of each, because [documentation on // JSON.stringify](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/stringify) // says that we cannot rely on a consistent ordering of the object keys. Thus // we implement the following comparison routine. // Note that this only works for objects that fit the requirements above; if // equality (in your situation) is affected by the prototype chain, or if your // object contains functions, or any other similar difficulty, then this // routine is not guaranteed to work for you. // It yields the same result as `JSON.stringify(x) is JSON.stringify(y)` would // if `stringify` always gave the same ordering of object keys. const sameJSON = ( x, y ) => { // If only one is an object, or only one is an array, then they're not equal. // If neither is an object, you can use plain simple `is` to compare. if ( ( x instanceof Object ) != ( y instanceof Object ) ) return false if ( ( x instanceof Array ) != ( y instanceof Array ) ) return false if ( !( x instanceof Object ) ) return x === y // So now we know that both inputs are objects. // Get their keys in a consistent order. If they aren't the same for both // objects, then the objects aren't equal. const xkeys = Object.keys( x ).sort() const ykeys = Object.keys( y ).sort() if ( JSON.stringify( xkeys ) != JSON.stringify( ykeys ) ) return false // If there's any key on which the objects don't match, then they aren't equal. // Otherwise, they are. return xkeys.every( key => sameJSON( x[key], y[key] ) ) } // And the following lines test to see if this function is running in a // [WebWorker](https://www.w3.org/TR/workers/), and if so, they install an // event handler for messages posted from the main thread, which exposes the // key API from this module to the outside, through message-passing. if ( typeof( WorkerGlobalScope ) !== 'undefined' || ( typeof( self ) !== 'undefined' && typeof( self.importScripts ) !== 'undefined' ) ) { // We keep track of a set of named parsers in this object. Clients can create // them by passing messages to this thread, as defined below. const ParserStore = { } self.addEventListener( 'message', event => { const command = event.data[0] const name = event.data[1] switch( command ) { // Receiving a message of the form // `[ 'newParser', 'parser name', 'start token' ]` // creates a new parser. case 'newParser': const startToken = event.data[2] ParserStore[name] = new Grammar( startToken ) break // Clients can add types to the parser's tokenizer (which is created if there // wasn't one before) with messages of the form // `[ 'addType', 'parser name', 'regular expression' ]`. With two // exceptions, these messages are converted directly into function calls of // `addType()` in the tokenizer (so see its documentation above). // The exceptions: First, because regular expressions cannot be passed to // workers, the client must pass `regexp.source` instead, and on this end, the // `RegExp` constructor will be called to rebuild the object. // Second, there is an optional fourth argument, the transformation function to // be applied to any token encountered of this type. Because functions cannot // be passed to workers, the client must convert the function to a string // (e.g., `String(f)`). It will be rebuilt into a function on this side, // obviously without its original environment/scope. case 'addType': if ( !ParserStore[name] ) return const regexp = event.data[2] let func = event.data[3] const funcre = /\s*function\s*\(((?:[a-zA-Z0-9,]|\s)*)\)\s*\{\s*((?:.|\n)*)\}\s*$/ if ( func ) { const match = funcre.exec( func ) match[1] = match[1].replace( /\s/g, '' ) func = new Function( ...match.slice( 1 ) ) } ParserStore[name].defaults.tokenizer ||= new Tokenizer() ParserStore[name].defaults.tokenizer.addType( new RegExp( regexp ), func ) break // Clients can add rules to the parser with messages of the form // `[ 'addRule', 'parser name', 'category', sequences... ]`. These // messages are converted directly into function calls of `addRule()` in the // parser, so see its documentation above. // Because regular expressions cannot be passed to WebWorkers, we modify the // convention in storing the sequences. Each item in a sequence must be of // the form "c:category name" or "t:terminal regexp" so that category names // and regular expressions for terminals can be distinguished. We convert // them to strings or regular expressions before calling `addRule()`. case 'addRule': if ( !ParserStore[name] ) return const category = event.data[2] const sequences = event.data.slice( 3 ) sequences.forEach( ( sequence, index ) => { if ( /^t:/.test( sequence ) ) sequences[index] = [ new RegExp( sequence.slice( 2 ) ) ] if ( !( sequence instanceof Array ) ) sequences[index] = sequence = `${sequence}`.split( ' ' ) sequence.forEach( ( entry, index2 ) => { let rest = entry.slice( 2 ) if ( entry[0] == 't' ) rest = new RegExp( `^${rest}$` ) sequence[index2] = rest } ) } ) ParserStore[name].addRule( category, ...sequences ) break // Clients passing messages of the form `[ 'parse', 'parser name', 'text' ]` // are requesting the named parser to parse the given text and then send a // message back containing the results (which may be an empty list, as in the // documentation of the `parse()` function in the `Parser` class, above). case 'parse': if ( !ParserStore[name] ) return const text = event.data[2] self.postMessage( ParserStore[name].parse( text ) ) break // Clients can pass a message `[ 'deleteParser', 'parser name' ]` to remove // the named parser from memory. case 'deleteParser': delete ParserStore[name] } } ) }