{ "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": true, "slideshow": { "slide_type": "slide" } }, "source": [ "# Recursive-Descent Parser" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Problem\n", "\n", "Consider the following EBNF description of simple arithmetic expressions:\n", "```\n", "// | + | - \n", " {( + | - ) } \n", "// | * | / \n", " {( * | / ) } \n", " → id | int_lit | ( )\n", "```\n", "\n", "Suppose a lexical analyzer recognizes:\n", "- only arithmetic expressions which include operations: `+`, `-`, `*`, `/` \n", "- integer literals\n", "- parentheses\n", "- variable names which consist of strings of uppercase or lowercase letters\n", " - Optional: might contain digits but must begin with a letter.\n", "- white spaces are ignored \n", " \n", "Write a **lexical** and **syntactic** analyzers that process arithmetical expressions.\n", "- Example: **(sum + 47) / total**" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Lexical Analyzer\n", "\n", "- Define `STATE` type that describes states of lexer finite automaton\n", "- Define `TOKEN` type that lists of token in our language\n", "- Provide functions for identifying character types: alphabet, number, special characters or white space" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "// Lexer finite-state automaton states\n", "type STATE = START | IDENT | NUMBR | EOF\n", " \n", "// Tokens\n", "type TOKEN = INT_LIT | IDENT | ASSIGN_OP | ADD_OP | SUB_OP | MULT_OP | DIV_OP | LEFT_PAREN | RIGHT_PAREN | UNKNOWN | EOF\n", "\n", "// Simple function that classifies a character as being alphabetic or not.\n", "let Alpha = function\n", " | X when X >= 'a' && X <= 'z' -> true\n", " | X when X >= 'A' && X <= 'Z' -> true\n", " | _ -> false\n", "\n", "// Simple function that classifies a character as being number or not.\n", "let Number = function\n", " | X when X >= '0' && X <= '9' -> true\n", " | _ -> false\n", " \n", "// Simple function that classifies a character as being white space or not.\n", "let Space = function\n", " |' ' | '\\t' -> true\n", " | _ -> false" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Some active patterns for detecting binary operations and parentheses" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "source": [ "// Simple function that classifies a character as being parentheses or not.\n", "let (|Parent|_|) input =\n", " match input with\n", " |'(' -> Some(LEFT_PAREN)\n", " |')' -> Some(RIGHT_PAREN)\n", " | _ -> None\n", "\n", "// Active pattern for detecting binary operations\n", "let (|BinOp|_|) input =\n", " match input with\n", " |'+' -> Some(ADD_OP)\n", " |'-' -> Some(SUB_OP)\n", " |'/' -> Some(DIV_OP)\n", " |'*' -> Some(MULT_OP)\n", " |'=' -> Some(ASSIGN_OP)\n", " | _ -> None\n", "\n", "// Append `char` to a `string`.\n", "let append S C = S + C.ToString() " ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "// Tokenizer: implementation of FSA\n", "let rec tokenize ((source:seq), state, lexeme, token) =\n", " if Seq.isEmpty source && state = STATE.EOF then\n", " (Seq.empty, STATE.EOF, \"\", EOF) // Last token in tokenizer, repeat it infinitly\n", " else if Seq.isEmpty source then\n", " (Seq.empty, STATE.EOF, lexeme, token) // Indicate end of input\n", " else\n", " let C = Seq.head source\n", " let S = Seq.tail source\n", " match (C, state) with\n", " | (C , STATE.START) when Space C -> tokenize (S, state, \"\", token)\n", " | (BinOp Op, STATE.START) -> (S, state, append \"\" C, Op)\n", " | (Parent P, STATE.START) -> (S, state, append \"\" C, P)\n", " | (C , STATE.START) when Alpha C -> tokenize (S, STATE.IDENT, append \"\" C, TOKEN.IDENT)\n", " | (C , STATE.START) when Number C -> tokenize (S, STATE.NUMBR, append \"\" C, TOKEN.INT_LIT)\n", " | (C , STATE.IDENT) when Alpha C -> tokenize (S, state, append lexeme C, TOKEN.IDENT)\n", " | (C , STATE.IDENT) when Space C -> (S, STATE.START, lexeme, TOKEN.IDENT)\n", " | (BinOp Op, STATE.IDENT) -> (source, STATE.START, lexeme, TOKEN.IDENT)\n", " | (Parent P, STATE.IDENT) -> (source, STATE.START, lexeme, TOKEN.IDENT)\n", " | (C , STATE.NUMBR) when Space C -> (S, STATE.START, lexeme, TOKEN.INT_LIT)\n", " | (C , STATE.NUMBR) when Number C -> tokenize (S, state, append lexeme C, TOKEN.INT_LIT)\n", " | (BinOp Op, STATE.NUMBR) -> (source, STATE.START, lexeme, TOKEN.INT_LIT)\n", " | (Parent P, STATE.NUMBR) -> (source, STATE.START, lexeme, TOKEN.INT_LIT)\n", " | _ -> (Seq.empty, STATE.EOF, append lexeme C, TOKEN.UNKNOWN) " ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "// Form a sequence from a tokenizer calls\n", "let token_sequence source =\n", " let state = tokenize (source, STATE.START, \"\", TOKEN.UNKNOWN)\n", " let unfolder current_state =\n", " let _, st, lex, tkn = current_state\n", " if tkn = TOKEN.EOF then\n", " None\n", " else\n", " let next_state = tokenize current_state\n", " Some(current_state, next_state)\n", " Seq.unfold unfolder state" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "State: START,\tLexem: (,\tToken: LEFT_PAREN\n", "State: START,\tLexem: prod,\tToken: IDENT\n", "State: START,\tLexem: +,\tToken: ADD_OP\n", "State: START,\tLexem: 37,\tToken: INT_LIT\n", "State: START,\tLexem: ),\tToken: RIGHT_PAREN\n", "State: START,\tLexem: /,\tToken: DIV_OP\n", "State: EOF,\tLexem: total,\tToken: IDENT\n" ] } ], "source": [ "// Test some inputs\n", "let correct_input = \"(prod + 37)/total\"\n", "\n", "for state in token_sequence correct_input do\n", " let _, st, lex, tkn = state\n", " printfn \"State: %A,\\tLexem: %s,\\tToken: %A\" st lex tkn" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "State: START,\tLexem: (,\tToken: LEFT_PAREN\n", "State: START,\tLexem: sum,\tToken: IDENT\n", "State: START,\tLexem: +,\tToken: ADD_OP\n", "State: EOF,\tLexem: 47?,\tToken: UNKNOWN\n" ] } ], "source": [ "// This input contains error: `47?` is not a proper integer literal\n", "let error_input = \"(sum + 47?)/ total\"\n", "\n", "for state in token_sequence error_input do\n", " let _, st, lex, tkn = state\n", " printfn \"State: %A,\\tLexem: %s,\\tToken: %A\" st lex tkn" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Recursive-Descent Parsing\n", "\n", "- There is a subprogram for each nonterminal in the grammar, which can parse sentences that can be generated by that nonterminal\n", "- EBNF is ideally suited for being the basis for a recursive-descent parser, because EBNF minimizes the number of nonterminals\n", "\n", "The coding process when there is only one RHS:\n", "- For each terminal symbol in the RHS, compare it with the next input token; if they match, continue, else there is an error\n", "- For each nonterminal symbol in the RHS, call its associated parsing subprogram\n", "\n", "A nonterminal that has more than one RHS requires an initial process to determine which RHS it is to parse\n", "- The correct RHS is chosen on the basis of the next token of input (the lookahead)\n", "- The next token is compared with the first token that can be generated by each RHS until a match is found\n", "- If no match is found, it is a syntax error" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "let token (src, st, lex, tkn) = tkn // get a token from a context\n", "\n", "let tokenize_print ctx = // call tokenizer, print and return acquired token\n", " let newctx = tokenize ctx\n", " let _, st, lex, tkn = newctx\n", " printfn \"%A: %s\" tkn lex\n", " newctx " ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Implementation of a recursive-descent parser based of following grammar:\n", "```\n", " {( + | - ) }\n", " {( * | / ) }\n", " → id | int_lit | ( )\n", "```\n", "\n", "- Implement `expr`, `term` & `factor` functions" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "// Parses strings in the language generated by the rule:\n", "// -> id | int_constant | ( )\n", "let rec factor context = \n", " printfn \"Enter \"\n", "\n", " // Determine which RHS\n", " let next_context = \n", " let tkn = token context\n", " if tkn = TOKEN.IDENT || tkn = INT_LIT then\n", " // Get the next token\n", " tokenize_print context\n", " else\n", " // If the RHS is (), pass over the left parenthesis,\n", " // call expr, and check for the right parenthesis\n", " if tkn = TOKEN.LEFT_PAREN then \n", " let expr_context = context |> tokenize_print |> expr \n", " if (token expr_context) = TOKEN.RIGHT_PAREN then \n", " tokenize_print expr_context\n", " else\n", " failwith \"No right parenthesis\"\n", " else\n", " failwith \"Not an id, an integer literal, or a left parenthesis\" \n", "\n", " printfn \"Exit \"\n", " next_context\n", "\n", "// As long as the next token is * or /, get the next token and parse the next factor\n", "and next_factor context = \n", " match (token context) with\n", " | TOKEN.MULT_OP | TOKEN.DIV_OP -> context |> tokenize_print |> factor |> next_factor\n", " | _ -> context\n", "\n", "// Parses strings in the language generated by the rule:\n", "// -> {(* | /) )\n", "and term context = \n", " printfn \"Enter \"\n", " let next_context = context |> factor |> next_factor\n", " printfn \"Exit \"\n", " next_context \n", "\n", "// As long as the next token is + or -, get the next token and parse the next term\n", "and next_term context = \n", " match (token context) with\n", " | TOKEN.ADD_OP | TOKEN.SUB_OP -> context |> tokenize_print |> term |> next_term\n", " | _ -> context\n", "\n", "// Parses strings in the language generated by the rule:\n", "// -> {(+ | -) }\n", "and expr context =\n", " printfn \"Enter \"\n", " let next_context = context |> term |> next_term \n", " printfn \"Exit \"\n", " next_context\n", "\n", "// Parser function\n", "let parser source =\n", " printfn \"Start parsing: %s\" source \n", " tokenize_print (source, STATE.START, \"\", TOKEN.UNKNOWN) |> expr |> ignore" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LEFT_PAREN: (\n" ] }, { "data": { "text/html": [ "
Item1Item2Item3Item4
[ p, r, o, d, , +, , 3, 7, ), /, t, o, t, a, l ]
START
(
LEFT_PAREN
" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenize_print (correct_input, STATE.START, \"\", TOKEN.UNKNOWN)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(prod + 37)/total" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "correct_input" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Start parsing: (prod + 37)/total\n", "LEFT_PAREN: (\n", "Enter \n", "Enter \n", "Enter \n", "IDENT: prod\n", "Enter \n", "Enter \n", "Enter \n", "ADD_OP: +\n", "Exit \n", "Exit \n", "INT_LIT: 37\n", "Enter \n", "Enter \n", "RIGHT_PAREN: )\n", "Exit \n", "Exit \n", "Exit \n", "DIV_OP: /\n", "Exit \n", "IDENT: total\n", "Enter \n", "EOF: \n", "Exit \n", "Exit \n", "Exit \n" ] } ], "source": [ "parser correct_input" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Start parsing: (sum + 47?)/ total\n", "LEFT_PAREN: (\n", "Enter \n", "Enter \n", "Enter \n", "IDENT: sum\n", "Enter \n", "Enter \n", "Enter \n", "ADD_OP: +\n", "Exit \n", "Exit \n", "UNKNOWN: 47?\n", "Enter \n", "Enter \n", "Error: Not an id, an integer literal, or a left parenthesis\n" ] } ], "source": [ "try\n", " parser error_input\n", "with\n", " | Failure(msg) -> printfn \"Error: %s\" msg" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Start parsing: a +- b\n", "IDENT: a\n", "Enter \n", "Enter \n", "Enter \n", "ADD_OP: +\n", "Exit \n", "Exit \n", "SUB_OP: -\n", "Enter \n", "Enter \n", "Error: Not an id, an integer literal, or a left parenthesis\n" ] } ], "source": [ "try\n", " parser \"a +- b\"\n", "with\n", " | Failure(msg) -> printfn \"Error: %s\" msg" ] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": ".NET (F#)", "language": "F#", "name": ".net-fsharp" }, "language": "fsharp", "language_info": { "file_extension": ".fs", "mimetype": "text/x-fsharp", "name": "C#", "pygments_lexer": "fsharp", "version": "4.5" } }, "nbformat": 4, "nbformat_minor": 2 }