extends Object class_name BashParser var pid: int var ctx: BashContext var error: String = "" # `p` is the pid to use in case '$$' is used in the input func _init(context: BashContext, p: int): ctx = context pid = p func set_context(context: BashContext) -> void: ctx = context func set_pid(p: int) -> void: pid = p func clear_error() -> void: error = "" # The goal of this function is to identify the commands among the tokens list. # The very first token must be a PLAIN (the name of the command). # Every tokens that is after a command's name belongs to this command and will be given as its arguments. # An input can have multiple commands, each one separated by a pipe (|) or a semicolon (;). # If they are separated by a pipe, then the standard output of the first one becomes the standard input of the second one. # If they are separated by a semicolon, then they're completely independant from one another. # The very next token after a pipe or a semicolon must be a PLAIN (which is the name of the following command). # For example, the command: "echo -n 'yoyo' | tr y t' # would give: # [ # [ # { # "type": "command", # "name": "echo", # "options": ["-n", "yoyo"], # "redirections": [] # }, # { # "type": "command" # "name": "tr", # "options": ["y", "t"], # "redirections": [] # } # ] # ] # However, the command "echo hello ; echo world" are two different commands that must be executed one after the other. # If the first one fails, it does not stop the second one from being executed. # It would give something like this: # [ # [ # { # "type": "command", # "name": "echo", # "options": ["hello"], # "redirections": [] # } # ], # [ # { # "type": "command" # "name": "echo", # "options": ["world"], # "redirections": [] # } # ] # ] # If the command has redirections, it will look something like this: # "redirections": [{ "port": 1, "type": Tokens.WRITING_REDIRECTION, "target": BashToken(Tokens.PLAIN, "file.txt"), "copied": false }] # == 1>file.txt # if "copied" is `true`: # == 1>&2 (target is the same as the redirection of port 2) # Also, if this is just a variable affectation (yoyo=5 for example), then: # [ # [ # { # "type": "variable", # "name": "yo", # "value": BashToken # } # ] # ] # Finally, if this is a for loop, then : # [ # [ # { # "type": "for", # "variable_name": String, # "sequences": array of tokens # "body": array of tokens # } # ] # ] func parse(input) -> Array: var tokens_list := [] if input is String: var lexer := BashLexer.new(input) if not lexer.error.empty(): error = lexer.error return [] tokens_list = lexer.tokens_list elif input is Array: tokens_list = input else: error = "Donnée invalides pour le parsing du code." return [] if tokens_list.size() == 1 and tokens_list[0].is_eoi(): return [] var i := 0 var e := 0 var commands := [[]] var number_of_tokens := tokens_list.size() # Probably one of the weirdest thing in Bash: # variables affectations are ignored when they follow, or are followed by, a pipe. # As a consequence, we'll remove from the output of the parsing algorithm all variable affectations, # if a pipe is detected within the input. var has_pipe := false for t in tokens_list: if t.is_pipe(): has_pipe = true break while i < number_of_tokens: if tokens_list[i].is_newline(): i += 1 continue var r = _parse_command(tokens_list.slice(i, number_of_tokens) if i > 0 else tokens_list) if r is String: error = r return [] i += r.number_of_read_tokens if has_pipe and r.command.type == "variable": break commands[e].append(r.command) if i < number_of_tokens: if tokens_list[i].is_pipe(): i += 1 elif tokens_list[i].is_eoi(): break if r.should_cut_node: i += 1 if i >= number_of_tokens or tokens_list[i].is_eoi(): break # ending the command with a semicolon should not throw an error commands.append([]) e += 1 return commands func _parse_command(list: Array): if list.empty() or list[0].is_eoi(): return "Erreur de syntaxe : bash attendait une commande mais il n'y a rien." if list[0].type != Tokens.PLAIN and list[0].type != Tokens.KEYWORD: return "Erreur de syntaxe : le symbole '" + str(list[0].value) + "' n'était pas attendu" var is_variable_affectation: bool = list.size() > 1 and list[1].is_equal_sign() if is_variable_affectation and not list[0].value.is_valid_identifier(): return "Erreur de syntaxe : l'identifiant '" + list[0].value + "' n'est pas un nom de variable valide." if list[0].is_keyword_and_equals("for"): var for_loop = _parse_for_loop(list.slice(1, list.size())) if "error" in for_loop: return for_loop.error var size: int = for_loop.size for_loop.erase("size") # we don't need it anymore return { "number_of_read_tokens": size, "should_cut_node": not list[size].is_pipe(), "command": for_loop } var c := { "type": "command", "name": list[0].value, "options": [], "redirections": [] } if not is_variable_affectation else { "type": "variable", "name": list[0].value, "value": null } var number_of_tokens = list.size() var should_cut_node := false var found_redirection := false var i := 1 while i < number_of_tokens: if list[i].is_pipe() or list[i].is_eoi(): break elif list[i].is_line_separator(): should_cut_node = true break else: if list[i].is_descriptor(): var descriptor: int = list[i].value i += 1 var redirection_type: String = list[i].type if descriptor == 1 and list[i].is_reading_redirection(): return "Erreur de syntaxe : le descripteur " + str(descriptor) + " ne peut être en lecture." i += 1 if i >= number_of_tokens: return "Erreur de syntaxe : fin inattendue de redirection." var copied: bool = false var target: BashToken if list[i].is_and(): copied = true i += 1 if i >= number_of_tokens: return "Erreur de syntaxe : valeur attendue pour la redirection copiée." target = list[i] elif list[i].is_plain() or list[i].is_command_substitution(): target = list[i] else: return "Erreur de syntaxe : un chemin est attendu après une redirection." c.redirections.append({ "port": descriptor, "type": redirection_type, "target": target, "copied": copied }) found_redirection = true else: if found_redirection: return "Erreur de syntaxe : fin de commande attendue" if list[i].is_equal_sign(): i += 1 # ignoring the "=" c.value = list[i] # getting the value, and even if the value is empty ("a=") it will have a PLAIN token afterwards i += 1 # jumping over the value if i < number_of_tokens and list[i].is_line_separator(): should_cut_node = true break # we want to end it now, even if there is another affectation right after ("a=5 b=7") else: c.options.append(list[i]) i += 1 return { "number_of_read_tokens": i, "should_cut_node": should_cut_node, "command": c } # As soon as the parser encounters a FOR keyword, # this function is called to interpret everything after that. # The very first token of the list should be the variable name of the syntax. # As a reminder, the syntax is the following: # KEYWORD:for TOKEN:PLAIN KEYWORD:in ...(TOKEN:SUB | TOKEN:PLAIN | TOKEN:STRING | TOKEN:VAR) (NL | TOKEN:SEMICOLON) # KEYWORD: do (NL)? # ...COMMANDS # KEYWORD:done # Which is transformed into: # { # "type": "for", # "variable_name": String # "sequences": the tokens after the 'in' keyword # "body": array of tokens (the parse method has to be called later so that the variables inside it don't get interpreted before their initialisation) # "size": number of tokens of the for loop (useful to continue the parsing process after the loop) # } # If an error occured, only { "error": String } is returned. func _parse_for_loop(tokens: Array) -> Dictionary: var var_name := "" var sequences := [] var number_of_tokens := tokens.size() if not tokens[0].is_plain(): return { "error": "Erreur de syntaxe : le nom d'une variable est attendu après le mot-clé 'for'." } if not tokens[0].value.is_valid_identifier(): return { "error": "Erreur de syntaxe : le nom de la variable de contrôle de la boucle 'for' n'est pas valide." } var_name = tokens[0].value if not tokens[1].is_keyword_and_equals("in"): return { "error": "Erreur de syntaxe : le mot-clé 'in' est attendu après la variable de contrôle de la boucle 'for'." } var i := 2 while i < number_of_tokens and not tokens[i].is_line_separator(): if not tokens[i].is_valid_token_in_for_loop(): return { "error": "Erreur de syntaxe : le symbole '" + str(tokens[i].value) + "' n'est pas valide pour une boucle 'for'." } sequences.append(tokens[i]) i += 1 if sequences.empty(): return { "error": "Erreur de syntaxe : des valeurs sont attendues sur lesquelles itérer avec la boucle 'for'." } i += 1 # jump over the semicolon/newline if i >= number_of_tokens: return { "error": "Erreur de syntaxe : le corps de la boucle 'for' est attendu." } if not tokens[i].is_keyword_and_equals("do"): return { "error": "Erreur de syntaxe : le mot-clé 'do' est attendu après la définition de la boucle." } i += 1 # jump over the `do` keyword # Now we have to parse everything that is inside of the body # I'm just giving all the tokens before the one that corresponds to the closing `done` keyword. # Note that we may have loops inside loops !! var done_keywords := 1 # exactly like we'd do with parenthesis, we'll stop the process as soon as the right loop gets closed. var beginning_index_of_body := i while i < number_of_tokens and done_keywords > 0: if tokens[i].is_keyword_and_equals("for"): done_keywords += 1 elif tokens[i].is_keyword_and_equals("done"): done_keywords -= 1 i += 1 if done_keywords != 0: return { "error": "La boucle for n'a pas été fermée." } var body = tokens.slice(beginning_index_of_body, i - 2) return { "type": "for", "variable_name": var_name, "sequences": sequences, "body": body, "size": i + 1 }