# Run capa against loaded Ghidra database and render results in Ghidra UI # @author Colton Gabertan (gabertan.colton@gmail.com) # @category Python 3.capa # Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at: [package root]/LICENSE.txt # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. import sys import json import logging import pathlib from typing import Any, Dict, List from ghidra.app.cmd.label import AddLabelCmd, CreateNamespacesCmd from ghidra.program.model.symbol import Namespace, SourceType, SymbolType import capa import capa.main import capa.rules import capa.render.json import capa.ghidra.helpers import capa.capabilities.common import capa.features.extractors.ghidra.extractor logger = logging.getLogger("capa_explorer") def add_bookmark(addr, txt, category="CapaExplorer"): """create bookmark at addr""" currentProgram().getBookmarkManager().setBookmark(addr, "Info", category, txt) # type: ignore [name-defined] # noqa: F821 def create_namespace(namespace_str): """create new Ghidra namespace for each capa namespace""" cmd = CreateNamespacesCmd(namespace_str, SourceType.USER_DEFINED) cmd.applyTo(currentProgram()) # type: ignore [name-defined] # noqa: F821 return cmd.getNamespace() def create_label(ghidra_addr, name, capa_namespace): """custom label cmd to overlay symbols under capa-generated namespaces""" # prevent duplicate labels under the same capa-generated namespace symbol_table = currentProgram().getSymbolTable() # type: ignore [name-defined] # noqa: F821 for sym in symbol_table.getSymbols(ghidra_addr): if sym.getName(True) == capa_namespace.getName(True) + Namespace.DELIMITER + name: return # create SymbolType.LABEL at addr # prioritize capa-generated namespace (duplicate match @ new addr), else put under global Ghidra one (new match) cmd = AddLabelCmd(ghidra_addr, name, True, SourceType.USER_DEFINED) cmd.applyTo(currentProgram()) # type: ignore [name-defined] # noqa: F821 # assign new match overlay label to capa-generated namespace cmd.getSymbol().setNamespace(capa_namespace) return class CapaMatchData: def __init__( self, namespace, scope, capability, matches, attack: List[Dict[Any, Any]], mbc: List[Dict[Any, Any]], ): self.namespace = namespace self.scope = scope self.capability = capability self.matches = matches self.attack = attack self.mbc = mbc def bookmark_functions(self): """create bookmarks for MITRE ATT&CK & MBC mappings""" if self.attack == [] and self.mbc == []: return for key in self.matches.keys(): addr = toAddr(hex(key)) # type: ignore [name-defined] # noqa: F821 func = getFunctionContaining(addr) # type: ignore [name-defined] # noqa: F821 # bookmark & tag MITRE ATT&CK tactics & MBC @ function scope if func is not None: func_addr = func.getEntryPoint() if self.attack != []: for item in self.attack: attack_txt = "" for part in item.get("parts", {}): attack_txt = attack_txt + part + Namespace.DELIMITER attack_txt = attack_txt + item.get("id", {}) add_bookmark(func_addr, attack_txt, "CapaExplorer::MITRE ATT&CK") if self.mbc != []: for item in self.mbc: mbc_txt = "" for part in item.get("parts", {}): mbc_txt = mbc_txt + part + Namespace.DELIMITER mbc_txt = mbc_txt + item.get("id", {}) add_bookmark(func_addr, mbc_txt, "CapaExplorer::MBC") def set_plate_comment(self, ghidra_addr): """set plate comments at matched functions""" comment = getPlateComment(ghidra_addr) # type: ignore [name-defined] # noqa: F821 rule_path = self.namespace.replace(Namespace.DELIMITER, "/") # 2 calls to avoid duplicate comments via subsequent script runs if comment is None: # first comment @ function comment = rule_path + "\n" setPlateComment(ghidra_addr, comment) # type: ignore [name-defined] # noqa: F821 elif rule_path not in comment: comment = comment + rule_path + "\n" setPlateComment(ghidra_addr, comment) # type: ignore [name-defined] # noqa: F821 else: return def set_pre_comment(self, ghidra_addr, sub_type, description): """set pre comments at subscoped matches of main rules""" comment = getPreComment(ghidra_addr) # type: ignore [name-defined] # noqa: F821 if comment is None: comment = "capa: " + sub_type + "(" + description + ")" + ' matched in "' + self.capability + '"\n' setPreComment(ghidra_addr, comment) # type: ignore [name-defined] # noqa: F821 elif self.capability not in comment: comment = ( comment + "capa: " + sub_type + "(" + description + ")" + ' matched in "' + self.capability + '"\n' ) setPreComment(ghidra_addr, comment) # type: ignore [name-defined] # noqa: F821 else: return def label_matches(self): """label findings at function scopes and comment on subscope matches""" capa_namespace = create_namespace(self.namespace) symbol_table = currentProgram().getSymbolTable() # type: ignore [name-defined] # noqa: F821 # handle function main scope of matched rule # these will typically contain further matches within if self.scope == "function": for addr in self.matches.keys(): ghidra_addr = toAddr(hex(addr)) # type: ignore [name-defined] # noqa: F821 # classify new function label under capa-generated namespace sym = symbol_table.getPrimarySymbol(ghidra_addr) if sym is not None: if sym.getSymbolType() == SymbolType.FUNCTION: create_label(ghidra_addr, sym.getName(), capa_namespace) self.set_plate_comment(ghidra_addr) # parse the corresponding nodes, and pre-comment subscope matched features # under the encompassing function(s) for sub_match in self.matches.get(addr): for loc, node in sub_match.items(): sub_ghidra_addr = toAddr(hex(loc)) # type: ignore [name-defined] # noqa: F821 if sub_ghidra_addr == ghidra_addr: # skip duplicates continue # precomment subscope matches under the function if node != {}: for sub_type, description in parse_node(node): self.set_pre_comment(sub_ghidra_addr, sub_type, description) else: # resolve the encompassing function for the capa namespace # of non-function scoped main matches for addr in self.matches.keys(): ghidra_addr = toAddr(hex(addr)) # type: ignore [name-defined] # noqa: F821 # basic block / insn scoped main matches # Ex. See "Create Process on Windows" Rule func = getFunctionContaining(ghidra_addr) # type: ignore [name-defined] # noqa: F821 if func is not None: func_addr = func.getEntryPoint() create_label(func_addr, func.getName(), capa_namespace) self.set_plate_comment(func_addr) # create subscope match precomments for sub_match in self.matches.get(addr): for loc, node in sub_match.items(): sub_ghidra_addr = toAddr(hex(loc)) # type: ignore [name-defined] # noqa: F821 if node != {}: if func is not None: # basic block/ insn scope under resolved function for sub_type, description in parse_node(node): self.set_pre_comment(sub_ghidra_addr, sub_type, description) else: # this would be a global/file scoped main match # try to resolve the encompassing function via the subscope match, instead # Ex. "run as service" rule sub_func = getFunctionContaining(sub_ghidra_addr) # type: ignore [name-defined] # noqa: F821 if sub_func is not None: sub_func_addr = sub_func.getEntryPoint() # place function in capa namespace & create the subscope match label in Ghidra's global namespace create_label(sub_func_addr, sub_func.getName(), capa_namespace) self.set_plate_comment(sub_func_addr) for sub_type, description in parse_node(node): self.set_pre_comment(sub_ghidra_addr, sub_type, description) else: # addr is in some other file section like .data # represent this location with a label symbol under the capa namespace # Ex. See "Reference Base64 String" rule for sub_type, description in parse_node(node): # in many cases, these will be ghidra-labeled data, so just add the existing # label symbol to the capa namespace for sym in symbol_table.getSymbols(sub_ghidra_addr): if sym.getSymbolType() == SymbolType.LABEL: sym.setNamespace(capa_namespace) self.set_pre_comment(sub_ghidra_addr, sub_type, description) def get_capabilities(): rules_dir: str = "" try: selected_dir = askDirectory("Choose capa rules directory", "Ok") # type: ignore [name-defined] # noqa: F821 if selected_dir: rules_dir = selected_dir.getPath() except RuntimeError: # RuntimeError thrown when user selects "Cancel" pass if not rules_dir: logger.info("You must choose a capa rules directory before running capa.") return "" # return empty str to avoid handling both int and str types rules_path: pathlib.Path = pathlib.Path(rules_dir) logger.info("running capa using rules from %s", str(rules_path)) rules = capa.rules.get_rules([rules_path]) meta = capa.ghidra.helpers.collect_metadata([rules_path]) extractor = capa.features.extractors.ghidra.extractor.GhidraFeatureExtractor() capabilities, counts = capa.capabilities.common.find_capabilities(rules, extractor, True) if capa.capabilities.common.has_file_limitation(rules, capabilities, is_standalone=False): popup("capa explorer encountered warnings during analysis. Please check the console output for more information.") # type: ignore [name-defined] # noqa: F821 logger.info("capa encountered warnings during analysis") return capa.render.json.render(meta, rules, capabilities) def get_locations(match_dict): """recursively collect match addresses and associated nodes""" for loc in match_dict.get("locations", {}): # either an rva (absolute) # or an offset into a file (file) if loc.get("type", "") in ("absolute", "file"): yield loc.get("value"), match_dict.get("node") for child in match_dict.get("children", {}): yield from get_locations(child) def parse_node(node_data): """pull match descriptions and sub features by parsing node dicts""" node = node_data.get(node_data.get("type")) if "description" in node: yield "description", node.get("description") data = node.get(node.get("type")) if isinstance(data, (str, int)): feat_type = node.get("type") if isinstance(data, int): data = hex(data) yield feat_type, data def parse_json(capa_data): """Parse json produced by capa""" for rule, capability in capa_data.get("rules", {}).items(): # structure to contain rule match address & supporting feature data # {rule match addr:[{feature addr:{node_data}}]} rule_matches: Dict[Any, List[Any]] = {} for i in range(len(capability.get("matches"))): # grab rule match location match_loc = capability.get("matches")[i][0].get("value") if match_loc is None: # Ex. See "Reference Base64 string" # {'type':'no address'} match_loc = i rule_matches[match_loc] = [] # grab extracted feature locations & corresponding node data # feature[0]: location # feature[1]: node features = capability.get("matches")[i][1] feat_dict = {} for feature in get_locations(features): feat_dict[feature[0]] = feature[1] rule_matches[match_loc].append(feat_dict) # dict data of currently matched rule meta = capability["meta"] # get MITRE ATT&CK and MBC attack = meta.get("attack") if attack is None: attack = [] mbc = meta.get("mbc") if mbc is None: mbc = [] # scope match for the rule scope = meta["scopes"].get("static") fmt_rule = Namespace.DELIMITER + rule.replace(" ", "-") if "namespace" in meta: # split into list to help define child namespaces # this requires the correct delimiter used by Ghidra # Ex. 'communication/named-pipe/create/create pipe' -> capa::communication::named-pipe::create::create-pipe namespace_str = Namespace.DELIMITER.join(meta["namespace"].split("/")) namespace = "capa" + Namespace.DELIMITER + namespace_str + fmt_rule else: # lib rules via the official rules repo will not contain data # for the "namespaces" key, so format using rule itself # Ex. 'contain loop' -> capa::lib::contain-loop namespace = "capa" + Namespace.DELIMITER + "lib" + fmt_rule yield CapaMatchData(namespace, scope, rule, rule_matches, attack, mbc) def main(): logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) if isRunningHeadless(): # type: ignore [name-defined] # noqa: F821 logger.error("unsupported Ghidra execution mode") return capa.main.E_UNSUPPORTED_GHIDRA_EXECUTION_MODE if not capa.ghidra.helpers.is_supported_ghidra_version(): logger.error("unsupported Ghidra version") return capa.main.E_UNSUPPORTED_GHIDRA_VERSION if not capa.ghidra.helpers.is_supported_file_type(): logger.error("unsupported file type") return capa.main.E_INVALID_FILE_TYPE if not capa.ghidra.helpers.is_supported_arch_type(): logger.error("unsupported file architecture") return capa.main.E_INVALID_FILE_ARCH # capa_data will always contain {'meta':..., 'rules':...} # if the 'rules' key contains no values, then there were no matches capa_data = json.loads(get_capabilities()) if capa_data.get("rules") is None: logger.info("capa explorer found no matches") popup("capa explorer found no matches.") # type: ignore [name-defined] # noqa: F821 return capa.main.E_EMPTY_REPORT for item in parse_json(capa_data): item.bookmark_functions() item.label_matches() logger.info("capa explorer analysis complete") popup("capa explorer analysis complete.\nPlease see results in the Bookmarks Window and Namespaces section of the Symbol Tree Window.") # type: ignore [name-defined] # noqa: F821 return 0 if __name__ == "__main__": if sys.version_info < (3, 8): from capa.exceptions import UnsupportedRuntimeError raise UnsupportedRuntimeError("This version of capa can only be used with Python 3.8+") exit_code = main() if exit_code != 0: popup("capa explorer encountered errors during analysis. Please check the console output for more information.") # type: ignore [name-defined] # noqa: F821 sys.exit(exit_code)