#!/usr/bin/env python3 import pymongo import argparse import hashlib import sys import os import subprocess # Provide a helpful API to user arg_parser = argparse.ArgumentParser("Consume metadata for a sample into the database") arg_parser.add_argument("-f", "--filename", required=True, help="Filename to internalize") args = arg_parser.parse_args() # Try opening the file (in binary mode), and fail if we cannot malware_file = open(args.filename, "rb") if not malware_file: sys.stderr.write("There was an error reading the file\n") sys.exit(1) # Establish a connection to the MongDB server conn = pymongo.MongoClient() # Retrieve a handle to the "cs7038" database db = conn["cs7038"] # Retrieve a handle to the "malware" collection mwcoll = db["malware"] # Empty dictionary representing the file data object malware_object = {} # We decided there are a group of values that we always want to consume: # MD5, SHA-1, SHA-256, File name, File size, File type # md5_engine = hashlib.md5() sha1_engine = hashlib.sha1() sha256_engine = hashlib.sha256() malware_bytes = malware_file.read() md5_engine.update(malware_bytes) sha1_engine.update(malware_bytes) sha256_engine.update(malware_bytes) malware_object["md5"] = md5_engine.hexdigest() malware_object["sha1"] = sha1_engine.hexdigest() malware_object["sha256"] = sha256_engine.hexdigest() malware_object["size"] = os.stat(args.filename).st_size # Check to see if there's already an object in the DB matching this # content: cur = mwcoll.find({'size': malware_object["size"], "sha256": malware_object["sha256"]}) # If the results are a list of one, then retrieve the object id and then update the object # to add the filename, if it is indeed a never-before-seen name of the file if cur.count() == 1: db_obj = cur[0] obj_id = db_obj['_id'] mwcoll.update({'_id': obj_id}, {'$addToSet': {'names': args.filename}}) # then leave, because we can assume all the other data is already there from # content analysis sys.exit(0) # Create new list with the filename provided malware_object["names"] = [os.path.basename(args.filename)] # Next, execute the "exiftool" program to identify file type and other data exiftool_proc = subprocess.Popen(["exiftool", "-t", args.filename], stdout=subprocess.PIPE) for result_line in exiftool_proc.stdout: # All entries, because of -t, are key\tvalue formatted # If these exist, also import them: # Company Name, Author, File Description, Creation or Modification time data = result_line.decode('utf-8').strip().split('\t') if data[0] == 'File Type': malware_object["type"] = data[1] elif data[0] == 'Time Stamp': malware_object["creation_time"] = data[1] elif data[0] == 'Create Date': malware_object["creation_time"] = data[1] elif data[0] == 'Modify Date': malware_object["modification_time"] = data[1] elif data[0] == 'Author': malware_object["author"] = data[1] elif data[0] == 'Company Name': malware_object["company"] = data[1] elif data[0] == 'File Description': malware_object["file description"] = data[1] # Complete exectution, then close handle exiftool_proc.wait() exiftool_proc = None # Also, we decided there are a few items that are conditional: # # Exit early if not a PE32 file if malware_object["type"] != "Win32 EXE" and malware_object["type"] != "Win32 DLL": mwcoll.insert(malware_object) print("Added to database: " + repr(malware_object)) sys.exit(0) # If file type is a PE32 EXE or DLL, then: # List of section names, compiled Time Stamp (as creation time) objdump_proc = subprocess.Popen(["objdump", "-x", args.filename], stdout=subprocess.PIPE) # Walk through results till we encounter the "Sections:" table for result_line in objdump_proc.stdout: if result_line.decode('utf-8').find("Sections:") == 0: break # Ignore heading objdump_proc.stdout.readline() malware_object['sections'] = [] # Process table for result_line in objdump_proc.stdout: cleaned_line = result_line.decode('utf-8').strip() # The SYMBOL TABLE comes after the section table, and indicates end of section table if cleaned_line.find('SYMBOL TABLE') == 0: break # Ignore lines that don't begin with a number if len(cleaned_line) == 0 or ord(cleaned_line[0]) < ord('0') or ord(cleaned_line[0]) > ord('9'): continue fields = cleaned_line.split() malware_object['sections'].append(fields[1]) objdump_proc.wait() objdump_proc = None # Insert into database mwcoll.insert(malware_object) print("Added to database: " + repr(malware_object))