#!/usr/bin/python ''' This is tekCollect! This tool will scrape specified data types out of a URL or file. @TekDefense Ian Ahl | www.TekDefense.com | 1aN0rmus@tekDefense.com *Some of the Regular Expressions were taken from http://gskinner.com/RegExr/ Version: 0.5 Changelog: .5 [+] Quick update to add the WDIR Regex. This will pull Windows directories. [+] Modified the URL regext to be less strict. .4 [+] Fixed issue where -t IP4 returned URLs [+] Added summary functions that shows what types of data are in a specified target. [+] Modified the regex for many of the data types for better results [+] Added several new data types: zip, twitter, doc, exe, MYSQL hash, Wordpress (WP) hash, IMG, FLASH [+] Modified the way summary is displayed [+] several improvements by machn1k (https://github.com/machn1k, http://twitter.com/machn1k) [+] Made some modifications based on machn1k's changes .3 [+] Added predefined data types that can be invoke with -t type .2 [+] Expanded the script to allow custom regex with a -r 'regex here' .1 [+] Replaced listTypes selction with loop [+] Tool created and can only pull md5 hashes TODO [-] Proper hash values matching [-] Ability to accept multiple --types [-] Summary sub options (Hash, Host, PII) [-] Improved menu selections & functions ''' import httplib2, re, sys, argparse dTypes = 'MD5, SHA1, SHA256, MySQL, WP (Wordpress), Domain, URL, IP4, IP6, SSN, EMAIL, CCN, Twitter, DOC, EXE, ZIP, IMG ' # Adding arguments parser = argparse.ArgumentParser(description='tekCollect is a tool that will scrape a file or website for specified data') parser.add_argument('-u', '--url', help='This option is used to search for hashes on a website') parser.add_argument('-f', '--file', help='This option is used to import a file that contains hashes') parser.add_argument('-o', '--output', help='This option will output the results to a file.') parser.add_argument('-r', '--regex', help='This option allows the user to set a custom regex value. Must encase in single or double quotes.') parser.add_argument('-t', '--type', help='This option allows a user to choose the type of data they want to pull out. Currently supports ' + dTypes) parser.add_argument('-s', '--summary', action='store_true', default=False, help='This options will show a summary of the data types in a file') args = parser.parse_args() # Setting some variables and lists regVal = '' # Initial revVal listResults = [] MD5 = '\W([a-fA-F0-9]{32})\W' SHA1 = '[a-fA-F0-9]{40}' SHA256 = '[a-fA-F0-9]{64}' LM = '[a-fA-F0-9]{32}' DOMAIN = '\W(\w+\.){1,4}(com|net|biz|cat|aero|asia|coop|info|int|jobs|mobi|museum|name|org|post|pre|tel|travel|xxx|edu|gov|mil|br|cc|ca|uk|ch|co|cx|de|fr|hk|jp|kr|nl|nr|ru|tk|ws|tw)[^a-fA-F0-9_-]' URL = '(http\:\/\/|https\:\/\/)(.+\S)' IP4 = '((?