#!/usr/bin/env python

__description__ = "Program to use Python's re.findall on files"
__author__ = 'Didier Stevens'
__version__ = '0.0.23'
__date__ = '2025/02/09'

"""

Source code put in public domain by Didier Stevens, no Copyright
https://DidierStevens.com
Use at your own risk

History:
  2013/12/06: start
  2013/12/15: added re-search.txt support, options -b -s
  2014/03/25: added ipv4
  2014/04/03: added extra regex comments
  2014/04/09: refactoring: module reextra
  2014/07/18: added manual, stdin
  2014/09/16: updated manual
  2014/09/17: added exception handling for import reextra
  2014/10/10: added options csv, grep and removeanchor
  2014/11/04: updated man
  2014/11/13: added error handling to CompileRegex
  2015/07/07: added option fullread
  2015/07/28: 0.0.2 added option dotall
  2016/07/22: fix for binary files/data
  2017/03/03: added str regex
  2017/04/10: 0.0.4 added option grepall
  2017/05/13: 0.0.5 bugfix output line
  2017/05/17: 0.0.6 added regex btc
  2017/05/18: 0.0.7 fixed regex btc, thanks @SecurityBeard
  2017/06/13: 0.0.8 added --script and --execute
  2017/09/06: 0.0.9 added option -x
  2018/06/25: 0.0.10 added regexs email-domain, url-domain and onion
  2018/06/29: 0.0.11 fixed ProcessFile for Linux/OSX
  2018/06/30: added option -e
  2018/07/28: 0.0.11 added regexes str-e, str-u and str-eu
  2018/08/28: 0.0.13 added support for user library in the current directory
  2018/09/19: Updated Quote
  2019/03/06: changed URL regex
  2020/12/08: 0.0.14 added domaintld
  2021/01/05: 0.0.15 added -n all-
  2021/01/22: added option -F
  2021/02/06: 0.0.16 changed url and url-domain regexes for _ in hostname
  2021/02/14: Fixed human language vulnerabilities
  2021/03/11: 0.0.17 added gzip support and option --encoding
  2021/04/04: fixes for binary mode
  2021/05/05: added public ips filter
  2021/09/19: 0.0.18 map Python3 fix
  2021/09/21: added sys.stdin.reconfigure
  2022/04/18: 0.0.19 Python3 fix stdin binary
  2022/05/06: 0.0.20 added input & output encoding
  2022/07/24: 0.0.21 added UNC regex
  2023/02/17: 0.0.22 added hash regexes
  2023/04/01: added str-s regexes
  2024/02/14: 0.0.23 added tab (\t) support for separator
  2025/02/09: added options --recursedir, literalfilenames and checkfilenames

Todo:
  add hostname to header
"""

import optparse
import glob
import collections
import re
import sys
import os
import pickle
import math
import textwrap
import csv
import binascii
import gzip
import fnmatch

try:
    import reextra
except:
    print("This program requires module reextra (it is a part of the re-search package).\nMake sure it is installed in Python's module repository or the same folder where re-search.py is installed.")
    exit(-1)

REGEX_STANDARD = b'[\x09\x20-\x7E]'

dLibrary = {
            'email': r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}',
            'email-domain': r'[a-zA-Z0-9._%+-]+@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})',
            'url': r'[a-zA-Z]+://[_-a-zA-Z0-9.]+(?:/[-a-zA-Z0-9+&@#/%=~_|!:,.;]*)?(?:\?[-a-zA-Z0-9+&@#/%=~_|!:,.;]*)?',
            'url-domain': r'[a-zA-Z]+://([_-a-zA-Z0-9.]+)(?:/[-a-zA-Z0-9+&@#/%=~_|!:,.;]*)?(?:\?[-a-zA-Z0-9+&@#/%=~_|!:,.;]*)?',
            'ipv4': r'\b(?:(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\.){3}(?:25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])\b',
            'str': r'"[^"]+"',
            'str-e': r'"[^"]*"',
            'str-u': r'"([^"]+)"',
            'str-eu': r'"([^"]*)"',
            'str-s': r"'[^']+'",
            'str-se': r"'[^']*'",
            'str-su': r"'([^']+)'",
            'str-seu': r"'([^']*)'",
            'btc': r'(?#extra=P:BTCValidate)\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b',
            'onion': r'[a-zA-Z2-7]{16}\.onion',
            'domaintld': r'(?#extra=P:DomainTLDValidate)\b[a-zA-Z0-9.-]+\.[a-zA-Z-]+\b',
            'unc': r'\\\\[a-z0-9 %._-]+\\[a-z0-9 $%._-]+(?:\\[a-z0-9 $%._\\-]+)?',
            'md5': r'(?#extra=P:HashValidate)\b[0-9a-f]{32}\b',
            'sha1': r'(?#extra=P:HashValidate)\b[0-9a-f]{40}\b',
            'sha256': r'(?#extra=P:HashValidate)\b[0-9a-f]{64}\b',
            'sha512': r'(?#extra=P:HashValidate)\b[0-9a-f]{128}\b',
           }

dLibraryGroups = {
                   'hashes': ['md5', 'sha1', 'sha256', 'sha512']
                 }

excludeRegexesForAll = ['str', 'str-e', 'str-u', 'str-eu', 'str-s', 'str-se', 'str-su', 'str-seu', 'url-domain', 'email-domain', 'md5', 'sha1', 'sha256', 'sha512']

def ListLibraryNames():
    result = ''
    MergeUserLibrary()
    for key in sorted(dLibrary.keys()):
        result += ' %s%s: %s\n' % (key, IFF(key in excludeRegexesForAll, '', '*'), dLibrary[key])
    result += ' all: all names marked with *\n'
    for key in sorted(dLibraryGroups.keys()):
        result += ' %s: %s\n' % (key, ', '.join(dLibraryGroups[key]))
    return result

def PrintManual():
    manual = '''
Manual:

re-search is a program to match regular expressions. It is like grep -o, it will match regular expressions in text files, not the complete line.

It has 2 major features: a small, extendable library of regular expressions selectable by name; and extra functionality like gibberish detection, enablelists/blocklists and Python functions.

We will use this list of URLs in our examples:
http://didierstevens.com
http://zcczjhbczhbzhj.com
http://www.google.com
http://ryzaocnsyvozkd.com
http://www.microsoft.com
http://ahsnvyetdhfkg.com

Example to extract alphabetical .com domains from file list.txt with a regular expression:
re-search.py [a-z]+\.com list.txt

Output:
didierstevens.com
zcczjhbczhbzhj.com
google.com
ryzaocnsyvozkd.com
microsoft.com
ahsnvyetdhfkg.com

Example to extract URLs from file list.txt with the build-in regular expression for URLs:
re-search.py -n url list.txt

Output:
http://didierstevens.com
http://zcczjhbczhbzhj.com
http://www.google.com
http://ryzaocnsyvozkd.com
http://www.microsoft.com
http://ahsnvyetdhfkg.com

Here is a list of build-in regular expressions:\n''' + ListLibraryNames() + '''
The following command will use all build-in regular expressions marked with *:
re-search.py -n all file

It's possible to exclude one or more build-in regular expressions when using -n all:
Here is an example to use all regular expressions except url:
re-search.py -n all-url file

And here is an example to use all regular expressions except url and email:
re-search.py -n all-url,email file

You can also use a capture group in your regular expression. The selected text will be extracted from the first capture group:
re-search.py ([a-z]+)\.com list.txt

Output:
didierstevens
zcczjhbczhbzhj
google
ryzaocnsyvozkd
microsoft
ahsnvyetdhfkg

By default the regular expression matching is not case sensitive. You can make it case sensitive with option -c. To surround the regular expression with boundaries (\b), use option -b. Output can be mode lowercase with option -l and unique with option -u. Output can be saved to a file with option -o filename. And if you also want to output the regular expression used for matching, use option -d.
To get grep-like output, use option -g. Option -r removes the anchor (^and $) or the regular expression. Use option -D (dotall) to make the . expression match newline characters.
By default, re-search reads the file(s) line-by-line. Binary files can also be processed, but are best read completely and not line-by-line. Use option -f (fullread) to perform a full binary read of the file (and not line-by-line).
Option -e (extractstrings) will also do a full binary read of the file (like -f --fullread), and then extract all strings (ASCII and UNICODE, and at least 4 characters long) for further matching.
Option -G (grepall) will also do a full binary read of the file (like -f --fullread), but output the complete file if there is a match. This is usefull to select files for further processing, like string searching.
Option -x (hex) will produce hexadecimal output.

If you have a list of regular expressions to match, put them in a csv file, and use option -v, -S, -I, -H, -R and -C.
Example:
re-search.py -vHrg -o result -S , -I " " -R PCRE -C pcre.csv logs

Gibberish detection, enablelists/blocklists filtering and matching with Python functions is done by prefixing the regular expression with a comment. Regular expressions can contain comments, like programming languages. This is a comment for regular expressions: (?#comment).
If you use re-search with regular expression comments, nothing special happens:
re-search.py "(?#comment)[a-z]+\.com" list.txt

However, if your regular expression comment prefixes the regular expression, and the comment starts with keyword extra=, then you can use gibberish detection, enablelist/blocklist filtering and Python function matching.
To use gibberisch detection, you use directive S (S stands for sensical). If you want to filter all strings that match the regular expression and are gibberish, you use the following regular expression comment: (?#extra=S:g). :g means that you want to filter for gibberish.

Example to extract alphabetical .com domains from file list.txt with a regular expression that are gibberish:
re-search.py "(?#extra=S:g)[a-z]+\.com" list.txt

Output:
zcczjhbczhbzhj.com
ryzaocnsyvozkd.com
ahsnvyetdhfkg.com

If you want to filter all strings that match the regular expression and are not gibberish, you use the following regular expression comment: (?#extra=S:s). :s means that you want to filter for sensical strings.

Example to extract alphabetical .com domains from file list.txt with a regular expression that are not gibberish:
re-search.py "(?#extra=S:s)[a-z]+\.com" list.txt

Output:
didierstevens.com
google.com
microsoft.com

Blocklists are defined via directive E (Exclude). If you want to filter all strings that match the regular expression and are not in the blocklist, you use the following regular expression comment: (?#extra=E:blocklist). Blocklist is a textfile you provide containing all the strings to be blocklisted.

Example to extract alphabetical .com domains from file list.txt with a regular expression that are not in file blocklist (blocklist contains google.com):
re-search.py "(?#extra=E:blocklist)[a-z]+\.com" list.txt

Output:
didierstevens.com
zcczjhbczhbzhj.com
ryzaocnsyvozkd.com
microsoft.com
ahsnvyetdhfkg.com

Enablelists are defined via directive I (Include). If you want to filter all strings that match the regular expression and are in the enablelist, you use the following regular expression comment: (?#extra=I:enablelist). Enablelist is a textfile you provide containing all the strings to be enablelisted.

Example to extract alphabetical .com domains from file list.txt with a regular expression that are in file enablelist (enablelist contains didierstevens.com):
re-search.py "(?#extra=I:enablelist)[a-z]+\.com" list.txt

Output:
didierstevens.com

Python function matching is defined via directive P (Python). If you want to validate a string with a Python function, you use the following regular expression comment: (?#extra=P:Validate). Validate is a Python function that takes a string as argument and returns a boolean: True for a match and False if there is no match. You can provide your custom Python function(s) in a file via option --script or as a commandline argument via option --execute.

Example: Bitcoin address matching. Regular expression [13][a-km-zA-HJ-NP-Z1-9]{25,34} will match Bitcoin addresses, but also other strings that look like a Bitcoin address but are not a valid Bitcoin address. A valid Bitcoin address has a particular syntax, and a valid checksum. The regular expression can check the syntax, but not validate the checksum. Python function BTCValidate can check the checksum of a Bitcoin address. The following regular expression matches Bitcoin addresses with a valid syntax and uses Python function BTCValidate to validate the checksum:
(?#extra=P:BTCValidate)[13][a-km-zA-HJ-NP-Z1-9]{25,34}

You can use more than one directive in a regular expression. Directives are separated by the ; character.

Example to extract alphabetical .com domains from file list.txt with a regular expression that are not gibberish and that are not blocklisted:
re-search.py "(?#extra=S:s;E:blocklist)[a-z]+\.com" list.txt

Output:
didierstevens.com
microsoft.com


Classifying a string as gibberish or not, is done with a set of classes that I developed based on work done by rrenaud at https://github.com/rrenaud/Gibberish-Detector. The training text is a public domain book in the Sherlock Holmes series. This means that English text is used for gibberish classification. You can provide your own trained pickle file with option -s.

You can extend the library of regular expressions used by re-search without changing the program source code. Create a text file named re-search.txt located in the same directory as re-search.py and/or the current directory. For each regular expression you want to add to the library, enter a line with format name=regex. Comments (lines starting with #) are ignored. Here is an example for MAC addresses:

mac=[0-9A-F]{2}([-:]?)(?:[0-9A-F]{2}\1){4}[0-9A-F]{2}

re-search.py requires module reextra, which is part of the re-search package.

Option -F can be used to filter matched strings. Use option -F ? to get a list of available filters.

The content of folders can be processed too: use option --recursedir and provide folder names as argument. Wildcards and here files (for folder names) can be used too.

To prevent the tool from processing file arguments with wildcard characters or special initial characters (@ and #) differently, but to process them as normal files, use option --literalfilenames.

Use --checkfilenames to test the existance of files before processing.

'''
    for line in manual.split('\n'):
        print(textwrap.fill(line))

QUOTE = '"'

def IfWIN32SetBinary(io):
    if sys.platform == 'win32':
        import msvcrt
        msvcrt.setmode(io.fileno(), os.O_BINARY)

def ToString(value):
    if type(value) == type(''):
        return value
    else:
        return str(value)

def Quote(value, separator, quote):
    value = ToString(value)
    if value[0] == quote and value[-1] == quote:
        return value
    if separator in value:
        return quote + value + quote
    else:
        return value

def MakeCSVLine(row, separator, quote):
    return separator.join([Quote(value, separator, quote) for value in row])

def File2Strings(filename):
    try:
        f = open(filename, 'r')
    except:
        return None
    try:
        return list(map(lambda line:line.rstrip('\n'), f.readlines()))
    except:
        return None
    finally:
        f.close()

def ProcessAt(argument):
    if argument.startswith('@'):
        strings = File2Strings(argument[1:])
        if strings == None:
            raise Exception('Error reading %s' % argument)
        else:
            return strings
    else:
        return [argument]

# CIC: Call If Callable
def CIC(expression):
    if callable(expression):
        return expression()
    else:
        return expression

# IFF: IF Function
def IFF(expression, valueTrue, valueFalse):
    if expression:
        return CIC(valueTrue)
    else:
        return CIC(valueFalse)

def PrintError(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

#Fix for http://bugs.python.org/issue11395
def StdoutWriteChunked(data):
    while data != '':
        sys.stdout.write(data[0:10000])
        try:
            sys.stdout.flush()
        except IOError:
            return
        data = data[10000:]

class cOutput():
    def __init__(self, grepall, filename=None, encoding=''):
        self.grepall = grepall
        self.filename = filename
        self.encoding = encoding
        self.encodingvalue, self.errorsvalue = ParseOptionEncoding('o', self.encoding)
        if self.filename and self.filename != '':
            if self.grepall:
                self.f = open(self.filename, 'wb')
            else:
                self.f = open(self.filename, 'w', encoding=self.encodingvalue, errors=self.errorsvalue)
        else:
            self.f = None
            if self.grepall:
                IfWIN32SetBinary(sys.stdout)

    def Line(self, line):
        if self.grepall:
            if self.f:
                self.f.write(line)
            else:
                StdoutWriteChunked(line)
        else:
            if self.f:
                self.f.write(line + '\n')
            else:
                print(line)

    def Close(self):
        if self.f:
            self.f.close()
            self.f = None

def ExpandFilenameArguments(filenames):
    return list(collections.OrderedDict.fromkeys(sum(map(glob.glob, sum(map(ProcessAt, filenames), [])), [])))

def PrintLibrary():
    global dLibrary
    global dLibraryGroups

    print('Valid regex library names:')
    for key in sorted(dLibrary.keys()):
        print(' %s: %s' % (key, dLibrary[key]))
    print('')
    print('Valid regex library group names:')
    for key in sorted(dLibraryGroups.keys()):
        print(' %s: %s' % (key, ', '.join(dLibraryGroups[key])))

def MergeUserLibrarySub(filename):
    global dLibrary

    lines = File2Strings(filename)
    if not lines:
        return
    for line in lines:
        if not line.startswith('#'):
            result = line.split('=', 1)
            if len(result) == 2:
                dLibrary[result[0]] = result[1]

def MergeUserLibrary():
    MergeUserLibrarySub(os.path.splitext(sys.argv[0])[0] + '.txt')
    MergeUserLibrarySub(os.path.splitext(os.path.basename(sys.argv[0]))[0] + '.txt')

def Library(name):
    global dLibrary

    MergeUserLibrary()

    try:
        return dLibrary[name]
    except KeyError:
        print('Invalid regex library name: %s' % name)
        print('')
        PrintLibrary()
        sys.exit(-1)

def LibraryAllNames():
    global dLibrary

    MergeUserLibrary()
    return sorted(dLibrary.keys())

class cOutputResult():
    def __init__(self, options):
        if options.output:
            self.oOutput = cOutput(options.grepall, options.output, options.encoding)
        else:
            self.oOutput = cOutput(options.grepall, None, options.encoding)
        self.options = options
        self.dLines = {}

    def Line(self, line):
        if self.options.grepall:
            self.oOutput.Line(line)
        else:
            line = IFF(self.options.lower, lambda: line.lower(), line)
            if not line in self.dLines:
                self.oOutput.Line(line)
            if self.options.unique and not line in self.dLines:
                self.dLines[line] = True

    def Close(self):
        self.oOutput.Close()

def CompileRegex(regex, options):
    regex = IFF(options.name, lambda: Library(regex), regex)
    if options.removeanchor:
        regex = IFF(regex.startswith('^'), regex[1:], regex)
        regex = IFF(regex.endswith('$'), regex[:-1], regex)
    regex = IFF(options.boundary, '\\b%s\\b' % regex, regex)
    try:
        oREExtra = reextra.cREExtra(regex, IFF(options.casesensitive, 0, re.IGNORECASE) + IFF(options.dotall, 0, re.DOTALL), options.sensical)
    except:
        raise Exception('Error regex: %s' % regex)
    return regex, oREExtra

def ProcessFile(fIn, fullread, fType, options):
    if fullread:
        yield fIn.read()
    else:
        for line in fIn:
            if fType == 2:
                line = line.decode(*ParseOptionEncoding('i', options.encoding))
            yield line.strip('\n\r')

def Hex(data, dohex):
    if dohex:
        return binascii.b2a_hex(data)
    else:
        return data

def ExtractStringsASCII(data):
    regex = REGEX_STANDARD + b'{%d,}'
    return re.findall(regex % 4, data)

def ExtractStringsUNICODE(data):
    regex = b'((' + REGEX_STANDARD + b'\x00){%d,})'
    return [foundunicodestring.replace(b'\x00', b'') for foundunicodestring, dummy in re.findall(regex % 4, data)]

def ExtractStrings(data):
    return ExtractStringsASCII(data) + ExtractStringsUNICODE(data)

def DumpFunctionStrings(data):
    return ''.join([extractedstring.decode() + '\n' for extractedstring in ExtractStrings(data)])

def ParseOptionEncodingSub2(encoding):
    if encoding == '':
        encodingvalue = 'utf8'
        errorsvalue = 'surrogateescape'
    elif ':' in encoding:
        encodingvalue, errorsvalue = encoding.split(':', 1)
    else:
        encodingvalue = encoding
        errorsvalue = None
    return encodingvalue, errorsvalue

def ParseOptionEncodingSub(entry):
    if not entry.startswith('i=') and not entry.startswith('o='):
        entry = 'i=' + entry
    stream, encoding = entry.split('=', 1)
    encodingvalue, errorsvalue = ParseOptionEncodingSub2(encoding)
    return stream, encodingvalue, errorsvalue

def ParseOptionEncoding(streamId, encoding):
    dStreamsPresent = {'i': False, 'o': False}
    dStreams = {'i': ['utf8', 'surrogateescape'], 'o': ['utf8', 'surrogateescape']}
    if encoding != '':
        for entry in encoding.split(','):
            stream, encodingvalue, errorsvalue = ParseOptionEncodingSub(entry)
            if dStreamsPresent[stream]:
                raise Exception('Encoding option error: %s' % encoding)
            else:
                dStreamsPresent[stream] = True
                dStreams[stream] = [encodingvalue, errorsvalue]
    return dStreams[streamId]

def MinimalPythonVersion(major, minor):
    if sys.version_info[0] < major:
        return False
    if sys.version_info[0] > major:
        return True
    return sys.version_info[1] >= minor

def RESearchSingle(regex, filenames, oOutput, options):
    if options.name and regex == 'all':
        regexes = [CompileRegex(name, options) for name in LibraryAllNames() if not name in excludeRegexesForAll]
    elif options.name and regex.startswith('all-'):
        regexes = [CompileRegex(name, options) for name in LibraryAllNames() if not name in excludeRegexesForAll and not name in regex[4:].split(',')]
    elif options.name and regex in dLibraryGroups:
        regexes = [CompileRegex(name, options) for name in LibraryAllNames() if name in dLibraryGroups[regex]]
    else:
        regexes = [CompileRegex(regex, options)]
    for filename in filenames:
        if filename == '':
            if options.fullread or options.extractstrings or options.grepall:
                if sys.version_info[0] == 2:
                    IfWIN32SetBinary(sys.stdin)
                else:
                    fIn = sys.stdin.buffer
            elif MinimalPythonVersion(3, 7):
                sys.stdin.reconfigure(encoding=ParseOptionEncoding('i', options.encoding)[0], errors=ParseOptionEncoding('i', options.encoding)[1])
                fIn = sys.stdin
            else:
                fIn = sys.stdin
            fType = 1
        elif os.path.splitext(filename)[1].lower() == '.gz':
            fIn = gzip.GzipFile(filename, 'rb')
            fType = 2
        else:
            if options.fullread or options.extractstrings or options.grepall:
                fIn = open(filename, 'rb')
            else:
                fIn = open(filename, 'r', encoding=ParseOptionEncoding('i', options.encoding)[0], errors=ParseOptionEncoding('i', options.encoding)[1])
            fType = 3
        for line in ProcessFile(fIn, options.fullread or options.extractstrings or options.grepall, fType, options):
            if options.extractstrings:
                line = DumpFunctionStrings(line)
            for regex, oREExtra in regexes:
                if options.display:
                    oOutput.Line('Regex: %s' % regex)
                results = oREExtra.Findall(line)
                results = ApplyFilter(results, options)
                if options.grepall or options.grep:
                    if results != []:
                        oOutput.Line(Hex(line, options.hex))
                else:
                    for result in results:
                        if isinstance(result, str):
                            oOutput.Line(Hex(result, options.hex))
                        if isinstance(result, tuple):
                            oOutput.Line(Hex(result[0], options.hex))
        if fType != 1:
            fIn.close()

def RESearchCSV(csvFilename, filenames, oOutput, options):
    reader = csv.reader(open(csvFilename, 'r'), delimiter=options.separatorcsv, skipinitialspace=False, quoting=IFF(options.unquoted, csv.QUOTE_NONE, csv.QUOTE_MINIMAL))
    indexRegex = 0
    indexComment = None
    if not options.header:
        if options.regexindex != '':
            indexRegex = int(options.regexindex)
        if options.commentindex != '':
            indexComment = int(options.commentindex)
    firstRow = True
    dRegex = {}
    for row in reader:
        if options.header and firstRow:
            firstRow = False
            if options.regexindex != '':
                indexRegex = row.index(options.regexindex)
            if options.commentindex != '':
                indexComment = row.index(options.commentindex)
            continue
        regex, oREExtra = CompileRegex(row[indexRegex], options)
        if options.display:
            oOutput.Line('Regex: %s' % row[indexRegex])
        dRegex[regex] = (oREExtra, IFF(indexComment == None, None, lambda: row[indexComment]))

    for filename in filenames:
        if filename == '':
            if options.fullread or options.extractstrings or options.grepall:
                if sys.version_info[0] == 2:
                    IfWIN32SetBinary(sys.stdin)
                else:
                    fIn = sys.stdin.buffer
            elif MinimalPythonVersion(3, 7):
                sys.stdin.reconfigure(encoding=ParseOptionEncoding('i', options.encoding)[0], errors=ParseOptionEncoding('i', options.encoding)[1])
                fIn = sys.stdin
            else:
                fIn = sys.stdin
            fType = 1
        elif os.path.splitext(filename)[1].lower() == '.gz':
            fIn = gzip.GzipFile(filename, 'rb')
            fType = 2
        else:
            fIn = open(filename, IFF(options.fullread or options.extractstrings or options.grepall, 'rb', 'r'), encoding=ParseOptionEncoding('i', options.encoding)[0], errors=ParseOptionEncoding('i', options.encoding)[1])
            fType = 3
        for line in ProcessFile(fIn, options.fullread or options.extractstrings or options.grepall, fType, options):
            if options.extractstrings:
                line = DumpFunctionStrings(line)
            for regex, (oREExtra, comment) in dRegex.items():
                results = oREExtra.Findall(line)
                results = ApplyFilter(results, options)
                newRow = [regex]
                if comment != None:
                    newRow.append(comment)
                if options.grep:
                    if results != []:
                        if options.separatorinput == '':
                            newRow.append(line)
                            outputLine = MakeCSVLine(newRow, options.separatorcsv, QUOTE)
                        else:
                            outputLine = MakeCSVLine(newRow, options.separatorinput, QUOTE) + options.separatorinput + line
                        oOutput.Line(outputLine)
                else:
                    for result in results:
                        if isinstance(result, str):
                            if options.separatorinput == '':
                                newRow.append(result)
                                outputLine = MakeCSVLine(newRow, options.separatorcsv, QUOTE)
                            else:
                                outputLine = MakeCSVLine(newRow, options.separatorinput, QUOTE) + options.separatorinput + result
                        if isinstance(result, tuple):
                            if options.separatorinput == '':
                                newRow.append(result[0])
                                outputLine = MakeCSVLine(newRow, options.separatorcsv, QUOTE)
                            else:
                                outputLine = MakeCSVLine(newRow, options.separatorinput, QUOTE) + options.separatorinput + result[0]
                        oOutput.Line(outputLine)
        if fType != 1:
            fIn.close()

def RESearch(regex, filenames, options):
    oOutput = cOutputResult(options)
    if options.script != '':
        reextra.Script(options.script)
    if options.execute != '':
        reextra.Execute(options.execute)
    if options.csv:
        RESearchCSV(regex, filenames, oOutput, options)
    else:
        RESearchSingle(regex, filenames, oOutput, options)
    oOutput.Close()

def KeepOfficeURL(url):
    url = url.lower()
    for urlfilter in ['http://schemas.openxmlformats.org/', 'http://schemas.microsoft.com/', 'http://purl.org/', 'http://www.w3.org/']:
        if url.startswith(urlfilter):
            return False
    return True

def RemoveOfficeURLs(urls):
    return [url for url in urls if KeepOfficeURL(url)]

def IsPrivateIPv4(ipv4):
    if ipv4.startswith('192.168.'):
        return True
    if ipv4.startswith('10.'):
        return True
    if ipv4.startswith('172.'):
        result = ipv4.split('.')
        if len(result) >= 2:
            secondNumber = result[1]
            if secondNumber in ['16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31']:
                return True
    if ipv4.startswith('127.'):
        return True
    if ipv4.startswith('169.254.'):
        return True
    return False

def RemovePrivateIPv4s(ipv4s):
    return [ipv4 for ipv4 in ipv4s if not IsPrivateIPv4(ipv4)]

dFilters = {
            'officeurls': ['Remove URLs that are common in OOXML Office documents', RemoveOfficeURLs],
            'publicipv4s': ['Remove IPv4 addresses that are private', RemovePrivateIPv4s]
}

def ApplyFilter(result, options):
    if options.filter == '':
        return result
    if options.filter in dFilters:
        return dFilters[options.filter][1](result)
    raise Exception('Unknown filter: %' % options.filter)

class cExpandFilenameArguments():
    def __init__(self, filenames, literalfilenames=False, recursedir=False, checkfilenames=False, expressionprefix=None):
        self.containsUnixShellStyleWildcards = False
        self.warning = False
        self.message = ''
        self.filenameexpressions = []
        self.expressionprefix = expressionprefix
        self.literalfilenames = literalfilenames

        expression = ''
        if len(filenames) == 0:
            self.filenameexpressions = [['', '']]
        elif literalfilenames:
            self.filenameexpressions = [[filename, ''] for filename in filenames]
        elif recursedir:
            for dirwildcard in filenames:
                if expressionprefix != None and dirwildcard.startswith(expressionprefix):
                    expression = dirwildcard[len(expressionprefix):]
                else:
                    if dirwildcard.startswith('@'):
                        for filename in ProcessAt(dirwildcard):
                            self.filenameexpressions.append([filename, expression])
                    elif os.path.isfile(dirwildcard):
                        self.filenameexpressions.append([dirwildcard, expression])
                    else:
                        if os.path.isdir(dirwildcard):
                            dirname = dirwildcard
                            basename = '*'
                        else:
                            dirname, basename = os.path.split(dirwildcard)
                            if dirname == '':
                                dirname = '.'
                        for path, dirs, files in os.walk(dirname):
                            for filename in fnmatch.filter(files, basename):
                                self.filenameexpressions.append([os.path.join(path, filename), expression])
        else:
            for filename in list(collections.OrderedDict.fromkeys(sum(map(self.Glob, sum(map(ProcessAt, filenames), [])), []))):
                if expressionprefix != None and filename.startswith(expressionprefix):
                    expression = filename[len(expressionprefix):]
                else:
                    self.filenameexpressions.append([filename, expression])
            self.warning = self.containsUnixShellStyleWildcards and len(self.filenameexpressions) == 0
            if self.warning:
                self.message = "Your filename argument(s) contain Unix shell-style wildcards, but no files were matched.\nCheck your wildcard patterns or use option literalfilenames if you don't want wildcard pattern matching."
                return
        if self.filenameexpressions == [] and expression != '':
            self.filenameexpressions = [['', expression]]
        if checkfilenames:
            self.CheckIfFilesAreValid()

    def Glob(self, filename):
        if not ('?' in filename or '*' in filename or ('[' in filename and ']' in filename)):
            return [filename]
        self.containsUnixShellStyleWildcards = True
        return glob.glob(filename)

    def CheckIfFilesAreValid(self):
        valid = []
        doesnotexist = []
        isnotafile = []
        for filename, expression in self.filenameexpressions:
            hashfile = False
            try:
                hashfile = FilenameCheckHash(filename, self.literalfilenames)[0] == FCH_DATA
            except:
                pass
            if filename == '' or hashfile:
                valid.append([filename, expression])
            elif not os.path.exists(filename):
                doesnotexist.append(filename)
            elif not os.path.isfile(filename):
                isnotafile.append(filename)
            else:
                valid.append([filename, expression])
        self.filenameexpressions = valid
        if len(doesnotexist) > 0:
            self.warning = True
            self.message += 'The following files do not exist and will be skipped: ' + ' '.join(doesnotexist) + '\n'
        if len(isnotafile) > 0:
            self.warning = True
            self.message += 'The following files are not regular files and will be skipped: ' + ' '.join(isnotafile) + '\n'

    def Filenames(self):
        if self.expressionprefix == None:
            return [filename for filename, expression in self.filenameexpressions]
        else:
            return self.filenameexpressions

def Main():
    global dLibrary

    moredesc = '''

Arguments:
@file: process each file listed in the text file specified
wildcards are supported

Valid regex library names:
'''

    moredesc += ListLibraryNames()
    moredesc += '''
Source code put in the public domain by Didier Stevens, no Copyright
Use at your own risk
https://DidierStevens.com'''

    oParser = optparse.OptionParser(usage='usage: %prog [options] regex [[@]file ...]\n' + __description__ + moredesc, version='%prog ' + __version__)
    oParser.add_option('-n', '--name', action='store_true', default=False, help='The regex argument is a name of a library regex')
    oParser.add_option('-c', '--casesensitive', action='store_true', default=False, help='Make search case-sensitive')
    oParser.add_option('-l', '--lower', action='store_true', default=False, help='Lowercase output')
    oParser.add_option('-u', '--unique', action='store_true', default=False, help='Unique output')
    oParser.add_option('-o', '--output', type=str, default='', help='Output to file')
    oParser.add_option('-b', '--boundary', action='store_true', default=False, help='Add boundaries (\\b) around the regex')
    oParser.add_option('-d', '--display', action='store_true', default=False, help='Display the regex')
    oParser.add_option('-s', '--sensical', default='', help='Sensical pickle file')
    oParser.add_option('-m', '--man', action='store_true', default=False, help='Print manual')
    oParser.add_option('-g', '--grep', action='store_true', default=False, help='Outputs the complete line, like grep (without -o)')
    oParser.add_option('-r', '--removeanchor', action='store_true', default=False, help='Remove anchor of regex starting with ^ and/or ending with $')
    oParser.add_option('-v', '--csv', action='store_true', default=False, help='First argument is a CSV file with regular expressions')
    oParser.add_option('-S', '--separatorcsv', default=';', help='Separator character for CSV file (default ;)')
    oParser.add_option('-I', '--separatorinput', default='', help='Separator character for input file (default none)')
    oParser.add_option('-U', '--unquoted', action='store_true', default=False, help='No handling of quotes in CSV file')
    oParser.add_option('-H', '--header', action='store_true', default=False, help='Header')
    oParser.add_option('-R', '--regexindex', default='', help='Index or title of the regex column in the CSV file')
    oParser.add_option('-C', '--commentindex', default='', help='Index or title of the comment column in the CSV file')
    oParser.add_option('-f', '--fullread', action='store_true', default=False, help='Do a full binary read of the input, not line per line')
    oParser.add_option('-e', '--extractstrings', action='store_true', default=False, help='Do a full binary read of the input, and extract strings for matching')
    oParser.add_option('-G', '--grepall', action='store_true', default=False, help='Do a full read of the input and a full write when there is a match, not line per line')
    oParser.add_option('-D', '--dotall', action='store_true', default=False, help='. matches newline too')
    oParser.add_option('-x', '--hex', action='store_true', default=False, help='output in hex format')
    oParser.add_option('-F', '--filter', default='', help='Filter output')
    oParser.add_option('--script', default='', help='Python script file with definitions to include')
    oParser.add_option('--execute', default='', help='Python commands to execute')
    oParser.add_option('--encoding', type=str, default='', help='Encoding for file open')
    oParser.add_option('--literalfilenames', action='store_true', default=False, help='Do not interpret filenames')
    oParser.add_option('--recursedir', action='store_true', default=False, help='Recurse directories (wildcards and here files (@...) allowed)')
    oParser.add_option('--checkfilenames', action='store_true', default=False, help='Perform check if files exist prior to file processing')
    (options, args) = oParser.parse_args()

    if options.man:
        oParser.print_help()
        PrintManual()
        return

    if options.separatorcsv == r'\t':
        options.separatorcsv = '\t'

    if options.filter != '' and not options.filter in dFilters:
        if options.filter != '?':
            print('Unknown filter: %s' % options.filter)
        print('List of available filters:')
        for key, [description, function] in dFilters.items():
            print(' %s: %s' % (key, description))
        return

    if len(args) == 0:
        oParser.print_help()
    elif len(args) == 1:
        RESearch(args[0], [''], options)
    else:
        oExpandFilenameArguments = cExpandFilenameArguments(args[1:], options.literalfilenames, options.recursedir, options.checkfilenames)
        if oExpandFilenameArguments.warning:
            PrintError('\nWarning:')
            PrintError(oExpandFilenameArguments.message)
        RESearch(args[0], oExpandFilenameArguments.Filenames(), options)

if __name__ == '__main__':
    Main()