#!/usr/bin/env python
import argparse
import re
#
# Short script to discover and list the objects from a PDF
#

# Set up the cmdline args
ap = argparse.ArgumentParser(description="Lists objects from PDF")
ap.add_argument('-f','--filename', required=True, action='store', help='PDF File name')
args = ap.parse_args()

# Define a global regex to look for an object definition
objdef_re = re.compile(br'(\d+) (\d+) obj')

def get_next_object(fh):
    buf = b''
    while True:
        # Read 4096 bytes at a time
        tmpbuf = fh.read(4096)

        # Tack on to left over content from prior read
        buf += tmpbuf

        # Find all matches within this buffer, and provide an iterator allowing us to step
        # through them
        matches = objdef_re.finditer(buf)
        next_end = 0
        for m in matches:
            cut_tmp = False

            if m.start() < next_end:
                continue

            s = b''

            # Build a data structure keeping track of the parsed object ids
            # as well as the raw content from the PDF
            obj_item = {'raw': buf[m.start():m.end()],
                        'id_0': int(m.group(1)),
                        'id_1': int(m.group(2))}

            # Find the next endobj
            next_end = buf.find(b'endobj', m.start())
            if next_end == -1:
                # There is no more additional object content in what we read, need to pull
                # in more data
                while True:
                    cut_tmp = True
                    nbuf = fh.read(4096)
                    buf += nbuf
                    if not nbuf:
                        next_end = len(buf)
                        break
                    next_end = buf.find(b'endobj', m.start())

                    if next_end == -1:
                        next_end = len(buf)
                    else:
                        break

            # Grab the actual object data contents
            objdata = buf[m.start() + len(m.group(0)):next_end]
            obj_item['size'] = len(objdata)

            # But if we pulled in more data, we need to discard the older data, but keep
            # the newer data after the "endobj", for the next match search
            if cut_tmp:
                buf = buf[next_end + 6:]
                next_end = 0

            # Return the parsed object contents
            yield obj_item

        # Clear the already-processed objs out of the buf to return memory
        if next_end > 0:
            buf = buf[next_end+6:]

        # If we get an empty read, it means we reached EOF
        if not tmpbuf:
            return

def process_object(o):
    print("Object {id0}:{id1}, len={size} >>> {content}".format(id0=o['id_0'], id1=o['id_1'],
                                                                size=o['size'],
                                                                content=o['raw']))

try:
    with open(args.filename, 'rb') as pdf_fh:
        for obj in get_next_object(pdf_fh):
            # Called once for each object
            process_object(obj)
except EOFError:
    pass # Silently exit if we hit EOFError
except IOError:
    print("There was an IO Error")