#!/usr/bin/env python import argparse import re # # Short script to discover and list the objects from a PDF # # Set up the cmdline args ap = argparse.ArgumentParser(description="Lists objects from PDF") ap.add_argument('-f','--filename', required=True, action='store', help='PDF File name') args = ap.parse_args() # Define a global regex to look for an object definition objdef_re = re.compile(br'(\d+) (\d+) obj') def get_next_object(fh): buf = b'' while True: # Read 4096 bytes at a time tmpbuf = fh.read(4096) # Tack on to left over content from prior read buf += tmpbuf # Find all matches within this buffer, and provide an iterator allowing us to step # through them matches = objdef_re.finditer(buf) next_end = 0 for m in matches: cut_tmp = False if m.start() < next_end: continue s = b'' # Build a data structure keeping track of the parsed object ids # as well as the raw content from the PDF obj_item = {'raw': buf[m.start():m.end()], 'id_0': int(m.group(1)), 'id_1': int(m.group(2))} # Find the next endobj next_end = buf.find(b'endobj', m.start()) if next_end == -1: # There is no more additional object content in what we read, need to pull # in more data while True: cut_tmp = True nbuf = fh.read(4096) buf += nbuf if not nbuf: next_end = len(buf) break next_end = buf.find(b'endobj', m.start()) if next_end == -1: next_end = len(buf) else: break # Grab the actual object data contents objdata = buf[m.start() + len(m.group(0)):next_end] obj_item['size'] = len(objdata) # But if we pulled in more data, we need to discard the older data, but keep # the newer data after the "endobj", for the next match search if cut_tmp: buf = buf[next_end + 6:] next_end = 0 # Return the parsed object contents yield obj_item # Clear the already-processed objs out of the buf to return memory if next_end > 0: buf = buf[next_end+6:] # If we get an empty read, it means we reached EOF if not tmpbuf: return def process_object(o): print("Object {id0}:{id1}, len={size} >>> {content}".format(id0=o['id_0'], id1=o['id_1'], size=o['size'], content=o['raw'])) try: with open(args.filename, 'rb') as pdf_fh: for obj in get_next_object(pdf_fh): # Called once for each object process_object(obj) except EOFError: pass # Silently exit if we hit EOFError except IOError: print("There was an IO Error")