# Original idea from Hitcon 2022 web2pdf challenge: https://blog.splitline.tw/hitcon-ctf-2022/#%F0%9F%93%83-web2pdf-web # Example usage: python extract_pdf_images.py Ticket-831767.pdf -v import fitz # PyMuPDF from PIL import Image import base64 import zlib import re from argparse import ArgumentParser parser = ArgumentParser(description="Extract file contents embedded in bitmap images in PDF file. File content may be plaintext or base64/zlib compressed. Extracted bitmap images and file contents are written to disk") parser.add_argument('file', help='PDF file path') parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output') args = parser.parse_args() PDF_FILE_PATH = args.file VERBOSE = args.verbose def decompress(data: bytes, chunk_size: int = 1024) -> bytes: """best effort zlib decompress. Return empty if not zlib compressed.""" decompressor = zlib.decompressobj(wbits=-15) decompressed_output = b'' for i in range(0, len(data), chunk_size): chunk = data[i:i + chunk_size] try: decompressed_chunk = decompressor.decompress(chunk) decompressed_output += decompressed_chunk except zlib.error as e: if VERBOSE: print(f"Zlib error encountered at chunk {i}: {e}. Stopping decompression.") # Return the successfully decompressed data up to this point return decompressed_output + decompressor.flush() # Return the full output if no error was encountered return decompressed_output + decompressor.flush() def decodeb64(encoded_data: bytes, min_b64_output_bytes: int = 12) -> bytes: """ Best-effort Base64 decode. If the total successful Base64 decoded output is less than 'min_b64_output_bytes', the function falls back to cleaning the entire original input as printable plain text. """ # 1. Clean and initialize encoded_data = encoded_data.strip() decoded_output = b"" block_size = 4 # 2. Iterative Base64 Decoding Attempt for i in range(0, len(encoded_data), block_size): block = encoded_data[i:i + block_size] try: # base64.b64decode directly accepts the bytes block decoded_block = base64.b64decode(block, validate=True) decoded_output += decoded_block except base64.binascii.Error as e: # --- Base64 Decode Failed: Trigger Fallback/Partial Logic --- # Check the total size of successfully decoded output bytes so far if len(decoded_output) < min_b64_output_bytes: if VERBOSE: print(f"B64 decode failed after only {len(decoded_output)} bytes. Falling back to plain text.") # Fallback triggered: We use the entire original input data return _clean_unprintable_bytes(encoded_data) # If the partial decode meets or exceeds the minimum threshold (N), # we treat it as best-effort B64 and return the partial binary data. if VERBOSE: print(f"B64 decode failed after {len(decoded_output)} bytes (>= {min_b64_output_bytes}). Returning partial output.") return decoded_output # --- Success Case --- # Loop completed without error. Now check the total length of the full output. if len(decoded_output) < min_b64_output_bytes: if VERBOSE: print(f"Full B64 decode resulted in only {len(decoded_output)} bytes (< {min_b64_output_bytes}). Falling back to plain text.") # Fallback triggered: Even though it decoded fully, the output was too short. return _clean_unprintable_bytes(encoded_data) # Full successful decode that meets the length requirement. return decoded_output # --- Helper Function for Plain Text Cleaning --- def _clean_unprintable_bytes(data: bytes) -> bytes: """Decodes bytes to string and removes all non-printable ASCII characters.""" # Convert the raw bytes to a string. # Using 'ascii' and 'ignore' errors to handle non-ASCII bytes gracefully. text_input = data.decode('ascii', errors='ignore') # Regex to find and remove non-standard printable characters. # [^\x20-\x7E\n\r\t] matches anything outside the ASCII printable range. RE_UNPRINTABLE = re.compile(r'[^\x20-\x7E\n\r\t]') cleaned_text = RE_UNPRINTABLE.sub('', text_input) # Return the cleaned string (type is 'str') return cleaned_text.encode() def extract_data(filename): try: with open(filename, 'rb') as f: data = f.read() marker = b'\x1b$)C' data = data.partition(marker)[2].replace(b'\x00', b'') b64_decoded_data = decodeb64(data) decompressed_data = decompress(b64_decoded_data) if decompressed_data: data = decompressed_data elif b64_decoded_data: data = b64_decoded_data if VERBOSE: print(data) if data: extracted_filename = filename + '.extracted' with open(extracted_filename, 'wb') as f2: f2.write(data) print(f'Wrote extracted data to: {extracted_filename}') except Exception as e: print(f'Unexpected error extracting data: {e}') try: pdf_file = fitz.open(PDF_FILE_PATH) for page_index in range(len(pdf_file)): page = pdf_file[page_index] image_list = page.get_images(full=True) if not image_list: continue if VERBOSE: print(f"Found {len(image_list)} images on page {page_index + 1}") for image_index, img in enumerate(image_list, start=1): xref = img[0] try: # Use PyMuPDF's pixmap to get the raw image data pix = fitz.Pixmap(pdf_file, xref) # Check for alpha channel and convert to RGB if present, as BMP doesn't support it if pix.alpha: pix = fitz.Pixmap(fitz.csRGB, pix) # Get the raw pixel data from the pixmap image_data = pix.samples # Create a Pillow image object from the raw pixel data pil_image = Image.frombytes("RGB", [pix.width, pix.height], image_data) # Define the filename and save using Pillow in the BMP format image_filename = f"page{page_index+1}_img{image_index}.bmp" pil_image.save(image_filename, "BMP") if VERBOSE: print(f"Saved original image as BMP: {image_filename}") extracted_data = extract_data(image_filename) except Exception as e: print(f"Error processing image {image_index} on page {page_index + 1}: {e}") print() pdf_file.close() except Exception as e: print(f"An error occurred: {e}")