#!/usr/bin/env python3 # script to craft MD5 collisions of a PDF and a PE # Ange Albertini 2019-2021 import os import sys import struct import hashlib # PDF functions ############################################################### def EnclosedString(d, starts, ends): off = d.find(starts) + len(starts) return d[off:d.find(ends, off)] def getCount(d): s = EnclosedString(d, b"/Count ", b"/") count = int(s) return count template = b"""%%PDF-1.3 %%\xC2\xB5\xC2\xB6 1 0 obj <> stream %(pe)s endstream endobj 2 0 obj %(lenPE)i endobj 3 0 obj << /Type /Catalog /Pages 4 0 R >> endobj 4 0 obj <> endobj """ # PE functions ################################################################ def getPEhdr(d): PEoffset = d.find("PE\0\0") peHDR = d[PEoffset:] Machine = struct.unpack("H", peHDR[4:4+2])[0] SecCount = struct.unpack("h", peHDR[0x6:0x6+2])[0] bits = None if Machine == 0x014C: bits = 32 elif Machine == 0x8664: bits = 64 if bits is None: print("ERROR: unknown arch") sys.exit() NumDiffOff = 0x74 if bits == 32 else 0x84 NumDD = struct.unpack("i", peHDR[NumDiffOff:NumDiffOff+4])[0] SecTblOff = NumDiffOff + 4 + NumDD * 2 * 4 # get the offset of the first section SectsStart = struct.unpack("i", peHDR[SecTblOff+0x14:SecTblOff+0x14+4])[0] PElen = SecTblOff + SecCount * 0x28 return PEoffset, PElen, SecCount, PEoffset + SecTblOff, SectsStart def relocateSections(d, SecTblOff, SecCount, delta): for i in range(SecCount): offset = SecTblOff + i*0x28 + 0x14 PhysOffset = struct.unpack("i", d[offset:offset+4])[0] d = b"".join([ d[:offset], struct.pack("i", PhysOffset + delta), d[offset+4:] ]) return d # Prefix constants ############################################################ # required offset of the PE header after the prefix PEOFFSET = 0x2C0 # where section starts ALIGN = 0x1000 SECTIONEXTRA = 0x00 # amount of stuff to copy before sections start in case (for UPX) # main ######################################################################## if len(sys.argv) == 1: print("PDF-PE MD5 collider") print("Usage: pdf-pe.py ") sys.exit() with open(sys.argv[2], "rb") as f: pe = f.read() assert pe.startswith(b"MZ") PEoff, HdrLen, NumSec, SecTblOff, SectsStart = getPEhdr(pe) lenPE = len(pe[PEoff:]) os.system('mutool merge -o merged.pdf dummy.pdf %s' % (sys.argv[1])) with open("merged.pdf", "rb") as f: dm = f.read() count = getCount(dm) - 1 kids = EnclosedString(dm, b"/Kids[", b"]") # we skip the first dummy that should be 4 0 R because of the `mutool merge` assert kids.startswith(b"4 0 R ") kids = kids[6:] dm = dm[dm.find(b"5 0 obj"):] dm = dm.replace(b"/Parent 2 0 R", b"/Parent 4 0 R") dm = dm.replace(b"/Root 1 0 R", b"/Root 3 0 R") pe = relocateSections(pe, SecTblOff, NumSec, ALIGN - SectsStart) Sections = pe[SectsStart - SECTIONEXTRA:] pe = b"".join([ pe[PEoff:PEoff+HdrLen], (ALIGN - HdrLen - PEOFFSET - SECTIONEXTRA) * b"\0", Sections, ]) # we need to align the PE header stage1 = template % locals() deltaPDF = stage1.find(b"stream\n") + len(b"stream\n") pe = b"\0" * (PEOFFSET - deltaPDF + len(b"2 0 R") - len(b"%i" % lenPE)) + pe lenPE = len(pe) with open("hacked.pdf", "wb") as f: f.write(template % locals()) f.write(dm) # let's adjust offsets - don't use -g to keep the length object 2 temporarily unused by mutool transform # the direct length reference added by mutool will be replaced by a reference to object 2 via the prefix # (yes, errors will appear because we modified objects without adjusting XREF) print() print("KEEP CALM and IGNORE THE NEXT ERRORS") os.system('mutool clean hacked.pdf cleaned.pdf') with open("cleaned.pdf", "rb") as f: cleaned = f.read() with open("pdfpe1.bin", "rb") as f: prefix1 = f.read() with open("pdfpe2.bin", "rb") as f: prefix2 = f.read() assert hashlib.md5(prefix1).hexdigest() == hashlib.md5(prefix2).hexdigest() assert hashlib.sha1(prefix1).hexdigest() != hashlib.sha1(prefix2).hexdigest() assert len(prefix1) == len(prefix2) lenPrefix = len(prefix1) file1 = prefix1 + cleaned[lenPrefix:] file2 = prefix2 + cleaned[lenPrefix:] with open("collision1.pdf", "wb") as f: f.write(file1) with open("collision2.exe", "wb") as f: f.write(file2) os.remove('merged.pdf') os.remove('hacked.pdf') os.remove('cleaned.pdf') md5 = hashlib.md5(file1).hexdigest() assert md5 == hashlib.md5(file2).hexdigest() # to prove the files should be 100% valid print() os.system('mutool info -X collision1.pdf') print() print() print("MD5: %s" % md5) print("Success!")