#!/usr/bin/env python3 # Zinsider: instant md5 collisions of pairs of zip+xml files # Ange Albertini 2022 - MIT licence # Office Open XML: docx / pptx / xlsx # Open Container Format: epub # Open Packaging Conventions: # - 3D manufacturing format: 3mf # - XML Paper Specification: xps / oxps import extendzip import argparse import io import os.path import sys import xml.etree.ElementTree as ET import zipfile as ZF def getArgs(): parser = argparse.ArgumentParser(description="Generate MD5 collisions of zip+xml file formats.") parser.add_argument('file1', help="First input file.") parser.add_argument('file2', help="Second input file.") args = parser.parse_args() return args def getFileType(zip): filelist = zip.namelist() if "FixedDocumentSequence.fdseq" in filelist: return "oxps" if "FixedDocSeq.fdseq" in filelist: return "xps" if "word/document.xml" in filelist: return "docx" if "xl/workbook.xml" in filelist: return "xlsx" if "ppt/presentation.xml" in filelist: return "pptx" if "EPUB/package.opf" in filelist: return "epub" if "3D/3dmodel.model" in filelist: return "3mf" return None def checkFileType(zip1, zip2): type1 = getFileType(zip1) if type1 is None: print("Error: unknown file type: '%s'" % os.path.basename(fn1)) sys.exit() type2 = getFileType(zip2) if type2 is None: print("Error: unknown file type: '%s'" % os.path.basename(fn2)) sys.exit() if type1 != type2: print("Error: file types not matching: '%s' - '%s'" % (os.path.basename(fn1),os.path.basename(fn2))) sys.exit() return type1 def appendNumber(p, suffix="1", skip=1): # absolute paths start with a `/` that should be left unchanged return p[:skip] + p[skip:].replace("/", suffix+"/", 1) def replaceNumber(p, suffix="1", skip=1): # absolute paths start with a `/` that should be left unchanged return p[:skip] + p[skip:].replace("1/", suffix+"/", 1) def setParams(comtype): global updatePath, REL_FN, MOVE_EXCL, ATTRIB, XPATH, BLOCKS, CONTENT_FN if comtype in ["docx", "xlsx", "pptx", "3mf"]: REL_FN = "_rels/.rels" # ".//{http://schemas.openxmlformats.org/package/2006/relationships}Relationship[@Target]", ET.register_namespace('', 'http://schemas.openxmlformats.org/package/2006/content-types') CONTENT_FN = "[Content_Types].xml" MOVE_EXCL = "docProps/" ATTRIB = 'PartName' XPATH = ".//{http://schemas.openxmlformats.org/package/2006/content-types}Override[@PartName]" BLOCKS = ET.Element("Override", attrib={"PartName":"/blocks", "ContentType":"application/octet-stream"}) elif comtype == "epub": REL_FN = "META-INF/container.xml" # ".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile[@full-path]" ET.register_namespace('', 'urn:oasis:names:tc:opendocument:xmlns:container') elif comtype == "xps": REL_FN = "FixedDocSeq.fdseq" # //{http://schemas.microsoft.com/xps/2005/06}DocumentReference[@Source]" ET.register_namespace('', 'http://schemas.microsoft.com/xps/2005/06') MOVE_EXCL = "_rels/" updatePath = replaceNumber elif comtype == "oxps": REL_FN = "FixedDocumentSequence.fdseq" # //{http://schemas.openxps.org/oxps/v1.0}DocumentReference[@Source]" ET.register_namespace('', 'http://schemas.openxps.org/oxps/v1.0') MOVE_EXCL = "_rels/" updatePath = replaceNumber else: print("Error !") sys.exit() def mergeZips(): for iz in zip1.infolist(): data_ = zip1.read(iz) if iz.filename in [REL_FN, CONTENT_FN]: continue np = iz.filename if not np.startswith(MOVE_EXCL): np = updatePath(np, suffix="1") # print("> %s => %s" % (iz.filename, np)) iz.filename = np zfSuffix.writestr(iz, data_) for iz in zip2.infolist(): data_ = zip2.read(iz) if iz.filename in [REL_FN, CONTENT_FN]: continue np = iz.filename if not np.startswith(MOVE_EXCL): np = updatePath(np, suffix="2") # print("> %s => %s" % (iz.filename, np)) iz.filename = np # Skip duplicate if iz.filename in zip1.namelist(): # print("File already existing:", iz.filename) continue zfSuffix.writestr(iz, data_) def mergeCT(): if CONTENT_FN is not None: print("Copying content types") tree = ET.ElementTree() treeExtra = ET.ElementTree() filename = CONTENT_FN tree.parse(zip1.open(filename)) root = tree.getroot() for e in root.findall(XPATH): a = e.attrib[ATTRIB] if not (a.startswith(MOVE_EXCL) or a.startswith("/" + MOVE_EXCL)): na = updatePath(a, suffix="1") e.attrib[ATTRIB] = na # print("> Content types(1):", a, na) # Getting all paths attribs1 = [e.attrib[ATTRIB] for e in root.findall(XPATH)] print("Merging content types") treeExtra.parse(zip2.open(filename)) rootExtra = treeExtra.getroot() for e in rootExtra.findall(XPATH): a = e.attrib[ATTRIB] if not (a.startswith(MOVE_EXCL) or a.startswith("/" + MOVE_EXCL)): na = updatePath(a, suffix="2") e.attrib[ATTRIB] = na # print("> Content types(2):", a, na) # avoid duplicates if e.attrib[ATTRIB] not in attribs1: root.append(ET.Element( e.tag, e.attrib )) if BLOCKS is not None: print("Adding collision block exclusion") root.append(BLOCKS) if sys.version_info >= (3,9): ET.indent(tree, space="\t", level=0) data_ = ET.tostring(root, encoding='utf-8', xml_declaration=True) zfSuffix.writestr(filename, data_) if __name__ == '__main__': args = getArgs() zip1 = ZF.ZipFile(args.file1) zip2 = ZF.ZipFile(args.file2) comtype = checkFileType(zip1, zip2) print("Common file type: %s" % (comtype)) # XML Manifest pointing to root file. REL_FN = None # Directory not to be moved - 'docProps/' in office file. MOVE_EXCL = "\x00" # Content type XML file that needs path updating CONTENT_FN = None XPATH = None ATTRIB = None # XML element to be added to content type to hide collision blocks BLOCKS = None updatePath = appendNumber setParams(comtype) hSuffix = io.BytesIO() with ZF.ZipFile(hSuffix, mode='w') as zfSuffix: print("Merging archived files") mergeZips() mergeCT() print("Merging suffix with prefix pair") hColl1, hColl2 = extendzip.extend(comtype, hSuffix) print("Verifying and saving") extendzip.checkWrite(comtype, hColl1, hColl2)