# In memory ZIP

I recently ran into an issue with a large amount of JSON events, where the volume of data was ridicoulus simply because of the redundancy in the dataset.

Hearing Raymond Hettingers words inside my head: "There Must Be a Better Way!" I decided to spend some time shrinking the data.

There were 3 steps to the process:

1. First convert redundant JSON, such as `{'x': ..., 'y': ..., 'z':...}` to tuples.
2. Second to do a diff between the each objects state and only keep the changes.
3. Compress the JSON as quickly as possible.

The two first parts are more of less trivial. The JSON is a dict in python so doing a diff is a dict comparison


In [1]:
def dict_comp(A, B):
 """ helper for comparing json like dicts"""
 C = {}
 for k, v1 in A.items():
 v2 = B.get(k, None)
 if v2 is None:
 C[k]=v1
 elif v1!=v2:
 C[k]=v2
 elif isinstance(v1,dict):
 C.update(dict_comp(v1, v2))
 else:
 continue
 return C

In [2]:
A = {'Alice': 1, 'Bob': {'one': 1, 'two':2}}
B = {'Alice': 0, 'Bob': {'one': 1, 'two': 0}}

dict_comp(A,B)

{'Alice': 0, 'Bob': {'one': 1, 'two': 0}}

Here we only see the changes.

The third part is probably a little more novel. Let's start with the requirements:

1. To have a class with a simple api.
2. To keep everything in memory (speed)
3. To be able to append more data to the ZIP
4. To be able to write the zip from memory to disk as a single linear write.
5. To be able to lazily load the zip.
6. To be able to iterate over the filenames in the zip
7. To be able to load the data from the zip using the name as a path.

Here's the whole thing:

In [3]:
import zipfile, io


class InMemoryZip(object):
 def __init__(self):
 # Create the in-memory file-like object
 self.in_memory_zip = io.BytesIO()
 self._path = None

 def load(self, path):
 if not isinstance(path, pathlib.Path):
 raise TypeError(f"{path} is not a path object")
 if not path.name.lower().endswith('zip'):
 raise ValueError(f"{path} is not a zip")

 self._path = path

 def __iter__(self):
 assert isinstance(self._path, pathlib.Path)
 zf = zipfile.ZipFile(self._path)
 for name in zf.namelist():
 yield name

 def __getitem__(self, item):
 zf = zipfile.ZipFile(self._path)
 if item not in zf.namelist():
 raise KeyError(f"no such file: {item}")
 return zf.read(item)

 def append(self, filename_in_zip, file_contents):
 """Appends a file with name filename_in_zip and contents of
 file_contents to the in-memory zip."""
 # Get a handle to the in-memory zip in append mode
 zf = zipfile.ZipFile(self.in_memory_zip, "a", zipfile.ZIP_DEFLATED, False)

 # Write the file to the in-memory zip
 zf.writestr(filename_in_zip, file_contents)

 # Mark the files as having been created on Windows so that
 # Unix permissions are not inferred as 0000
 for zfile in zf.filelist:
 zfile.create_system = 0

 return self

 def read(self):
 """Returns a string with the contents of the in-memory zip."""
 self.in_memory_zip.seek(0)
 return self.in_memory_zip.read()

 def write(self, path):
 """Writes the in-memory zip to a file."""
 if not isinstance(path, pathlib.Path):
 raise TypeError
 with path.open('wb') as fo:
 fo.write(self.read())


To test that it works, let's first get the imports out of the way:

In [4]:
import io
import pathlib
import tempfile

Next, let's create some data and store it in memory

In [5]:
imz = InMemoryZip()
bytestream = io.BytesIO(b"123 123 ")
bytestream.seek(0)
imz.append('a/first', bytestream.read())

bytestream = io.BytesIO(b"123 456 ")
bytestream.seek(0)
imz.append('a/second', bytestream.read())

<__main__.InMemoryZip at 0x28aa67d12b0>

Now let's write it to disk

In [6]:

tempdir = tempfile.gettempdir()
path = pathlib.Path(tempdir) / "io_test.zip"
imz.write(path)
with path.open('rb') as fi:
 print(path.name, len(fi.read()), "bytes")

io_test.zip 222 bytes


Finally let's load it from disk

In [7]:

imz = InMemoryZip()
imz.load(path)

names = [name for name in imz]
assert len(names) == 2
first = imz['a/first']
assert first == b"123 123 "
second = imz['a/second']
assert second == b"123 456 "

And finally clean up the file system.

In [8]:
path.unlink()

Simple.