{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Advanced indexing" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'2.1.5.dev144'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import sys\n", "sys.path.insert(0, '..')\n", "import zarr\n", "import numpy as np\n", "np.random.seed(42)\n", "import cProfile\n", "zarr.__version__" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Functionality and API" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Indexing a 1D array with a Boolean (mask) array\n", "\n", "Supported via ``get/set_mask_selection()`` and ``.vindex[]``. Also supported via ``get/set_orthogonal_selection()`` and ``.oindex[]``." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "a = np.arange(10)\n", "za = zarr.array(a, chunks=2)\n", "ix = [False, True, False, True, False, True, False, True, False, True]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 3, 5, 7, 9])" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get items\n", "za.vindex[ix]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 3, 5, 7, 9])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get items\n", "za.oindex[ix]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 10, 2, 30, 4, 50, 6, 70, 8, 90])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set items\n", "za.vindex[ix] = a[ix] * 10\n", "za[:]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 100, 2, 300, 4, 500, 6, 700, 8, 900])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set items\n", "za.oindex[ix] = a[ix] * 100\n", "za[:]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 3, 5, 7, 9])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# if using .oindex, indexing array can be any array-like, e.g., Zarr array\n", "zix = zarr.array(ix, chunks=2)\n", "za = zarr.array(a, chunks=2)\n", "za.oindex[zix] # will not load all zix into memory" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Indexing a 1D array with a 1D integer (coordinate) array\n", "\n", "Supported via ``get/set_coordinate_selection()`` and ``.vindex[]``. Also supported via ``get/set_orthogonal_selection()`` and ``.oindex[]``." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "a = np.arange(10)\n", "za = zarr.array(a, chunks=2)\n", "ix = [1, 3, 5, 7, 9]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 3, 5, 7, 9])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get items\n", "za.vindex[ix]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 3, 5, 7, 9])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get items\n", "za.oindex[ix]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 10, 2, 30, 4, 50, 6, 70, 8, 90])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set items\n", "za.vindex[ix] = a[ix] * 10\n", "za[:]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 100, 2, 300, 4, 500, 6, 700, 8, 900])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set items\n", "za.oindex[ix] = a[ix] * 100\n", "za[:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Indexing a 1D array with a multi-dimensional integer (coordinate) array\n", "\n", "Supported via ``get/set_coordinate_selection()`` and ``.vindex[]``." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "a = np.arange(10)\n", "za = zarr.array(a, chunks=2)\n", "ix = np.array([[1, 3, 5], [2, 4, 6]])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1, 3, 5],\n", " [2, 4, 6]])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get items\n", "za.vindex[ix]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 10, 20, 30, 40, 50, 60, 7, 8, 9])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set items\n", "za.vindex[ix] = a[ix] * 10\n", "za[:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Slicing a 1D array with step > 1\n", "\n", "Slices with step > 1 are supported via ``get/set_basic_selection()``, ``get/set_orthogonal_selection()``, ``__getitem__`` and ``.oindex[]``. Negative steps are not supported." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "a = np.arange(10)\n", "za = zarr.array(a, chunks=2)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 3, 5, 7, 9])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get items\n", "za[1::2]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0, 10, 2, 30, 4, 50, 6, 70, 8, 90])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set items\n", "za.oindex[1::2] = a[1::2] * 10\n", "za[:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Orthogonal (outer) indexing of multi-dimensional arrays\n", "\n", "Orthogonal (a.k.a. outer) indexing is supported with either Boolean or integer arrays, in combination with integers and slices. This functionality is provided via the ``get/set_orthogonal_selection()`` methods. For convenience, this functionality is also available via the ``.oindex[]`` property." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", " [ 3, 4, 5],\n", " [ 6, 7, 8],\n", " [ 9, 10, 11],\n", " [12, 13, 14]])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = np.arange(15).reshape(5, 3)\n", "za = zarr.array(a, chunks=(3, 2))\n", "za[:]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 3, 5],\n", " [ 9, 11]])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# orthogonal indexing with Boolean arrays\n", "ix0 = [False, True, False, True, False]\n", "ix1 = [True, False, True]\n", "za.get_orthogonal_selection((ix0, ix1))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 3, 5],\n", " [ 9, 11]])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# alternative API\n", "za.oindex[ix0, ix1]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 3, 5],\n", " [ 9, 11]])" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# orthogonal indexing with integer arrays\n", "ix0 = [1, 3]\n", "ix1 = [0, 2]\n", "za.get_orthogonal_selection((ix0, ix1))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 3, 5],\n", " [ 9, 11]])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# alternative API\n", "za.oindex[ix0, ix1]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 3, 4, 5],\n", " [ 9, 10, 11]])" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# combine with slice\n", "za.oindex[[1, 3], :]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 2],\n", " [ 3, 5],\n", " [ 6, 8],\n", " [ 9, 11],\n", " [12, 14]])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# combine with slice\n", "za.oindex[:, [0, 2]]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", " [42, 4, 42],\n", " [ 6, 7, 8],\n", " [42, 10, 42],\n", " [12, 13, 14]])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set items via Boolean selection\n", "ix0 = [False, True, False, True, False]\n", "ix1 = [True, False, True]\n", "selection = ix0, ix1\n", "value = 42\n", "za.set_orthogonal_selection(selection, value)\n", "za[:]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", " [44, 4, 44],\n", " [ 6, 7, 8],\n", " [44, 10, 44],\n", " [12, 13, 14]])" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# alternative API\n", "za.oindex[ix0, ix1] = 44\n", "za[:]" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", " [46, 4, 46],\n", " [ 6, 7, 8],\n", " [46, 10, 46],\n", " [12, 13, 14]])" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set items via integer selection\n", "ix0 = [1, 3]\n", "ix1 = [0, 2]\n", "selection = ix0, ix1\n", "value = 46\n", "za.set_orthogonal_selection(selection, value)\n", "za[:]" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", " [48, 4, 48],\n", " [ 6, 7, 8],\n", " [48, 10, 48],\n", " [12, 13, 14]])" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# alternative API\n", "za.oindex[ix0, ix1] = 48\n", "za[:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Coordinate indexing of multi-dimensional arrays\n", "\n", "Selecting arbitrary points from a multi-dimensional array by indexing with integer (coordinate) arrays is supported. This functionality is provided via the ``get/set_coordinate_selection()`` methods. For convenience, this functionality is also available via the ``.vindex[]`` property." ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", " [ 3, 4, 5],\n", " [ 6, 7, 8],\n", " [ 9, 10, 11],\n", " [12, 13, 14]])" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = np.arange(15).reshape(5, 3)\n", "za = zarr.array(a, chunks=(3, 2))\n", "za[:]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 3, 11])" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get items\n", "ix0 = [1, 3]\n", "ix1 = [0, 2]\n", "za.get_coordinate_selection((ix0, ix1))" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 3, 11])" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# alternative API\n", "za.vindex[ix0, ix1]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", " [42, 4, 5],\n", " [ 6, 7, 8],\n", " [ 9, 10, 42],\n", " [12, 13, 14]])" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set items\n", "za.set_coordinate_selection((ix0, ix1), 42)\n", "za[:]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", " [44, 4, 5],\n", " [ 6, 7, 8],\n", " [ 9, 10, 44],\n", " [12, 13, 14]])" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# alternative API\n", "za.vindex[ix0, ix1] = 44\n", "za[:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Mask indexing of multi-dimensional arrays\n", "\n", "Selecting arbitrary points from a multi-dimensional array by a Boolean array is supported. This functionality is provided via the ``get/set_mask_selection()`` methods. For convenience, this functionality is also available via the ``.vindex[]`` property." ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", " [ 3, 4, 5],\n", " [ 6, 7, 8],\n", " [ 9, 10, 11],\n", " [12, 13, 14]])" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = np.arange(15).reshape(5, 3)\n", "za = zarr.array(a, chunks=(3, 2))\n", "za[:]" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 3, 11])" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ix = np.zeros_like(a, dtype=bool)\n", "ix[1, 0] = True\n", "ix[3, 2] = True\n", "za.get_mask_selection(ix)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 3, 11])" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "za.vindex[ix]" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", " [42, 4, 5],\n", " [ 6, 7, 8],\n", " [ 9, 10, 42],\n", " [12, 13, 14]])" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "za.set_mask_selection(ix, 42)\n", "za[:]" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", " [44, 4, 5],\n", " [ 6, 7, 8],\n", " [ 9, 10, 44],\n", " [12, 13, 14]])" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "za.vindex[ix] = 44\n", "za[:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Selecting fields from arrays with a structured dtype\n", "\n", "All ``get/set_selection_...()`` methods support a ``fields`` argument which allows retrieving/replacing data for a specific field or fields. Also h5py-like API is supported where fields can be provided within ``__getitem__``, ``.oindex[]`` and ``.vindex[]``." ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([(b'aaa', 1, 4.2), (b'bbb', 2, 8.4), (b'ccc', 3, 12.6)],\n", " dtype=[('foo', 'S3'), ('bar', '\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mIndexError\u001b[0m: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices" ] } ], "source": [ "a['foo', 'baz']" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([(b'aaa', 4.2), (b'bbb', 8.4), (b'ccc', 12.6)],\n", " dtype=[('foo', 'S3'), ('baz', '", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mza\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, selection)\u001b[0m\n\u001b[1;32m 537\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 538\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpop_fields\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 539\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 540\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 541\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mEllipsis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36mget_basic_selection\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 661\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 662\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 663\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_nd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 664\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 665\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m_get_basic_selection_nd\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m 701\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[0;31m# setup indexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 703\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBasicIndexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 704\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 705\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/indexing.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, selection, array)\u001b[0m\n\u001b[1;32m 275\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 276\u001b[0m raise IndexError('unsupported selection item for basic indexing; expected integer '\n\u001b[0;32m--> 277\u001b[0;31m 'or slice, got {!r}'.format(type(dim_sel)))\n\u001b[0m\u001b[1;32m 278\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[0mdim_indexers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim_indexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mIndexError\u001b[0m: unsupported selection item for basic indexing; expected integer or slice, got " ] } ], "source": [ "za[['foo', 'baz']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1D Benchmarking" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "800000000" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c = np.arange(100000000)\n", "c.nbytes" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 480 ms, sys: 16 ms, total: 496 ms\n", "Wall time: 141 ms\n" ] }, { "data": { "text/html": [ "
Typezarr.core.Array
Data typeint64
Shape(100000000,)
Chunk shape(97657,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes800000000 (762.9M)
No. bytes stored11854081 (11.3M)
Storage ratio67.5
Chunks initialized1024/1024
" ], "text/plain": [ "Type : zarr.core.Array\n", "Data type : int64\n", "Shape : (100000000,)\n", "Chunk shape : (97657,)\n", "Order : C\n", "Read-only : False\n", "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", "Store type : builtins.dict\n", "No. bytes : 800000000 (762.9M)\n", "No. bytes stored : 11854081 (11.3M)\n", "Storage ratio : 67.5\n", "Chunks initialized : 1024/1024" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%time zc = zarr.array(c)\n", "zc.info" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "121 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit c.copy()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "254 ms ± 942 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit zc[:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### bool dense selection" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9997476" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# relatively dense selection - 10%\n", "ix_dense_bool = np.random.binomial(1, 0.1, size=c.shape[0]).astype(bool)\n", "np.count_nonzero(ix_dense_bool)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "243 ms ± 5.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit c[ix_dense_bool]" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "433 ms ± 6.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit zc.oindex[ix_dense_bool]" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "548 ms ± 5.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit zc.vindex[ix_dense_bool]" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "import tempfile\n", "import cProfile\n", "import pstats\n", "\n", "def profile(statement, sort='time', restrictions=(7,)):\n", " with tempfile.NamedTemporaryFile() as f:\n", " cProfile.run(statement, filename=f.name)\n", " pstats.Stats(f.name).sort_stats(sort).print_stats(*restrictions)\n" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Nov 8 17:17:48 2017 /tmp/tmpruua2rs_\n", "\n", " 98386 function calls in 0.483 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 83 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 1025 0.197 0.000 0.197 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", " 1024 0.149 0.000 0.159 0.000 ../zarr/core.py:1028(_decode_chunk)\n", " 1024 0.044 0.000 0.231 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 1024 0.009 0.000 0.009 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", " 1025 0.007 0.000 0.238 0.000 ../zarr/indexing.py:541(__iter__)\n", " 1024 0.006 0.000 0.207 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n", " 2048 0.005 0.000 0.005 0.000 ../zarr/core.py:337()\n", "\n", "\n" ] } ], "source": [ "profile('zc.oindex[ix_dense_bool]')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Method ``nonzero`` is being called internally within numpy to convert bool to int selections, no way to avoid." ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Nov 8 17:18:06 2017 /tmp/tmp7_bautep\n", "\n", " 52382 function calls in 0.592 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 88 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 2 0.219 0.110 0.219 0.110 {method 'nonzero' of 'numpy.ndarray' objects}\n", " 1024 0.096 0.000 0.101 0.000 ../zarr/core.py:1028(_decode_chunk)\n", " 2 0.094 0.047 0.094 0.047 ../zarr/indexing.py:630()\n", " 1024 0.044 0.000 0.167 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 1 0.029 0.029 0.029 0.029 {built-in method numpy.core.multiarray.ravel_multi_index}\n", " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", " 1 0.021 0.021 0.181 0.181 ../zarr/indexing.py:603(__init__)\n", "\n", "\n" ] } ], "source": [ "profile('zc.vindex[ix_dense_bool]')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "``.vindex[]`` is a bit slower, possibly because internally it converts to a coordinate array first." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### int dense selection" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10000000" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ix_dense_int = np.random.choice(c.shape[0], size=c.shape[0]//10, replace=True)\n", "ix_dense_int_sorted = ix_dense_int.copy()\n", "ix_dense_int_sorted.sort()\n", "len(ix_dense_int)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "62.2 ms ± 2.36 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit c[ix_dense_int_sorted]" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "355 ms ± 3.53 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit zc.oindex[ix_dense_int_sorted]" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "351 ms ± 3.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit zc.vindex[ix_dense_int_sorted]" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "128 ms ± 137 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit c[ix_dense_int]" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.71 s ± 5.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit zc.oindex[ix_dense_int]" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.68 s ± 3.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit zc.vindex[ix_dense_int]" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Nov 8 17:19:09 2017 /tmp/tmpgmu5btr_\n", "\n", " 95338 function calls in 0.424 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 89 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 1 0.141 0.141 0.184 0.184 ../zarr/indexing.py:369(__init__)\n", " 1024 0.099 0.000 0.106 0.000 ../zarr/core.py:1028(_decode_chunk)\n", " 1024 0.046 0.000 0.175 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 1025 0.027 0.000 0.027 0.000 ../zarr/indexing.py:424(__iter__)\n", " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", " 1 0.010 0.010 0.010 0.010 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/function_base.py:1848(diff)\n", " 1025 0.006 0.000 0.059 0.000 ../zarr/indexing.py:541(__iter__)\n", "\n", "\n" ] } ], "source": [ "profile('zc.oindex[ix_dense_int_sorted]')" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Nov 8 17:19:13 2017 /tmp/tmpay1gvnx8\n", "\n", " 52362 function calls in 0.398 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 85 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 2 0.107 0.054 0.107 0.054 ../zarr/indexing.py:630()\n", " 1024 0.091 0.000 0.096 0.000 ../zarr/core.py:1028(_decode_chunk)\n", " 1024 0.041 0.000 0.160 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 1 0.040 0.040 0.213 0.213 ../zarr/indexing.py:603(__init__)\n", " 1 0.029 0.029 0.029 0.029 {built-in method numpy.core.multiarray.ravel_multi_index}\n", " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", " 2048 0.011 0.000 0.011 0.000 ../zarr/indexing.py:695()\n", "\n", "\n" ] } ], "source": [ "profile('zc.vindex[ix_dense_int_sorted]')" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Nov 8 17:19:20 2017 /tmp/tmpngsf6zpp\n", "\n", " 120946 function calls in 1.793 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 92 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 1 1.128 1.128 1.128 1.128 {method 'argsort' of 'numpy.ndarray' objects}\n", " 1024 0.139 0.000 0.285 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 1 0.132 0.132 1.422 1.422 ../zarr/indexing.py:369(__init__)\n", " 1 0.120 0.120 0.120 0.120 {method 'take' of 'numpy.ndarray' objects}\n", " 1024 0.116 0.000 0.123 0.000 ../zarr/core.py:1028(_decode_chunk)\n", " 1025 0.034 0.000 0.034 0.000 ../zarr/indexing.py:424(__iter__)\n", " 1 0.023 0.023 0.023 0.023 {built-in method numpy.core.multiarray.bincount}\n", "\n", "\n" ] } ], "source": [ "profile('zc.oindex[ix_dense_int]')" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Nov 8 17:19:22 2017 /tmp/tmpbskhj8de\n", "\n", " 50320 function calls in 1.730 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 86 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 1 1.116 1.116 1.116 1.116 {method 'argsort' of 'numpy.ndarray' objects}\n", " 1024 0.133 0.000 0.275 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 2 0.121 0.060 0.121 0.060 ../zarr/indexing.py:654()\n", " 1024 0.113 0.000 0.119 0.000 ../zarr/core.py:1028(_decode_chunk)\n", " 2 0.100 0.050 0.100 0.050 ../zarr/indexing.py:630()\n", " 1 0.030 0.030 0.030 0.030 {built-in method numpy.core.multiarray.ravel_multi_index}\n", " 1 0.024 0.024 1.427 1.427 ../zarr/indexing.py:603(__init__)\n", "\n", "\n" ] } ], "source": [ "profile('zc.vindex[ix_dense_int]')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When indices are not sorted, zarr needs to partially sort them so the occur in chunk order, so we only have to visit each chunk once. This sorting dominates the processing time and is unavoidable AFAIK." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### bool sparse selection" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9932" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# relatively sparse selection\n", "ix_sparse_bool = np.random.binomial(1, 0.0001, size=c.shape[0]).astype(bool)\n", "np.count_nonzero(ix_sparse_bool)" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "15.7 ms ± 38.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%timeit c[ix_sparse_bool]" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "156 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit zc.oindex[ix_sparse_bool]" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "133 ms ± 2.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit zc.vindex[ix_sparse_bool]" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Nov 8 17:20:09 2017 /tmp/tmpb7nqc9ax\n", "\n", " 98386 function calls in 0.191 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 83 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 1024 0.093 0.000 0.098 0.000 ../zarr/core.py:1028(_decode_chunk)\n", " 1025 0.017 0.000 0.017 0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n", " 1024 0.007 0.000 0.007 0.000 {built-in method numpy.core.multiarray.count_nonzero}\n", " 1024 0.007 0.000 0.129 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 1025 0.005 0.000 0.052 0.000 ../zarr/indexing.py:541(__iter__)\n", " 1024 0.005 0.000 0.025 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n", " 2048 0.004 0.000 0.004 0.000 ../zarr/core.py:337()\n", "\n", "\n" ] } ], "source": [ "profile('zc.oindex[ix_sparse_bool]')" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Nov 8 17:20:09 2017 /tmp/tmphsko8nvh\n", "\n", " 52382 function calls in 0.160 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 88 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 1024 0.093 0.000 0.098 0.000 ../zarr/core.py:1028(_decode_chunk)\n", " 2 0.017 0.008 0.017 0.008 {method 'nonzero' of 'numpy.ndarray' objects}\n", " 1025 0.008 0.000 0.014 0.000 ../zarr/indexing.py:674(__iter__)\n", " 1024 0.006 0.000 0.127 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 2048 0.004 0.000 0.004 0.000 ../zarr/indexing.py:695()\n", " 2054 0.003 0.000 0.003 0.000 ../zarr/core.py:337()\n", " 1024 0.002 0.000 0.005 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/core/arrayprint.py:381(wrapper)\n", "\n", "\n" ] } ], "source": [ "profile('zc.vindex[ix_sparse_bool]')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### int sparse selection" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10000" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ix_sparse_int = np.random.choice(c.shape[0], size=c.shape[0]//10000, replace=True)\n", "ix_sparse_int_sorted = ix_sparse_int.copy()\n", "ix_sparse_int_sorted.sort()\n", "len(ix_sparse_int)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "18.9 µs ± 392 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" ] } ], "source": [ "%timeit c[ix_sparse_int_sorted]" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "20.3 µs ± 155 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" ] } ], "source": [ "%timeit c[ix_sparse_int]" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "125 ms ± 296 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit zc.oindex[ix_sparse_int_sorted]" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "109 ms ± 428 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit zc.vindex[ix_sparse_int_sorted]" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "132 ms ± 489 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit zc.oindex[ix_sparse_int]" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "108 ms ± 579 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit zc.vindex[ix_sparse_int]" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Nov 8 17:21:12 2017 /tmp/tmp0b0o2quo\n", "\n", " 120946 function calls in 0.196 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 92 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 1024 0.105 0.000 0.111 0.000 ../zarr/core.py:1028(_decode_chunk)\n", " 2048 0.006 0.000 0.013 0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n", " 1025 0.006 0.000 0.051 0.000 ../zarr/indexing.py:541(__iter__)\n", " 1024 0.006 0.000 0.141 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 2048 0.005 0.000 0.005 0.000 ../zarr/core.py:337()\n", " 15373 0.004 0.000 0.010 0.000 {built-in method builtins.isinstance}\n", " 1025 0.004 0.000 0.005 0.000 ../zarr/indexing.py:424(__iter__)\n", "\n", "\n" ] } ], "source": [ "profile('zc.oindex[ix_sparse_int]')" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Nov 8 17:21:19 2017 /tmp/tmpdwju98kn\n", "\n", " 50320 function calls in 0.167 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 86 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 1024 0.105 0.000 0.111 0.000 ../zarr/core.py:1028(_decode_chunk)\n", " 1025 0.009 0.000 0.017 0.000 ../zarr/indexing.py:674(__iter__)\n", " 1024 0.006 0.000 0.142 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 2048 0.005 0.000 0.005 0.000 ../zarr/indexing.py:695()\n", " 2054 0.004 0.000 0.004 0.000 ../zarr/core.py:337()\n", " 1 0.003 0.003 0.162 0.162 ../zarr/core.py:591(_get_selection)\n", " 1027 0.003 0.000 0.003 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", "\n", "\n" ] } ], "source": [ "profile('zc.vindex[ix_sparse_int]')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For sparse selections, processing time is dominated by decompression, so we can't do any better." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### sparse bool selection as zarr array" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Typezarr.core.Array
Data typebool
Shape(100000000,)
Chunk shape(390625,)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes100000000 (95.4M)
No. bytes stored507131 (495.2K)
Storage ratio197.2
Chunks initialized256/256
" ], "text/plain": [ "Type : zarr.core.Array\n", "Data type : bool\n", "Shape : (100000000,)\n", "Chunk shape : (390625,)\n", "Order : C\n", "Read-only : False\n", "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", "Store type : builtins.dict\n", "No. bytes : 100000000 (95.4M)\n", "No. bytes stored : 507131 (495.2K)\n", "Storage ratio : 197.2\n", "Chunks initialized : 256/256" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "zix_sparse_bool = zarr.array(ix_sparse_bool)\n", "zix_sparse_bool.info" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "387 ms ± 5.47 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit zc.oindex[zix_sparse_bool]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### slice with step" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "80.3 ms ± 377 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit np.array(c[::2])" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "168 ms ± 837 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit zc[::2]" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "136 ms ± 1.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit zc[::10]" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "104 ms ± 1.86 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit zc[::100]" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100 ms ± 1.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit zc[::1000]" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Nov 8 17:22:44 2017 /tmp/tmpg9dxqcpg\n", "\n", " 49193 function calls in 0.211 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 55 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 1024 0.104 0.000 0.110 0.000 ../zarr/core.py:1028(_decode_chunk)\n", " 1024 0.067 0.000 0.195 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 1025 0.005 0.000 0.013 0.000 ../zarr/indexing.py:278(__iter__)\n", " 2048 0.004 0.000 0.004 0.000 ../zarr/core.py:337()\n", " 2050 0.003 0.000 0.003 0.000 ../zarr/indexing.py:90(ceildiv)\n", " 1025 0.003 0.000 0.006 0.000 ../zarr/indexing.py:109(__iter__)\n", " 1024 0.003 0.000 0.003 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", "\n", "\n" ] } ], "source": [ "profile('zc[::2]')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2D Benchmarking" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(100000000,)" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c.shape" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(100000, 1000)" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d = c.reshape(-1, 1000)\n", "d.shape" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Typezarr.core.Array
Data typeint64
Shape(100000, 1000)
Chunk shape(3125, 32)
OrderC
Read-onlyFalse
CompressorBlosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store typebuiltins.dict
No. bytes800000000 (762.9M)
No. bytes stored39228864 (37.4M)
Storage ratio20.4
Chunks initialized1024/1024
" ], "text/plain": [ "Type : zarr.core.Array\n", "Data type : int64\n", "Shape : (100000, 1000)\n", "Chunk shape : (3125, 32)\n", "Order : C\n", "Read-only : False\n", "Compressor : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n", "Store type : builtins.dict\n", "No. bytes : 800000000 (762.9M)\n", "No. bytes stored : 39228864 (37.4M)\n", "Storage ratio : 20.4\n", "Chunks initialized : 1024/1024" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "zd = zarr.array(d)\n", "zd.info" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### bool orthogonal selection" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "ix0 = np.random.binomial(1, 0.5, size=d.shape[0]).astype(bool)\n", "ix1 = np.random.binomial(1, 0.5, size=d.shape[1]).astype(bool)" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "101 ms ± 577 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit d[np.ix_(ix0, ix1)]" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "373 ms ± 5.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit zd.oindex[ix0, ix1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### int orthogonal selection" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [], "source": [ "ix0 = np.random.choice(d.shape[0], size=int(d.shape[0] * .5), replace=True)\n", "ix1 = np.random.choice(d.shape[1], size=int(d.shape[1] * .5), replace=True)" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "174 ms ± 4.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit d[np.ix_(ix0, ix1)]" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "566 ms ± 12.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit zd.oindex[ix0, ix1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### coordinate (point) selection" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10000000" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "n = int(d.size * .1)\n", "ix0 = np.random.choice(d.shape[0], size=n, replace=True)\n", "ix1 = np.random.choice(d.shape[1], size=n, replace=True)\n", "n" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "243 ms ± 3.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit d[ix0, ix1]" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2.03 s ± 17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%timeit zd.vindex[ix0, ix1]" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Wed Nov 8 17:24:31 2017 /tmp/tmp7c68z70p\n", "\n", " 62673 function calls in 2.065 seconds\n", "\n", " Ordered by: internal time\n", " List reduced from 88 to 7 due to restriction <7>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 1 1.112 1.112 1.112 1.112 {method 'argsort' of 'numpy.ndarray' objects}\n", " 3 0.244 0.081 0.244 0.081 ../zarr/indexing.py:654()\n", " 3 0.193 0.064 0.193 0.064 ../zarr/indexing.py:630()\n", " 1024 0.170 0.000 0.350 0.000 ../zarr/core.py:849(_chunk_getitem)\n", " 1024 0.142 0.000 0.151 0.000 ../zarr/core.py:1028(_decode_chunk)\n", " 1 0.044 0.044 0.044 0.044 {built-in method numpy.core.multiarray.ravel_multi_index}\n", " 1 0.043 0.043 1.676 1.676 ../zarr/indexing.py:603(__init__)\n", "\n", "\n" ] } ], "source": [ "profile('zd.vindex[ix0, ix1]')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Points need to be partially sorted so all points in the same chunk are grouped and processed together. This requires ``argsort`` which dominates time." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## h5py comparison\n", "\n", "N.B., not really fair because using slower compressor, but for interest..." ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "import h5py\n", "import tempfile" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "h5f = h5py.File(tempfile.mktemp(), driver='core', backing_store=False)" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hc = h5f.create_dataset('c', data=c, compression='gzip', compression_opts=1, chunks=zc.chunks, shuffle=True)\n", "hc" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.16 s, sys: 172 ms, total: 1.33 s\n", "Wall time: 1.32 s\n" ] }, { "data": { "text/plain": [ "array([ 0, 1, 2, ..., 99999997, 99999998, 99999999])" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%time hc[:]" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.11 s, sys: 0 ns, total: 1.11 s\n", "Wall time: 1.11 s\n" ] }, { "data": { "text/plain": [ "array([ 1063, 28396, 37229, ..., 99955875, 99979354, 99995791])" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%time hc[ix_sparse_bool]" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "# # this is pathological, takes minutes \n", "# %time hc[ix_dense_bool]" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 38.3 s, sys: 136 ms, total: 38.4 s\n", "Wall time: 38.1 s\n" ] }, { "data": { "text/plain": [ "array([ 0, 1000, 2000, ..., 99997000, 99998000, 99999000])" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# this is pretty slow\n", "%time hc[::1000]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }