{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Advanced indexing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'2.1.5.dev144'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sys\n",
    "sys.path.insert(0, '..')\n",
    "import zarr\n",
    "import numpy as np\n",
    "np.random.seed(42)\n",
    "import cProfile\n",
    "zarr.__version__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Functionality and API"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Indexing a 1D array with a Boolean (mask) array\n",
    "\n",
    "Supported via ``get/set_mask_selection()`` and ``.vindex[]``. Also supported via ``get/set_orthogonal_selection()`` and ``.oindex[]``."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.arange(10)\n",
    "za = zarr.array(a, chunks=2)\n",
    "ix = [False,  True,  False,  True, False, True, False,  True,  False,  True]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 3, 5, 7, 9])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get items\n",
    "za.vindex[ix]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 3, 5, 7, 9])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get items\n",
    "za.oindex[ix]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0, 10,  2, 30,  4, 50,  6, 70,  8, 90])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# set items\n",
    "za.vindex[ix] = a[ix] * 10\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  0, 100,   2, 300,   4, 500,   6, 700,   8, 900])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# set items\n",
    "za.oindex[ix] = a[ix] * 100\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 3, 5, 7, 9])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# if using .oindex, indexing array can be any array-like, e.g., Zarr array\n",
    "zix = zarr.array(ix, chunks=2)\n",
    "za = zarr.array(a, chunks=2)\n",
    "za.oindex[zix]  # will not load all zix into memory"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Indexing a 1D array with a 1D integer (coordinate) array\n",
    "\n",
    "Supported via ``get/set_coordinate_selection()`` and ``.vindex[]``. Also supported via ``get/set_orthogonal_selection()`` and ``.oindex[]``."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.arange(10)\n",
    "za = zarr.array(a, chunks=2)\n",
    "ix = [1, 3, 5, 7, 9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 3, 5, 7, 9])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get items\n",
    "za.vindex[ix]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 3, 5, 7, 9])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get items\n",
    "za.oindex[ix]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0, 10,  2, 30,  4, 50,  6, 70,  8, 90])"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# set items\n",
    "za.vindex[ix] = a[ix] * 10\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  0, 100,   2, 300,   4, 500,   6, 700,   8, 900])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# set items\n",
    "za.oindex[ix] = a[ix] * 100\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Indexing a 1D array with a multi-dimensional integer (coordinate) array\n",
    "\n",
    "Supported via ``get/set_coordinate_selection()`` and ``.vindex[]``."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.arange(10)\n",
    "za = zarr.array(a, chunks=2)\n",
    "ix = np.array([[1, 3, 5], [2, 4, 6]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1, 3, 5],\n",
       "       [2, 4, 6]])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get items\n",
    "za.vindex[ix]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0, 10, 20, 30, 40, 50, 60,  7,  8,  9])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# set items\n",
    "za.vindex[ix] = a[ix] * 10\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Slicing a 1D array with step > 1\n",
    "\n",
    "Slices with step > 1 are supported via ``get/set_basic_selection()``, ``get/set_orthogonal_selection()``, ``__getitem__`` and ``.oindex[]``. Negative steps are not supported."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.arange(10)\n",
    "za = zarr.array(a, chunks=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 3, 5, 7, 9])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get items\n",
    "za[1::2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0, 10,  2, 30,  4, 50,  6, 70,  8, 90])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# set items\n",
    "za.oindex[1::2] = a[1::2] * 10\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Orthogonal (outer) indexing of multi-dimensional arrays\n",
    "\n",
    "Orthogonal (a.k.a. outer) indexing is supported with either Boolean or integer arrays, in combination with integers and slices. This functionality is provided via the ``get/set_orthogonal_selection()`` methods. For convenience, this functionality is also available via the ``.oindex[]`` property."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1,  2],\n",
       "       [ 3,  4,  5],\n",
       "       [ 6,  7,  8],\n",
       "       [ 9, 10, 11],\n",
       "       [12, 13, 14]])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = np.arange(15).reshape(5, 3)\n",
    "za = zarr.array(a, chunks=(3, 2))\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 3,  5],\n",
       "       [ 9, 11]])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# orthogonal indexing with Boolean arrays\n",
    "ix0 = [False, True, False, True, False]\n",
    "ix1 = [True, False, True]\n",
    "za.get_orthogonal_selection((ix0, ix1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 3,  5],\n",
       "       [ 9, 11]])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# alternative API\n",
    "za.oindex[ix0, ix1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 3,  5],\n",
       "       [ 9, 11]])"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# orthogonal indexing with integer arrays\n",
    "ix0 = [1, 3]\n",
    "ix1 = [0, 2]\n",
    "za.get_orthogonal_selection((ix0, ix1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 3,  5],\n",
       "       [ 9, 11]])"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# alternative API\n",
    "za.oindex[ix0, ix1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 3,  4,  5],\n",
       "       [ 9, 10, 11]])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# combine with slice\n",
    "za.oindex[[1,  3], :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  2],\n",
       "       [ 3,  5],\n",
       "       [ 6,  8],\n",
       "       [ 9, 11],\n",
       "       [12, 14]])"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# combine with slice\n",
    "za.oindex[:, [0, 2]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1,  2],\n",
       "       [42,  4, 42],\n",
       "       [ 6,  7,  8],\n",
       "       [42, 10, 42],\n",
       "       [12, 13, 14]])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# set items via Boolean selection\n",
    "ix0 = [False, True, False, True, False]\n",
    "ix1 = [True, False, True]\n",
    "selection = ix0, ix1\n",
    "value = 42\n",
    "za.set_orthogonal_selection(selection, value)\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1,  2],\n",
       "       [44,  4, 44],\n",
       "       [ 6,  7,  8],\n",
       "       [44, 10, 44],\n",
       "       [12, 13, 14]])"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# alternative API\n",
    "za.oindex[ix0, ix1] = 44\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1,  2],\n",
       "       [46,  4, 46],\n",
       "       [ 6,  7,  8],\n",
       "       [46, 10, 46],\n",
       "       [12, 13, 14]])"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# set items via integer selection\n",
    "ix0 = [1, 3]\n",
    "ix1 = [0, 2]\n",
    "selection = ix0, ix1\n",
    "value = 46\n",
    "za.set_orthogonal_selection(selection, value)\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1,  2],\n",
       "       [48,  4, 48],\n",
       "       [ 6,  7,  8],\n",
       "       [48, 10, 48],\n",
       "       [12, 13, 14]])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# alternative API\n",
    "za.oindex[ix0, ix1] = 48\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Coordinate indexing of multi-dimensional arrays\n",
    "\n",
    "Selecting arbitrary points from a multi-dimensional array by indexing with integer (coordinate) arrays is supported. This functionality is provided via the ``get/set_coordinate_selection()`` methods. For convenience, this functionality is also available via the ``.vindex[]`` property."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1,  2],\n",
       "       [ 3,  4,  5],\n",
       "       [ 6,  7,  8],\n",
       "       [ 9, 10, 11],\n",
       "       [12, 13, 14]])"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = np.arange(15).reshape(5, 3)\n",
    "za = zarr.array(a, chunks=(3, 2))\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 3, 11])"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get items\n",
    "ix0 = [1, 3]\n",
    "ix1 = [0, 2]\n",
    "za.get_coordinate_selection((ix0, ix1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 3, 11])"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# alternative API\n",
    "za.vindex[ix0, ix1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1,  2],\n",
       "       [42,  4,  5],\n",
       "       [ 6,  7,  8],\n",
       "       [ 9, 10, 42],\n",
       "       [12, 13, 14]])"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# set items\n",
    "za.set_coordinate_selection((ix0, ix1), 42)\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1,  2],\n",
       "       [44,  4,  5],\n",
       "       [ 6,  7,  8],\n",
       "       [ 9, 10, 44],\n",
       "       [12, 13, 14]])"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# alternative API\n",
    "za.vindex[ix0, ix1] = 44\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Mask indexing of multi-dimensional arrays\n",
    "\n",
    "Selecting arbitrary points from a multi-dimensional array by a Boolean array is supported. This functionality is provided via the ``get/set_mask_selection()`` methods. For convenience, this functionality is also available via the ``.vindex[]`` property."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1,  2],\n",
       "       [ 3,  4,  5],\n",
       "       [ 6,  7,  8],\n",
       "       [ 9, 10, 11],\n",
       "       [12, 13, 14]])"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = np.arange(15).reshape(5, 3)\n",
    "za = zarr.array(a, chunks=(3, 2))\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 3, 11])"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ix = np.zeros_like(a, dtype=bool)\n",
    "ix[1, 0] = True\n",
    "ix[3, 2] = True\n",
    "za.get_mask_selection(ix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 3, 11])"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "za.vindex[ix]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1,  2],\n",
       "       [42,  4,  5],\n",
       "       [ 6,  7,  8],\n",
       "       [ 9, 10, 42],\n",
       "       [12, 13, 14]])"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "za.set_mask_selection(ix, 42)\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0,  1,  2],\n",
       "       [44,  4,  5],\n",
       "       [ 6,  7,  8],\n",
       "       [ 9, 10, 44],\n",
       "       [12, 13, 14]])"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "za.vindex[ix] = 44\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Selecting fields from arrays with a structured dtype\n",
    "\n",
    "All ``get/set_selection_...()`` methods support a ``fields`` argument which allows retrieving/replacing data for a specific field or fields. Also h5py-like API is supported where fields can be provided within ``__getitem__``, ``.oindex[]`` and ``.vindex[]``."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([(b'aaa', 1,   4.2), (b'bbb', 2,   8.4), (b'ccc', 3,  12.6)],\n",
       "      dtype=[('foo', 'S3'), ('bar', '<i4'), ('baz', '<f8')])"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a = np.array([(b'aaa', 1, 4.2),\n",
    "              (b'bbb', 2, 8.4),\n",
    "              (b'ccc', 3, 12.6)], \n",
    "             dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')])\n",
    "za = zarr.array(a, chunks=2, fill_value=None)\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([b'aaa', b'bbb', b'ccc'],\n",
       "      dtype='|S3')"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "za['foo']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([(b'aaa',   4.2), (b'bbb',   8.4), (b'ccc',  12.6)],\n",
       "      dtype=[('foo', 'S3'), ('baz', '<f8')])"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "za['foo', 'baz']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([b'aaa', b'bbb'],\n",
       "      dtype='|S3')"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "za[:2, 'foo']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([(b'aaa',  4.2), (b'bbb',  8.4)],\n",
       "      dtype=[('foo', 'S3'), ('baz', '<f8')])"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "za[:2, 'foo', 'baz']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([b'aaa', b'ccc'],\n",
       "      dtype='|S3')"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "za.oindex[[0, 2], 'foo']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([b'aaa', b'ccc'],\n",
       "      dtype='|S3')"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "za.vindex[[0, 2], 'foo']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([(b'aaa', 42,   4.2), (b'bbb', 42,   8.4), (b'ccc', 42,  12.6)],\n",
       "      dtype=[('foo', 'S3'), ('bar', '<i4'), ('baz', '<f8')])"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "za['bar'] = 42\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([(b'aaa', 84,   4.2), (b'bbb', 84,   8.4), (b'ccc', 42,  12.6)],\n",
       "      dtype=[('foo', 'S3'), ('bar', '<i4'), ('baz', '<f8')])"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "za[:2, 'bar'] = 84\n",
    "za[:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that this API differs from numpy when selecting multiple fields. E.g.:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "ename": "IndexError",
     "evalue": "only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-51-ad64b4a52de3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices"
     ]
    }
   ],
   "source": [
    "a['foo', 'baz']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([(b'aaa',   4.2), (b'bbb',   8.4), (b'ccc',  12.6)],\n",
       "      dtype=[('foo', 'S3'), ('baz', '<f8')])"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a[['foo', 'baz']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([(b'aaa',   4.2), (b'bbb',   8.4), (b'ccc',  12.6)],\n",
       "      dtype=[('foo', 'S3'), ('baz', '<f8')])"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "za['foo', 'baz']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "ename": "IndexError",
     "evalue": "unsupported selection item for basic indexing; expected integer or slice, got <class 'list'>",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-54-2394dac3f711>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mza\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'foo'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'baz'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, selection)\u001b[0m\n\u001b[1;32m    537\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    538\u001b[0m         \u001b[0mfields\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpop_fields\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 539\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    540\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    541\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mget_basic_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mEllipsis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36mget_basic_selection\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m    661\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    662\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 663\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_basic_selection_nd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    664\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    665\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_get_basic_selection_zd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/core.py\u001b[0m in \u001b[0;36m_get_basic_selection_nd\u001b[0;34m(self, selection, out, fields)\u001b[0m\n\u001b[1;32m    701\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    702\u001b[0m         \u001b[0;31m# setup indexer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 703\u001b[0;31m         \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBasicIndexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    704\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    705\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_selection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfields\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfields\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/src/github/alimanfoo/zarr/zarr/indexing.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, selection, array)\u001b[0m\n\u001b[1;32m    275\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    276\u001b[0m                 raise IndexError('unsupported selection item for basic indexing; expected integer '\n\u001b[0;32m--> 277\u001b[0;31m                                  'or slice, got {!r}'.format(type(dim_sel)))\n\u001b[0m\u001b[1;32m    278\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    279\u001b[0m             \u001b[0mdim_indexers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdim_indexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mIndexError\u001b[0m: unsupported selection item for basic indexing; expected integer or slice, got <class 'list'>"
     ]
    }
   ],
   "source": [
    "za[['foo', 'baz']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1D Benchmarking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "800000000"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c = np.arange(100000000)\n",
    "c.nbytes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 480 ms, sys: 16 ms, total: 496 ms\n",
      "Wall time: 141 ms\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">int64</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100000000,)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(97657,)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">builtins.dict</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">800000000 (762.9M)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">11854081 (11.3M)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">67.5</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">1024/1024</td></tr></tbody></table>"
      ],
      "text/plain": [
       "Type               : zarr.core.Array\n",
       "Data type          : int64\n",
       "Shape              : (100000000,)\n",
       "Chunk shape        : (97657,)\n",
       "Order              : C\n",
       "Read-only          : False\n",
       "Compressor         : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n",
       "Store type         : builtins.dict\n",
       "No. bytes          : 800000000 (762.9M)\n",
       "No. bytes stored   : 11854081 (11.3M)\n",
       "Storage ratio      : 67.5\n",
       "Chunks initialized : 1024/1024"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time zc = zarr.array(c)\n",
    "zc.info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "121 ms ± 1.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit c.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "254 ms ± 942 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc[:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### bool dense selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9997476"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# relatively dense selection - 10%\n",
    "ix_dense_bool = np.random.binomial(1, 0.1, size=c.shape[0]).astype(bool)\n",
    "np.count_nonzero(ix_dense_bool)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "243 ms ± 5.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit c[ix_dense_bool]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "433 ms ± 6.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc.oindex[ix_dense_bool]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "548 ms ± 5.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc.vindex[ix_dense_bool]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tempfile\n",
    "import cProfile\n",
    "import pstats\n",
    "\n",
    "def profile(statement, sort='time', restrictions=(7,)):\n",
    "    with tempfile.NamedTemporaryFile() as f:\n",
    "        cProfile.run(statement, filename=f.name)\n",
    "        pstats.Stats(f.name).sort_stats(sort).print_stats(*restrictions)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Nov  8 17:17:48 2017    /tmp/tmpruua2rs_\n",
      "\n",
      "         98386 function calls in 0.483 seconds\n",
      "\n",
      "   Ordered by: internal time\n",
      "   List reduced from 83 to 7 due to restriction <7>\n",
      "\n",
      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
      "     1025    0.197    0.000    0.197    0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n",
      "     1024    0.149    0.000    0.159    0.000 ../zarr/core.py:1028(_decode_chunk)\n",
      "     1024    0.044    0.000    0.231    0.000 ../zarr/core.py:849(_chunk_getitem)\n",
      "     1024    0.009    0.000    0.009    0.000 {built-in method numpy.core.multiarray.count_nonzero}\n",
      "     1025    0.007    0.000    0.238    0.000 ../zarr/indexing.py:541(__iter__)\n",
      "     1024    0.006    0.000    0.207    0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n",
      "     2048    0.005    0.000    0.005    0.000 ../zarr/core.py:337(<genexpr>)\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "profile('zc.oindex[ix_dense_bool]')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Method ``nonzero`` is being called internally within numpy to convert bool to int selections, no way to avoid."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Nov  8 17:18:06 2017    /tmp/tmp7_bautep\n",
      "\n",
      "         52382 function calls in 0.592 seconds\n",
      "\n",
      "   Ordered by: internal time\n",
      "   List reduced from 88 to 7 due to restriction <7>\n",
      "\n",
      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
      "        2    0.219    0.110    0.219    0.110 {method 'nonzero' of 'numpy.ndarray' objects}\n",
      "     1024    0.096    0.000    0.101    0.000 ../zarr/core.py:1028(_decode_chunk)\n",
      "        2    0.094    0.047    0.094    0.047 ../zarr/indexing.py:630(<genexpr>)\n",
      "     1024    0.044    0.000    0.167    0.000 ../zarr/core.py:849(_chunk_getitem)\n",
      "        1    0.029    0.029    0.029    0.029 {built-in method numpy.core.multiarray.ravel_multi_index}\n",
      "        1    0.023    0.023    0.023    0.023 {built-in method numpy.core.multiarray.bincount}\n",
      "        1    0.021    0.021    0.181    0.181 ../zarr/indexing.py:603(__init__)\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "profile('zc.vindex[ix_dense_bool]')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "``.vindex[]`` is a bit slower, possibly because internally it converts to a coordinate array first."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### int dense selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10000000"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ix_dense_int = np.random.choice(c.shape[0], size=c.shape[0]//10, replace=True)\n",
    "ix_dense_int_sorted = ix_dense_int.copy()\n",
    "ix_dense_int_sorted.sort()\n",
    "len(ix_dense_int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "62.2 ms ± 2.36 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit c[ix_dense_int_sorted]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "355 ms ± 3.53 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc.oindex[ix_dense_int_sorted]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "351 ms ± 3.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc.vindex[ix_dense_int_sorted]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "128 ms ± 137 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit c[ix_dense_int]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.71 s ± 5.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc.oindex[ix_dense_int]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.68 s ± 3.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc.vindex[ix_dense_int]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Nov  8 17:19:09 2017    /tmp/tmpgmu5btr_\n",
      "\n",
      "         95338 function calls in 0.424 seconds\n",
      "\n",
      "   Ordered by: internal time\n",
      "   List reduced from 89 to 7 due to restriction <7>\n",
      "\n",
      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
      "        1    0.141    0.141    0.184    0.184 ../zarr/indexing.py:369(__init__)\n",
      "     1024    0.099    0.000    0.106    0.000 ../zarr/core.py:1028(_decode_chunk)\n",
      "     1024    0.046    0.000    0.175    0.000 ../zarr/core.py:849(_chunk_getitem)\n",
      "     1025    0.027    0.000    0.027    0.000 ../zarr/indexing.py:424(__iter__)\n",
      "        1    0.023    0.023    0.023    0.023 {built-in method numpy.core.multiarray.bincount}\n",
      "        1    0.010    0.010    0.010    0.010 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/function_base.py:1848(diff)\n",
      "     1025    0.006    0.000    0.059    0.000 ../zarr/indexing.py:541(__iter__)\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "profile('zc.oindex[ix_dense_int_sorted]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Nov  8 17:19:13 2017    /tmp/tmpay1gvnx8\n",
      "\n",
      "         52362 function calls in 0.398 seconds\n",
      "\n",
      "   Ordered by: internal time\n",
      "   List reduced from 85 to 7 due to restriction <7>\n",
      "\n",
      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
      "        2    0.107    0.054    0.107    0.054 ../zarr/indexing.py:630(<genexpr>)\n",
      "     1024    0.091    0.000    0.096    0.000 ../zarr/core.py:1028(_decode_chunk)\n",
      "     1024    0.041    0.000    0.160    0.000 ../zarr/core.py:849(_chunk_getitem)\n",
      "        1    0.040    0.040    0.213    0.213 ../zarr/indexing.py:603(__init__)\n",
      "        1    0.029    0.029    0.029    0.029 {built-in method numpy.core.multiarray.ravel_multi_index}\n",
      "        1    0.023    0.023    0.023    0.023 {built-in method numpy.core.multiarray.bincount}\n",
      "     2048    0.011    0.000    0.011    0.000 ../zarr/indexing.py:695(<genexpr>)\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "profile('zc.vindex[ix_dense_int_sorted]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Nov  8 17:19:20 2017    /tmp/tmpngsf6zpp\n",
      "\n",
      "         120946 function calls in 1.793 seconds\n",
      "\n",
      "   Ordered by: internal time\n",
      "   List reduced from 92 to 7 due to restriction <7>\n",
      "\n",
      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
      "        1    1.128    1.128    1.128    1.128 {method 'argsort' of 'numpy.ndarray' objects}\n",
      "     1024    0.139    0.000    0.285    0.000 ../zarr/core.py:849(_chunk_getitem)\n",
      "        1    0.132    0.132    1.422    1.422 ../zarr/indexing.py:369(__init__)\n",
      "        1    0.120    0.120    0.120    0.120 {method 'take' of 'numpy.ndarray' objects}\n",
      "     1024    0.116    0.000    0.123    0.000 ../zarr/core.py:1028(_decode_chunk)\n",
      "     1025    0.034    0.000    0.034    0.000 ../zarr/indexing.py:424(__iter__)\n",
      "        1    0.023    0.023    0.023    0.023 {built-in method numpy.core.multiarray.bincount}\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "profile('zc.oindex[ix_dense_int]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Nov  8 17:19:22 2017    /tmp/tmpbskhj8de\n",
      "\n",
      "         50320 function calls in 1.730 seconds\n",
      "\n",
      "   Ordered by: internal time\n",
      "   List reduced from 86 to 7 due to restriction <7>\n",
      "\n",
      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
      "        1    1.116    1.116    1.116    1.116 {method 'argsort' of 'numpy.ndarray' objects}\n",
      "     1024    0.133    0.000    0.275    0.000 ../zarr/core.py:849(_chunk_getitem)\n",
      "        2    0.121    0.060    0.121    0.060 ../zarr/indexing.py:654(<genexpr>)\n",
      "     1024    0.113    0.000    0.119    0.000 ../zarr/core.py:1028(_decode_chunk)\n",
      "        2    0.100    0.050    0.100    0.050 ../zarr/indexing.py:630(<genexpr>)\n",
      "        1    0.030    0.030    0.030    0.030 {built-in method numpy.core.multiarray.ravel_multi_index}\n",
      "        1    0.024    0.024    1.427    1.427 ../zarr/indexing.py:603(__init__)\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "profile('zc.vindex[ix_dense_int]')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When indices are not sorted, zarr needs to partially sort them so the occur in chunk order, so we only have to visit each chunk once. This sorting dominates the processing time and is unavoidable AFAIK."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### bool sparse selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9932"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# relatively sparse selection\n",
    "ix_sparse_bool = np.random.binomial(1, 0.0001, size=c.shape[0]).astype(bool)\n",
    "np.count_nonzero(ix_sparse_bool)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "15.7 ms ± 38.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit c[ix_sparse_bool]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "156 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc.oindex[ix_sparse_bool]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "133 ms ± 2.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc.vindex[ix_sparse_bool]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Nov  8 17:20:09 2017    /tmp/tmpb7nqc9ax\n",
      "\n",
      "         98386 function calls in 0.191 seconds\n",
      "\n",
      "   Ordered by: internal time\n",
      "   List reduced from 83 to 7 due to restriction <7>\n",
      "\n",
      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
      "     1024    0.093    0.000    0.098    0.000 ../zarr/core.py:1028(_decode_chunk)\n",
      "     1025    0.017    0.000    0.017    0.000 {method 'nonzero' of 'numpy.ndarray' objects}\n",
      "     1024    0.007    0.000    0.007    0.000 {built-in method numpy.core.multiarray.count_nonzero}\n",
      "     1024    0.007    0.000    0.129    0.000 ../zarr/core.py:849(_chunk_getitem)\n",
      "     1025    0.005    0.000    0.052    0.000 ../zarr/indexing.py:541(__iter__)\n",
      "     1024    0.005    0.000    0.025    0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n",
      "     2048    0.004    0.000    0.004    0.000 ../zarr/core.py:337(<genexpr>)\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "profile('zc.oindex[ix_sparse_bool]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Nov  8 17:20:09 2017    /tmp/tmphsko8nvh\n",
      "\n",
      "         52382 function calls in 0.160 seconds\n",
      "\n",
      "   Ordered by: internal time\n",
      "   List reduced from 88 to 7 due to restriction <7>\n",
      "\n",
      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
      "     1024    0.093    0.000    0.098    0.000 ../zarr/core.py:1028(_decode_chunk)\n",
      "        2    0.017    0.008    0.017    0.008 {method 'nonzero' of 'numpy.ndarray' objects}\n",
      "     1025    0.008    0.000    0.014    0.000 ../zarr/indexing.py:674(__iter__)\n",
      "     1024    0.006    0.000    0.127    0.000 ../zarr/core.py:849(_chunk_getitem)\n",
      "     2048    0.004    0.000    0.004    0.000 ../zarr/indexing.py:695(<genexpr>)\n",
      "     2054    0.003    0.000    0.003    0.000 ../zarr/core.py:337(<genexpr>)\n",
      "     1024    0.002    0.000    0.005    0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/core/arrayprint.py:381(wrapper)\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "profile('zc.vindex[ix_sparse_bool]')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### int sparse selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10000"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ix_sparse_int = np.random.choice(c.shape[0], size=c.shape[0]//10000, replace=True)\n",
    "ix_sparse_int_sorted = ix_sparse_int.copy()\n",
    "ix_sparse_int_sorted.sort()\n",
    "len(ix_sparse_int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "18.9 µs ± 392 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit c[ix_sparse_int_sorted]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "20.3 µs ± 155 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit c[ix_sparse_int]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "125 ms ± 296 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc.oindex[ix_sparse_int_sorted]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "109 ms ± 428 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc.vindex[ix_sparse_int_sorted]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "132 ms ± 489 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc.oindex[ix_sparse_int]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "108 ms ± 579 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc.vindex[ix_sparse_int]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Nov  8 17:21:12 2017    /tmp/tmp0b0o2quo\n",
      "\n",
      "         120946 function calls in 0.196 seconds\n",
      "\n",
      "   Ordered by: internal time\n",
      "   List reduced from 92 to 7 due to restriction <7>\n",
      "\n",
      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
      "     1024    0.105    0.000    0.111    0.000 ../zarr/core.py:1028(_decode_chunk)\n",
      "     2048    0.006    0.000    0.013    0.000 /home/aliman/pyenv/zarr_20171023/lib/python3.6/site-packages/numpy/lib/index_tricks.py:26(ix_)\n",
      "     1025    0.006    0.000    0.051    0.000 ../zarr/indexing.py:541(__iter__)\n",
      "     1024    0.006    0.000    0.141    0.000 ../zarr/core.py:849(_chunk_getitem)\n",
      "     2048    0.005    0.000    0.005    0.000 ../zarr/core.py:337(<genexpr>)\n",
      "    15373    0.004    0.000    0.010    0.000 {built-in method builtins.isinstance}\n",
      "     1025    0.004    0.000    0.005    0.000 ../zarr/indexing.py:424(__iter__)\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "profile('zc.oindex[ix_sparse_int]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Nov  8 17:21:19 2017    /tmp/tmpdwju98kn\n",
      "\n",
      "         50320 function calls in 0.167 seconds\n",
      "\n",
      "   Ordered by: internal time\n",
      "   List reduced from 86 to 7 due to restriction <7>\n",
      "\n",
      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
      "     1024    0.105    0.000    0.111    0.000 ../zarr/core.py:1028(_decode_chunk)\n",
      "     1025    0.009    0.000    0.017    0.000 ../zarr/indexing.py:674(__iter__)\n",
      "     1024    0.006    0.000    0.142    0.000 ../zarr/core.py:849(_chunk_getitem)\n",
      "     2048    0.005    0.000    0.005    0.000 ../zarr/indexing.py:695(<genexpr>)\n",
      "     2054    0.004    0.000    0.004    0.000 ../zarr/core.py:337(<genexpr>)\n",
      "        1    0.003    0.003    0.162    0.162 ../zarr/core.py:591(_get_selection)\n",
      "     1027    0.003    0.000    0.003    0.000 {method 'reshape' of 'numpy.ndarray' objects}\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "profile('zc.vindex[ix_sparse_int]')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For sparse selections, processing time is dominated by decompression, so we can't do any better."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### sparse bool selection as zarr array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">bool</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100000000,)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(390625,)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">builtins.dict</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">100000000 (95.4M)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">507131 (495.2K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">197.2</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">256/256</td></tr></tbody></table>"
      ],
      "text/plain": [
       "Type               : zarr.core.Array\n",
       "Data type          : bool\n",
       "Shape              : (100000000,)\n",
       "Chunk shape        : (390625,)\n",
       "Order              : C\n",
       "Read-only          : False\n",
       "Compressor         : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n",
       "Store type         : builtins.dict\n",
       "No. bytes          : 100000000 (95.4M)\n",
       "No. bytes stored   : 507131 (495.2K)\n",
       "Storage ratio      : 197.2\n",
       "Chunks initialized : 256/256"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "zix_sparse_bool = zarr.array(ix_sparse_bool)\n",
    "zix_sparse_bool.info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "387 ms ± 5.47 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc.oindex[zix_sparse_bool]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### slice with step"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "80.3 ms ± 377 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit np.array(c[::2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "168 ms ± 837 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc[::2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "136 ms ± 1.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc[::10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "104 ms ± 1.86 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc[::100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100 ms ± 1.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zc[::1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Nov  8 17:22:44 2017    /tmp/tmpg9dxqcpg\n",
      "\n",
      "         49193 function calls in 0.211 seconds\n",
      "\n",
      "   Ordered by: internal time\n",
      "   List reduced from 55 to 7 due to restriction <7>\n",
      "\n",
      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
      "     1024    0.104    0.000    0.110    0.000 ../zarr/core.py:1028(_decode_chunk)\n",
      "     1024    0.067    0.000    0.195    0.000 ../zarr/core.py:849(_chunk_getitem)\n",
      "     1025    0.005    0.000    0.013    0.000 ../zarr/indexing.py:278(__iter__)\n",
      "     2048    0.004    0.000    0.004    0.000 ../zarr/core.py:337(<genexpr>)\n",
      "     2050    0.003    0.000    0.003    0.000 ../zarr/indexing.py:90(ceildiv)\n",
      "     1025    0.003    0.000    0.006    0.000 ../zarr/indexing.py:109(__iter__)\n",
      "     1024    0.003    0.000    0.003    0.000 {method 'reshape' of 'numpy.ndarray' objects}\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "profile('zc[::2]')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2D Benchmarking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(100000000,)"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "c.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(100000, 1000)"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d = c.reshape(-1, 1000)\n",
    "d.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">int64</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100000, 1000)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(3125, 32)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">builtins.dict</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">800000000 (762.9M)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">39228864 (37.4M)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">20.4</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">1024/1024</td></tr></tbody></table>"
      ],
      "text/plain": [
       "Type               : zarr.core.Array\n",
       "Data type          : int64\n",
       "Shape              : (100000, 1000)\n",
       "Chunk shape        : (3125, 32)\n",
       "Order              : C\n",
       "Read-only          : False\n",
       "Compressor         : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)\n",
       "Store type         : builtins.dict\n",
       "No. bytes          : 800000000 (762.9M)\n",
       "No. bytes stored   : 39228864 (37.4M)\n",
       "Storage ratio      : 20.4\n",
       "Chunks initialized : 1024/1024"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "zd = zarr.array(d)\n",
    "zd.info"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### bool orthogonal selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix0 = np.random.binomial(1, 0.5, size=d.shape[0]).astype(bool)\n",
    "ix1 = np.random.binomial(1, 0.5, size=d.shape[1]).astype(bool)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "101 ms ± 577 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit d[np.ix_(ix0, ix1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "373 ms ± 5.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zd.oindex[ix0, ix1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### int orthogonal selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix0 = np.random.choice(d.shape[0], size=int(d.shape[0] * .5), replace=True)\n",
    "ix1 = np.random.choice(d.shape[1], size=int(d.shape[1] * .5), replace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "174 ms ± 4.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit d[np.ix_(ix0, ix1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "566 ms ± 12.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zd.oindex[ix0, ix1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### coordinate (point) selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10000000"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n = int(d.size * .1)\n",
    "ix0 = np.random.choice(d.shape[0], size=n, replace=True)\n",
    "ix1 = np.random.choice(d.shape[1], size=n, replace=True)\n",
    "n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "243 ms ± 3.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit d[ix0, ix1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.03 s ± 17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit zd.vindex[ix0, ix1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Nov  8 17:24:31 2017    /tmp/tmp7c68z70p\n",
      "\n",
      "         62673 function calls in 2.065 seconds\n",
      "\n",
      "   Ordered by: internal time\n",
      "   List reduced from 88 to 7 due to restriction <7>\n",
      "\n",
      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
      "        1    1.112    1.112    1.112    1.112 {method 'argsort' of 'numpy.ndarray' objects}\n",
      "        3    0.244    0.081    0.244    0.081 ../zarr/indexing.py:654(<genexpr>)\n",
      "        3    0.193    0.064    0.193    0.064 ../zarr/indexing.py:630(<genexpr>)\n",
      "     1024    0.170    0.000    0.350    0.000 ../zarr/core.py:849(_chunk_getitem)\n",
      "     1024    0.142    0.000    0.151    0.000 ../zarr/core.py:1028(_decode_chunk)\n",
      "        1    0.044    0.044    0.044    0.044 {built-in method numpy.core.multiarray.ravel_multi_index}\n",
      "        1    0.043    0.043    1.676    1.676 ../zarr/indexing.py:603(__init__)\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "profile('zd.vindex[ix0, ix1]')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Points need to be partially sorted so all points in the same chunk are grouped and processed together. This requires ``argsort`` which dominates time."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## h5py comparison\n",
    "\n",
    "N.B., not really fair because using slower compressor, but for interest..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "import h5py\n",
    "import tempfile"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "h5f = h5py.File(tempfile.mktemp(), driver='core', backing_store=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<HDF5 dataset \"c\": shape (100000000,), type \"<i8\">"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hc = h5f.create_dataset('c', data=c, compression='gzip', compression_opts=1, chunks=zc.chunks, shuffle=True)\n",
    "hc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.16 s, sys: 172 ms, total: 1.33 s\n",
      "Wall time: 1.32 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([       0,        1,        2, ..., 99999997, 99999998, 99999999])"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time hc[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.11 s, sys: 0 ns, total: 1.11 s\n",
      "Wall time: 1.11 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([    1063,    28396,    37229, ..., 99955875, 99979354, 99995791])"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%time hc[ix_sparse_bool]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # this is pathological, takes minutes \n",
    "# %time hc[ix_dense_bool]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 38.3 s, sys: 136 ms, total: 38.4 s\n",
      "Wall time: 38.1 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([       0,     1000,     2000, ..., 99997000, 99998000, 99999000])"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# this is pretty slow\n",
    "%time hc[::1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}