{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Support for dask arrays\n", "\n", "It is possible to operate on dask arrays and spare the memory (or perhaps even time)." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Necessary imports\n", "import dask.multiprocessing\n", "import numpy as np\n", "\n", "import dask.array as da\n", "from physt import h1, h2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Create two arrays\n", "np.random.seed(42)\n", "\n", "SIZE = 2 ** 21\n", "CHUNK = int(SIZE / 16)\n", "\n", "million = np.random.rand(SIZE)#.astype(int)\n", "million2 = (3 * million + np.random.normal(0., 0.3, SIZE))#.astype(int)\n", "\n", "# Chunk them for dask\n", "chunked = da.from_array(million, chunks=(CHUNK))\n", "chunked2 = da.from_array(million2, chunks=(CHUNK))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create histograms\n", "\n", "`h1`, `h2`, ... have their alternatives in `physt.dask_compat`. They should work similarly. Although, they are not complete and unexpected errors may occur." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from physt.compat.dask import h1 as d1\n", "from physt.compat.dask import h2 as d2" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Check: True\n" ] }, { "data": { "text/plain": "Histogram1D(bins=(28,), total=2097152, dtype=int64)" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": "
", "image/png": "\n" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Use chunks to create a 1D histogram\n", "ha = d1(chunked2, \"fixed_width\", bin_width=0.2)\n", "check_ha = h1(million2, \"fixed_width\", bin_width=0.2)\n", "ok = (ha == check_ha)\n", "print(\"Check: \", ok)\n", "ha.plot()\n", "ha" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/honza/code/my/physt/src/physt/util.py:81: FutureWarning:\n", "\n", "histogramdd is deprecated, use h instead\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Check: True\n" ] }, { "data": { "text/plain": "Histogram2D(bins=(5, 28), total=2097152, dtype=int64)" }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": "
", "image/png": "\n" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Use chunks to create a 2D histogram\n", "hb = d2(chunked, chunked2, \"fixed_width\", bin_width=.2, axis_names=[\"x\", \"y\"])\n", "check_hb = h2(million, million2, \"fixed_width\", bin_width=.2, axis_names=[\"x\", \"y\"])\n", "hb.plot(show_zero=False, cmap=\"rainbow\")\n", "ok = (hb == check_hb)\n", "print(\"Check: \", ok)\n", "hb" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Check: True\n" ] }, { "data": { "text/plain": "
", "image/png": "\n" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# And another cross-check\n", "hh = hb.projection(\"y\")\n", "hh.plot()\n", "print(\"Check: \", np.array_equal(hh.frequencies, ha.frequencies)) # Just frequencies" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": "True" }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Use dask for normal arrays (will automatically split array to chunks)\n", "d1(million2, \"fixed_width\", bin_width=0.2) == ha" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Some timings\n", "\n", "Your results may vary substantially. These numbers are just for illustration, on 4-core (8-thread) machine. The real gain comes when we have data that don't fit into memory." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Efficiency" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 399 ms, sys: 3.75 ms, total: 403 ms\n", "Wall time: 442 ms\n" ] }, { "data": { "text/plain": "Histogram1D(bins=(28,), total=2097152, dtype=int64)" }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Standard\n", "%time h1(million2, \"fixed_width\", bin_width=0.2)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 368 ms, sys: 4.08 ms, total: 372 ms\n", "Wall time: 84.5 ms\n" ] }, { "data": { "text/plain": "Histogram1D(bins=(28,), total=2097152, dtype=int64)" }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Same array, but using dask\n", "%time d1(million2, \"fixed_width\", bin_width=0.2)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 343 ms, sys: 4.18 ms, total: 347 ms\n", "Wall time: 64.5 ms\n" ] }, { "data": { "text/plain": "Histogram1D(bins=(28,), total=2097152, dtype=int64)" }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Most efficient: dask with already chunked data\n", "%time d1(chunked2, \"fixed_width\", bin_width=0.2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Different scheduling" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 337 ms, sys: 14.1 ms, total: 351 ms\n", "Wall time: 88.3 ms\n" ] }, { "data": { "text/plain": "Histogram1D(bins=(28,), total=2097152, dtype=int64)" }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%time d1(chunked2, \"fixed_width\", bin_width=0.2)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 317 ms, sys: 9.95 ms, total: 327 ms\n", "Wall time: 98.6 ms\n" ] }, { "data": { "text/plain": "Histogram1D(bins=(28,), total=2097152, dtype=int64)" }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "# Hyper-threading or not?\n", "graph, name = d1(chunked2, \"fixed_width\", bin_width=0.2, compute=False)\n", "dask.threaded.get(graph, name, num_workers=4)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 22.8 ms, sys: 8.53 ms, total: 31.3 ms\n", "Wall time: 1.65 s\n" ] }, { "data": { "text/plain": "Histogram1D(bins=(28,), total=2097152, dtype=int64)" }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Multiprocessing not so efficient for small arrays?\n", "%time d1(chunked2, \"fixed_width\", bin_width=0.2, dask_method=dask.multiprocessing.get)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 1 }