{ "cells": [ { "cell_type": "markdown", "id": "59a21b23-d08f-4203-ae87-b51474a0bbaf", "metadata": {}, "source": [ "# GroupBy operation\n", "\n", "#### Germain Salvat Vallverdu\n", "\n", "This notebook compare the GroupBy efficiency using either a pandas data frame or\n", "the combination of numpy split and unique functions." ] }, { "cell_type": "code", "execution_count": 1, "id": "e559702e-9fe1-4d8e-a0fa-168a9877597b", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "8aee5ccb-c5ae-4155-938c-31e7fbad027f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "numpy 1.21.2\n", "pandas 1.4.1\n" ] } ], "source": [ "print(\"numpy\", np.__version__)\n", "print(\"pandas\", pd.__version__)" ] }, { "cell_type": "code", "execution_count": 3, "id": "d8700930-5414-4451-b4c3-200a59838ea8", "metadata": {}, "outputs": [], "source": [ "# number of integers\n", "N = 1_000_000" ] }, { "cell_type": "markdown", "id": "09d4ab35-6996-4fa7-a4c0-68f6aa9d0f7c", "metadata": {}, "source": [ "## Case 1: a small number of groups\n", "\n", "10 groups over N rows." ] }, { "cell_type": "code", "execution_count": 4, "id": "b5a412a9-fd03-4187-a8df-03ee2f89af02", "metadata": {}, "outputs": [], "source": [ "group_min, group_max = 100, 110\n", "group_values = np.random.randint(group_min, group_max, N)\n", "compo = np.random.randint(1, 100, 4 * N).reshape(N, 4)\n", "values = np.concatenate((compo, group_values[:, np.newaxis]), axis=1)" ] }, { "cell_type": "code", "execution_count": 5, "id": "e8af07d7-475d-410d-b9cd-22fa41597254", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDM
04629190109
128192570104
249148485104
366276130102
459694924100
\n", "
" ], "text/plain": [ " A B C D M\n", "0 46 2 91 90 109\n", "1 28 19 25 70 104\n", "2 49 14 84 85 104\n", "3 66 27 61 30 102\n", "4 59 69 49 24 100" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(values, columns=list(\"ABCDM\"))\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 6, "id": "bca5406c-e6d3-42fd-a76e-b7bb4b753497", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "219 µs ± 12.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], "source": [ "%%timeit\n", "df = pd.DataFrame(values, columns=list(\"ABCDM\"))\n", "groups = df.groupby(\"M\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "295da442-f314-48d0-a437-3edc4a391cf4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "21.2 ms ± 365 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%%timeit\n", "sorted_M = np.sort(group_values)" ] }, { "cell_type": "code", "execution_count": 8, "id": "f23602a1-9c47-47c7-b67f-39ce74ca3136", "metadata": {}, "outputs": [], "source": [ "sorted_M = np.sort(group_values)" ] }, { "cell_type": "code", "execution_count": 9, "id": "1df6fabf-c0b2-4fdd-88c7-4f09bfa25762", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8.33 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%%timeit\n", "# use already sorted\n", "keys, sections = np.unique(sorted_M, return_index=True)\n", "group_list = np.split(\n", " sorted_M, \n", " sections[1:]\n", ")\n", "groups = {keys[i]: group_list[i] for i in range(len(keys))}" ] }, { "cell_type": "code", "execution_count": 10, "id": "9a15e95d-c721-4f76-bed5-5b1ce3d1a01f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " " ] }, { "data": { "text/plain": [ " 121 function calls (109 primitive calls) in 0.034 seconds\n", "\n", " Ordered by: cumulative time\n", " List reduced from 33 to 20 due to restriction <20>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 1 0.000 0.000 0.034 0.034 {built-in method builtins.exec}\n", " 1 0.001 0.001 0.034 0.034 :1()\n", " 15/3 0.000 0.000 0.033 0.011 {built-in method numpy.core._multiarray_umath.implement_array_function}\n", " 1 0.000 0.000 0.024 0.024 <__array_function__ internals>:2(sort)\n", " 1 0.000 0.000 0.024 0.024 fromnumeric.py:846(sort)\n", " 1 0.023 0.023 0.023 0.023 {method 'sort' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.009 0.009 <__array_function__ internals>:2(unique)\n", " 1 0.002 0.002 0.009 0.009 arraysetops.py:138(unique)\n", " 1 0.003 0.003 0.007 0.007 arraysetops.py:320(_unique1d)\n", " 1 0.002 0.002 0.002 0.002 {method 'argsort' of 'numpy.ndarray' objects}\n", " 1 0.002 0.002 0.002 0.002 {method 'copy' of 'numpy.ndarray' objects}\n", " 1 0.001 0.001 0.001 0.001 {method 'flatten' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(split)\n", " 1 0.000 0.000 0.000 0.000 shape_base.py:799(split)\n", " 1 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(array_split)\n", " 1 0.000 0.000 0.000 0.000 shape_base.py:739(array_split)\n", " 11 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(swapaxes)\n", " 11 0.000 0.000 0.000 0.000 fromnumeric.py:550(swapaxes)\n", " 11 0.000 0.000 0.000 0.000 fromnumeric.py:51(_wrapfunc)\n", " 1 0.000 0.000 0.000 0.000 arraysetops.py:125(_unpack_tuple)" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%prun -s cumulative -l 20\n", "sorted_M = np.sort(group_values)\n", "keys, sections = np.unique(sorted_M, return_index=True)\n", "group_list = np.split(\n", " sorted_M, \n", " sections[1:]\n", ")\n", "groups = {keys[i]: group_list[i] for i in range(len(keys))}" ] }, { "cell_type": "markdown", "id": "fc6ec9f6-d22d-4a23-9618-83e0f6f2d117", "metadata": {}, "source": [ "Above, it is clear that `sort` is the most time consuming part." ] }, { "cell_type": "markdown", "id": "4a3f964c-d4bf-4358-9b9d-fba234acb983", "metadata": {}, "source": [ "## Case 2: a large number of groups\n", "\n", "1000 groups over N rows." ] }, { "cell_type": "code", "execution_count": 11, "id": "96d85df7-90b1-4bc4-ada0-0289240cb31d", "metadata": {}, "outputs": [], "source": [ "group_min, group_max = 100, 1200\n", "group_values = np.random.randint(group_min, group_max, N)\n", "compo = np.random.randint(1, 100, 4 * N).reshape(N, 4)\n", "values = np.concatenate((compo, group_values[:, np.newaxis]), axis=1)" ] }, { "cell_type": "code", "execution_count": 12, "id": "059940fa-390e-40f8-b1f2-82caa35e115c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCDM
019299163193
173267278300
229254353282
324139581118
454377511464
\n", "
" ], "text/plain": [ " A B C D M\n", "0 19 29 91 63 193\n", "1 73 26 72 78 300\n", "2 29 25 43 53 282\n", "3 24 13 95 8 1118\n", "4 54 37 75 11 464" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(values, columns=list(\"ABCDM\"))\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 14, "id": "578aebd1-8281-4889-96d1-d1e42b3f234c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "210 µs ± 6.12 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], "source": [ "%%timeit\n", "df = pd.DataFrame(values, columns=list(\"ABCDM\"))\n", "groups = df.groupby(\"M\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "7c785ca6-6642-45b0-93f2-f050e73e3c37", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "40.9 ms ± 946 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%%timeit\n", "sorted_M = np.sort(group_values)" ] }, { "cell_type": "code", "execution_count": 16, "id": "feb92c5f-9a94-44b2-9c3a-e9924ad30581", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "70.3 ms ± 1.82 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%%timeit\n", "group_values[group_values.argsort()]" ] }, { "cell_type": "markdown", "id": "07be9d5a-e904-4327-8372-d385ed0f9f58", "metadata": {}, "source": [ "Here, in the case of 1D array, `np.sort` is more efficient than `argsort()`." ] }, { "cell_type": "code", "execution_count": 17, "id": "c427854d-bf54-474f-8c45-5953d0178265", "metadata": {}, "outputs": [], "source": [ "sorted_M = np.sort(group_values)" ] }, { "cell_type": "code", "execution_count": 18, "id": "2f335f70-888d-4d6e-b1c9-63a88722203e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9.61 ms ± 145 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%%timeit\n", "# run here using already sorted\n", "keys, sections = np.unique(sorted_M, return_index=True)\n", "group_list = np.split(\n", " sorted_M, \n", " sections[1:]\n", ")\n", "groups = {keys[i]: group_list[i] for i in range(len(keys))}" ] }, { "cell_type": "code", "execution_count": 19, "id": "eeec3538-f545-4930-91ff-af60020d4e2e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " " ] }, { "data": { "text/plain": [ " 8841 function calls (7739 primitive calls) in 0.063 seconds\n", "\n", " Ordered by: cumulative time\n", " List reduced from 33 to 20 due to restriction <20>\n", "\n", " ncalls tottime percall cumtime percall filename:lineno(function)\n", " 1 0.000 0.000 0.063 0.063 {built-in method builtins.exec}\n", " 1 0.001 0.001 0.063 0.063 :1()\n", " 1105/3 0.000 0.000 0.061 0.020 {built-in method numpy.core._multiarray_umath.implement_array_function}\n", " 1 0.000 0.000 0.046 0.046 <__array_function__ internals>:2(sort)\n", " 1 0.000 0.000 0.046 0.046 fromnumeric.py:846(sort)\n", " 1 0.045 0.045 0.045 0.045 {method 'sort' of 'numpy.ndarray' objects}\n", " 1 0.000 0.000 0.011 0.011 <__array_function__ internals>:2(unique)\n", " 1 0.003 0.003 0.011 0.011 arraysetops.py:138(unique)\n", " 1 0.004 0.004 0.008 0.008 arraysetops.py:320(_unique1d)\n", " 1 0.000 0.000 0.004 0.004 <__array_function__ internals>:2(split)\n", " 1 0.000 0.000 0.004 0.004 shape_base.py:799(split)\n", " 1 0.000 0.000 0.004 0.004 <__array_function__ internals>:2(array_split)\n", " 1 0.001 0.001 0.004 0.004 shape_base.py:739(array_split)\n", " 1 0.003 0.003 0.003 0.003 {method 'argsort' of 'numpy.ndarray' objects}\n", " 1101 0.001 0.000 0.003 0.000 <__array_function__ internals>:2(swapaxes)\n", " 1101 0.000 0.000 0.002 0.000 fromnumeric.py:550(swapaxes)\n", " 1 0.001 0.001 0.001 0.001 {method 'flatten' of 'numpy.ndarray' objects}\n", " 1 0.001 0.001 0.001 0.001 {method 'copy' of 'numpy.ndarray' objects}\n", " 1101 0.001 0.000 0.001 0.000 fromnumeric.py:51(_wrapfunc)\n", " 1101 0.000 0.000 0.000 0.000 {method 'swapaxes' of 'numpy.ndarray' objects}" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%prun -s cumulative -l 20\n", "sorted_M = np.sort(group_values)\n", "keys, sections = np.unique(sorted_M, return_index=True)\n", "group_list = np.split(\n", " sorted_M, \n", " sections[1:]\n", ")\n", "groups = {keys[i]: group_list[i] for i in range(len(keys))}" ] }, { "cell_type": "markdown", "id": "2fded82d-a24a-45f1-8089-4f82f32463ec", "metadata": {}, "source": [ "## Use StackOverflow example\n", "\n", "https://stackoverflow.com/questions/38013778/\n", "\n", "```python\n", ">>> a = np.random.randint(5, size=(10000, 2)) # 5 different \"groups\"\n", "\n", "# Only the sort\n", ">>> a = a[a[:, 0].argsort()]\n", "⏱ 116.9 ms\n", "\n", "# Group by on the already sorted table\n", ">>> np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])\n", "⏱ 35.5 ms\n", "\n", "# Total sort + groupby\n", ">>> a = a[a[:, 0].argsort()]\n", ">>> np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])\n", "⏱ 153.0 ms 👑\n", "\n", "# With pandas (cf Piotr answer)\n", ">>> df = pd.DataFrame(a, columns=[\"key\", \"val\"]) # no timer for this line\n", ">>> df.groupby(\"key\").val.apply(pd.Series.tolist) \n", "⏱ 362.3 ms\n", "```\n", "\n", "### 5 groups" ] }, { "cell_type": "code", "execution_count": 20, "id": "e52f2d2a-aab2-4094-9055-a2657248fd07", "metadata": {}, "outputs": [], "source": [ "a = np.random.randint(5, size=(10000, 2)) # 5 different \"groups\"" ] }, { "cell_type": "code", "execution_count": 21, "id": "cfe54057-a965-496c-8702-65e7c6097edd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "298 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], "source": [ "%timeit a[a[:, 0].argsort()]" ] }, { "cell_type": "code", "execution_count": 22, "id": "3f7b825d-51f9-4ea5-b17a-199a4d90645f", "metadata": {}, "outputs": [], "source": [ "a = a[a[:, 0].argsort()]" ] }, { "cell_type": "code", "execution_count": 23, "id": "63cf27ef-62fc-476b-aa14-28738aa0e9c3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "54.2 µs ± 570 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" ] } ], "source": [ "%timeit np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])" ] }, { "cell_type": "code", "execution_count": 24, "id": "f33ea719-2f53-465f-8be7-213e8071fb0b", "metadata": {}, "outputs": [], "source": [ "a = np.random.randint(5, size=(10000, 2)) # 5 different \"groups\"" ] }, { "cell_type": "code", "execution_count": 25, "id": "915fa73a-381c-42ce-a738-f5a64c8cce84", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.11 ms ± 67.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], "source": [ "%%timeit \n", "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n", "df.groupby(\"key\").val.apply(pd.Series.tolist)" ] }, { "cell_type": "code", "execution_count": 26, "id": "4162f778-6e73-4281-925a-e2ceb183ccea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "211 µs ± 9.29 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], "source": [ "%%timeit \n", "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n", "df.groupby(\"key\")" ] }, { "cell_type": "markdown", "id": "e6aa6369-3713-4c19-ae33-6a4c724cbc3e", "metadata": {}, "source": [ "The `apply` and `tolist` operation is quite expensive. Removing this operation,\n", "sorting plus grouping operation in numpy is slower than pandas.\n", "\n", "* sorting: 298 µs\n", "* grouping: 53 µs\n", "* sorting + grouping: 351 µs\n", "* pandas only grouping: 213 µs\n", "\n", "Using the groupby object of Pandas, you can access to a given group using the `get_group`\n", "method which is fast." ] }, { "cell_type": "code", "execution_count": 27, "id": "f13972ae-92e0-41af-bc6e-5d21850ed3bb", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n", "groupby = df.groupby(\"key\")" ] }, { "cell_type": "code", "execution_count": 28, "id": "2e498b44-4a5c-4a70-bef2-19db538e8555", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "99.7 µs ± 3.88 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" ] } ], "source": [ "%timeit groupby.get_group(4)" ] }, { "cell_type": "markdown", "id": "43a47781-b918-4506-9e31-da5e7152f787", "metadata": {}, "source": [ "The `get_group` is quite slow, but in the case of only 5 groups, you call it\n", "at most 5 times." ] }, { "cell_type": "markdown", "id": "368a96e1-c162-46d2-b2b7-b16d603cf95d", "metadata": {}, "source": [ "### 500 groups" ] }, { "cell_type": "code", "execution_count": 30, "id": "1dd3ffe0-0ee4-41d0-8689-ab0b58ce25cd", "metadata": {}, "outputs": [], "source": [ "a = np.random.randint(500, size=(10000, 2)) # 500 different \"groups\"" ] }, { "cell_type": "code", "execution_count": 31, "id": "07c484a5-c880-44dd-be15-4b337400917e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "544 µs ± 3.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], "source": [ "%timeit a[a[:, 0].argsort()]" ] }, { "cell_type": "code", "execution_count": 32, "id": "2b25db57-21d2-4277-b27d-77bb57e9a57c", "metadata": {}, "outputs": [], "source": [ "a = a[a[:, 0].argsort()]" ] }, { "cell_type": "code", "execution_count": 33, "id": "3d3e5daa-5b14-4352-8e3a-644ed1a11ef5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "667 µs ± 2.33 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], "source": [ "%timeit np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])" ] }, { "cell_type": "code", "execution_count": 34, "id": "82b328f7-c6c4-4692-9628-88d92ea26b9b", "metadata": {}, "outputs": [], "source": [ "a = np.random.randint(500, size=(10000, 2)) # 5 different \"groups\"" ] }, { "cell_type": "code", "execution_count": 35, "id": "05dd78a5-8425-4868-b91b-20b5873cdd8c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "7.79 ms ± 305 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%%timeit \n", "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n", "df.groupby(\"key\").val.apply(pd.Series.tolist)" ] }, { "cell_type": "code", "execution_count": 36, "id": "8a7af8c0-eae5-4bf0-a439-24bbe419f6ea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "225 µs ± 18.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], "source": [ "%%timeit \n", "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n", "df.groupby(\"key\")" ] }, { "cell_type": "code", "execution_count": 37, "id": "d3ab391b-a996-4263-b40d-d20fd6dafd55", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n", "groupby = df.groupby(\"key\")" ] }, { "cell_type": "code", "execution_count": 38, "id": "fa3091ff-82d4-4635-ad4a-b306c8eeb947", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "75.3 µs ± 1.33 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" ] } ], "source": [ "%timeit groupby.get_group(4)" ] }, { "cell_type": "markdown", "id": "f9d8a273-8cb3-44c4-a136-020f8e9f3f80", "metadata": {}, "source": [ "The `get_group` is quite slow, but in the case of only 5 groups, you call it\n", "at most 5 times." ] }, { "cell_type": "markdown", "id": "efeaa958-1160-47e1-9401-ee363d2e1324", "metadata": {}, "source": [ "### Increasing N" ] }, { "cell_type": "code", "execution_count": 39, "id": "ca24ee47-b399-47f2-b222-40df9e22b971", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1000000\n" ] } ], "source": [ "print(N)" ] }, { "cell_type": "code", "execution_count": 42, "id": "0a26dc65-a734-4c96-8d73-95e6b7b0346d", "metadata": {}, "outputs": [], "source": [ "a = np.random.randint(500, size=(N, 2)) # 5àà different \"groups\"" ] }, { "cell_type": "code", "execution_count": 43, "id": "e675fe80-3941-4cae-aec3-35c1764051bd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "109 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%timeit a[a[:, 0].argsort()]" ] }, { "cell_type": "code", "execution_count": 44, "id": "9dcd596d-db3b-4291-aab3-34e7bc4947d6", "metadata": {}, "outputs": [], "source": [ "a = a[a[:, 0].argsort()]" ] }, { "cell_type": "code", "execution_count": 45, "id": "0e6e6bcc-def8-4469-9210-e8c97efb54c9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8.22 ms ± 98.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%timeit np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])" ] }, { "cell_type": "code", "execution_count": 46, "id": "a948713c-ac3a-4baf-92ac-e8bfff436280", "metadata": {}, "outputs": [], "source": [ "a = np.random.randint(500, size=(N, 2)) # 5 different \"groups\"" ] }, { "cell_type": "code", "execution_count": 47, "id": "5c3ce16b-cfaf-4ec1-90b3-fdc982b49c69", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "128 ms ± 4.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%%timeit \n", "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n", "df.groupby(\"key\").val.apply(pd.Series.tolist)" ] }, { "cell_type": "code", "execution_count": 48, "id": "e5ef38f5-6a74-4c8f-bad0-8ad38d424022", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "210 µs ± 5.22 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" ] } ], "source": [ "%%timeit \n", "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n", "df.groupby(\"key\")" ] }, { "cell_type": "code", "execution_count": 49, "id": "a8b08873-9caf-4675-b487-a49ddc90f6bd", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n", "groupby = df.groupby(\"key\")" ] }, { "cell_type": "code", "execution_count": 52, "id": "5b655920-58db-4a8d-8fc4-5bfaaa8f62ea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "114 µs ± 670 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" ] } ], "source": [ "%timeit groupby.get_group(40)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }