{
"cells": [
{
"cell_type": "markdown",
"id": "59a21b23-d08f-4203-ae87-b51474a0bbaf",
"metadata": {},
"source": [
"# GroupBy operation\n",
"\n",
"#### Germain Salvat Vallverdu\n",
"\n",
"This notebook compare the GroupBy efficiency using either a pandas data frame or\n",
"the combination of numpy split and unique functions."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e559702e-9fe1-4d8e-a0fa-168a9877597b",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "8aee5ccb-c5ae-4155-938c-31e7fbad027f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"numpy 1.21.2\n",
"pandas 1.4.1\n"
]
}
],
"source": [
"print(\"numpy\", np.__version__)\n",
"print(\"pandas\", pd.__version__)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d8700930-5414-4451-b4c3-200a59838ea8",
"metadata": {},
"outputs": [],
"source": [
"# number of integers\n",
"N = 1_000_000"
]
},
{
"cell_type": "markdown",
"id": "09d4ab35-6996-4fa7-a4c0-68f6aa9d0f7c",
"metadata": {},
"source": [
"## Case 1: a small number of groups\n",
"\n",
"10 groups over N rows."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b5a412a9-fd03-4187-a8df-03ee2f89af02",
"metadata": {},
"outputs": [],
"source": [
"group_min, group_max = 100, 110\n",
"group_values = np.random.randint(group_min, group_max, N)\n",
"compo = np.random.randint(1, 100, 4 * N).reshape(N, 4)\n",
"values = np.concatenate((compo, group_values[:, np.newaxis]), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e8af07d7-475d-410d-b9cd-22fa41597254",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
" M | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 46 | \n",
" 2 | \n",
" 91 | \n",
" 90 | \n",
" 109 | \n",
"
\n",
" \n",
" 1 | \n",
" 28 | \n",
" 19 | \n",
" 25 | \n",
" 70 | \n",
" 104 | \n",
"
\n",
" \n",
" 2 | \n",
" 49 | \n",
" 14 | \n",
" 84 | \n",
" 85 | \n",
" 104 | \n",
"
\n",
" \n",
" 3 | \n",
" 66 | \n",
" 27 | \n",
" 61 | \n",
" 30 | \n",
" 102 | \n",
"
\n",
" \n",
" 4 | \n",
" 59 | \n",
" 69 | \n",
" 49 | \n",
" 24 | \n",
" 100 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D M\n",
"0 46 2 91 90 109\n",
"1 28 19 25 70 104\n",
"2 49 14 84 85 104\n",
"3 66 27 61 30 102\n",
"4 59 69 49 24 100"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(values, columns=list(\"ABCDM\"))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "bca5406c-e6d3-42fd-a76e-b7bb4b753497",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"219 µs ± 12.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"df = pd.DataFrame(values, columns=list(\"ABCDM\"))\n",
"groups = df.groupby(\"M\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "295da442-f314-48d0-a437-3edc4a391cf4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"21.2 ms ± 365 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"sorted_M = np.sort(group_values)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f23602a1-9c47-47c7-b67f-39ce74ca3136",
"metadata": {},
"outputs": [],
"source": [
"sorted_M = np.sort(group_values)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "1df6fabf-c0b2-4fdd-88c7-4f09bfa25762",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"8.33 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"# use already sorted\n",
"keys, sections = np.unique(sorted_M, return_index=True)\n",
"group_list = np.split(\n",
" sorted_M, \n",
" sections[1:]\n",
")\n",
"groups = {keys[i]: group_list[i] for i in range(len(keys))}"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "9a15e95d-c721-4f76-bed5-5b1ce3d1a01f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" "
]
},
{
"data": {
"text/plain": [
" 121 function calls (109 primitive calls) in 0.034 seconds\n",
"\n",
" Ordered by: cumulative time\n",
" List reduced from 33 to 20 due to restriction <20>\n",
"\n",
" ncalls tottime percall cumtime percall filename:lineno(function)\n",
" 1 0.000 0.000 0.034 0.034 {built-in method builtins.exec}\n",
" 1 0.001 0.001 0.034 0.034 :1()\n",
" 15/3 0.000 0.000 0.033 0.011 {built-in method numpy.core._multiarray_umath.implement_array_function}\n",
" 1 0.000 0.000 0.024 0.024 <__array_function__ internals>:2(sort)\n",
" 1 0.000 0.000 0.024 0.024 fromnumeric.py:846(sort)\n",
" 1 0.023 0.023 0.023 0.023 {method 'sort' of 'numpy.ndarray' objects}\n",
" 1 0.000 0.000 0.009 0.009 <__array_function__ internals>:2(unique)\n",
" 1 0.002 0.002 0.009 0.009 arraysetops.py:138(unique)\n",
" 1 0.003 0.003 0.007 0.007 arraysetops.py:320(_unique1d)\n",
" 1 0.002 0.002 0.002 0.002 {method 'argsort' of 'numpy.ndarray' objects}\n",
" 1 0.002 0.002 0.002 0.002 {method 'copy' of 'numpy.ndarray' objects}\n",
" 1 0.001 0.001 0.001 0.001 {method 'flatten' of 'numpy.ndarray' objects}\n",
" 1 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(split)\n",
" 1 0.000 0.000 0.000 0.000 shape_base.py:799(split)\n",
" 1 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(array_split)\n",
" 1 0.000 0.000 0.000 0.000 shape_base.py:739(array_split)\n",
" 11 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(swapaxes)\n",
" 11 0.000 0.000 0.000 0.000 fromnumeric.py:550(swapaxes)\n",
" 11 0.000 0.000 0.000 0.000 fromnumeric.py:51(_wrapfunc)\n",
" 1 0.000 0.000 0.000 0.000 arraysetops.py:125(_unpack_tuple)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%prun -s cumulative -l 20\n",
"sorted_M = np.sort(group_values)\n",
"keys, sections = np.unique(sorted_M, return_index=True)\n",
"group_list = np.split(\n",
" sorted_M, \n",
" sections[1:]\n",
")\n",
"groups = {keys[i]: group_list[i] for i in range(len(keys))}"
]
},
{
"cell_type": "markdown",
"id": "fc6ec9f6-d22d-4a23-9618-83e0f6f2d117",
"metadata": {},
"source": [
"Above, it is clear that `sort` is the most time consuming part."
]
},
{
"cell_type": "markdown",
"id": "4a3f964c-d4bf-4358-9b9d-fba234acb983",
"metadata": {},
"source": [
"## Case 2: a large number of groups\n",
"\n",
"1000 groups over N rows."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "96d85df7-90b1-4bc4-ada0-0289240cb31d",
"metadata": {},
"outputs": [],
"source": [
"group_min, group_max = 100, 1200\n",
"group_values = np.random.randint(group_min, group_max, N)\n",
"compo = np.random.randint(1, 100, 4 * N).reshape(N, 4)\n",
"values = np.concatenate((compo, group_values[:, np.newaxis]), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "059940fa-390e-40f8-b1f2-82caa35e115c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" D | \n",
" M | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 19 | \n",
" 29 | \n",
" 91 | \n",
" 63 | \n",
" 193 | \n",
"
\n",
" \n",
" 1 | \n",
" 73 | \n",
" 26 | \n",
" 72 | \n",
" 78 | \n",
" 300 | \n",
"
\n",
" \n",
" 2 | \n",
" 29 | \n",
" 25 | \n",
" 43 | \n",
" 53 | \n",
" 282 | \n",
"
\n",
" \n",
" 3 | \n",
" 24 | \n",
" 13 | \n",
" 95 | \n",
" 8 | \n",
" 1118 | \n",
"
\n",
" \n",
" 4 | \n",
" 54 | \n",
" 37 | \n",
" 75 | \n",
" 11 | \n",
" 464 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C D M\n",
"0 19 29 91 63 193\n",
"1 73 26 72 78 300\n",
"2 29 25 43 53 282\n",
"3 24 13 95 8 1118\n",
"4 54 37 75 11 464"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(values, columns=list(\"ABCDM\"))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "578aebd1-8281-4889-96d1-d1e42b3f234c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"210 µs ± 6.12 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"df = pd.DataFrame(values, columns=list(\"ABCDM\"))\n",
"groups = df.groupby(\"M\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "7c785ca6-6642-45b0-93f2-f050e73e3c37",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"40.9 ms ± 946 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"sorted_M = np.sort(group_values)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "feb92c5f-9a94-44b2-9c3a-e9924ad30581",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"70.3 ms ± 1.82 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"group_values[group_values.argsort()]"
]
},
{
"cell_type": "markdown",
"id": "07be9d5a-e904-4327-8372-d385ed0f9f58",
"metadata": {},
"source": [
"Here, in the case of 1D array, `np.sort` is more efficient than `argsort()`."
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "c427854d-bf54-474f-8c45-5953d0178265",
"metadata": {},
"outputs": [],
"source": [
"sorted_M = np.sort(group_values)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "2f335f70-888d-4d6e-b1c9-63a88722203e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9.61 ms ± 145 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"# run here using already sorted\n",
"keys, sections = np.unique(sorted_M, return_index=True)\n",
"group_list = np.split(\n",
" sorted_M, \n",
" sections[1:]\n",
")\n",
"groups = {keys[i]: group_list[i] for i in range(len(keys))}"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "eeec3538-f545-4930-91ff-af60020d4e2e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" "
]
},
{
"data": {
"text/plain": [
" 8841 function calls (7739 primitive calls) in 0.063 seconds\n",
"\n",
" Ordered by: cumulative time\n",
" List reduced from 33 to 20 due to restriction <20>\n",
"\n",
" ncalls tottime percall cumtime percall filename:lineno(function)\n",
" 1 0.000 0.000 0.063 0.063 {built-in method builtins.exec}\n",
" 1 0.001 0.001 0.063 0.063 :1()\n",
" 1105/3 0.000 0.000 0.061 0.020 {built-in method numpy.core._multiarray_umath.implement_array_function}\n",
" 1 0.000 0.000 0.046 0.046 <__array_function__ internals>:2(sort)\n",
" 1 0.000 0.000 0.046 0.046 fromnumeric.py:846(sort)\n",
" 1 0.045 0.045 0.045 0.045 {method 'sort' of 'numpy.ndarray' objects}\n",
" 1 0.000 0.000 0.011 0.011 <__array_function__ internals>:2(unique)\n",
" 1 0.003 0.003 0.011 0.011 arraysetops.py:138(unique)\n",
" 1 0.004 0.004 0.008 0.008 arraysetops.py:320(_unique1d)\n",
" 1 0.000 0.000 0.004 0.004 <__array_function__ internals>:2(split)\n",
" 1 0.000 0.000 0.004 0.004 shape_base.py:799(split)\n",
" 1 0.000 0.000 0.004 0.004 <__array_function__ internals>:2(array_split)\n",
" 1 0.001 0.001 0.004 0.004 shape_base.py:739(array_split)\n",
" 1 0.003 0.003 0.003 0.003 {method 'argsort' of 'numpy.ndarray' objects}\n",
" 1101 0.001 0.000 0.003 0.000 <__array_function__ internals>:2(swapaxes)\n",
" 1101 0.000 0.000 0.002 0.000 fromnumeric.py:550(swapaxes)\n",
" 1 0.001 0.001 0.001 0.001 {method 'flatten' of 'numpy.ndarray' objects}\n",
" 1 0.001 0.001 0.001 0.001 {method 'copy' of 'numpy.ndarray' objects}\n",
" 1101 0.001 0.000 0.001 0.000 fromnumeric.py:51(_wrapfunc)\n",
" 1101 0.000 0.000 0.000 0.000 {method 'swapaxes' of 'numpy.ndarray' objects}"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%prun -s cumulative -l 20\n",
"sorted_M = np.sort(group_values)\n",
"keys, sections = np.unique(sorted_M, return_index=True)\n",
"group_list = np.split(\n",
" sorted_M, \n",
" sections[1:]\n",
")\n",
"groups = {keys[i]: group_list[i] for i in range(len(keys))}"
]
},
{
"cell_type": "markdown",
"id": "2fded82d-a24a-45f1-8089-4f82f32463ec",
"metadata": {},
"source": [
"## Use StackOverflow example\n",
"\n",
"https://stackoverflow.com/questions/38013778/\n",
"\n",
"```python\n",
">>> a = np.random.randint(5, size=(10000, 2)) # 5 different \"groups\"\n",
"\n",
"# Only the sort\n",
">>> a = a[a[:, 0].argsort()]\n",
"⏱ 116.9 ms\n",
"\n",
"# Group by on the already sorted table\n",
">>> np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])\n",
"⏱ 35.5 ms\n",
"\n",
"# Total sort + groupby\n",
">>> a = a[a[:, 0].argsort()]\n",
">>> np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])\n",
"⏱ 153.0 ms 👑\n",
"\n",
"# With pandas (cf Piotr answer)\n",
">>> df = pd.DataFrame(a, columns=[\"key\", \"val\"]) # no timer for this line\n",
">>> df.groupby(\"key\").val.apply(pd.Series.tolist) \n",
"⏱ 362.3 ms\n",
"```\n",
"\n",
"### 5 groups"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "e52f2d2a-aab2-4094-9055-a2657248fd07",
"metadata": {},
"outputs": [],
"source": [
"a = np.random.randint(5, size=(10000, 2)) # 5 different \"groups\""
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "cfe54057-a965-496c-8702-65e7c6097edd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"298 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
"source": [
"%timeit a[a[:, 0].argsort()]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "3f7b825d-51f9-4ea5-b17a-199a4d90645f",
"metadata": {},
"outputs": [],
"source": [
"a = a[a[:, 0].argsort()]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "63cf27ef-62fc-476b-aa14-28738aa0e9c3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"54.2 µs ± 570 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
]
}
],
"source": [
"%timeit np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "f33ea719-2f53-465f-8be7-213e8071fb0b",
"metadata": {},
"outputs": [],
"source": [
"a = np.random.randint(5, size=(10000, 2)) # 5 different \"groups\""
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "915fa73a-381c-42ce-a738-f5a64c8cce84",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.11 ms ± 67.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
"source": [
"%%timeit \n",
"df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
"df.groupby(\"key\").val.apply(pd.Series.tolist)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "4162f778-6e73-4281-925a-e2ceb183ccea",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"211 µs ± 9.29 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
"source": [
"%%timeit \n",
"df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
"df.groupby(\"key\")"
]
},
{
"cell_type": "markdown",
"id": "e6aa6369-3713-4c19-ae33-6a4c724cbc3e",
"metadata": {},
"source": [
"The `apply` and `tolist` operation is quite expensive. Removing this operation,\n",
"sorting plus grouping operation in numpy is slower than pandas.\n",
"\n",
"* sorting: 298 µs\n",
"* grouping: 53 µs\n",
"* sorting + grouping: 351 µs\n",
"* pandas only grouping: 213 µs\n",
"\n",
"Using the groupby object of Pandas, you can access to a given group using the `get_group`\n",
"method which is fast."
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "f13972ae-92e0-41af-bc6e-5d21850ed3bb",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
"groupby = df.groupby(\"key\")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "2e498b44-4a5c-4a70-bef2-19db538e8555",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"99.7 µs ± 3.88 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
]
}
],
"source": [
"%timeit groupby.get_group(4)"
]
},
{
"cell_type": "markdown",
"id": "43a47781-b918-4506-9e31-da5e7152f787",
"metadata": {},
"source": [
"The `get_group` is quite slow, but in the case of only 5 groups, you call it\n",
"at most 5 times."
]
},
{
"cell_type": "markdown",
"id": "368a96e1-c162-46d2-b2b7-b16d603cf95d",
"metadata": {},
"source": [
"### 500 groups"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "1dd3ffe0-0ee4-41d0-8689-ab0b58ce25cd",
"metadata": {},
"outputs": [],
"source": [
"a = np.random.randint(500, size=(10000, 2)) # 500 different \"groups\""
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "07c484a5-c880-44dd-be15-4b337400917e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"544 µs ± 3.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
"source": [
"%timeit a[a[:, 0].argsort()]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "2b25db57-21d2-4277-b27d-77bb57e9a57c",
"metadata": {},
"outputs": [],
"source": [
"a = a[a[:, 0].argsort()]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "3d3e5daa-5b14-4352-8e3a-644ed1a11ef5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"667 µs ± 2.33 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
"source": [
"%timeit np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "82b328f7-c6c4-4692-9628-88d92ea26b9b",
"metadata": {},
"outputs": [],
"source": [
"a = np.random.randint(500, size=(10000, 2)) # 5 different \"groups\""
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "05dd78a5-8425-4868-b91b-20b5873cdd8c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"7.79 ms ± 305 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%%timeit \n",
"df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
"df.groupby(\"key\").val.apply(pd.Series.tolist)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "8a7af8c0-eae5-4bf0-a439-24bbe419f6ea",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"225 µs ± 18.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
"source": [
"%%timeit \n",
"df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
"df.groupby(\"key\")"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "d3ab391b-a996-4263-b40d-d20fd6dafd55",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
"groupby = df.groupby(\"key\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "fa3091ff-82d4-4635-ad4a-b306c8eeb947",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"75.3 µs ± 1.33 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
]
}
],
"source": [
"%timeit groupby.get_group(4)"
]
},
{
"cell_type": "markdown",
"id": "f9d8a273-8cb3-44c4-a136-020f8e9f3f80",
"metadata": {},
"source": [
"The `get_group` is quite slow, but in the case of only 5 groups, you call it\n",
"at most 5 times."
]
},
{
"cell_type": "markdown",
"id": "efeaa958-1160-47e1-9401-ee363d2e1324",
"metadata": {},
"source": [
"### Increasing N"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "ca24ee47-b399-47f2-b222-40df9e22b971",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1000000\n"
]
}
],
"source": [
"print(N)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "0a26dc65-a734-4c96-8d73-95e6b7b0346d",
"metadata": {},
"outputs": [],
"source": [
"a = np.random.randint(500, size=(N, 2)) # 5àà different \"groups\""
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "e675fe80-3941-4cae-aec3-35c1764051bd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"109 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%timeit a[a[:, 0].argsort()]"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "9dcd596d-db3b-4291-aab3-34e7bc4947d6",
"metadata": {},
"outputs": [],
"source": [
"a = a[a[:, 0].argsort()]"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "0e6e6bcc-def8-4469-9210-e8c97efb54c9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"8.22 ms ± 98.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
]
}
],
"source": [
"%timeit np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "a948713c-ac3a-4baf-92ac-e8bfff436280",
"metadata": {},
"outputs": [],
"source": [
"a = np.random.randint(500, size=(N, 2)) # 5 different \"groups\""
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "5c3ce16b-cfaf-4ec1-90b3-fdc982b49c69",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"128 ms ± 4.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%%timeit \n",
"df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
"df.groupby(\"key\").val.apply(pd.Series.tolist)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "e5ef38f5-6a74-4c8f-bad0-8ad38d424022",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"210 µs ± 5.22 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
]
}
],
"source": [
"%%timeit \n",
"df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
"df.groupby(\"key\")"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "a8b08873-9caf-4675-b487-a49ddc90f6bd",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
"groupby = df.groupby(\"key\")"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "5b655920-58db-4a8d-8fc4-5bfaaa8f62ea",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"114 µs ± 670 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
]
}
],
"source": [
"%timeit groupby.get_group(40)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}