{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "59a21b23-d08f-4203-ae87-b51474a0bbaf",
   "metadata": {},
   "source": [
    "# GroupBy operation\n",
    "\n",
    "#### Germain Salvat Vallverdu\n",
    "\n",
    "This notebook compare the GroupBy efficiency using either a pandas data frame or\n",
    "the combination of numpy split and unique functions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e559702e-9fe1-4d8e-a0fa-168a9877597b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8aee5ccb-c5ae-4155-938c-31e7fbad027f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "numpy 1.21.2\n",
      "pandas 1.4.1\n"
     ]
    }
   ],
   "source": [
    "print(\"numpy\", np.__version__)\n",
    "print(\"pandas\", pd.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d8700930-5414-4451-b4c3-200a59838ea8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# number of integers\n",
    "N = 1_000_000"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09d4ab35-6996-4fa7-a4c0-68f6aa9d0f7c",
   "metadata": {},
   "source": [
    "## Case 1: a small number of groups\n",
    "\n",
    "10 groups over N rows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b5a412a9-fd03-4187-a8df-03ee2f89af02",
   "metadata": {},
   "outputs": [],
   "source": [
    "group_min, group_max = 100, 110\n",
    "group_values = np.random.randint(group_min, group_max, N)\n",
    "compo = np.random.randint(1, 100, 4 * N).reshape(N, 4)\n",
    "values = np.concatenate((compo, group_values[:, np.newaxis]), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e8af07d7-475d-410d-b9cd-22fa41597254",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>M</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>46</td>\n",
       "      <td>2</td>\n",
       "      <td>91</td>\n",
       "      <td>90</td>\n",
       "      <td>109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>28</td>\n",
       "      <td>19</td>\n",
       "      <td>25</td>\n",
       "      <td>70</td>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>49</td>\n",
       "      <td>14</td>\n",
       "      <td>84</td>\n",
       "      <td>85</td>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>66</td>\n",
       "      <td>27</td>\n",
       "      <td>61</td>\n",
       "      <td>30</td>\n",
       "      <td>102</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>59</td>\n",
       "      <td>69</td>\n",
       "      <td>49</td>\n",
       "      <td>24</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A   B   C   D    M\n",
       "0  46   2  91  90  109\n",
       "1  28  19  25  70  104\n",
       "2  49  14  84  85  104\n",
       "3  66  27  61  30  102\n",
       "4  59  69  49  24  100"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(values, columns=list(\"ABCDM\"))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "bca5406c-e6d3-42fd-a76e-b7bb4b753497",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "219 µs ± 12.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "df = pd.DataFrame(values, columns=list(\"ABCDM\"))\n",
    "groups = df.groupby(\"M\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "295da442-f314-48d0-a437-3edc4a391cf4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "21.2 ms ± 365 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "sorted_M = np.sort(group_values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "f23602a1-9c47-47c7-b67f-39ce74ca3136",
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted_M = np.sort(group_values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "1df6fabf-c0b2-4fdd-88c7-4f09bfa25762",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8.33 ms ± 222 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "# use already sorted\n",
    "keys, sections = np.unique(sorted_M, return_index=True)\n",
    "group_list = np.split(\n",
    "    sorted_M, \n",
    "    sections[1:]\n",
    ")\n",
    "groups = {keys[i]: group_list[i] for i in range(len(keys))}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9a15e95d-c721-4f76-bed5-5b1ce3d1a01f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " "
     ]
    },
    {
     "data": {
      "text/plain": [
       "         121 function calls (109 primitive calls) in 0.034 seconds\n",
       "\n",
       "   Ordered by: cumulative time\n",
       "   List reduced from 33 to 20 due to restriction <20>\n",
       "\n",
       "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
       "        1    0.000    0.000    0.034    0.034 {built-in method builtins.exec}\n",
       "        1    0.001    0.001    0.034    0.034 <string>:1(<module>)\n",
       "     15/3    0.000    0.000    0.033    0.011 {built-in method numpy.core._multiarray_umath.implement_array_function}\n",
       "        1    0.000    0.000    0.024    0.024 <__array_function__ internals>:2(sort)\n",
       "        1    0.000    0.000    0.024    0.024 fromnumeric.py:846(sort)\n",
       "        1    0.023    0.023    0.023    0.023 {method 'sort' of 'numpy.ndarray' objects}\n",
       "        1    0.000    0.000    0.009    0.009 <__array_function__ internals>:2(unique)\n",
       "        1    0.002    0.002    0.009    0.009 arraysetops.py:138(unique)\n",
       "        1    0.003    0.003    0.007    0.007 arraysetops.py:320(_unique1d)\n",
       "        1    0.002    0.002    0.002    0.002 {method 'argsort' of 'numpy.ndarray' objects}\n",
       "        1    0.002    0.002    0.002    0.002 {method 'copy' of 'numpy.ndarray' objects}\n",
       "        1    0.001    0.001    0.001    0.001 {method 'flatten' of 'numpy.ndarray' objects}\n",
       "        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(split)\n",
       "        1    0.000    0.000    0.000    0.000 shape_base.py:799(split)\n",
       "        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(array_split)\n",
       "        1    0.000    0.000    0.000    0.000 shape_base.py:739(array_split)\n",
       "       11    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(swapaxes)\n",
       "       11    0.000    0.000    0.000    0.000 fromnumeric.py:550(swapaxes)\n",
       "       11    0.000    0.000    0.000    0.000 fromnumeric.py:51(_wrapfunc)\n",
       "        1    0.000    0.000    0.000    0.000 arraysetops.py:125(_unpack_tuple)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%prun -s cumulative -l 20\n",
    "sorted_M = np.sort(group_values)\n",
    "keys, sections = np.unique(sorted_M, return_index=True)\n",
    "group_list = np.split(\n",
    "    sorted_M, \n",
    "    sections[1:]\n",
    ")\n",
    "groups = {keys[i]: group_list[i] for i in range(len(keys))}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fc6ec9f6-d22d-4a23-9618-83e0f6f2d117",
   "metadata": {},
   "source": [
    "Above, it is clear that `sort` is the most time consuming part."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4a3f964c-d4bf-4358-9b9d-fba234acb983",
   "metadata": {},
   "source": [
    "## Case 2: a large number of groups\n",
    "\n",
    "1000 groups over N rows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "96d85df7-90b1-4bc4-ada0-0289240cb31d",
   "metadata": {},
   "outputs": [],
   "source": [
    "group_min, group_max = 100, 1200\n",
    "group_values = np.random.randint(group_min, group_max, N)\n",
    "compo = np.random.randint(1, 100, 4 * N).reshape(N, 4)\n",
    "values = np.concatenate((compo, group_values[:, np.newaxis]), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "059940fa-390e-40f8-b1f2-82caa35e115c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "      <th>M</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>19</td>\n",
       "      <td>29</td>\n",
       "      <td>91</td>\n",
       "      <td>63</td>\n",
       "      <td>193</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>73</td>\n",
       "      <td>26</td>\n",
       "      <td>72</td>\n",
       "      <td>78</td>\n",
       "      <td>300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>29</td>\n",
       "      <td>25</td>\n",
       "      <td>43</td>\n",
       "      <td>53</td>\n",
       "      <td>282</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>24</td>\n",
       "      <td>13</td>\n",
       "      <td>95</td>\n",
       "      <td>8</td>\n",
       "      <td>1118</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>54</td>\n",
       "      <td>37</td>\n",
       "      <td>75</td>\n",
       "      <td>11</td>\n",
       "      <td>464</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A   B   C   D     M\n",
       "0  19  29  91  63   193\n",
       "1  73  26  72  78   300\n",
       "2  29  25  43  53   282\n",
       "3  24  13  95   8  1118\n",
       "4  54  37  75  11   464"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(values, columns=list(\"ABCDM\"))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "578aebd1-8281-4889-96d1-d1e42b3f234c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "210 µs ± 6.12 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "df = pd.DataFrame(values, columns=list(\"ABCDM\"))\n",
    "groups = df.groupby(\"M\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "7c785ca6-6642-45b0-93f2-f050e73e3c37",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40.9 ms ± 946 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "sorted_M = np.sort(group_values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "feb92c5f-9a94-44b2-9c3a-e9924ad30581",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "70.3 ms ± 1.82 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "group_values[group_values.argsort()]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07be9d5a-e904-4327-8372-d385ed0f9f58",
   "metadata": {},
   "source": [
    "Here, in the case of 1D array, `np.sort` is more efficient than `argsort()`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "c427854d-bf54-474f-8c45-5953d0178265",
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted_M = np.sort(group_values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "2f335f70-888d-4d6e-b1c9-63a88722203e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9.61 ms ± 145 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "# run here using already sorted\n",
    "keys, sections = np.unique(sorted_M, return_index=True)\n",
    "group_list = np.split(\n",
    "    sorted_M, \n",
    "    sections[1:]\n",
    ")\n",
    "groups = {keys[i]: group_list[i] for i in range(len(keys))}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "eeec3538-f545-4930-91ff-af60020d4e2e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " "
     ]
    },
    {
     "data": {
      "text/plain": [
       "         8841 function calls (7739 primitive calls) in 0.063 seconds\n",
       "\n",
       "   Ordered by: cumulative time\n",
       "   List reduced from 33 to 20 due to restriction <20>\n",
       "\n",
       "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
       "        1    0.000    0.000    0.063    0.063 {built-in method builtins.exec}\n",
       "        1    0.001    0.001    0.063    0.063 <string>:1(<module>)\n",
       "   1105/3    0.000    0.000    0.061    0.020 {built-in method numpy.core._multiarray_umath.implement_array_function}\n",
       "        1    0.000    0.000    0.046    0.046 <__array_function__ internals>:2(sort)\n",
       "        1    0.000    0.000    0.046    0.046 fromnumeric.py:846(sort)\n",
       "        1    0.045    0.045    0.045    0.045 {method 'sort' of 'numpy.ndarray' objects}\n",
       "        1    0.000    0.000    0.011    0.011 <__array_function__ internals>:2(unique)\n",
       "        1    0.003    0.003    0.011    0.011 arraysetops.py:138(unique)\n",
       "        1    0.004    0.004    0.008    0.008 arraysetops.py:320(_unique1d)\n",
       "        1    0.000    0.000    0.004    0.004 <__array_function__ internals>:2(split)\n",
       "        1    0.000    0.000    0.004    0.004 shape_base.py:799(split)\n",
       "        1    0.000    0.000    0.004    0.004 <__array_function__ internals>:2(array_split)\n",
       "        1    0.001    0.001    0.004    0.004 shape_base.py:739(array_split)\n",
       "        1    0.003    0.003    0.003    0.003 {method 'argsort' of 'numpy.ndarray' objects}\n",
       "     1101    0.001    0.000    0.003    0.000 <__array_function__ internals>:2(swapaxes)\n",
       "     1101    0.000    0.000    0.002    0.000 fromnumeric.py:550(swapaxes)\n",
       "        1    0.001    0.001    0.001    0.001 {method 'flatten' of 'numpy.ndarray' objects}\n",
       "        1    0.001    0.001    0.001    0.001 {method 'copy' of 'numpy.ndarray' objects}\n",
       "     1101    0.001    0.000    0.001    0.000 fromnumeric.py:51(_wrapfunc)\n",
       "     1101    0.000    0.000    0.000    0.000 {method 'swapaxes' of 'numpy.ndarray' objects}"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%prun -s cumulative -l 20\n",
    "sorted_M = np.sort(group_values)\n",
    "keys, sections = np.unique(sorted_M, return_index=True)\n",
    "group_list = np.split(\n",
    "    sorted_M, \n",
    "    sections[1:]\n",
    ")\n",
    "groups = {keys[i]: group_list[i] for i in range(len(keys))}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2fded82d-a24a-45f1-8089-4f82f32463ec",
   "metadata": {},
   "source": [
    "## Use StackOverflow example\n",
    "\n",
    "https://stackoverflow.com/questions/38013778/\n",
    "\n",
    "```python\n",
    ">>> a = np.random.randint(5, size=(10000, 2))  # 5 different \"groups\"\n",
    "\n",
    "# Only the sort\n",
    ">>> a = a[a[:, 0].argsort()]\n",
    "⏱ 116.9 ms\n",
    "\n",
    "# Group by on the already sorted table\n",
    ">>> np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])\n",
    "⏱ 35.5 ms\n",
    "\n",
    "# Total sort + groupby\n",
    ">>> a = a[a[:, 0].argsort()]\n",
    ">>> np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])\n",
    "⏱ 153.0 ms 👑\n",
    "\n",
    "# With pandas (cf Piotr answer)\n",
    ">>> df = pd.DataFrame(a, columns=[\"key\", \"val\"]) # no timer for this line\n",
    ">>> df.groupby(\"key\").val.apply(pd.Series.tolist) \n",
    "⏱ 362.3 ms\n",
    "```\n",
    "\n",
    "### 5 groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "e52f2d2a-aab2-4094-9055-a2657248fd07",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.random.randint(5, size=(10000, 2))  # 5 different \"groups\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "cfe54057-a965-496c-8702-65e7c6097edd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "298 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit a[a[:, 0].argsort()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "3f7b825d-51f9-4ea5-b17a-199a4d90645f",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = a[a[:, 0].argsort()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "63cf27ef-62fc-476b-aa14-28738aa0e9c3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "54.2 µs ± 570 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "f33ea719-2f53-465f-8be7-213e8071fb0b",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.random.randint(5, size=(10000, 2))  # 5 different \"groups\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "915fa73a-381c-42ce-a738-f5a64c8cce84",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.11 ms ± 67.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit \n",
    "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
    "df.groupby(\"key\").val.apply(pd.Series.tolist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "4162f778-6e73-4281-925a-e2ceb183ccea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "211 µs ± 9.29 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit \n",
    "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
    "df.groupby(\"key\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e6aa6369-3713-4c19-ae33-6a4c724cbc3e",
   "metadata": {},
   "source": [
    "The `apply` and `tolist` operation is quite expensive. Removing this operation,\n",
    "sorting plus grouping operation in numpy is slower than pandas.\n",
    "\n",
    "* sorting: 298 µs\n",
    "* grouping: 53 µs\n",
    "* sorting + grouping: 351 µs\n",
    "* pandas only grouping: 213 µs\n",
    "\n",
    "Using the groupby object of Pandas, you can access to a given group using the `get_group`\n",
    "method which is fast."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "f13972ae-92e0-41af-bc6e-5d21850ed3bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
    "groupby = df.groupby(\"key\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "2e498b44-4a5c-4a70-bef2-19db538e8555",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "99.7 µs ± 3.88 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit groupby.get_group(4)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "43a47781-b918-4506-9e31-da5e7152f787",
   "metadata": {},
   "source": [
    "The `get_group` is quite slow, but in the case of only 5 groups, you call it\n",
    "at most 5 times."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "368a96e1-c162-46d2-b2b7-b16d603cf95d",
   "metadata": {},
   "source": [
    "### 500 groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "1dd3ffe0-0ee4-41d0-8689-ab0b58ce25cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.random.randint(500, size=(10000, 2))  # 500 different \"groups\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "07c484a5-c880-44dd-be15-4b337400917e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "544 µs ± 3.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit a[a[:, 0].argsort()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "2b25db57-21d2-4277-b27d-77bb57e9a57c",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = a[a[:, 0].argsort()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "3d3e5daa-5b14-4352-8e3a-644ed1a11ef5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "667 µs ± 2.33 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "82b328f7-c6c4-4692-9628-88d92ea26b9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.random.randint(500, size=(10000, 2))  # 5 different \"groups\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "05dd78a5-8425-4868-b91b-20b5873cdd8c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7.79 ms ± 305 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit \n",
    "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
    "df.groupby(\"key\").val.apply(pd.Series.tolist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "8a7af8c0-eae5-4bf0-a439-24bbe419f6ea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "225 µs ± 18.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit \n",
    "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
    "df.groupby(\"key\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "d3ab391b-a996-4263-b40d-d20fd6dafd55",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
    "groupby = df.groupby(\"key\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "fa3091ff-82d4-4635-ad4a-b306c8eeb947",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "75.3 µs ± 1.33 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit groupby.get_group(4)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f9d8a273-8cb3-44c4-a136-020f8e9f3f80",
   "metadata": {},
   "source": [
    "The `get_group` is quite slow, but in the case of only 5 groups, you call it\n",
    "at most 5 times."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "efeaa958-1160-47e1-9401-ee363d2e1324",
   "metadata": {},
   "source": [
    "### Increasing N"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "ca24ee47-b399-47f2-b222-40df9e22b971",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000000\n"
     ]
    }
   ],
   "source": [
    "print(N)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "0a26dc65-a734-4c96-8d73-95e6b7b0346d",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.random.randint(500, size=(N, 2))  # 5àà different \"groups\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "e675fe80-3941-4cae-aec3-35c1764051bd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "109 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit a[a[:, 0].argsort()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "9dcd596d-db3b-4291-aab3-34e7bc4947d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = a[a[:, 0].argsort()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "0e6e6bcc-def8-4469-9210-e8c97efb54c9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8.22 ms ± 98.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit np.split(a[:, 1], np.unique(a[:, 0], return_index=True)[1][1:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "a948713c-ac3a-4baf-92ac-e8bfff436280",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = np.random.randint(500, size=(N, 2))  # 5 different \"groups\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "5c3ce16b-cfaf-4ec1-90b3-fdc982b49c69",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "128 ms ± 4.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit \n",
    "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
    "df.groupby(\"key\").val.apply(pd.Series.tolist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "e5ef38f5-6a74-4c8f-bad0-8ad38d424022",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "210 µs ± 5.22 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit \n",
    "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
    "df.groupby(\"key\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "a8b08873-9caf-4675-b487-a49ddc90f6bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(a, columns=[\"key\", \"val\"])\n",
    "groupby = df.groupby(\"key\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "5b655920-58db-4a8d-8fc4-5bfaaa8f62ea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "114 µs ± 670 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%timeit groupby.get_group(40)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}