{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pandas parallelization" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2020-08-17T17:03:16.321164Z", "iopub.status.busy": "2020-08-17T17:03:16.320901Z", "iopub.status.idle": "2020-08-17T17:03:17.360884Z", "shell.execute_reply": "2020-08-17T17:03:17.360077Z", "shell.execute_reply.started": "2020-08-17T17:03:16.321124Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import swifter" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data" ] }, { "cell_type": "markdown", "metadata": { "execution": { "iopub.execute_input": "2020-08-17T16:25:42.218487Z", "iopub.status.busy": "2020-08-17T16:25:42.218215Z", "iopub.status.idle": "2020-08-17T16:25:42.224742Z", "shell.execute_reply": "2020-08-17T16:25:42.222762Z", "shell.execute_reply.started": "2020-08-17T16:25:42.218445Z" } }, "source": [ "Shape: `(100.000 x 10.000)`" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2020-08-17T17:03:17.362349Z", "iopub.status.busy": "2020-08-17T17:03:17.362169Z", "iopub.status.idle": "2020-08-17T17:03:20.693553Z", "shell.execute_reply": "2020-08-17T17:03:20.692569Z", "shell.execute_reply.started": "2020-08-17T17:03:17.362321Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " <th>1</th>\n", " <th>2</th>\n", " <th>3</th>\n", " <th>4</th>\n", " <th>5</th>\n", " <th>6</th>\n", " <th>7</th>\n", " <th>8</th>\n", " <th>9</th>\n", " <th>...</th>\n", " <th>9990</th>\n", " <th>9991</th>\n", " <th>9992</th>\n", " <th>9993</th>\n", " <th>9994</th>\n", " <th>9995</th>\n", " <th>9996</th>\n", " <th>9997</th>\n", " <th>9998</th>\n", " <th>9999</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " <td>8</td>\n", " <td>9</td>\n", " <td>...</td>\n", " <td>9990</td>\n", " <td>9991</td>\n", " <td>9992</td>\n", " <td>9993</td>\n", " <td>9994</td>\n", " <td>9995</td>\n", " <td>9996</td>\n", " <td>9997</td>\n", " <td>9998</td>\n", " <td>9999</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>10000</td>\n", " <td>10001</td>\n", " <td>10002</td>\n", " <td>10003</td>\n", " <td>10004</td>\n", " <td>10005</td>\n", " <td>10006</td>\n", " <td>10007</td>\n", " <td>10008</td>\n", " <td>10009</td>\n", " <td>...</td>\n", " <td>19990</td>\n", " <td>19991</td>\n", " <td>19992</td>\n", " <td>19993</td>\n", " <td>19994</td>\n", " <td>19995</td>\n", " <td>19996</td>\n", " <td>19997</td>\n", " <td>19998</td>\n", " <td>19999</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>20000</td>\n", " <td>20001</td>\n", " <td>20002</td>\n", " <td>20003</td>\n", " <td>20004</td>\n", " <td>20005</td>\n", " <td>20006</td>\n", " <td>20007</td>\n", " <td>20008</td>\n", " <td>20009</td>\n", " <td>...</td>\n", " <td>29990</td>\n", " <td>29991</td>\n", " <td>29992</td>\n", " <td>29993</td>\n", " <td>29994</td>\n", " <td>29995</td>\n", " <td>29996</td>\n", " <td>29997</td>\n", " <td>29998</td>\n", " <td>29999</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>30000</td>\n", " <td>30001</td>\n", " <td>30002</td>\n", " <td>30003</td>\n", " <td>30004</td>\n", " <td>30005</td>\n", " <td>30006</td>\n", " <td>30007</td>\n", " <td>30008</td>\n", " <td>30009</td>\n", " <td>...</td>\n", " <td>39990</td>\n", " <td>39991</td>\n", " <td>39992</td>\n", " <td>39993</td>\n", " <td>39994</td>\n", " <td>39995</td>\n", " <td>39996</td>\n", " <td>39997</td>\n", " <td>39998</td>\n", " <td>39999</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>40000</td>\n", " <td>40001</td>\n", " <td>40002</td>\n", " <td>40003</td>\n", " <td>40004</td>\n", " <td>40005</td>\n", " <td>40006</td>\n", " <td>40007</td>\n", " <td>40008</td>\n", " <td>40009</td>\n", " <td>...</td>\n", " <td>49990</td>\n", " <td>49991</td>\n", " <td>49992</td>\n", " <td>49993</td>\n", " <td>49994</td>\n", " <td>49995</td>\n", " <td>49996</td>\n", " <td>49997</td>\n", " <td>49998</td>\n", " <td>49999</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>5 rows × 10000 columns</p>\n", "</div>" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... \\\n", "0 0 1 2 3 4 5 6 7 8 9 ... \n", "1 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 ... \n", "2 20000 20001 20002 20003 20004 20005 20006 20007 20008 20009 ... \n", "3 30000 30001 30002 30003 30004 30005 30006 30007 30008 30009 ... \n", "4 40000 40001 40002 40003 40004 40005 40006 40007 40008 40009 ... \n", "\n", " 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 \n", "0 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 \n", "1 19990 19991 19992 19993 19994 19995 19996 19997 19998 19999 \n", "2 29990 29991 29992 29993 29994 29995 29996 29997 29998 29999 \n", "3 39990 39991 39992 39993 39994 39995 39996 39997 39998 39999 \n", "4 49990 49991 49992 49993 49994 49995 49996 49997 49998 49999 \n", "\n", "[5 rows x 10000 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(np.arange(10**9).reshape(10**5, 10**4))\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Performance test" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2020-08-17T17:03:20.694945Z", "iopub.status.busy": "2020-08-17T17:03:20.694721Z", "iopub.status.idle": "2020-08-17T17:04:27.208520Z", "shell.execute_reply": "2020-08-17T17:04:27.206714Z", "shell.execute_reply.started": "2020-08-17T17:03:20.694910Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1min 6s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" ] } ], "source": [ "%timeit -n1 -r1 df.apply(np.mean)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`Swifter` - a package which efficiently applies any function to a pandas dataframe or series in the fastest available manner. Reference: https://github.com/jmcarpenter2/swifter" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2020-08-17T17:04:27.211567Z", "iopub.status.busy": "2020-08-17T17:04:27.211248Z", "iopub.status.idle": "2020-08-17T17:04:34.868285Z", "shell.execute_reply": "2020-08-17T17:04:34.867524Z", "shell.execute_reply.started": "2020-08-17T17:04:27.211518Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "7.65 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" ] } ], "source": [ "%timeit -n1 -r1 df.swifter.apply(np.mean)" ] }, { "cell_type": "markdown", "metadata": { "execution": { "iopub.execute_input": "2020-08-17T16:31:41.792674Z", "iopub.status.busy": "2020-08-17T16:31:41.792382Z", "iopub.status.idle": "2020-08-17T16:31:41.799717Z", "shell.execute_reply": "2020-08-17T16:31:41.797948Z", "shell.execute_reply.started": "2020-08-17T16:31:41.792630Z" } }, "source": [ "For this specific case - `over 8x` speed improvement. Here we use only 1 loop & 1 run since when running the same function more than once `swifter` further optimizes the performance and becomes even faster - however very unlikely use case." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }