{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pandas parallelization" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2020-08-17T17:03:16.321164Z", "iopub.status.busy": "2020-08-17T17:03:16.320901Z", "iopub.status.idle": "2020-08-17T17:03:17.360884Z", "shell.execute_reply": "2020-08-17T17:03:17.360077Z", "shell.execute_reply.started": "2020-08-17T17:03:16.321124Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import swifter" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data" ] }, { "cell_type": "markdown", "metadata": { "execution": { "iopub.execute_input": "2020-08-17T16:25:42.218487Z", "iopub.status.busy": "2020-08-17T16:25:42.218215Z", "iopub.status.idle": "2020-08-17T16:25:42.224742Z", "shell.execute_reply": "2020-08-17T16:25:42.222762Z", "shell.execute_reply.started": "2020-08-17T16:25:42.218445Z" } }, "source": [ "Shape: `(100.000 x 10.000)`" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2020-08-17T17:03:17.362349Z", "iopub.status.busy": "2020-08-17T17:03:17.362169Z", "iopub.status.idle": "2020-08-17T17:03:20.693553Z", "shell.execute_reply": "2020-08-17T17:03:20.692569Z", "shell.execute_reply.started": "2020-08-17T17:03:17.362321Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...9990999199929993999499959996999799989999
00123456789...9990999199929993999499959996999799989999
110000100011000210003100041000510006100071000810009...19990199911999219993199941999519996199971999819999
220000200012000220003200042000520006200072000820009...29990299912999229993299942999529996299972999829999
330000300013000230003300043000530006300073000830009...39990399913999239993399943999539996399973999839999
440000400014000240003400044000540006400074000840009...49990499914999249993499944999549996499974999849999
\n", "

5 rows × 10000 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 ... \\\n", "0 0 1 2 3 4 5 6 7 8 9 ... \n", "1 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 ... \n", "2 20000 20001 20002 20003 20004 20005 20006 20007 20008 20009 ... \n", "3 30000 30001 30002 30003 30004 30005 30006 30007 30008 30009 ... \n", "4 40000 40001 40002 40003 40004 40005 40006 40007 40008 40009 ... \n", "\n", " 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 \n", "0 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 \n", "1 19990 19991 19992 19993 19994 19995 19996 19997 19998 19999 \n", "2 29990 29991 29992 29993 29994 29995 29996 29997 29998 29999 \n", "3 39990 39991 39992 39993 39994 39995 39996 39997 39998 39999 \n", "4 49990 49991 49992 49993 49994 49995 49996 49997 49998 49999 \n", "\n", "[5 rows x 10000 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(np.arange(10**9).reshape(10**5, 10**4))\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Performance test" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2020-08-17T17:03:20.694945Z", "iopub.status.busy": "2020-08-17T17:03:20.694721Z", "iopub.status.idle": "2020-08-17T17:04:27.208520Z", "shell.execute_reply": "2020-08-17T17:04:27.206714Z", "shell.execute_reply.started": "2020-08-17T17:03:20.694910Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1min 6s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" ] } ], "source": [ "%timeit -n1 -r1 df.apply(np.mean)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`Swifter` - a package which efficiently applies any function to a pandas dataframe or series in the fastest available manner. Reference: https://github.com/jmcarpenter2/swifter" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2020-08-17T17:04:27.211567Z", "iopub.status.busy": "2020-08-17T17:04:27.211248Z", "iopub.status.idle": "2020-08-17T17:04:34.868285Z", "shell.execute_reply": "2020-08-17T17:04:34.867524Z", "shell.execute_reply.started": "2020-08-17T17:04:27.211518Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "7.65 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" ] } ], "source": [ "%timeit -n1 -r1 df.swifter.apply(np.mean)" ] }, { "cell_type": "markdown", "metadata": { "execution": { "iopub.execute_input": "2020-08-17T16:31:41.792674Z", "iopub.status.busy": "2020-08-17T16:31:41.792382Z", "iopub.status.idle": "2020-08-17T16:31:41.799717Z", "shell.execute_reply": "2020-08-17T16:31:41.797948Z", "shell.execute_reply.started": "2020-08-17T16:31:41.792630Z" } }, "source": [ "For this specific case - `over 8x` speed improvement. Here we use only 1 loop & 1 run since when running the same function more than once `swifter` further optimizes the performance and becomes even faster - however very unlikely use case." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }