{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pandas parallelization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-17T17:03:16.321164Z",
     "iopub.status.busy": "2020-08-17T17:03:16.320901Z",
     "iopub.status.idle": "2020-08-17T17:03:17.360884Z",
     "shell.execute_reply": "2020-08-17T17:03:17.360077Z",
     "shell.execute_reply.started": "2020-08-17T17:03:16.321124Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import swifter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-17T16:25:42.218487Z",
     "iopub.status.busy": "2020-08-17T16:25:42.218215Z",
     "iopub.status.idle": "2020-08-17T16:25:42.224742Z",
     "shell.execute_reply": "2020-08-17T16:25:42.222762Z",
     "shell.execute_reply.started": "2020-08-17T16:25:42.218445Z"
    }
   },
   "source": [
    "Shape: `(100.000 x 10.000)`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-17T17:03:17.362349Z",
     "iopub.status.busy": "2020-08-17T17:03:17.362169Z",
     "iopub.status.idle": "2020-08-17T17:03:20.693553Z",
     "shell.execute_reply": "2020-08-17T17:03:20.692569Z",
     "shell.execute_reply.started": "2020-08-17T17:03:17.362321Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>9990</th>\n",
       "      <th>9991</th>\n",
       "      <th>9992</th>\n",
       "      <th>9993</th>\n",
       "      <th>9994</th>\n",
       "      <th>9995</th>\n",
       "      <th>9996</th>\n",
       "      <th>9997</th>\n",
       "      <th>9998</th>\n",
       "      <th>9999</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>...</td>\n",
       "      <td>9990</td>\n",
       "      <td>9991</td>\n",
       "      <td>9992</td>\n",
       "      <td>9993</td>\n",
       "      <td>9994</td>\n",
       "      <td>9995</td>\n",
       "      <td>9996</td>\n",
       "      <td>9997</td>\n",
       "      <td>9998</td>\n",
       "      <td>9999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>10000</td>\n",
       "      <td>10001</td>\n",
       "      <td>10002</td>\n",
       "      <td>10003</td>\n",
       "      <td>10004</td>\n",
       "      <td>10005</td>\n",
       "      <td>10006</td>\n",
       "      <td>10007</td>\n",
       "      <td>10008</td>\n",
       "      <td>10009</td>\n",
       "      <td>...</td>\n",
       "      <td>19990</td>\n",
       "      <td>19991</td>\n",
       "      <td>19992</td>\n",
       "      <td>19993</td>\n",
       "      <td>19994</td>\n",
       "      <td>19995</td>\n",
       "      <td>19996</td>\n",
       "      <td>19997</td>\n",
       "      <td>19998</td>\n",
       "      <td>19999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>20000</td>\n",
       "      <td>20001</td>\n",
       "      <td>20002</td>\n",
       "      <td>20003</td>\n",
       "      <td>20004</td>\n",
       "      <td>20005</td>\n",
       "      <td>20006</td>\n",
       "      <td>20007</td>\n",
       "      <td>20008</td>\n",
       "      <td>20009</td>\n",
       "      <td>...</td>\n",
       "      <td>29990</td>\n",
       "      <td>29991</td>\n",
       "      <td>29992</td>\n",
       "      <td>29993</td>\n",
       "      <td>29994</td>\n",
       "      <td>29995</td>\n",
       "      <td>29996</td>\n",
       "      <td>29997</td>\n",
       "      <td>29998</td>\n",
       "      <td>29999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>30000</td>\n",
       "      <td>30001</td>\n",
       "      <td>30002</td>\n",
       "      <td>30003</td>\n",
       "      <td>30004</td>\n",
       "      <td>30005</td>\n",
       "      <td>30006</td>\n",
       "      <td>30007</td>\n",
       "      <td>30008</td>\n",
       "      <td>30009</td>\n",
       "      <td>...</td>\n",
       "      <td>39990</td>\n",
       "      <td>39991</td>\n",
       "      <td>39992</td>\n",
       "      <td>39993</td>\n",
       "      <td>39994</td>\n",
       "      <td>39995</td>\n",
       "      <td>39996</td>\n",
       "      <td>39997</td>\n",
       "      <td>39998</td>\n",
       "      <td>39999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>40000</td>\n",
       "      <td>40001</td>\n",
       "      <td>40002</td>\n",
       "      <td>40003</td>\n",
       "      <td>40004</td>\n",
       "      <td>40005</td>\n",
       "      <td>40006</td>\n",
       "      <td>40007</td>\n",
       "      <td>40008</td>\n",
       "      <td>40009</td>\n",
       "      <td>...</td>\n",
       "      <td>49990</td>\n",
       "      <td>49991</td>\n",
       "      <td>49992</td>\n",
       "      <td>49993</td>\n",
       "      <td>49994</td>\n",
       "      <td>49995</td>\n",
       "      <td>49996</td>\n",
       "      <td>49997</td>\n",
       "      <td>49998</td>\n",
       "      <td>49999</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 10000 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    0      1      2      3      4      5      6      7      8      9     ...  \\\n",
       "0      0      1      2      3      4      5      6      7      8      9  ...   \n",
       "1  10000  10001  10002  10003  10004  10005  10006  10007  10008  10009  ...   \n",
       "2  20000  20001  20002  20003  20004  20005  20006  20007  20008  20009  ...   \n",
       "3  30000  30001  30002  30003  30004  30005  30006  30007  30008  30009  ...   \n",
       "4  40000  40001  40002  40003  40004  40005  40006  40007  40008  40009  ...   \n",
       "\n",
       "    9990   9991   9992   9993   9994   9995   9996   9997   9998   9999  \n",
       "0   9990   9991   9992   9993   9994   9995   9996   9997   9998   9999  \n",
       "1  19990  19991  19992  19993  19994  19995  19996  19997  19998  19999  \n",
       "2  29990  29991  29992  29993  29994  29995  29996  29997  29998  29999  \n",
       "3  39990  39991  39992  39993  39994  39995  39996  39997  39998  39999  \n",
       "4  49990  49991  49992  49993  49994  49995  49996  49997  49998  49999  \n",
       "\n",
       "[5 rows x 10000 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(np.arange(10**9).reshape(10**5, 10**4))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Performance test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-17T17:03:20.694945Z",
     "iopub.status.busy": "2020-08-17T17:03:20.694721Z",
     "iopub.status.idle": "2020-08-17T17:04:27.208520Z",
     "shell.execute_reply": "2020-08-17T17:04:27.206714Z",
     "shell.execute_reply.started": "2020-08-17T17:03:20.694910Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1min 6s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit -n1 -r1 df.apply(np.mean)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`Swifter` - a package which efficiently applies any function to a pandas dataframe or series in the fastest available manner. Reference: https://github.com/jmcarpenter2/swifter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-17T17:04:27.211567Z",
     "iopub.status.busy": "2020-08-17T17:04:27.211248Z",
     "iopub.status.idle": "2020-08-17T17:04:34.868285Z",
     "shell.execute_reply": "2020-08-17T17:04:34.867524Z",
     "shell.execute_reply.started": "2020-08-17T17:04:27.211518Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7.65 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit -n1 -r1 df.swifter.apply(np.mean)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-17T16:31:41.792674Z",
     "iopub.status.busy": "2020-08-17T16:31:41.792382Z",
     "iopub.status.idle": "2020-08-17T16:31:41.799717Z",
     "shell.execute_reply": "2020-08-17T16:31:41.797948Z",
     "shell.execute_reply.started": "2020-08-17T16:31:41.792630Z"
    }
   },
   "source": [
    "For this specific case - `over 8x` speed improvement. Here we use only 1 loop & 1 run since when running the same function more than once `swifter` further optimizes the performance and becomes even faster - however very unlikely use case."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}