{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:51:56.088764Z",
     "iopub.status.busy": "2020-08-11T15:51:56.088469Z",
     "iopub.status.idle": "2020-08-11T15:51:56.837524Z",
     "shell.execute_reply": "2020-08-11T15:51:56.836477Z",
     "shell.execute_reply.started": "2020-08-11T15:51:56.088716Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data & column memory statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:51:56.839417Z",
     "iopub.status.busy": "2020-08-11T15:51:56.839203Z",
     "iopub.status.idle": "2020-08-11T15:51:56.888382Z",
     "shell.execute_reply": "2020-08-11T15:51:56.887185Z",
     "shell.execute_reply.started": "2020-08-11T15:51:56.839382Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal_length</th>\n",
       "      <th>sepal_width</th>\n",
       "      <th>petal_length</th>\n",
       "      <th>petal_width</th>\n",
       "      <th>species</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "      <td>setosa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   sepal_length  sepal_width  petal_length  petal_width species\n",
       "0           5.1          3.5           1.4          0.2  setosa\n",
       "1           4.9          3.0           1.4          0.2  setosa\n",
       "2           4.7          3.2           1.3          0.2  setosa\n",
       "3           4.6          3.1           1.5          0.2  setosa\n",
       "4           5.0          3.6           1.4          0.2  setosa"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris = sns.load_dataset('iris')\n",
    "iris_init = iris.copy() # will be used to get the reduction performance\n",
    "iris.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:51:56.889851Z",
     "iopub.status.busy": "2020-08-11T15:51:56.889567Z",
     "iopub.status.idle": "2020-08-11T15:51:56.896621Z",
     "shell.execute_reply": "2020-08-11T15:51:56.895716Z",
     "shell.execute_reply.started": "2020-08-11T15:51:56.889803Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sepal_length    float64\n",
       "sepal_width     float64\n",
       "petal_length    float64\n",
       "petal_width     float64\n",
       "species          object\n",
       "dtype: object"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:51:56.898063Z",
     "iopub.status.busy": "2020-08-11T15:51:56.897839Z",
     "iopub.status.idle": "2020-08-11T15:51:56.906122Z",
     "shell.execute_reply": "2020-08-11T15:51:56.905172Z",
     "shell.execute_reply.started": "2020-08-11T15:51:56.898026Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index            128\n",
       "sepal_length    1200\n",
       "sepal_width     1200\n",
       "petal_length    1200\n",
       "petal_width     1200\n",
       "species         9800\n",
       "dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris.memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:22:46.192393Z",
     "iopub.status.busy": "2020-08-11T15:22:46.192141Z",
     "iopub.status.idle": "2020-08-11T15:22:46.196169Z",
     "shell.execute_reply": "2020-08-11T15:22:46.195015Z",
     "shell.execute_reply.started": "2020-08-11T15:22:46.192354Z"
    }
   },
   "source": [
    "### Applying optimizations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:22:57.876536Z",
     "iopub.status.busy": "2020-08-11T15:22:57.876286Z",
     "iopub.status.idle": "2020-08-11T15:22:57.880191Z",
     "shell.execute_reply": "2020-08-11T15:22:57.879111Z",
     "shell.execute_reply.started": "2020-08-11T15:22:57.876496Z"
    }
   },
   "source": [
    "#### Casting categorical columns as `category`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Usually gives biggest memory savings. Instead of storing whole strings/objects `pandas` tokenizes them and only stores indices which significantly reduces the memory usage."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:51:56.908888Z",
     "iopub.status.busy": "2020-08-11T15:51:56.908622Z",
     "iopub.status.idle": "2020-08-11T15:51:56.914680Z",
     "shell.execute_reply": "2020-08-11T15:51:56.913797Z",
     "shell.execute_reply.started": "2020-08-11T15:51:56.908847Z"
    }
   },
   "outputs": [],
   "source": [
    "iris.species = iris.species.astype(\"category\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:51:56.916242Z",
     "iopub.status.busy": "2020-08-11T15:51:56.916011Z",
     "iopub.status.idle": "2020-08-11T15:51:56.923110Z",
     "shell.execute_reply": "2020-08-11T15:51:56.921794Z",
     "shell.execute_reply.started": "2020-08-11T15:51:56.916205Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Column memory usage is reduced by 94.4%'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"Column memory usage is reduced by {:.1%}\".format(1 - iris.species.memory_usage(deep=True) / iris_init.species.memory_usage(deep=True))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:31:05.555896Z",
     "iopub.status.busy": "2020-08-11T15:31:05.555631Z",
     "iopub.status.idle": "2020-08-11T15:31:05.561770Z",
     "shell.execute_reply": "2020-08-11T15:31:05.560011Z",
     "shell.execute_reply.started": "2020-08-11T15:31:05.555855Z"
    }
   },
   "source": [
    "#### Optimizing the numerical columns with `pd.to_numeric` and `downcast` argument"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:47:12.928593Z",
     "iopub.status.busy": "2020-08-11T15:47:12.928363Z",
     "iopub.status.idle": "2020-08-11T15:47:12.934125Z",
     "shell.execute_reply": "2020-08-11T15:47:12.932791Z",
     "shell.execute_reply.started": "2020-08-11T15:47:12.928556Z"
    }
   },
   "source": [
    "By default pandas are using `float64` numeric type which is one of the heaviest ones. Some optimization can be done here given that we do not care about such high precision in this case (and, in fact, in this dataset we have only 1 digit after comma/dot)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:51:56.924884Z",
     "iopub.status.busy": "2020-08-11T15:51:56.924425Z",
     "iopub.status.idle": "2020-08-11T15:51:56.936372Z",
     "shell.execute_reply": "2020-08-11T15:51:56.935309Z",
     "shell.execute_reply.started": "2020-08-11T15:51:56.924827Z"
    }
   },
   "outputs": [],
   "source": [
    "columns = iris.columns.drop(\"species\")\n",
    "iris[columns] = iris[columns].apply(pd.to_numeric, downcast=\"float\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:29:23.859559Z",
     "iopub.status.busy": "2020-08-11T15:29:23.859306Z",
     "iopub.status.idle": "2020-08-11T15:29:23.862930Z",
     "shell.execute_reply": "2020-08-11T15:29:23.861985Z",
     "shell.execute_reply.started": "2020-08-11T15:29:23.859519Z"
    }
   },
   "source": [
    "### Final memory usage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:51:56.938099Z",
     "iopub.status.busy": "2020-08-11T15:51:56.937861Z",
     "iopub.status.idle": "2020-08-11T15:51:56.944942Z",
     "shell.execute_reply": "2020-08-11T15:51:56.944129Z",
     "shell.execute_reply.started": "2020-08-11T15:51:56.938062Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index           128\n",
       "sepal_length    600\n",
       "sepal_width     600\n",
       "petal_length    600\n",
       "petal_width     600\n",
       "species         426\n",
       "dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris.memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2020-08-11T15:51:56.946379Z",
     "iopub.status.busy": "2020-08-11T15:51:56.946137Z",
     "iopub.status.idle": "2020-08-11T15:51:56.954586Z",
     "shell.execute_reply": "2020-08-11T15:51:56.953734Z",
     "shell.execute_reply.started": "2020-08-11T15:51:56.946341Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'In total memory usage is reduced by 79.9%'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"In total memory usage is reduced by {:.1%}\".format(1 - iris.memory_usage(deep=True).sum() / iris_init.memory_usage(deep=True).sum())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}