{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:51:56.088764Z", "iopub.status.busy": "2020-08-11T15:51:56.088469Z", "iopub.status.idle": "2020-08-11T15:51:56.837524Z", "shell.execute_reply": "2020-08-11T15:51:56.836477Z", "shell.execute_reply.started": "2020-08-11T15:51:56.088716Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data & column memory statistics" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:51:56.839417Z", "iopub.status.busy": "2020-08-11T15:51:56.839203Z", "iopub.status.idle": "2020-08-11T15:51:56.888382Z", "shell.execute_reply": "2020-08-11T15:51:56.887185Z", "shell.execute_reply.started": "2020-08-11T15:51:56.839382Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>sepal_length</th>\n", " <th>sepal_width</th>\n", " <th>petal_length</th>\n", " <th>petal_width</th>\n", " <th>species</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <td>0</td>\n", " <td>5.1</td>\n", " <td>3.5</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>setosa</td>\n", " </tr>\n", " <tr>\n", " <td>1</td>\n", " <td>4.9</td>\n", " <td>3.0</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>setosa</td>\n", " </tr>\n", " <tr>\n", " <td>2</td>\n", " <td>4.7</td>\n", " <td>3.2</td>\n", " <td>1.3</td>\n", " <td>0.2</td>\n", " <td>setosa</td>\n", " </tr>\n", " <tr>\n", " <td>3</td>\n", " <td>4.6</td>\n", " <td>3.1</td>\n", " <td>1.5</td>\n", " <td>0.2</td>\n", " <td>setosa</td>\n", " </tr>\n", " <tr>\n", " <td>4</td>\n", " <td>5.0</td>\n", " <td>3.6</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>setosa</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.1 3.5 1.4 0.2 setosa\n", "1 4.9 3.0 1.4 0.2 setosa\n", "2 4.7 3.2 1.3 0.2 setosa\n", "3 4.6 3.1 1.5 0.2 setosa\n", "4 5.0 3.6 1.4 0.2 setosa" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris = sns.load_dataset('iris')\n", "iris_init = iris.copy() # will be used to get the reduction performance\n", "iris.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:51:56.889851Z", "iopub.status.busy": "2020-08-11T15:51:56.889567Z", "iopub.status.idle": "2020-08-11T15:51:56.896621Z", "shell.execute_reply": "2020-08-11T15:51:56.895716Z", "shell.execute_reply.started": "2020-08-11T15:51:56.889803Z" } }, "outputs": [ { "data": { "text/plain": [ "sepal_length float64\n", "sepal_width float64\n", "petal_length float64\n", "petal_width float64\n", "species object\n", "dtype: object" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.dtypes" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:51:56.898063Z", "iopub.status.busy": "2020-08-11T15:51:56.897839Z", "iopub.status.idle": "2020-08-11T15:51:56.906122Z", "shell.execute_reply": "2020-08-11T15:51:56.905172Z", "shell.execute_reply.started": "2020-08-11T15:51:56.898026Z" } }, "outputs": [ { "data": { "text/plain": [ "Index 128\n", "sepal_length 1200\n", "sepal_width 1200\n", "petal_length 1200\n", "petal_width 1200\n", "species 9800\n", "dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.memory_usage(deep=True)" ] }, { "cell_type": "markdown", "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:22:46.192393Z", "iopub.status.busy": "2020-08-11T15:22:46.192141Z", "iopub.status.idle": "2020-08-11T15:22:46.196169Z", "shell.execute_reply": "2020-08-11T15:22:46.195015Z", "shell.execute_reply.started": "2020-08-11T15:22:46.192354Z" } }, "source": [ "### Applying optimizations" ] }, { "cell_type": "markdown", "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:22:57.876536Z", "iopub.status.busy": "2020-08-11T15:22:57.876286Z", "iopub.status.idle": "2020-08-11T15:22:57.880191Z", "shell.execute_reply": "2020-08-11T15:22:57.879111Z", "shell.execute_reply.started": "2020-08-11T15:22:57.876496Z" } }, "source": [ "#### Casting categorical columns as `category`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Usually gives biggest memory savings. Instead of storing whole strings/objects `pandas` tokenizes them and only stores indices which significantly reduces the memory usage." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:51:56.908888Z", "iopub.status.busy": "2020-08-11T15:51:56.908622Z", "iopub.status.idle": "2020-08-11T15:51:56.914680Z", "shell.execute_reply": "2020-08-11T15:51:56.913797Z", "shell.execute_reply.started": "2020-08-11T15:51:56.908847Z" } }, "outputs": [], "source": [ "iris.species = iris.species.astype(\"category\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:51:56.916242Z", "iopub.status.busy": "2020-08-11T15:51:56.916011Z", "iopub.status.idle": "2020-08-11T15:51:56.923110Z", "shell.execute_reply": "2020-08-11T15:51:56.921794Z", "shell.execute_reply.started": "2020-08-11T15:51:56.916205Z" } }, "outputs": [ { "data": { "text/plain": [ "'Column memory usage is reduced by 94.4%'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"Column memory usage is reduced by {:.1%}\".format(1 - iris.species.memory_usage(deep=True) / iris_init.species.memory_usage(deep=True))" ] }, { "cell_type": "markdown", "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:31:05.555896Z", "iopub.status.busy": "2020-08-11T15:31:05.555631Z", "iopub.status.idle": "2020-08-11T15:31:05.561770Z", "shell.execute_reply": "2020-08-11T15:31:05.560011Z", "shell.execute_reply.started": "2020-08-11T15:31:05.555855Z" } }, "source": [ "#### Optimizing the numerical columns with `pd.to_numeric` and `downcast` argument" ] }, { "cell_type": "markdown", "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:47:12.928593Z", "iopub.status.busy": "2020-08-11T15:47:12.928363Z", "iopub.status.idle": "2020-08-11T15:47:12.934125Z", "shell.execute_reply": "2020-08-11T15:47:12.932791Z", "shell.execute_reply.started": "2020-08-11T15:47:12.928556Z" } }, "source": [ "By default pandas are using `float64` numeric type which is one of the heaviest ones. Some optimization can be done here given that we do not care about such high precision in this case (and, in fact, in this dataset we have only 1 digit after comma/dot)." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:51:56.924884Z", "iopub.status.busy": "2020-08-11T15:51:56.924425Z", "iopub.status.idle": "2020-08-11T15:51:56.936372Z", "shell.execute_reply": "2020-08-11T15:51:56.935309Z", "shell.execute_reply.started": "2020-08-11T15:51:56.924827Z" } }, "outputs": [], "source": [ "columns = iris.columns.drop(\"species\")\n", "iris[columns] = iris[columns].apply(pd.to_numeric, downcast=\"float\")" ] }, { "cell_type": "markdown", "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:29:23.859559Z", "iopub.status.busy": "2020-08-11T15:29:23.859306Z", "iopub.status.idle": "2020-08-11T15:29:23.862930Z", "shell.execute_reply": "2020-08-11T15:29:23.861985Z", "shell.execute_reply.started": "2020-08-11T15:29:23.859519Z" } }, "source": [ "### Final memory usage" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:51:56.938099Z", "iopub.status.busy": "2020-08-11T15:51:56.937861Z", "iopub.status.idle": "2020-08-11T15:51:56.944942Z", "shell.execute_reply": "2020-08-11T15:51:56.944129Z", "shell.execute_reply.started": "2020-08-11T15:51:56.938062Z" } }, "outputs": [ { "data": { "text/plain": [ "Index 128\n", "sepal_length 600\n", "sepal_width 600\n", "petal_length 600\n", "petal_width 600\n", "species 426\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.memory_usage(deep=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2020-08-11T15:51:56.946379Z", "iopub.status.busy": "2020-08-11T15:51:56.946137Z", "iopub.status.idle": "2020-08-11T15:51:56.954586Z", "shell.execute_reply": "2020-08-11T15:51:56.953734Z", "shell.execute_reply.started": "2020-08-11T15:51:56.946341Z" } }, "outputs": [ { "data": { "text/plain": [ "'In total memory usage is reduced by 79.9%'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"In total memory usage is reduced by {:.1%}\".format(1 - iris.memory_usage(deep=True).sum() / iris_init.memory_usage(deep=True).sum())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }