{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "toc": true
   },
   "source": [
    "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
    "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Understanding-Pandas-Data-Type\" data-toc-modified-id=\"Understanding-Pandas-Data-Type-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Understanding Pandas Data Type</a></span><ul class=\"toc-item\"><li><span><a href=\"#Optimizing-Numeric-Columns\" data-toc-modified-id=\"Optimizing-Numeric-Columns-1.1\"><span class=\"toc-item-num\">1.1&nbsp;&nbsp;</span>Optimizing Numeric Columns</a></span></li><li><span><a href=\"#Optimizing-object-types\" data-toc-modified-id=\"Optimizing-object-types-1.2\"><span class=\"toc-item-num\">1.2&nbsp;&nbsp;</span>Optimizing object types</a></span></li><li><span><a href=\"#Selecting-Types-While-Reading-the-Data-In\" data-toc-modified-id=\"Selecting-Types-While-Reading-the-Data-In-1.3\"><span class=\"toc-item-num\">1.3&nbsp;&nbsp;</span>Selecting Types While Reading the Data In</a></span></li><li><span><a href=\"#Ordered-Categorical\" data-toc-modified-id=\"Ordered-Categorical-1.4\"><span class=\"toc-item-num\">1.4&nbsp;&nbsp;</span>Ordered Categorical</a></span></li></ul></li><li><span><a href=\"#Reference\" data-toc-modified-id=\"Reference-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Reference</a></span></li></ul></div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       "@import url('http://fonts.googleapis.com/css?family=Source+Code+Pro');\n",
       "@import url('http://fonts.googleapis.com/css?family=Vollkorn');\n",
       "@import url('http://fonts.googleapis.com/css?family=Arimo');\n",
       "@import url('http://fonts.googleapis.com/css?family=Fira_sans');\n",
       "    \n",
       "    div.cell {\n",
       "        width: 1000px;\n",
       "        margin-left: 0% !important;\n",
       "        margin-right: auto;\n",
       "    }\n",
       "    div.text_cell code {\n",
       "        background: transparent;\n",
       "        color: #000000;\n",
       "        font-weight: 600;\n",
       "        font-size: 12pt;\n",
       "        font-style: bold;\n",
       "        font-family:  'Source Code Pro', Consolas, monocco, monospace;\n",
       "    }\n",
       "    h1 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "\t}\n",
       "\t\n",
       "    div.input_area {\n",
       "        background: #F6F6F9;\n",
       "        border: 1px solid #586e75;\n",
       "    }\n",
       "\n",
       "    .text_cell_render h1 {\n",
       "        font-weight: 200;\n",
       "        font-size: 30pt;\n",
       "        line-height: 100%;\n",
       "        color:#c76c0c;\n",
       "        margin-bottom: 0.5em;\n",
       "        margin-top: 1em;\n",
       "        display: block;\n",
       "        white-space: wrap;\n",
       "        text-align: left;\n",
       "    } \n",
       "    h2 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "        text-align: left;\n",
       "    }\n",
       "    .text_cell_render h2 {\n",
       "        font-weight: 200;\n",
       "        font-size: 16pt;\n",
       "        font-style: italic;\n",
       "        line-height: 100%;\n",
       "        color:#c76c0c;\n",
       "        margin-bottom: 0.5em;\n",
       "        margin-top: 1.5em;\n",
       "        display: block;\n",
       "        white-space: wrap;\n",
       "        text-align: left;\n",
       "    } \n",
       "    h3 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "    }\n",
       "    .text_cell_render h3 {\n",
       "        font-weight: 200;\n",
       "        font-size: 14pt;\n",
       "        line-height: 100%;\n",
       "        color:#d77c0c;\n",
       "        margin-bottom: 0.5em;\n",
       "        margin-top: 2em;\n",
       "        display: block;\n",
       "        white-space: wrap;\n",
       "        text-align: left;\n",
       "    }\n",
       "    h4 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "    }\n",
       "    .text_cell_render h4 {\n",
       "        font-weight: 100;\n",
       "        font-size: 14pt;\n",
       "        color:#d77c0c;\n",
       "        margin-bottom: 0.5em;\n",
       "        margin-top: 0.5em;\n",
       "        display: block;\n",
       "        white-space: nowrap;\n",
       "    }\n",
       "    h5 {\n",
       "        font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "    }\n",
       "    .text_cell_render h5 {\n",
       "        font-weight: 200;\n",
       "        font-style: normal;\n",
       "        color: #1d3b84;\n",
       "        font-size: 16pt;\n",
       "        margin-bottom: 0em;\n",
       "        margin-top: 0.5em;\n",
       "        display: block;\n",
       "        white-space: nowrap;\n",
       "    }\n",
       "    div.text_cell_render{\n",
       "        font-family: 'Fira sans', verdana,arial,sans-serif;\n",
       "        line-height: 125%;\n",
       "        font-size: 115%;\n",
       "        text-align:justify;\n",
       "        text-justify:inter-word;\n",
       "    }\n",
       "    div.output_wrapper{\n",
       "        margin-top:0.2em;\n",
       "        margin-bottom:0.2em;\n",
       "    }\n",
       "\n",
       "    code{\n",
       "      font-size: 70%;\n",
       "    }\n",
       "    .rendered_html code{\n",
       "    background-color: transparent;\n",
       "    }\n",
       "    ul{\n",
       "        margin: 2em;\n",
       "    }\n",
       "    ul li{\n",
       "        padding-left: 0.5em; \n",
       "        margin-bottom: 0.5em; \n",
       "        margin-top: 0.5em; \n",
       "    }\n",
       "    ul li li{\n",
       "        padding-left: 0.2em; \n",
       "        margin-bottom: 0.2em; \n",
       "        margin-top: 0.2em; \n",
       "    }\n",
       "    ol{\n",
       "        margin: 2em;\n",
       "    }\n",
       "    ol li{\n",
       "        padding-left: 0.5em; \n",
       "        margin-bottom: 0.5em; \n",
       "        margin-top: 0.5em; \n",
       "    }\n",
       "    ul li{\n",
       "        padding-left: 0.5em; \n",
       "        margin-bottom: 0.5em; \n",
       "        margin-top: 0.2em; \n",
       "    }\n",
       "    a:link{\n",
       "       font-weight: bold;\n",
       "       color:#447adb;\n",
       "    }\n",
       "    a:visited{\n",
       "       font-weight: bold;\n",
       "       color: #1d3b84;\n",
       "    }\n",
       "    a:hover{\n",
       "       font-weight: bold;\n",
       "       color: #1d3b84;\n",
       "    }\n",
       "    a:focus{\n",
       "       font-weight: bold;\n",
       "       color:#447adb;\n",
       "    }\n",
       "    a:active{\n",
       "       font-weight: bold;\n",
       "       color:#447adb;\n",
       "    }\n",
       "    .rendered_html :link {\n",
       "       text-decoration: underline; \n",
       "    }\n",
       "    .rendered_html :hover {\n",
       "       text-decoration: none; \n",
       "    }\n",
       "    .rendered_html :visited {\n",
       "      text-decoration: none;\n",
       "    }\n",
       "    .rendered_html :focus {\n",
       "      text-decoration: none;\n",
       "    }\n",
       "    .rendered_html :active {\n",
       "      text-decoration: none;\n",
       "    }\n",
       "    .warning{\n",
       "        color: rgb( 240, 20, 20 )\n",
       "    } \n",
       "    hr {\n",
       "      color: #f3f3f3;\n",
       "      background-color: #f3f3f3;\n",
       "      height: 1px;\n",
       "    }\n",
       "    blockquote{\n",
       "      display:block;\n",
       "      background: #fcfcfc;\n",
       "      border-left: 5px solid #c76c0c;\n",
       "      font-family: 'Open sans',verdana,arial,sans-serif;\n",
       "      width:680px;\n",
       "      padding: 10px 10px 10px 10px;\n",
       "      text-align:justify;\n",
       "      text-justify:inter-word;\n",
       "      }\n",
       "      blockquote p {\n",
       "        margin-bottom: 0;\n",
       "        line-height: 125%;\n",
       "        font-size: 100%;\n",
       "      }\n",
       "</style>\n",
       "<script>\n",
       "    MathJax.Hub.Config({\n",
       "                        TeX: {\n",
       "                           extensions: [\"AMSmath.js\"]\n",
       "                           },\n",
       "                tex2jax: {\n",
       "                    inlineMath: [ ['$','$'], [\"\\\\(\",\"\\\\)\"] ],\n",
       "                    displayMath: [ ['$$','$$'], [\"\\\\[\",\"\\\\]\"] ]\n",
       "                },\n",
       "                displayAlign: 'center', // Change this to 'center' to center equations.\n",
       "                \"HTML-CSS\": {\n",
       "                    scale:100,\n",
       "                        availableFonts: [],\n",
       "                        preferredFont:null,\n",
       "                        webFont: \"TeX\",\n",
       "                    styles: {'.MathJax_Display': {\"margin\": 4}}\n",
       "                }\n",
       "        });\n",
       "</script>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# code for loading the format for the notebook\n",
    "import os\n",
    "\n",
    "# path : store the current path to convert back to it later\n",
    "path = os.getcwd()\n",
    "os.chdir(os.path.join('..', '..', 'notebook_format'))\n",
    "\n",
    "from formats import load_style\n",
    "load_style(plot_style=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ethen 2019-08-08 15:15:37 \n",
      "\n",
      "CPython 3.6.4\n",
      "IPython 7.7.0\n",
      "\n",
      "numpy 1.17.0\n",
      "pandas 0.25.0\n"
     ]
    }
   ],
   "source": [
    "os.chdir(path)\n",
    "\n",
    "# 1. magic to print version\n",
    "# 2. magic so that the notebook will reload external python modules\n",
    "%load_ext watermark\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "%watermark -a 'Ethen' -d -t -v -p numpy,pandas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Understanding Pandas Data Type"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When working using pandas with small data (under 100 megabytes), performance is rarely a problem. When we move to larger data (100 megabytes to multiple gigabytes), performance issues can make run times much longer, and cause code to fail entirely due to insufficient memory. While tools like Spark can handle large data sets (100 gigabytes to multiple terabytes), taking full advantage of their capabilities usually requires more expensive hardware. And unlike pandas, they lack rich feature sets for high quality data cleaning, exploration, and analysis. For medium-sized data, we're better off trying to get more out of pandas, rather than switching to a different tool.\n",
    "\n",
    "In this documentation, we'll learn about memory usage with pandas, how to make pandas DataFrame smaller and faster, simply by selecting the appropriate data types for columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  \n",
       "0                           0.0      Asia  \n",
       "1                           4.9    Europe  \n",
       "2                           0.7    Africa  \n",
       "3                          12.4    Europe  \n",
       "4                           5.9    Africa  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks = pd.read_csv('http://bit.ly/drinksbycountry')\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We'll first look at the memory usage of each column, because we're interested in accuracy, we'll set the argument `deep` to `True` to get an accurate number."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index                             128\n",
       "country                         12588\n",
       "beer_servings                    1544\n",
       "spirit_servings                  1544\n",
       "wine_servings                    1544\n",
       "total_litres_of_pure_alcohol     1544\n",
       "continent                       12332\n",
       "dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# drinks.info(memory_usage = 'deep')\n",
    "# also works, it gives slightly different information\n",
    "drinks.memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Under the hood, pandas groups the columns into blocks of values of the same type, because each data type is stored separately, we’re going to examine the memory usage by each data type."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average memory usage for float columns: 836.00 B\n",
      "Average memory usage for int columns: 1190.00 B\n",
      "Average memory usage for object columns: 8349.33 B\n"
     ]
    }
   ],
   "source": [
    "for dtype in ('float', 'int', 'object'):\n",
    "    selected_dtype = drinks.select_dtypes(include = [dtype])\n",
    "    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()\n",
    "    # we can do / 1024 ** 2 to convert bytes to megabytes\n",
    "    print(\"Average memory usage for {} columns: {:03.2f} B\".format(dtype, mean_usage_b))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Immediately we can see that most of our memory is used by our `object` columns. We'll look at those later, but first lets see if we can improve on the memory usage for our numeric columns."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Optimizing Numeric Columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For blocks representing numeric values like integers and floats, pandas combines the columns and stores them as a NumPy ndarray. The NumPy ndarray is built around a C array, and the values are stored in a contiguous block of memory. This storage model consumes less space and allows us to access the values themselves quickly.\n",
    "\n",
    "Many types in pandas have multiple subtypes that can use fewer bytes to represent each value. For example, the float type has the float16, float32, and float64 subtypes. The number portion of a type's name indicates the number of bits that type uses to represent values. For example, the subtypes we just listed use 2, 4, 8 and  16 bytes, respectively. The following table shows the subtypes for the most common pandas types:\n",
    "\n",
    "|memory usage|\tfloat|\tint\t|uint\t|datetime  |bool  |\n",
    "|------------|-------|------|-------|----------|------|\n",
    "|1 bytes\t | \t     | int8 |uint8  |\t \t   |bool  |\n",
    "|2 bytes\t |float16| int16|uint16 |\t \t   |      |\n",
    "|4 bytes\t |float32| int32|uint32 |\t \t   |      |\n",
    "|8 bytes\t |float64| int64|uint64 |datetime64|\t  |\n",
    "\n",
    "An int8 value uses 1 byte (or 8 bits) to store a value, and can represent 256 values (2^8) in binary. This means that we can use this subtype to represent values ranging from -128 to 127 (including 0). And uint8, which is unsigned int, means we can only have positive values for this type, thus we can represent 256 values ranging from 0 to 255.\n",
    "\n",
    "We can use the `numpy.iinfo` class to verify the minimum and maximum values for each integer subtype. Let's look at an example:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Machine parameters for uint8\n",
      "---------------------------------------------------------------\n",
      "min = 0\n",
      "max = 255\n",
      "---------------------------------------------------------------\n",
      "\n",
      "Machine parameters for int8\n",
      "---------------------------------------------------------------\n",
      "min = -128\n",
      "max = 127\n",
      "---------------------------------------------------------------\n",
      "\n",
      "Machine parameters for int16\n",
      "---------------------------------------------------------------\n",
      "min = -32768\n",
      "max = 32767\n",
      "---------------------------------------------------------------\n",
      "\n"
     ]
    }
   ],
   "source": [
    "int_types = ['uint8', 'int8', 'int16']\n",
    "for int_type in int_types:\n",
    "    print(np.iinfo(int_type))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can use the function [`pd.to_numeric()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_numeric.html) to downcast our numeric types. We’ll use `DataFrame.select_dtypes` to select only the integer columns, then we’ll optimize the types and compare the memory usage."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mem_usage(pandas_obj):\n",
    "    \"\"\"memory usage of a pandas DataFrame or Series\"\"\"\n",
    "    # we assume if not a DataFrame it's a Series\n",
    "    if isinstance(pandas_obj, pd.DataFrame):\n",
    "        usage_b = pandas_obj.memory_usage(deep=True).sum()\n",
    "    else:\n",
    "        usage_b = pandas_obj.memory_usage(deep=True)\n",
    "\n",
    "    return '{:03.2f} B'.format(usage_b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4760.00 B\n",
      "1286.00 B\n"
     ]
    }
   ],
   "source": [
    "drinks_int = drinks.select_dtypes(include=['int'])\n",
    "converted_int = drinks_int.apply(pd.to_numeric, downcast='unsigned')\n",
    "\n",
    "print(mem_usage(drinks_int))\n",
    "print(mem_usage(converted_int))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Lets do the same thing with our float columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1672.00 B\n",
      "900.00 B\n"
     ]
    }
   ],
   "source": [
    "drinks_float = drinks.select_dtypes(include=['float'])\n",
    "converted_float = drinks_float.apply(pd.to_numeric, downcast='float')\n",
    "\n",
    "print(mem_usage(drinks_float))\n",
    "print(mem_usage(converted_float))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Optimizing object types"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The object type represents values using Python string objects, partly due to the lack of support for missing string values in NumPy. Because Python is a high-level, interpreted language, it doesn't have fine grained-control over how values in memory are stored.\n",
    "\n",
    "We'll use `sys.getsizeof()` to prove this out, first by looking at individual strings, and then items in a pandas series."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "60\n",
      "65\n",
      "74\n"
     ]
    }
   ],
   "source": [
    "from sys import getsizeof\n",
    "\n",
    "\n",
    "s1 = 'working out'\n",
    "s2 = 'memory usage for'\n",
    "s3 = 'strings in python is fun!'\n",
    "for s in [s1, s2, s3]:\n",
    "    print(getsizeof(s))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    60\n",
       "1    65\n",
       "2    74\n",
       "dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj_series = pd.Series(['working out',\n",
    "                        'memory usage for',\n",
    "                        'strings in python is fun!'])\n",
    "obj_series.apply(getsizeof)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can see that the size of strings when stored in a pandas series are identical to their usage as separate strings in Python. This limitation causes strings to be stored in a fragmented way that consumes more memory and is slower to access. Each element in an object column is really a pointer that contains the \"address\" for the actual value's location in memory. For more information about this part consider referring to the following link. [Blog: Why Python is Slow: Looking Under the Hood](https://jakevdp.github.io/blog/2014/05/09/why-python-is-slow/)\n",
    "\n",
    "To overcome this problem, Pandas introduced [`Categoricals`](http://pandas.pydata.org/pandas-docs/stable/categorical.html) in version 0.15. The category type uses integer values under the hood to represent the values in a column, rather than the raw values. Pandas uses a separate mapping dictionary that maps the integer values to the raw ones. This arrangement is useful whenever a column contains a limited set of values. When we convert a column to the category dtype, pandas uses the most space efficient int subtype that can represent all of the unique values in a column.\n",
    "\n",
    "<img src=\"categorical.png\" width=\"40%\" height=\"40%\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Since the `country` and `continent` columns are strings, they are represented as object types in pandas. Now let's say, instead of storing strings, we want to store the `continent` column as integers to reduce the memory required to store them by converting it to categorical type. To apply this conversion, we simply have to convert the column type to `category` using the `.astype` method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0      Asia\n",
      "1    Europe\n",
      "2    Africa\n",
      "3    Europe\n",
      "4    Africa\n",
      "Name: continent, dtype: object\n",
      "0      Asia\n",
      "1    Europe\n",
      "2    Africa\n",
      "3    Europe\n",
      "4    Africa\n",
      "Name: continent, dtype: category\n",
      "Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]\n"
     ]
    }
   ],
   "source": [
    "# convert and print the memory usage\n",
    "continent_col = 'continent'\n",
    "continent = drinks[continent_col]\n",
    "continent_cat = drinks[continent_col].astype('category')\n",
    "print(continent.head())\n",
    "print(continent_cat.head())\n",
    "# drinks.memory_usage(deep = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As we can see, apart from the fact that the type of the column has changed, the data looks exactly the same. Pandas internals will smooth out the user experience so we don’t notice that we’re actually using a compact array of integers. \n",
    "\n",
    "Let’s take a look at what's happening under the hood. In the following code chunk, we use the `Series.cat.codes` attribute to return the integer values the category type uses to represent each value."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1\n",
       "1    2\n",
       "2    0\n",
       "3    2\n",
       "4    0\n",
       "dtype: int8"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# this is simply showing the first 5 row)\n",
    "continent_cat.cat.codes[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This column doesn’t have any missing values, but if it did, the `category` subtype handles missing values by setting them to `-1`.\n",
    "\n",
    "We can also access the unique categories using the `Series.cat.categories` attribute. This information servers as the lookup table that stores the mappings of the integer representation to the original category."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Africa', 'Asia', 'Europe', 'North America', 'Oceania',\n",
       "       'South America'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "continent_cat.cat.categories"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Lastly, let’s look at the memory usage for this column before and after converting to the  category type."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "original:  12460.00 B\n",
      "categorical:  872.00 B\n"
     ]
    }
   ],
   "source": [
    "print('original: ', mem_usage(continent))\n",
    "print('categorical: ', mem_usage(continent_cat))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can see that by converting the `continent` column to integers we're being more space-efficient. Apart from that it can actually speed up laters operations, e.g. sorting, groupby as we're storing the strings as compactly as integers. Let's apply this notion again to the `country` column."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "original:  12716.00 B\n",
      "categorical:  18222.00 B\n"
     ]
    }
   ],
   "source": [
    "country_col = 'country'\n",
    "country = drinks[country_col]\n",
    "country_cat = drinks[country_col].astype('category')\n",
    "print('original: ', mem_usage(country))\n",
    "print('categorical: ', mem_usage(country_cat))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This time, the memory usage for the country column is now larger. The reason is that the country column's  value is unique. If all of the values in a column are unique, the category type will end up using more memory because the column is storing all of the raw string values in addition to the integer category codes.\n",
    "\n",
    "Thus we're actually creating 193 (shown below) unqiue categories, and we also have to store a lookup table for that."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "193"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "country_cat.cat.categories.size"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In summary, if we're working with an object column of strings, convert it to `category` type to make it for efficient. But this must be based on the assumption that the column takes a limited number of unique values, like in this case, the continent column only has 6 unique values."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Selecting Types While Reading the Data In"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So far, we've explored ways to reduce the memory footprint of an existing dataframe. By reading the dataframe in first and then iterating on ways to save memory, we were able to understand the amount of memory we can expect to save from each optimization better. As we mentioned earlier in the mission, however, we often won't have enough memory to represent all the values in a data set. How can we apply memory-saving techniques when we can't even create the dataframe in the first place?\n",
    "\n",
    "Fortunately, we can specify the optimal column types when we read the data set in. The `pandas.read_csv()` function has a few different parameters that allow us to do this. The dtype parameter accepts a dictionary that has (string) column names as the keys and numpy type objects as the values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index                             128\n",
       "country                         12588\n",
       "beer_servings                     772\n",
       "spirit_servings                   772\n",
       "wine_servings                     772\n",
       "total_litres_of_pure_alcohol      772\n",
       "continent                         744\n",
       "dtype: int64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "col_types = {'beer_servings': 'uint32',\n",
    "             'continent': 'category',\n",
    "             'country': 'object',\n",
    "             'spirit_servings': 'uint32',\n",
    "             'total_litres_of_pure_alcohol': 'float32',\n",
    "             'wine_servings': 'uint32'}\n",
    "\n",
    "# we can see that the column's memory usage is significantly\n",
    "# smaller than before (the DataFrame at the beginning)\n",
    "df_drinks = pd.read_csv('http://bit.ly/drinksbycountry', dtype=col_types)\n",
    "df_drinks.memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or instead of manually specifying the type, we can leverage a function to automatically perform the memory reduction for us."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def reduce_mem_usage(df, blacklist_cols=None):\n",
    "    \"\"\"\n",
    "    Iterate through all the columns of the dataframe and downcast the\n",
    "    data type to reduce memory usage.\n",
    "\n",
    "    The logic is numeric type will be downcast to the smallest possible\n",
    "    numeric type. e.g. if an int column's value ranges from 1 - 8, then it\n",
    "    fits into an int8 type, and will be downcast to int8.\n",
    "    And object type will be converted to categorical type.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    df : pd.DataFrame\n",
    "        Dataframe prior the memory reduction.\n",
    "\n",
    "    blacklist_cols : collection[str], e.g. list[str], set[str]\n",
    "        A collection of column names that won't go through the memory\n",
    "        reduction process.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    df : pd.DataFrame\n",
    "        Dataframe post memory reduction.\n",
    "\n",
    "    References\n",
    "    ----------\n",
    "    https://www.kaggle.com/gemartin/load-data-reduce-memory-usage\n",
    "    \"\"\"\n",
    "    start_mem = compute_df_total_mem(df)\n",
    "    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n",
    "    \n",
    "    blacklist_cols = blacklist_cols if blacklist_cols else set()\n",
    "    for col in df.columns:\n",
    "        if col in blacklist_cols:\n",
    "            continue\n",
    "        \n",
    "        col_type = df[col].dtype\n",
    "        \n",
    "        if col_type != object:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == 'int':\n",
    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                    df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)\n",
    "        else:\n",
    "            df[col] = df[col].astype('category')\n",
    "\n",
    "    end_mem = compute_df_total_mem(df)\n",
    "    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n",
    "    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n",
    "    return df\n",
    "\n",
    "\n",
    "def compute_df_total_mem(df):\n",
    "    \"\"\"Returns a dataframe's total memory usage in MB.\"\"\"\n",
    "    return df.memory_usage(deep=True).sum() / 1024 ** 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 0.03 MB\n",
      "Memory usage after optimization is: 0.01 MB\n",
      "Decreased by 51.9%\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Index                             128\n",
       "country                         12588\n",
       "beer_servings                     386\n",
       "spirit_servings                   386\n",
       "wine_servings                     386\n",
       "total_litres_of_pure_alcohol      386\n",
       "continent                         744\n",
       "dtype: int64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_drinks = pd.read_csv('http://bit.ly/drinksbycountry')\n",
    "df_drinks = reduce_mem_usage(df_drinks, blacklist_cols=['country'])\n",
    "df_drinks.memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The idea is that after performing the memory reduction, we should save this dataframe back to disk so in the future, we won't have to go through this process every time. (Assuming we'll be reading this data again and again)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ordered Categorical"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Another usage of `category` is to specify its order to perform sorting."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>very good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>excellent</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID    quality\n",
       "0  100       good\n",
       "1  101  very good\n",
       "2  102       good\n",
       "3  103  excellent"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# toy dataset that contains\n",
    "# the id for a product and\n",
    "# its corresponding customer review\n",
    "df = pd.DataFrame({\n",
    "    'ID': [100, 101, 102, 103],\n",
    "    'quality': ['good', 'very good', 'good', 'excellent']\n",
    "})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>excellent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>very good</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID    quality\n",
       "3  103  excellent\n",
       "0  100       good\n",
       "2  102       good\n",
       "1  101  very good"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# if we do a sort on quality, it will be\n",
    "# sorted alphabetically (default sorting for strings)\n",
    "df.sort_values('quality')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>very good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>excellent</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID    quality\n",
       "0  100       good\n",
       "2  102       good\n",
       "1  101  very good\n",
       "3  103  excellent"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can use an instance of CategoricalDtype to specify our own ordering\n",
    "cat_dtype = pd.api.types.CategoricalDtype(\n",
    "    categories=['good', 'very good', 'excellent'], ordered=True)\n",
    "df['quality'] = df['quality'].astype(cat_dtype)\n",
    "\n",
    "df.sort_values('quality')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>very good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>excellent</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID    quality\n",
       "1  101  very good\n",
       "3  103  excellent"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can even use boolean method on this new ordering\n",
    "df.loc[df['quality'] > 'good']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reference"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- [Blog: Pandas Categoricals](https://www.continuum.io/content/pandas-categoricals)\n",
    "- [Blog: Using pandas with large data](https://www.dataquest.io/blog/pandas-big-data/)\n",
    "- [Youtube: How do I make my pandas DataFrame smaller and faster?](https://www.youtube.com/watch?v=wDYDYGyN_cw)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  },
  "toc": {
   "nav_menu": {
    "height": "81px",
    "width": "252px"
   },
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}