{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Wrangling\n",
    "\n",
    "This notebook presents different data wrangling techniques used commonly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import required libraries\n",
    "import random\n",
    "import datetime \n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from random import randrange\n",
    "from sklearn import preprocessing\n",
    "\n",
    "from IPython.display import display\n",
    "\n",
    "pd.options.mode.chained_assignment = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Utilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def _random_date(start,date_count):\n",
    "    \"\"\"This function generates a random date based on params\n",
    "    Args:\n",
    "        start (date object): the base date\n",
    "        date_count (int): number of dates to be generated\n",
    "    Returns:\n",
    "        list of random dates\n",
    "\n",
    "    \"\"\"\n",
    "    current = start\n",
    "    while date_count > 0:\n",
    "        curr = current + datetime.timedelta(days=randrange(42))\n",
    "        yield curr\n",
    "        date_count-=1\n",
    "\n",
    "\n",
    "def generate_sample_data(row_count=100):\n",
    "    \"\"\"This function generates a random transaction dataset\n",
    "    Args:\n",
    "        row_count (int): number of rows for the dataframe\n",
    "    Returns:\n",
    "        a pandas dataframe\n",
    "\n",
    "    \"\"\"\n",
    "    \n",
    "    # sentinels\n",
    "    startDate = datetime.datetime(2016, 1, 1,13)\n",
    "    serial_number_sentinel = 1000\n",
    "    user_id_sentinel = 5001\n",
    "    product_id_sentinel = 101\n",
    "    price_sentinel = 2000\n",
    "    \n",
    "    \n",
    "    # base list of attributes\n",
    "    data_dict = {\n",
    "    'Serial No': np.arange(row_count)+serial_number_sentinel,\n",
    "    'Date': np.random.permutation(pd.to_datetime([x.strftime(\"%d-%m-%Y\") \n",
    "                                                    for x in _random_date(startDate,\n",
    "                                                                          row_count)]).date\n",
    "                                  ),\n",
    "    'User ID': np.random.permutation(np.random.randint(0,\n",
    "                                                       row_count,\n",
    "                                                       size=int(row_count/10)) + user_id_sentinel).tolist()*10,\n",
    "    'Product ID': np.random.permutation(np.random.randint(0,\n",
    "                                                          row_count,\n",
    "                                                          size=int(row_count/10))+ product_id_sentinel).tolist()*10 ,\n",
    "    'Quantity Purchased': np.random.permutation(np.random.randint(1,\n",
    "                                                                  42,\n",
    "                                                                  size=row_count)),\n",
    "    'Price': np.round(np.abs(np.random.randn(row_count)+1)*price_sentinel,\n",
    "                      decimals=2),\n",
    "    'User Type':np.random.permutation([chr(random.randrange(97, 97 + 3 + 1)) \n",
    "                                            for i in range(row_count)])\n",
    "    }\n",
    "    \n",
    "    # introduce missing values\n",
    "    for index in range(int(np.sqrt(row_count))): \n",
    "        data_dict['Price'][np.argmax(data_dict['Price'] == random.choice(data_dict['Price']))] = np.nan\n",
    "        data_dict['User Type'][np.argmax(data_dict['User Type'] == random.choice(data_dict['User Type']))] = np.nan\n",
    "        data_dict['Date'][np.argmax(data_dict['Date'] == random.choice(data_dict['Date']))] = np.nan\n",
    "        data_dict['Product ID'][np.argmax(data_dict['Product ID'] == random.choice(data_dict['Product ID']))] = 0\n",
    "        data_dict['Serial No'][np.argmax(data_dict['Serial No'] == random.choice(data_dict['Serial No']))] = -1\n",
    "        data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(data_dict['User ID']))] = -101\n",
    "        \n",
    "    \n",
    "    # create data frame\n",
    "    df = pd.DataFrame(data_dict)\n",
    "    \n",
    "    return df\n",
    "    \n",
    "\n",
    "def describe_dataframe(df=pd.DataFrame()):\n",
    "    \"\"\"This function generates descriptive stats of a dataframe\n",
    "    Args:\n",
    "        df (dataframe): the dataframe to be analyzed\n",
    "    Returns:\n",
    "        None\n",
    "\n",
    "    \"\"\"\n",
    "    print(\"\\n\\n\")\n",
    "    print(\"*\"*30)\n",
    "    print(\"About the Data\")\n",
    "    print(\"*\"*30)\n",
    "    \n",
    "    print(\"Number of rows::\",df.shape[0])\n",
    "    print(\"Number of columns::\",df.shape[1])\n",
    "    print(\"\\n\")\n",
    "    \n",
    "    print(\"Column Names::\",df.columns.values.tolist())\n",
    "    print(\"\\n\")\n",
    "    \n",
    "    print(\"Column Data Types::\\n\",df.dtypes)\n",
    "    print(\"\\n\")\n",
    "    \n",
    "    print(\"Columns with Missing Values::\",df.columns[df.isnull().any()].tolist())\n",
    "    print(\"\\n\")\n",
    "    \n",
    "    print(\"Number of rows with Missing Values::\",len(pd.isnull(df).any(1).nonzero()[0].tolist()))\n",
    "    print(\"\\n\")\n",
    "    \n",
    "    print(\"Sample Indices with missing data::\",pd.isnull(df).any(1).nonzero()[0].tolist()[0:5])\n",
    "    print(\"\\n\")\n",
    "    \n",
    "    print(\"General Stats::\")\n",
    "    print(df.info())\n",
    "    print(\"\\n\")\n",
    "    \n",
    "    print(\"Summary Stats::\")\n",
    "    print(df.describe())\n",
    "    print(\"\\n\")\n",
    "    \n",
    "    print(\"Dataframe Sample Rows::\")\n",
    "    display(df.head(5))\n",
    "    \n",
    "def cleanup_column_names(df,rename_dict={},do_inplace=True):\n",
    "    \"\"\"This function renames columns of a pandas dataframe\n",
    "       It converts column names to snake case if rename_dict is not passed. \n",
    "    Args:\n",
    "        rename_dict (dict): keys represent old column names and values point to \n",
    "                            newer ones\n",
    "        do_inplace (bool): flag to update existing dataframe or return a new one\n",
    "    Returns:\n",
    "        pandas dataframe if do_inplace is set to False, None otherwise\n",
    "\n",
    "    \"\"\"\n",
    "    if not rename_dict:\n",
    "        return df.rename(columns={col: col.lower().replace(' ','_') \n",
    "                    for col in df.columns.values.tolist()}, \n",
    "                  inplace=do_inplace)\n",
    "    else:\n",
    "        return df.rename(columns=rename_dict,inplace=do_inplace)\n",
    "\n",
    "def expand_user_type(u_type):\n",
    "    \"\"\"This function maps user types to user classes\n",
    "    Args:\n",
    "        u_type (str): user type value\n",
    "    Returns:\n",
    "        (str) user_class value\n",
    "\n",
    "    \"\"\"\n",
    "    if u_type in ['a','b']:\n",
    "        return 'new'\n",
    "    elif u_type == 'c':\n",
    "        return 'existing'\n",
    "    elif u_type == 'd':\n",
    "        return 'loyal_existing'\n",
    "    else:\n",
    "        return 'error'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate a Sample Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df = generate_sample_data(row_count=1000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Describe the Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "******************************\n",
      "About the Data\n",
      "******************************\n",
      "Number of rows:: 1000\n",
      "Number of columns:: 7\n",
      "\n",
      "\n",
      "Column Names:: ['Date', 'Price', 'Product ID', 'Quantity Purchased', 'Serial No', 'User ID', 'User Type']\n",
      "\n",
      "\n",
      "Column Data Types::\n",
      " Date                   object\n",
      "Price                 float64\n",
      "Product ID              int64\n",
      "Quantity Purchased      int32\n",
      "Serial No               int32\n",
      "User ID                 int64\n",
      "User Type              object\n",
      "dtype: object\n",
      "\n",
      "\n",
      "Columns with Missing Values:: ['Date', 'Price']\n",
      "\n",
      "\n",
      "Number of rows with Missing Values:: 61\n",
      "\n",
      "\n",
      "Sample Indices with missing data:: [1, 2, 3, 9, 11]\n",
      "\n",
      "\n",
      "General Stats::\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1000 entries, 0 to 999\n",
      "Data columns (total 7 columns):\n",
      "Date                  969 non-null object\n",
      "Price                 969 non-null float64\n",
      "Product ID            1000 non-null int64\n",
      "Quantity Purchased    1000 non-null int32\n",
      "Serial No             1000 non-null int32\n",
      "User ID               1000 non-null int64\n",
      "User Type             1000 non-null object\n",
      "dtypes: float64(1), int32(2), int64(2), object(2)\n",
      "memory usage: 47.0+ KB\n",
      "None\n",
      "\n",
      "\n",
      "Summary Stats::\n",
      "             Price  Product ID  Quantity Purchased    Serial No      User ID\n",
      "count   969.000000  1000.00000         1000.000000  1000.000000  1000.000000\n",
      "mean   2468.147967   618.90100           21.063000  1454.554000  5478.014000\n",
      "std    1657.607501   274.46151           12.170092   385.901616   340.779522\n",
      "min       2.600000     0.00000            1.000000    -1.000000  -101.000000\n",
      "25%    1086.990000   382.75000           10.000000  1227.750000  5245.000000\n",
      "50%    2248.270000   628.00000           21.000000  1483.500000  5412.000000\n",
      "75%    3543.000000   838.75000           32.000000  1744.250000  5744.500000\n",
      "max    8493.210000  1099.00000           41.000000  1999.000000  5992.000000\n",
      "\n",
      "\n",
      "Dataframe Sample Rows::\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Date</th>\n",
       "      <th>Price</th>\n",
       "      <th>Product ID</th>\n",
       "      <th>Quantity Purchased</th>\n",
       "      <th>Serial No</th>\n",
       "      <th>User ID</th>\n",
       "      <th>User Type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2016-01-23</td>\n",
       "      <td>1395.65</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1000</td>\n",
       "      <td>-101</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1352.99</td>\n",
       "      <td>906</td>\n",
       "      <td>19</td>\n",
       "      <td>1001</td>\n",
       "      <td>5632</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>3997.32</td>\n",
       "      <td>625</td>\n",
       "      <td>21</td>\n",
       "      <td>1002</td>\n",
       "      <td>5240</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>3681.48</td>\n",
       "      <td>865</td>\n",
       "      <td>35</td>\n",
       "      <td>1003</td>\n",
       "      <td>5557</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2016-01-27</td>\n",
       "      <td>3850.22</td>\n",
       "      <td>929</td>\n",
       "      <td>3</td>\n",
       "      <td>1004</td>\n",
       "      <td>5489</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         Date    Price  Product ID  Quantity Purchased  Serial No  User ID  \\\n",
       "0  2016-01-23  1395.65           0                   3       1000     -101   \n",
       "1         NaN  1352.99         906                  19       1001     5632   \n",
       "2         NaN  3997.32         625                  21       1002     5240   \n",
       "3         NaN  3681.48         865                  35       1003     5557   \n",
       "4  2016-01-27  3850.22         929                   3       1004     5489   \n",
       "\n",
       "  User Type  \n",
       "0         n  \n",
       "1         n  \n",
       "2         n  \n",
       "3         n  \n",
       "4         n  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "describe_dataframe(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Rename Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataframe columns:\n",
      "['Date', 'Price', 'Product ID', 'Quantity Purchased', 'Serial No', 'User ID', 'User Type']\n"
     ]
    }
   ],
   "source": [
    "print(\"Dataframe columns:\\n{}\".format(df.columns.tolist()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cleanup_column_names(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataframe columns:\n",
      "['date', 'price', 'product_id', 'quantity_purchased', 'serial_no', 'user_id', 'user_type']\n"
     ]
    }
   ],
   "source": [
    "print(\"Dataframe columns:\\n{}\".format(df.columns.tolist()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sort Rows on defined attributes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>502</th>\n",
       "      <td>2016-03-02</td>\n",
       "      <td>6168.66</td>\n",
       "      <td>625</td>\n",
       "      <td>22</td>\n",
       "      <td>-1</td>\n",
       "      <td>5240</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>727</th>\n",
       "      <td>2016-01-28</td>\n",
       "      <td>5483.55</td>\n",
       "      <td>445</td>\n",
       "      <td>11</td>\n",
       "      <td>-1</td>\n",
       "      <td>5016</td>\n",
       "      <td>c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>680</th>\n",
       "      <td>2016-01-13</td>\n",
       "      <td>5163.72</td>\n",
       "      <td>185</td>\n",
       "      <td>41</td>\n",
       "      <td>-1</td>\n",
       "      <td>5679</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>599</th>\n",
       "      <td>2016-01-01</td>\n",
       "      <td>4903.91</td>\n",
       "      <td>551</td>\n",
       "      <td>2</td>\n",
       "      <td>-1</td>\n",
       "      <td>5688</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75</th>\n",
       "      <td>2016-07-02</td>\n",
       "      <td>4584.97</td>\n",
       "      <td>534</td>\n",
       "      <td>12</td>\n",
       "      <td>-1</td>\n",
       "      <td>5351</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "502  2016-03-02  6168.66         625                  22         -1     5240   \n",
       "727  2016-01-28  5483.55         445                  11         -1     5016   \n",
       "680  2016-01-13  5163.72         185                  41         -1     5679   \n",
       "599  2016-01-01  4903.91         551                   2         -1     5688   \n",
       "75   2016-07-02  4584.97         534                  12         -1     5351   \n",
       "\n",
       "    user_type  \n",
       "502         b  \n",
       "727         c  \n",
       "680         d  \n",
       "599         d  \n",
       "75          a  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df.sort_values(['serial_no', 'price'], \n",
    "                         ascending=[True, False]).head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Rearrange Columns in a Dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>serial_no</th>\n",
       "      <th>date</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1000</td>\n",
       "      <td>2016-01-23</td>\n",
       "      <td>-101</td>\n",
       "      <td>n</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1395.65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1001</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5632</td>\n",
       "      <td>n</td>\n",
       "      <td>906</td>\n",
       "      <td>19</td>\n",
       "      <td>1352.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1002</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5240</td>\n",
       "      <td>n</td>\n",
       "      <td>625</td>\n",
       "      <td>21</td>\n",
       "      <td>3997.32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1003</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5557</td>\n",
       "      <td>n</td>\n",
       "      <td>865</td>\n",
       "      <td>35</td>\n",
       "      <td>3681.48</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1004</td>\n",
       "      <td>2016-01-27</td>\n",
       "      <td>5489</td>\n",
       "      <td>n</td>\n",
       "      <td>929</td>\n",
       "      <td>3</td>\n",
       "      <td>3850.22</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   serial_no        date  user_id user_type  product_id  quantity_purchased  \\\n",
       "0       1000  2016-01-23     -101         n           0                   3   \n",
       "1       1001         NaN     5632         n         906                  19   \n",
       "2       1002         NaN     5240         n         625                  21   \n",
       "3       1003         NaN     5557         n         865                  35   \n",
       "4       1004  2016-01-27     5489         n         929                   3   \n",
       "\n",
       "     price  \n",
       "0  1395.65  \n",
       "1  1352.99  \n",
       "2  3997.32  \n",
       "3  3681.48  \n",
       "4  3850.22  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df[['serial_no','date','user_id','user_type',\n",
    "              'product_id','quantity_purchased','price']].head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filtering Columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using Column Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 3 19 21 35  3  6 36 14 25 32]\n"
     ]
    }
   ],
   "source": [
    "# print 10 values from column at index 3\n",
    "print(df.iloc[:,3].values[0:10])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using Column Name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 3 19 21 35  3  6 36 14 25 32]\n"
     ]
    }
   ],
   "source": [
    "# print 10 values of quantity purchased\n",
    "print(df.quantity_purchased.values[0:10])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using Column Datatype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 1395.65  1352.99  3997.32  3681.48  3850.22   786.27  2725.81  4857.7\n",
      "  2884.57  3138.58]\n"
     ]
    }
   ],
   "source": [
    "# print 10 values of columns with data type float\n",
    "print(df.select_dtypes(include=['float64']).values[:10,0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filtering Rows"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Select specific rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>2016-04-02</td>\n",
       "      <td>3027.86</td>\n",
       "      <td>856</td>\n",
       "      <td>24</td>\n",
       "      <td>1010</td>\n",
       "      <td>5381</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>501</th>\n",
       "      <td>2016-01-21</td>\n",
       "      <td>2017.56</td>\n",
       "      <td>906</td>\n",
       "      <td>5</td>\n",
       "      <td>1501</td>\n",
       "      <td>5632</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>2016-03-02</td>\n",
       "      <td>1920.21</td>\n",
       "      <td>172</td>\n",
       "      <td>30</td>\n",
       "      <td>1020</td>\n",
       "      <td>5865</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "10   2016-04-02  3027.86         856                  24       1010     5381   \n",
       "501  2016-01-21  2017.56         906                   5       1501     5632   \n",
       "20   2016-03-02  1920.21         172                  30       1020     5865   \n",
       "\n",
       "    user_type  \n",
       "10          n  \n",
       "501         d  \n",
       "20          n  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df.iloc[[10,501,20]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Exclude Specific Row indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>1352.99</td>\n",
       "      <td>906</td>\n",
       "      <td>19</td>\n",
       "      <td>1001</td>\n",
       "      <td>5632</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>3997.32</td>\n",
       "      <td>625</td>\n",
       "      <td>21</td>\n",
       "      <td>1002</td>\n",
       "      <td>5240</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>3681.48</td>\n",
       "      <td>865</td>\n",
       "      <td>35</td>\n",
       "      <td>1003</td>\n",
       "      <td>5557</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2016-01-27</td>\n",
       "      <td>3850.22</td>\n",
       "      <td>929</td>\n",
       "      <td>3</td>\n",
       "      <td>1004</td>\n",
       "      <td>5489</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2016-01-28</td>\n",
       "      <td>786.27</td>\n",
       "      <td>300</td>\n",
       "      <td>6</td>\n",
       "      <td>1005</td>\n",
       "      <td>5262</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "1         NaN  1352.99         906                  19       1001     5632   \n",
       "2         NaN  3997.32         625                  21       1002     5240   \n",
       "3         NaN  3681.48         865                  35       1003     5557   \n",
       "4  2016-01-27  3850.22         929                   3       1004     5489   \n",
       "5  2016-01-28   786.27         300                   6       1005     5262   \n",
       "\n",
       "  user_type  \n",
       "1         n  \n",
       "2         n  \n",
       "3         n  \n",
       "4         n  \n",
       "5         n  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df.drop([0,24,51], axis=0).head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Conditional Filtering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>3681.48</td>\n",
       "      <td>865</td>\n",
       "      <td>35</td>\n",
       "      <td>1003</td>\n",
       "      <td>5557</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2016-01-22</td>\n",
       "      <td>2725.81</td>\n",
       "      <td>572</td>\n",
       "      <td>36</td>\n",
       "      <td>1006</td>\n",
       "      <td>5661</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>NaN</td>\n",
       "      <td>3138.58</td>\n",
       "      <td>556</td>\n",
       "      <td>32</td>\n",
       "      <td>1009</td>\n",
       "      <td>5332</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2780.03</td>\n",
       "      <td>829</td>\n",
       "      <td>37</td>\n",
       "      <td>-1</td>\n",
       "      <td>5307</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>2016-01-18</td>\n",
       "      <td>4192.76</td>\n",
       "      <td>1099</td>\n",
       "      <td>27</td>\n",
       "      <td>1012</td>\n",
       "      <td>5824</td>\n",
       "      <td>n</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "3          NaN  3681.48         865                  35       1003     5557   \n",
       "6   2016-01-22  2725.81         572                  36       1006     5661   \n",
       "9          NaN  3138.58         556                  32       1009     5332   \n",
       "11         NaN  2780.03         829                  37         -1     5307   \n",
       "12  2016-01-18  4192.76        1099                  27       1012     5824   \n",
       "\n",
       "   user_type  \n",
       "3          n  \n",
       "6          n  \n",
       "9          n  \n",
       "11         n  \n",
       "12         n  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df[df.quantity_purchased>25].head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Offset from top of the dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>2016-07-01</td>\n",
       "      <td>3151.10</td>\n",
       "      <td>379</td>\n",
       "      <td>38</td>\n",
       "      <td>-1</td>\n",
       "      <td>5405</td>\n",
       "      <td>c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>2016-01-25</td>\n",
       "      <td>235.77</td>\n",
       "      <td>906</td>\n",
       "      <td>19</td>\n",
       "      <td>1101</td>\n",
       "      <td>5632</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>2016-01-28</td>\n",
       "      <td>429.29</td>\n",
       "      <td>625</td>\n",
       "      <td>33</td>\n",
       "      <td>1102</td>\n",
       "      <td>5240</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>2016-01-31</td>\n",
       "      <td>6877.38</td>\n",
       "      <td>865</td>\n",
       "      <td>35</td>\n",
       "      <td>1103</td>\n",
       "      <td>5557</td>\n",
       "      <td>c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>2016-10-01</td>\n",
       "      <td>1895.89</td>\n",
       "      <td>929</td>\n",
       "      <td>29</td>\n",
       "      <td>1104</td>\n",
       "      <td>5489</td>\n",
       "      <td>c</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "100  2016-07-01  3151.10         379                  38         -1     5405   \n",
       "101  2016-01-25   235.77         906                  19       1101     5632   \n",
       "102  2016-01-28   429.29         625                  33       1102     5240   \n",
       "103  2016-01-31  6877.38         865                  35       1103     5557   \n",
       "104  2016-10-01  1895.89         929                  29       1104     5489   \n",
       "\n",
       "    user_type  \n",
       "100         c  \n",
       "101         d  \n",
       "102         d  \n",
       "103         c  \n",
       "104         c  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df[100:].head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Offset from bottom of the dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>990</th>\n",
       "      <td>2016-01-13</td>\n",
       "      <td>3366.48</td>\n",
       "      <td>611</td>\n",
       "      <td>16</td>\n",
       "      <td>1990</td>\n",
       "      <td>5039</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>991</th>\n",
       "      <td>2016-10-02</td>\n",
       "      <td>398.64</td>\n",
       "      <td>775</td>\n",
       "      <td>24</td>\n",
       "      <td>1991</td>\n",
       "      <td>5496</td>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>992</th>\n",
       "      <td>2016-07-01</td>\n",
       "      <td>4910.83</td>\n",
       "      <td>743</td>\n",
       "      <td>24</td>\n",
       "      <td>1992</td>\n",
       "      <td>5245</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>993</th>\n",
       "      <td>2016-11-02</td>\n",
       "      <td>1172.38</td>\n",
       "      <td>300</td>\n",
       "      <td>12</td>\n",
       "      <td>1993</td>\n",
       "      <td>5233</td>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>994</th>\n",
       "      <td>2016-01-17</td>\n",
       "      <td>1528.26</td>\n",
       "      <td>754</td>\n",
       "      <td>13</td>\n",
       "      <td>1994</td>\n",
       "      <td>5112</td>\n",
       "      <td>c</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "990  2016-01-13  3366.48         611                  16       1990     5039   \n",
       "991  2016-10-02   398.64         775                  24       1991     5496   \n",
       "992  2016-07-01  4910.83         743                  24       1992     5245   \n",
       "993  2016-11-02  1172.38         300                  12       1993     5233   \n",
       "994  2016-01-17  1528.26         754                  13       1994     5112   \n",
       "\n",
       "    user_type  \n",
       "990         d  \n",
       "991         a  \n",
       "992         d  \n",
       "993         b  \n",
       "994         c  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df[-10:].head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### TypeCasting/Data Type Conversion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "date                  datetime64[ns]\n",
      "price                        float64\n",
      "product_id                     int64\n",
      "quantity_purchased             int32\n",
      "serial_no                      int32\n",
      "user_id                        int64\n",
      "user_type                     object\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "df['date'] = pd.to_datetime(df.date)\n",
    "# compare dtypes of the original df with this one\n",
    "print(df.dtypes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Apply/Map Usage"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Map : Create a derived attribute using map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "      <th>user_class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>995</th>\n",
       "      <td>2016-01-19</td>\n",
       "      <td>404.66</td>\n",
       "      <td>713</td>\n",
       "      <td>40</td>\n",
       "      <td>1995</td>\n",
       "      <td>5976</td>\n",
       "      <td>c</td>\n",
       "      <td>existing</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>996</th>\n",
       "      <td>2016-02-01</td>\n",
       "      <td>236.63</td>\n",
       "      <td>808</td>\n",
       "      <td>40</td>\n",
       "      <td>1996</td>\n",
       "      <td>5950</td>\n",
       "      <td>b</td>\n",
       "      <td>new</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>997</th>\n",
       "      <td>2016-08-01</td>\n",
       "      <td>3413.14</td>\n",
       "      <td>965</td>\n",
       "      <td>33</td>\n",
       "      <td>1997</td>\n",
       "      <td>5264</td>\n",
       "      <td>b</td>\n",
       "      <td>new</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>998</th>\n",
       "      <td>2016-10-02</td>\n",
       "      <td>4820.05</td>\n",
       "      <td>678</td>\n",
       "      <td>36</td>\n",
       "      <td>1998</td>\n",
       "      <td>5955</td>\n",
       "      <td>b</td>\n",
       "      <td>new</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999</th>\n",
       "      <td>2016-01-15</td>\n",
       "      <td>3906.33</td>\n",
       "      <td>551</td>\n",
       "      <td>3</td>\n",
       "      <td>1999</td>\n",
       "      <td>5688</td>\n",
       "      <td>a</td>\n",
       "      <td>new</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "995 2016-01-19   404.66         713                  40       1995     5976   \n",
       "996 2016-02-01   236.63         808                  40       1996     5950   \n",
       "997 2016-08-01  3413.14         965                  33       1997     5264   \n",
       "998 2016-10-02  4820.05         678                  36       1998     5955   \n",
       "999 2016-01-15  3906.33         551                   3       1999     5688   \n",
       "\n",
       "    user_type user_class  \n",
       "995         c   existing  \n",
       "996         b        new  \n",
       "997         b        new  \n",
       "998         b        new  \n",
       "999         a        new  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df['user_class'] = df['user_type'].map(expand_user_type)\n",
    "display(df.tail())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Apply: Using apply to get attribute ranges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "price                 8490.61\n",
       "product_id            1099.00\n",
       "quantity_purchased      40.00\n",
       "serial_no             2000.00\n",
       "user_id               6093.00\n",
       "dtype: float64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df.select_dtypes(include=[np.number]).apply(lambda x: \n",
    "                                                        x.max()- x.min()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Applymap: Extract week from date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df['purchase_week'] = df[['date']].applymap(lambda dt:dt.week \n",
    "                                                if not pd.isnull(dt.week) \n",
    "                                                else 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "      <th>user_class</th>\n",
       "      <th>purchase_week</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2016-01-23</td>\n",
       "      <td>1395.65</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1000</td>\n",
       "      <td>-101</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaT</td>\n",
       "      <td>1352.99</td>\n",
       "      <td>906</td>\n",
       "      <td>19</td>\n",
       "      <td>1001</td>\n",
       "      <td>5632</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaT</td>\n",
       "      <td>3997.32</td>\n",
       "      <td>625</td>\n",
       "      <td>21</td>\n",
       "      <td>1002</td>\n",
       "      <td>5240</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaT</td>\n",
       "      <td>3681.48</td>\n",
       "      <td>865</td>\n",
       "      <td>35</td>\n",
       "      <td>1003</td>\n",
       "      <td>5557</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2016-01-27</td>\n",
       "      <td>3850.22</td>\n",
       "      <td>929</td>\n",
       "      <td>3</td>\n",
       "      <td>1004</td>\n",
       "      <td>5489</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "0 2016-01-23  1395.65           0                   3       1000     -101   \n",
       "1        NaT  1352.99         906                  19       1001     5632   \n",
       "2        NaT  3997.32         625                  21       1002     5240   \n",
       "3        NaT  3681.48         865                  35       1003     5557   \n",
       "4 2016-01-27  3850.22         929                   3       1004     5489   \n",
       "\n",
       "  user_type user_class  purchase_week  \n",
       "0         n      error              3  \n",
       "1         n      error              0  \n",
       "2         n      error              0  \n",
       "3         n      error              0  \n",
       "4         n      error              4  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Missing Values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Drop Rows with missing dates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "      <th>user_class</th>\n",
       "      <th>purchase_week</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2016-01-23</td>\n",
       "      <td>1395.65</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1000</td>\n",
       "      <td>-101</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2016-01-27</td>\n",
       "      <td>3850.22</td>\n",
       "      <td>929</td>\n",
       "      <td>3</td>\n",
       "      <td>1004</td>\n",
       "      <td>5489</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2016-01-28</td>\n",
       "      <td>786.27</td>\n",
       "      <td>300</td>\n",
       "      <td>6</td>\n",
       "      <td>1005</td>\n",
       "      <td>5262</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2016-01-22</td>\n",
       "      <td>2725.81</td>\n",
       "      <td>572</td>\n",
       "      <td>36</td>\n",
       "      <td>1006</td>\n",
       "      <td>5661</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2016-10-01</td>\n",
       "      <td>4857.70</td>\n",
       "      <td>1011</td>\n",
       "      <td>14</td>\n",
       "      <td>1007</td>\n",
       "      <td>5412</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>39</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "0 2016-01-23  1395.65           0                   3       1000     -101   \n",
       "4 2016-01-27  3850.22         929                   3       1004     5489   \n",
       "5 2016-01-28   786.27         300                   6       1005     5262   \n",
       "6 2016-01-22  2725.81         572                  36       1006     5661   \n",
       "7 2016-10-01  4857.70        1011                  14       1007     5412   \n",
       "\n",
       "  user_type user_class  purchase_week  \n",
       "0         n      error              3  \n",
       "4         n      error              4  \n",
       "5         n      error              4  \n",
       "6         n      error              3  \n",
       "7         n      error             39  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df_dropped = df.dropna(subset=['date'])\n",
    "display(df_dropped.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fill Missing Price values with mean price"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_dropped['price'].fillna(value=np.round(df.price.mean(),decimals=2),\n",
    "                                inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fill Missing user_type values with value from previous row (forward fill) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_dropped['user_type'].fillna(method='ffill',inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fill Missing user_type values with value from next row (backward fill)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_dropped['user_type'].fillna(method='bfill',inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Duplicates"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Drop Duplicate serial_no rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "      <th>user_class</th>\n",
       "      <th>purchase_week</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>75</th>\n",
       "      <td>2016-07-02</td>\n",
       "      <td>4584.97</td>\n",
       "      <td>534</td>\n",
       "      <td>12</td>\n",
       "      <td>-1</td>\n",
       "      <td>5351</td>\n",
       "      <td>a</td>\n",
       "      <td>new</td>\n",
       "      <td>26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>97</th>\n",
       "      <td>2016-01-01</td>\n",
       "      <td>743.37</td>\n",
       "      <td>965</td>\n",
       "      <td>3</td>\n",
       "      <td>-1</td>\n",
       "      <td>5264</td>\n",
       "      <td>c</td>\n",
       "      <td>existing</td>\n",
       "      <td>53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>2016-07-01</td>\n",
       "      <td>3151.10</td>\n",
       "      <td>379</td>\n",
       "      <td>38</td>\n",
       "      <td>-1</td>\n",
       "      <td>5405</td>\n",
       "      <td>c</td>\n",
       "      <td>existing</td>\n",
       "      <td>26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>114</th>\n",
       "      <td>2016-05-01</td>\n",
       "      <td>337.44</td>\n",
       "      <td>736</td>\n",
       "      <td>6</td>\n",
       "      <td>-1</td>\n",
       "      <td>5443</td>\n",
       "      <td>d</td>\n",
       "      <td>loyal_existing</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>2016-04-02</td>\n",
       "      <td>682.72</td>\n",
       "      <td>994</td>\n",
       "      <td>26</td>\n",
       "      <td>-1</td>\n",
       "      <td>5412</td>\n",
       "      <td>a</td>\n",
       "      <td>new</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "75  2016-07-02  4584.97         534                  12         -1     5351   \n",
       "97  2016-01-01   743.37         965                   3         -1     5264   \n",
       "100 2016-07-01  3151.10         379                  38         -1     5405   \n",
       "114 2016-05-01   337.44         736                   6         -1     5443   \n",
       "145 2016-04-02   682.72         994                  26         -1     5412   \n",
       "\n",
       "    user_type      user_class  purchase_week  \n",
       "75          a             new             26  \n",
       "97          c        existing             53  \n",
       "100         c        existing             26  \n",
       "114         d  loyal_existing             17  \n",
       "145         a             new             13  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of df=(969, 9)\n"
     ]
    }
   ],
   "source": [
    "# sample duplicates\n",
    "display(df_dropped[df_dropped.duplicated(subset=['serial_no'])].head())\n",
    "print(\"Shape of df={}\".format(df_dropped.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_dropped.drop_duplicates(subset=['serial_no'],inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "      <th>user_class</th>\n",
       "      <th>purchase_week</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2016-01-23</td>\n",
       "      <td>1395.65</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1000</td>\n",
       "      <td>-101</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2016-01-27</td>\n",
       "      <td>3850.22</td>\n",
       "      <td>929</td>\n",
       "      <td>3</td>\n",
       "      <td>1004</td>\n",
       "      <td>5489</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2016-01-28</td>\n",
       "      <td>786.27</td>\n",
       "      <td>300</td>\n",
       "      <td>6</td>\n",
       "      <td>1005</td>\n",
       "      <td>5262</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2016-01-22</td>\n",
       "      <td>2725.81</td>\n",
       "      <td>572</td>\n",
       "      <td>36</td>\n",
       "      <td>1006</td>\n",
       "      <td>5661</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2016-10-01</td>\n",
       "      <td>4857.70</td>\n",
       "      <td>1011</td>\n",
       "      <td>14</td>\n",
       "      <td>1007</td>\n",
       "      <td>5412</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>39</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "0 2016-01-23  1395.65           0                   3       1000     -101   \n",
       "4 2016-01-27  3850.22         929                   3       1004     5489   \n",
       "5 2016-01-28   786.27         300                   6       1005     5262   \n",
       "6 2016-01-22  2725.81         572                  36       1006     5661   \n",
       "7 2016-10-01  4857.70        1011                  14       1007     5412   \n",
       "\n",
       "  user_type user_class  purchase_week  \n",
       "0         n      error              3  \n",
       "4         n      error              4  \n",
       "5         n      error              4  \n",
       "6         n      error              3  \n",
       "7         n      error             39  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of df=(940, 9)\n"
     ]
    }
   ],
   "source": [
    "# updated dataframe\n",
    "display(df_dropped.head())\n",
    "print(\"Shape of df={}\".format(df_dropped.shape))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Remove rows which have less than 3 attributes with non-missing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "      <th>user_class</th>\n",
       "      <th>purchase_week</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2016-01-23</td>\n",
       "      <td>1395.65</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1000</td>\n",
       "      <td>-101</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaT</td>\n",
       "      <td>1352.99</td>\n",
       "      <td>906</td>\n",
       "      <td>19</td>\n",
       "      <td>1001</td>\n",
       "      <td>5632</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaT</td>\n",
       "      <td>3997.32</td>\n",
       "      <td>625</td>\n",
       "      <td>21</td>\n",
       "      <td>1002</td>\n",
       "      <td>5240</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaT</td>\n",
       "      <td>3681.48</td>\n",
       "      <td>865</td>\n",
       "      <td>35</td>\n",
       "      <td>1003</td>\n",
       "      <td>5557</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2016-01-27</td>\n",
       "      <td>3850.22</td>\n",
       "      <td>929</td>\n",
       "      <td>3</td>\n",
       "      <td>1004</td>\n",
       "      <td>5489</td>\n",
       "      <td>n</td>\n",
       "      <td>error</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "0 2016-01-23  1395.65           0                   3       1000     -101   \n",
       "1        NaT  1352.99         906                  19       1001     5632   \n",
       "2        NaT  3997.32         625                  21       1002     5240   \n",
       "3        NaT  3681.48         865                  35       1003     5557   \n",
       "4 2016-01-27  3850.22         929                   3       1004     5489   \n",
       "\n",
       "  user_type user_class  purchase_week  \n",
       "0         n      error              3  \n",
       "1         n      error              0  \n",
       "2         n      error              0  \n",
       "3         n      error              0  \n",
       "4         n      error              4  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of df=(1000, 9)\n"
     ]
    }
   ],
   "source": [
    "display(df.dropna(thresh=3).head())\n",
    "print(\"Shape of df={}\".format(df.dropna(thresh=3).shape))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Encode Categoricals"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "One Hot Encoding using get_dummies()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_class</th>\n",
       "      <th>purchase_week</th>\n",
       "      <th>user_type_a</th>\n",
       "      <th>user_type_b</th>\n",
       "      <th>user_type_c</th>\n",
       "      <th>user_type_d</th>\n",
       "      <th>user_type_n</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2016-01-23</td>\n",
       "      <td>1395.65</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1000</td>\n",
       "      <td>-101</td>\n",
       "      <td>error</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaT</td>\n",
       "      <td>1352.99</td>\n",
       "      <td>906</td>\n",
       "      <td>19</td>\n",
       "      <td>1001</td>\n",
       "      <td>5632</td>\n",
       "      <td>error</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaT</td>\n",
       "      <td>3997.32</td>\n",
       "      <td>625</td>\n",
       "      <td>21</td>\n",
       "      <td>1002</td>\n",
       "      <td>5240</td>\n",
       "      <td>error</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaT</td>\n",
       "      <td>3681.48</td>\n",
       "      <td>865</td>\n",
       "      <td>35</td>\n",
       "      <td>1003</td>\n",
       "      <td>5557</td>\n",
       "      <td>error</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2016-01-27</td>\n",
       "      <td>3850.22</td>\n",
       "      <td>929</td>\n",
       "      <td>3</td>\n",
       "      <td>1004</td>\n",
       "      <td>5489</td>\n",
       "      <td>error</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "0 2016-01-23  1395.65           0                   3       1000     -101   \n",
       "1        NaT  1352.99         906                  19       1001     5632   \n",
       "2        NaT  3997.32         625                  21       1002     5240   \n",
       "3        NaT  3681.48         865                  35       1003     5557   \n",
       "4 2016-01-27  3850.22         929                   3       1004     5489   \n",
       "\n",
       "  user_class  purchase_week  user_type_a  user_type_b  user_type_c  \\\n",
       "0      error              3            0            0            0   \n",
       "1      error              0            0            0            0   \n",
       "2      error              0            0            0            0   \n",
       "3      error              0            0            0            0   \n",
       "4      error              4            0            0            0   \n",
       "\n",
       "   user_type_d  user_type_n  \n",
       "0            0            1  \n",
       "1            0            1  \n",
       "2            0            1  \n",
       "3            0            1  \n",
       "4            0            1  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(pd.get_dummies(df,columns=['user_type']).head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Label Mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "      <th>user_class</th>\n",
       "      <th>purchase_week</th>\n",
       "      <th>encoded_user_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>995</th>\n",
       "      <td>2016-01-19</td>\n",
       "      <td>404.66</td>\n",
       "      <td>713</td>\n",
       "      <td>40</td>\n",
       "      <td>1995</td>\n",
       "      <td>5976</td>\n",
       "      <td>c</td>\n",
       "      <td>existing</td>\n",
       "      <td>3</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>996</th>\n",
       "      <td>2016-02-01</td>\n",
       "      <td>236.63</td>\n",
       "      <td>808</td>\n",
       "      <td>40</td>\n",
       "      <td>1996</td>\n",
       "      <td>5950</td>\n",
       "      <td>b</td>\n",
       "      <td>new</td>\n",
       "      <td>5</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>997</th>\n",
       "      <td>2016-08-01</td>\n",
       "      <td>3413.14</td>\n",
       "      <td>965</td>\n",
       "      <td>33</td>\n",
       "      <td>1997</td>\n",
       "      <td>5264</td>\n",
       "      <td>b</td>\n",
       "      <td>new</td>\n",
       "      <td>31</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>998</th>\n",
       "      <td>2016-10-02</td>\n",
       "      <td>4820.05</td>\n",
       "      <td>678</td>\n",
       "      <td>36</td>\n",
       "      <td>1998</td>\n",
       "      <td>5955</td>\n",
       "      <td>b</td>\n",
       "      <td>new</td>\n",
       "      <td>39</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999</th>\n",
       "      <td>2016-01-15</td>\n",
       "      <td>3906.33</td>\n",
       "      <td>551</td>\n",
       "      <td>3</td>\n",
       "      <td>1999</td>\n",
       "      <td>5688</td>\n",
       "      <td>a</td>\n",
       "      <td>new</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "995 2016-01-19   404.66         713                  40       1995     5976   \n",
       "996 2016-02-01   236.63         808                  40       1996     5950   \n",
       "997 2016-08-01  3413.14         965                  33       1997     5264   \n",
       "998 2016-10-02  4820.05         678                  36       1998     5955   \n",
       "999 2016-01-15  3906.33         551                   3       1999     5688   \n",
       "\n",
       "    user_type user_class  purchase_week  encoded_user_type  \n",
       "995         c   existing              3                2.0  \n",
       "996         b        new              5                1.0  \n",
       "997         b        new             31                1.0  \n",
       "998         b        new             39                1.0  \n",
       "999         a        new              2                0.0  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "type_map={'a':0,'b':1,'c':2,'d':3,np.NAN:-1}\n",
    "df['encoded_user_type'] = df.user_type.map(type_map)\n",
    "display((df.tail()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Random Sampling data from DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "      <th>user_class</th>\n",
       "      <th>purchase_week</th>\n",
       "      <th>encoded_user_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>2016-01-28</td>\n",
       "      <td>429.29</td>\n",
       "      <td>625</td>\n",
       "      <td>33</td>\n",
       "      <td>1102</td>\n",
       "      <td>5240</td>\n",
       "      <td>d</td>\n",
       "      <td>loyal_existing</td>\n",
       "      <td>4</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>435</th>\n",
       "      <td>2016-01-25</td>\n",
       "      <td>1068.73</td>\n",
       "      <td>1067</td>\n",
       "      <td>41</td>\n",
       "      <td>1435</td>\n",
       "      <td>5943</td>\n",
       "      <td>b</td>\n",
       "      <td>new</td>\n",
       "      <td>4</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>860</th>\n",
       "      <td>2016-08-02</td>\n",
       "      <td>5952.59</td>\n",
       "      <td>320</td>\n",
       "      <td>39</td>\n",
       "      <td>1860</td>\n",
       "      <td>5024</td>\n",
       "      <td>c</td>\n",
       "      <td>existing</td>\n",
       "      <td>31</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270</th>\n",
       "      <td>2016-12-01</td>\n",
       "      <td>1158.08</td>\n",
       "      <td>405</td>\n",
       "      <td>25</td>\n",
       "      <td>1270</td>\n",
       "      <td>5759</td>\n",
       "      <td>c</td>\n",
       "      <td>existing</td>\n",
       "      <td>48</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>2016-08-02</td>\n",
       "      <td>2207.99</td>\n",
       "      <td>572</td>\n",
       "      <td>41</td>\n",
       "      <td>1106</td>\n",
       "      <td>5661</td>\n",
       "      <td>b</td>\n",
       "      <td>new</td>\n",
       "      <td>31</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "102 2016-01-28   429.29         625                  33       1102     5240   \n",
       "435 2016-01-25  1068.73        1067                  41       1435     5943   \n",
       "860 2016-08-02  5952.59         320                  39       1860     5024   \n",
       "270 2016-12-01  1158.08         405                  25       1270     5759   \n",
       "106 2016-08-02  2207.99         572                  41       1106     5661   \n",
       "\n",
       "    user_type      user_class  purchase_week  encoded_user_type  \n",
       "102         d  loyal_existing              4                3.0  \n",
       "435         b             new              4                1.0  \n",
       "860         c        existing             31                2.0  \n",
       "270         c        existing             48                2.0  \n",
       "106         b             new             31                1.0  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df.sample(frac=0.2, replace=True, random_state=42).head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Normalizing Numeric Values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Normalize price values using  **Min-Max Scaler**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_normalized = df.dropna().copy()\n",
    "min_max_scaler = preprocessing.MinMaxScaler()\n",
    "np_scaled = min_max_scaler.fit_transform(df_normalized['price'].values.reshape(-1,1))\n",
    "df_normalized['price'] = np_scaled.reshape(-1,1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "      <th>user_class</th>\n",
       "      <th>purchase_week</th>\n",
       "      <th>encoded_user_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>2016-01-31</td>\n",
       "      <td>0.080591</td>\n",
       "      <td>805</td>\n",
       "      <td>12</td>\n",
       "      <td>1023</td>\n",
       "      <td>5042</td>\n",
       "      <td>b</td>\n",
       "      <td>new</td>\n",
       "      <td>4</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>2016-01-23</td>\n",
       "      <td>0.092868</td>\n",
       "      <td>800</td>\n",
       "      <td>36</td>\n",
       "      <td>1032</td>\n",
       "      <td>5946</td>\n",
       "      <td>d</td>\n",
       "      <td>loyal_existing</td>\n",
       "      <td>3</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>2016-05-01</td>\n",
       "      <td>0.102266</td>\n",
       "      <td>538</td>\n",
       "      <td>5</td>\n",
       "      <td>1033</td>\n",
       "      <td>5078</td>\n",
       "      <td>d</td>\n",
       "      <td>loyal_existing</td>\n",
       "      <td>17</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>2016-08-02</td>\n",
       "      <td>0.080187</td>\n",
       "      <td>1069</td>\n",
       "      <td>31</td>\n",
       "      <td>1034</td>\n",
       "      <td>5202</td>\n",
       "      <td>d</td>\n",
       "      <td>loyal_existing</td>\n",
       "      <td>31</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>2016-01-01</td>\n",
       "      <td>0.171362</td>\n",
       "      <td>1067</td>\n",
       "      <td>38</td>\n",
       "      <td>1035</td>\n",
       "      <td>5943</td>\n",
       "      <td>a</td>\n",
       "      <td>new</td>\n",
       "      <td>53</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         date     price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "23 2016-01-31  0.080591         805                  12       1023     5042   \n",
       "32 2016-01-23  0.092868         800                  36       1032     5946   \n",
       "33 2016-05-01  0.102266         538                   5       1033     5078   \n",
       "34 2016-08-02  0.080187        1069                  31       1034     5202   \n",
       "35 2016-01-01  0.171362        1067                  38       1035     5943   \n",
       "\n",
       "   user_type      user_class  purchase_week  encoded_user_type  \n",
       "23         b             new              4                1.0  \n",
       "32         d  loyal_existing              3                3.0  \n",
       "33         d  loyal_existing             17                3.0  \n",
       "34         d  loyal_existing             31                3.0  \n",
       "35         a             new             53                0.0  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df_normalized.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Normalize quantity purchased values using  **Robust Scaler**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_normalized = df.dropna().copy()\n",
    "robust_scaler = preprocessing.RobustScaler()\n",
    "rs_scaled = robust_scaler.fit_transform(df_normalized['quantity_purchased'].values.reshape(-1,1))\n",
    "df_normalized['quantity_purchased'] = rs_scaled.reshape(-1,1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>price</th>\n",
       "      <th>product_id</th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>serial_no</th>\n",
       "      <th>user_id</th>\n",
       "      <th>user_type</th>\n",
       "      <th>user_class</th>\n",
       "      <th>purchase_week</th>\n",
       "      <th>encoded_user_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>2016-01-31</td>\n",
       "      <td>686.87</td>\n",
       "      <td>805</td>\n",
       "      <td>-0.428571</td>\n",
       "      <td>1023</td>\n",
       "      <td>5042</td>\n",
       "      <td>b</td>\n",
       "      <td>new</td>\n",
       "      <td>4</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>2016-01-23</td>\n",
       "      <td>791.11</td>\n",
       "      <td>800</td>\n",
       "      <td>0.714286</td>\n",
       "      <td>1032</td>\n",
       "      <td>5946</td>\n",
       "      <td>d</td>\n",
       "      <td>loyal_existing</td>\n",
       "      <td>3</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>2016-05-01</td>\n",
       "      <td>870.90</td>\n",
       "      <td>538</td>\n",
       "      <td>-0.761905</td>\n",
       "      <td>1033</td>\n",
       "      <td>5078</td>\n",
       "      <td>d</td>\n",
       "      <td>loyal_existing</td>\n",
       "      <td>17</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>2016-08-02</td>\n",
       "      <td>683.44</td>\n",
       "      <td>1069</td>\n",
       "      <td>0.476190</td>\n",
       "      <td>1034</td>\n",
       "      <td>5202</td>\n",
       "      <td>d</td>\n",
       "      <td>loyal_existing</td>\n",
       "      <td>31</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>2016-01-01</td>\n",
       "      <td>1457.57</td>\n",
       "      <td>1067</td>\n",
       "      <td>0.809524</td>\n",
       "      <td>1035</td>\n",
       "      <td>5943</td>\n",
       "      <td>a</td>\n",
       "      <td>new</td>\n",
       "      <td>53</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         date    price  product_id  quantity_purchased  serial_no  user_id  \\\n",
       "23 2016-01-31   686.87         805           -0.428571       1023     5042   \n",
       "32 2016-01-23   791.11         800            0.714286       1032     5946   \n",
       "33 2016-05-01   870.90         538           -0.761905       1033     5078   \n",
       "34 2016-08-02   683.44        1069            0.476190       1034     5202   \n",
       "35 2016-01-01  1457.57        1067            0.809524       1035     5943   \n",
       "\n",
       "   user_type      user_class  purchase_week  encoded_user_type  \n",
       "23         b             new              4                1.0  \n",
       "32         d  loyal_existing              3                3.0  \n",
       "33         d  loyal_existing             17                3.0  \n",
       "34         d  loyal_existing             31                3.0  \n",
       "35         a             new             53                0.0  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df_normalized.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Data Summarization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Condition based aggregation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean price of items purchased by user_type=a :: 2441.0280995475105\n"
     ]
    }
   ],
   "source": [
    "print(\"Mean price of items purchased by user_type=a :: {}\".format(df['price'][df['user_type']=='a'].mean()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Condtion based counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3     172\n",
      "4     166\n",
      "2     101\n",
      "39     53\n",
      "35     50\n",
      "26     50\n",
      "22     46\n",
      "9      46\n",
      "53     45\n",
      "13     45\n",
      "31     41\n",
      "44     39\n",
      "5      39\n",
      "0      31\n",
      "17     27\n",
      "48     27\n",
      "18     22\n",
      "Name: purchase_week, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(df['purchase_week'].value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Group By"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Group By certain attributes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "user_class\n",
      "error              565\n",
      "existing          5299\n",
      "loyal_existing    5211\n",
      "new               9988\n",
      "Name: quantity_purchased, dtype: int32\n"
     ]
    }
   ],
   "source": [
    "print(df.groupby(['user_class'])['quantity_purchased'].sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Group By with different aggregate functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sum</th>\n",
       "      <th>mean</th>\n",
       "      <th>count_nonzero</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_class</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>error</th>\n",
       "      <td>565</td>\n",
       "      <td>18.225806</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>existing</th>\n",
       "      <td>5299</td>\n",
       "      <td>20.699219</td>\n",
       "      <td>256</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>loyal_existing</th>\n",
       "      <td>5211</td>\n",
       "      <td>21.533058</td>\n",
       "      <td>242</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>new</th>\n",
       "      <td>9988</td>\n",
       "      <td>21.205945</td>\n",
       "      <td>471</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 sum       mean  count_nonzero\n",
       "user_class                                    \n",
       "error            565  18.225806             31\n",
       "existing        5299  20.699219            256\n",
       "loyal_existing  5211  21.533058            242\n",
       "new             9988  21.205945            471"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df.groupby(['user_class'])['quantity_purchased'].agg([np.sum,\n",
    "                                                                np.mean,\n",
    "                                                                np.count_nonzero]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Group by specific aggregate functions for each attribute"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th>price</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_class</th>\n",
       "      <th>user_type</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>error</th>\n",
       "      <th>n</th>\n",
       "      <td>40</td>\n",
       "      <td>2355.328710</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>existing</th>\n",
       "      <th>c</th>\n",
       "      <td>41</td>\n",
       "      <td>2502.277358</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>loyal_existing</th>\n",
       "      <th>d</th>\n",
       "      <td>41</td>\n",
       "      <td>2349.236695</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">new</th>\n",
       "      <th>a</th>\n",
       "      <td>41</td>\n",
       "      <td>2441.028100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>41</td>\n",
       "      <td>2592.225064</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          quantity_purchased        price\n",
       "user_class     user_type                                 \n",
       "error          n                          40  2355.328710\n",
       "existing       c                          41  2502.277358\n",
       "loyal_existing d                          41  2349.236695\n",
       "new            a                          41  2441.028100\n",
       "               b                          41  2592.225064"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df.groupby(['user_class','user_type']).agg({'price':np.mean,\n",
    "                                                        'quantity_purchased':np.max}))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Group by with multiple agg for each attribute"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Anaconda2\\envs\\python3\\lib\\site-packages\\pandas\\core\\groupby.py:4036: FutureWarning: using a dict with renaming is deprecated and will be removed in a future version\n",
      "  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>quantity_purchased</th>\n",
       "      <th colspan=\"4\" halign=\"left\">price</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>sum</th>\n",
       "      <th>count</th>\n",
       "      <th>total_price</th>\n",
       "      <th>variance_price</th>\n",
       "      <th>mean_price</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_class</th>\n",
       "      <th>user_type</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>error</th>\n",
       "      <th>n</th>\n",
       "      <td>565</td>\n",
       "      <td>31.0</td>\n",
       "      <td>73015.19</td>\n",
       "      <td>1286.420840</td>\n",
       "      <td>2355.328710</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>existing</th>\n",
       "      <th>c</th>\n",
       "      <td>5299</td>\n",
       "      <td>256.0</td>\n",
       "      <td>615560.23</td>\n",
       "      <td>1692.473899</td>\n",
       "      <td>2502.277358</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>loyal_existing</th>\n",
       "      <th>d</th>\n",
       "      <td>5211</td>\n",
       "      <td>242.0</td>\n",
       "      <td>554419.86</td>\n",
       "      <td>1689.976272</td>\n",
       "      <td>2349.236695</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">new</th>\n",
       "      <th>a</th>\n",
       "      <td>4752</td>\n",
       "      <td>229.0</td>\n",
       "      <td>539467.21</td>\n",
       "      <td>1530.875030</td>\n",
       "      <td>2441.028100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>b</th>\n",
       "      <td>5236</td>\n",
       "      <td>242.0</td>\n",
       "      <td>609172.89</td>\n",
       "      <td>1746.481212</td>\n",
       "      <td>2592.225064</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         quantity_purchased  price                             \\\n",
       "                                        sum  count total_price variance_price   \n",
       "user_class     user_type                                                        \n",
       "error          n                        565   31.0    73015.19    1286.420840   \n",
       "existing       c                       5299  256.0   615560.23    1692.473899   \n",
       "loyal_existing d                       5211  242.0   554419.86    1689.976272   \n",
       "new            a                       4752  229.0   539467.21    1530.875030   \n",
       "               b                       5236  242.0   609172.89    1746.481212   \n",
       "\n",
       "                                       \n",
       "                           mean_price  \n",
       "user_class     user_type               \n",
       "error          n          2355.328710  \n",
       "existing       c          2502.277358  \n",
       "loyal_existing d          2349.236695  \n",
       "new            a          2441.028100  \n",
       "               b          2592.225064  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df.groupby(['user_class','user_type']).agg({'price':{\n",
    "                                                                'total_price':np.sum,\n",
    "                                                                'mean_price':np.mean,\n",
    "                                                                'variance_price':np.std,\n",
    "                                                                'count':np.count_nonzero},\n",
    "                                                   'quantity_purchased':np.sum}))  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pivot Tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>user_type</th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>n</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2016-01-01</th>\n",
       "      <td>1764.427143</td>\n",
       "      <td>2783.268889</td>\n",
       "      <td>1372.035556</td>\n",
       "      <td>2394.892000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-02</th>\n",
       "      <td>2783.710000</td>\n",
       "      <td>1978.700000</td>\n",
       "      <td>2249.120000</td>\n",
       "      <td>2899.566667</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-13</th>\n",
       "      <td>2693.746000</td>\n",
       "      <td>2589.390000</td>\n",
       "      <td>3011.610000</td>\n",
       "      <td>2391.913750</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-14</th>\n",
       "      <td>2456.080000</td>\n",
       "      <td>1274.623333</td>\n",
       "      <td>2336.740000</td>\n",
       "      <td>2341.664286</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-15</th>\n",
       "      <td>2784.635000</td>\n",
       "      <td>4452.075000</td>\n",
       "      <td>2389.252500</td>\n",
       "      <td>2019.024000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-16</th>\n",
       "      <td>1871.776667</td>\n",
       "      <td>4390.910000</td>\n",
       "      <td>1474.106000</td>\n",
       "      <td>3959.580000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-17</th>\n",
       "      <td>1838.467500</td>\n",
       "      <td>2715.795000</td>\n",
       "      <td>2396.167500</td>\n",
       "      <td>1604.277500</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-18</th>\n",
       "      <td>2287.111667</td>\n",
       "      <td>4001.793750</td>\n",
       "      <td>2302.727500</td>\n",
       "      <td>2558.880000</td>\n",
       "      <td>4192.760000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-19</th>\n",
       "      <td>2666.602500</td>\n",
       "      <td>1599.908333</td>\n",
       "      <td>1752.883333</td>\n",
       "      <td>1199.605556</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-20</th>\n",
       "      <td>2931.550000</td>\n",
       "      <td>3002.483333</td>\n",
       "      <td>1593.802857</td>\n",
       "      <td>2323.136000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-21</th>\n",
       "      <td>2294.050000</td>\n",
       "      <td>2676.760000</td>\n",
       "      <td>1100.090000</td>\n",
       "      <td>2637.204167</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-22</th>\n",
       "      <td>1982.850000</td>\n",
       "      <td>1257.962857</td>\n",
       "      <td>3056.168750</td>\n",
       "      <td>2241.121667</td>\n",
       "      <td>2267.086667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-23</th>\n",
       "      <td>2074.580000</td>\n",
       "      <td>2288.303333</td>\n",
       "      <td>3044.710000</td>\n",
       "      <td>2620.335000</td>\n",
       "      <td>1395.650000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-24</th>\n",
       "      <td>2643.150000</td>\n",
       "      <td>2974.468571</td>\n",
       "      <td>1296.088000</td>\n",
       "      <td>2374.154000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-25</th>\n",
       "      <td>2280.662857</td>\n",
       "      <td>2126.724000</td>\n",
       "      <td>2518.366000</td>\n",
       "      <td>1378.290000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-26</th>\n",
       "      <td>2856.388333</td>\n",
       "      <td>2502.004000</td>\n",
       "      <td>2032.230000</td>\n",
       "      <td>856.983333</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-27</th>\n",
       "      <td>1861.821667</td>\n",
       "      <td>2942.900000</td>\n",
       "      <td>1988.678000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2465.666667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-28</th>\n",
       "      <td>2218.346667</td>\n",
       "      <td>4202.705000</td>\n",
       "      <td>2899.943333</td>\n",
       "      <td>2481.795000</td>\n",
       "      <td>786.270000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-29</th>\n",
       "      <td>2249.405000</td>\n",
       "      <td>2308.105714</td>\n",
       "      <td>3600.764444</td>\n",
       "      <td>2495.716667</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-30</th>\n",
       "      <td>2340.668889</td>\n",
       "      <td>2407.977143</td>\n",
       "      <td>772.580000</td>\n",
       "      <td>2442.558571</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-01-31</th>\n",
       "      <td>1747.980000</td>\n",
       "      <td>2723.865714</td>\n",
       "      <td>2928.438000</td>\n",
       "      <td>2564.803333</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-02-01</th>\n",
       "      <td>4419.866667</td>\n",
       "      <td>2494.590000</td>\n",
       "      <td>3073.573750</td>\n",
       "      <td>2686.650000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-02-02</th>\n",
       "      <td>1486.748000</td>\n",
       "      <td>2444.455000</td>\n",
       "      <td>3577.707778</td>\n",
       "      <td>1073.430000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-03-01</th>\n",
       "      <td>2279.590000</td>\n",
       "      <td>4372.235714</td>\n",
       "      <td>2468.440000</td>\n",
       "      <td>2574.437143</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-03-02</th>\n",
       "      <td>1490.204000</td>\n",
       "      <td>3080.872000</td>\n",
       "      <td>2099.906667</td>\n",
       "      <td>2161.867143</td>\n",
       "      <td>1920.210000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-04-01</th>\n",
       "      <td>2951.452857</td>\n",
       "      <td>1038.900000</td>\n",
       "      <td>2455.441429</td>\n",
       "      <td>2187.444286</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-04-02</th>\n",
       "      <td>1507.856000</td>\n",
       "      <td>2675.234000</td>\n",
       "      <td>3138.163333</td>\n",
       "      <td>2199.732000</td>\n",
       "      <td>3027.860000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-05-01</th>\n",
       "      <td>3483.363333</td>\n",
       "      <td>2249.283333</td>\n",
       "      <td>2710.720000</td>\n",
       "      <td>2948.008333</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-05-02</th>\n",
       "      <td>1992.621667</td>\n",
       "      <td>2404.948000</td>\n",
       "      <td>2845.672500</td>\n",
       "      <td>2339.355000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-06-01</th>\n",
       "      <td>2154.700000</td>\n",
       "      <td>2748.571250</td>\n",
       "      <td>1891.890000</td>\n",
       "      <td>2777.006667</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-06-02</th>\n",
       "      <td>1924.160000</td>\n",
       "      <td>1321.549000</td>\n",
       "      <td>2408.695556</td>\n",
       "      <td>2147.680000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-07-01</th>\n",
       "      <td>2718.868333</td>\n",
       "      <td>2836.845714</td>\n",
       "      <td>3742.868333</td>\n",
       "      <td>3811.535556</td>\n",
       "      <td>538.690000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-07-02</th>\n",
       "      <td>3776.060000</td>\n",
       "      <td>4228.195000</td>\n",
       "      <td>2421.845000</td>\n",
       "      <td>2798.548000</td>\n",
       "      <td>3965.180000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-08-01</th>\n",
       "      <td>3618.547500</td>\n",
       "      <td>3413.140000</td>\n",
       "      <td>1708.800000</td>\n",
       "      <td>2434.700000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-08-02</th>\n",
       "      <td>2846.970000</td>\n",
       "      <td>2320.396250</td>\n",
       "      <td>3671.488182</td>\n",
       "      <td>1544.168000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-09-01</th>\n",
       "      <td>3104.240000</td>\n",
       "      <td>3417.488571</td>\n",
       "      <td>4045.373333</td>\n",
       "      <td>2343.676667</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-09-02</th>\n",
       "      <td>1455.944000</td>\n",
       "      <td>2181.548889</td>\n",
       "      <td>2336.960000</td>\n",
       "      <td>2016.385000</td>\n",
       "      <td>3851.250000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-01</th>\n",
       "      <td>3133.875000</td>\n",
       "      <td>1264.810000</td>\n",
       "      <td>2009.400000</td>\n",
       "      <td>2793.730000</td>\n",
       "      <td>4184.955000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-02</th>\n",
       "      <td>1684.794000</td>\n",
       "      <td>1824.796250</td>\n",
       "      <td>2691.698000</td>\n",
       "      <td>1785.662000</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-11-01</th>\n",
       "      <td>2104.942500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2520.680000</td>\n",
       "      <td>2366.991429</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-11-02</th>\n",
       "      <td>4805.934000</td>\n",
       "      <td>2694.271667</td>\n",
       "      <td>1942.766667</td>\n",
       "      <td>1376.770000</td>\n",
       "      <td>2545.870000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-01</th>\n",
       "      <td>2400.422000</td>\n",
       "      <td>3680.301250</td>\n",
       "      <td>1640.072857</td>\n",
       "      <td>2402.953333</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "user_type             a            b            c            d            n\n",
       "date                                                                       \n",
       "2016-01-01  1764.427143  2783.268889  1372.035556  2394.892000          NaN\n",
       "2016-01-02  2783.710000  1978.700000  2249.120000  2899.566667          NaN\n",
       "2016-01-13  2693.746000  2589.390000  3011.610000  2391.913750          NaN\n",
       "2016-01-14  2456.080000  1274.623333  2336.740000  2341.664286          NaN\n",
       "2016-01-15  2784.635000  4452.075000  2389.252500  2019.024000          NaN\n",
       "2016-01-16  1871.776667  4390.910000  1474.106000  3959.580000          NaN\n",
       "2016-01-17  1838.467500  2715.795000  2396.167500  1604.277500          NaN\n",
       "2016-01-18  2287.111667  4001.793750  2302.727500  2558.880000  4192.760000\n",
       "2016-01-19  2666.602500  1599.908333  1752.883333  1199.605556          NaN\n",
       "2016-01-20  2931.550000  3002.483333  1593.802857  2323.136000          NaN\n",
       "2016-01-21  2294.050000  2676.760000  1100.090000  2637.204167          NaN\n",
       "2016-01-22  1982.850000  1257.962857  3056.168750  2241.121667  2267.086667\n",
       "2016-01-23  2074.580000  2288.303333  3044.710000  2620.335000  1395.650000\n",
       "2016-01-24  2643.150000  2974.468571  1296.088000  2374.154000          NaN\n",
       "2016-01-25  2280.662857  2126.724000  2518.366000  1378.290000          NaN\n",
       "2016-01-26  2856.388333  2502.004000  2032.230000   856.983333          NaN\n",
       "2016-01-27  1861.821667  2942.900000  1988.678000          NaN  2465.666667\n",
       "2016-01-28  2218.346667  4202.705000  2899.943333  2481.795000   786.270000\n",
       "2016-01-29  2249.405000  2308.105714  3600.764444  2495.716667          NaN\n",
       "2016-01-30  2340.668889  2407.977143   772.580000  2442.558571          NaN\n",
       "2016-01-31  1747.980000  2723.865714  2928.438000  2564.803333          NaN\n",
       "2016-02-01  4419.866667  2494.590000  3073.573750  2686.650000          NaN\n",
       "2016-02-02  1486.748000  2444.455000  3577.707778  1073.430000          NaN\n",
       "2016-03-01  2279.590000  4372.235714  2468.440000  2574.437143          NaN\n",
       "2016-03-02  1490.204000  3080.872000  2099.906667  2161.867143  1920.210000\n",
       "2016-04-01  2951.452857  1038.900000  2455.441429  2187.444286          NaN\n",
       "2016-04-02  1507.856000  2675.234000  3138.163333  2199.732000  3027.860000\n",
       "2016-05-01  3483.363333  2249.283333  2710.720000  2948.008333          NaN\n",
       "2016-05-02  1992.621667  2404.948000  2845.672500  2339.355000          NaN\n",
       "2016-06-01  2154.700000  2748.571250  1891.890000  2777.006667          NaN\n",
       "2016-06-02  1924.160000  1321.549000  2408.695556  2147.680000          NaN\n",
       "2016-07-01  2718.868333  2836.845714  3742.868333  3811.535556   538.690000\n",
       "2016-07-02  3776.060000  4228.195000  2421.845000  2798.548000  3965.180000\n",
       "2016-08-01  3618.547500  3413.140000  1708.800000  2434.700000          NaN\n",
       "2016-08-02  2846.970000  2320.396250  3671.488182  1544.168000          NaN\n",
       "2016-09-01  3104.240000  3417.488571  4045.373333  2343.676667          NaN\n",
       "2016-09-02  1455.944000  2181.548889  2336.960000  2016.385000  3851.250000\n",
       "2016-10-01  3133.875000  1264.810000  2009.400000  2793.730000  4184.955000\n",
       "2016-10-02  1684.794000  1824.796250  2691.698000  1785.662000          NaN\n",
       "2016-11-01  2104.942500          NaN  2520.680000  2366.991429          NaN\n",
       "2016-11-02  4805.934000  2694.271667  1942.766667  1376.770000  2545.870000\n",
       "2016-12-01  2400.422000  3680.301250  1640.072857  2402.953333          NaN"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "display(df.pivot_table(index='date', columns='user_type', \n",
    "                         values='price',aggfunc=np.mean))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Stack a Dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    date                  2016-01-23 00:00:00\n",
      "     price                             1395.65\n",
      "     product_id                              0\n",
      "     quantity_purchased                      3\n",
      "     serial_no                            1000\n",
      "     user_id                              -101\n",
      "     user_type                               n\n",
      "     user_class                          error\n",
      "     purchase_week                           3\n",
      "1    price                             1352.99\n",
      "     product_id                            906\n",
      "     quantity_purchased                     19\n",
      "     serial_no                            1001\n",
      "     user_id                              5632\n",
      "     user_type                               n\n",
      "     user_class                          error\n",
      "     purchase_week                           0\n",
      "2    price                             3997.32\n",
      "     product_id                            625\n",
      "     quantity_purchased                     21\n",
      "     serial_no                            1002\n",
      "     user_id                              5240\n",
      "     user_type                               n\n",
      "     user_class                          error\n",
      "     purchase_week                           0\n",
      "3    price                             3681.48\n",
      "     product_id                            865\n",
      "     quantity_purchased                     35\n",
      "     serial_no                            1003\n",
      "     user_id                              5557\n",
      "                                  ...         \n",
      "997  date                  2016-08-01 00:00:00\n",
      "     price                             3413.14\n",
      "     product_id                            965\n",
      "     quantity_purchased                     33\n",
      "     serial_no                            1997\n",
      "     user_id                              5264\n",
      "     user_type                               b\n",
      "     user_class                            new\n",
      "     purchase_week                          31\n",
      "     encoded_user_type                       1\n",
      "998  date                  2016-10-02 00:00:00\n",
      "     price                             4820.05\n",
      "     product_id                            678\n",
      "     quantity_purchased                     36\n",
      "     serial_no                            1998\n",
      "     user_id                              5955\n",
      "     user_type                               b\n",
      "     user_class                            new\n",
      "     purchase_week                          39\n",
      "     encoded_user_type                       1\n",
      "999  date                  2016-01-15 00:00:00\n",
      "     price                             3906.33\n",
      "     product_id                            551\n",
      "     quantity_purchased                      3\n",
      "     serial_no                            1999\n",
      "     user_id                              5688\n",
      "     user_type                               a\n",
      "     user_class                            new\n",
      "     purchase_week                           2\n",
      "     encoded_user_type                       0\n",
      "Length: 9907, dtype: object\n"
     ]
    }
   ],
   "source": [
    "print(df.stack())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}