{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "<table>\n",
    "<tr>\n",
    "<td width=15%><img src=\"./img/UGA.png\"></img></td>\n",
    "<td><center><h1>Introduction to Python for Data Sciences</h1></center></td>\n",
    "<td width=15%><a href=\"http://www.iutzeler.org\" style=\"font-size: 16px; font-weight: bold\">Franck Iutzeler</a> </td>\n",
    "</tr>\n",
    "</table>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "<br/><br/>\n",
    "\n",
    "<center><a style=\"font-size: 40pt; font-weight: bold\">Chap. 3 - Data Handling with Pandas </a></center> \n",
    "\n",
    "<br/><br/>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "# 2- Dataframes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### Numpy operations\n",
    "\n",
    "\n",
    "If we apply a NumPy function on a Pandas datframe, the result will be another Pandas dataframe with the indices preserved."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A  B  C  D\n",
       "0  9  8  1  5\n",
       "1  2  9  1  2\n",
       "2  6  0  2  5"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(np.random.randint(0, 10, (3, 4)), columns=['A', 'B', 'C', 'D'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-2.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-2.0</td>\n",
       "      <td>-1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     A    B    C    D\n",
       "0 -1.0  0.0 -1.0 -1.0\n",
       "1 -2.0 -1.0 -1.0 -2.0\n",
       "2 -2.0  0.0 -2.0 -1.0"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.cos(df * np.pi/2 ) - 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Arithmetic operations\n",
    "\n",
    "Arithmetic operations can also be performed either with <tt>+ - / *</tt> or with dedicated <tt>add multiply</tt> etc methods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A   B\n",
       "0  4  10\n",
       "1  2   2"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "A = pd.DataFrame(np.random.randint(0, 20, (2, 2)), columns=list('AB'))\n",
    "A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>B</th>\n",
       "      <th>A</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   B  A  C\n",
       "0  2  5  6\n",
       "1  3  3  6\n",
       "2  7  4  0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "B = pd.DataFrame(np.random.randint(0, 10, (3, 3)), columns=list('BAC'))\n",
    "B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     A     B   C\n",
       "0  9.0  12.0 NaN\n",
       "1  5.0   5.0 NaN\n",
       "2  NaN   NaN NaN"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "A+B"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The pandas arithmetic functions also have an option to fill missing values by replacing the missing one in either of the dataframes by some value."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     A     B    C\n",
       "0  9.0  12.0  6.0\n",
       "1  5.0   5.0  6.0\n",
       "2  4.0   7.0  0.0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "A.add(B, fill_value=0.0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Appending, Concatenating, and Merging\n",
    "\n",
    "Thanks to naming, dataframes can be easily added, merged, etc. However, if some entries are missing (columns or indices), the operations may get complicated. Here the most standard situations are covered, take a look at the documentation (notably [this one on merging, appending, and concatenating](https://pandas.pydata.org/pandas-docs/stable/merging.html) )\n",
    "\n",
    "* **Appending** is for adding the lines of one dataframe with another one with the same columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A:\n",
      "     A   B\n",
      "0  11  15\n",
      "1   2  19 \n",
      "A2:\n",
      "     A   B\n",
      "0  17   8\n",
      "1  18   8\n",
      "2   3  12\n"
     ]
    }
   ],
   "source": [
    "A = pd.DataFrame(np.random.randint(0, 20, (2, 2)), columns=list('AB'))\n",
    "A2 = pd.DataFrame(np.random.randint(0, 20, (3, 2)), columns=list('AB'))\n",
    "print(\"A:\\n\",A,\"\\nA2:\\n\",A2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>11</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>17</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>18</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A   B\n",
       "0  11  15\n",
       "1   2  19\n",
       "0  17   8\n",
       "1  18   8\n",
       "2   3  12"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "A.append(A2) # this does not \"append to A\" but creates a new dataframe"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Sometimes, indexes do not matter, they can be resetted using <tt>ignore_index=True</tt>. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>11</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>17</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>18</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A   B\n",
       "0  11  15\n",
       "1   2  19\n",
       "2  17   8\n",
       "3  18   8\n",
       "4   3  12"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "A.append(A2,ignore_index=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* **Concatenating** is for adding lines and/or columns of multiples datasets (it is a generalization of appending)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "A:\n",
      "     A   B\n",
      "0  18  16\n",
      "1   7  13 \n",
      "A2:\n",
      "    A   B\n",
      "0  7  16\n",
      "1  4   5\n",
      "2  3   7 \n",
      "A3:\n",
      "     C   A  D\n",
      "0  13  17  1\n"
     ]
    }
   ],
   "source": [
    "A = pd.DataFrame(np.random.randint(0, 20, (2, 2)), columns=list('AB'))\n",
    "A2 = pd.DataFrame(np.random.randint(0, 20, (3, 2)), columns=list('AB'))\n",
    "A3 = pd.DataFrame(np.random.randint(0, 20, (1, 3)), columns=list('CAD'))\n",
    "print(\"A:\\n\",A,\"\\nA2:\\n\",A2,\"\\nA3:\\n\",A3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The most important settings of the <tt>concat</tt> function are <tt>pd.concat(objs, axis=0, join='outer',ignore_index=False)</tt> where <br/>\n",
    ". *objs* is the list of dataframes to concatenate <br/>\n",
    ". *axis* is the axis on which to concatenate 0 (default) for the lines and 1 for the columns <br/>\n",
    ". *join* is to decide if we keep all columns/indices on the other axis ('outer' ,default), or the intersection ( 'inner') <br/>\n",
    ". *ignore_index* is to decide is we keep the previous names (False, default) or give new ones (True)\n",
    "\n",
    "\n",
    "For a detailed view see  [this doc on merging, appending, and concatenating](https://pandas.pydata.org/pandas-docs/stable/merging.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>18</td>\n",
       "      <td>16.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7</td>\n",
       "      <td>13.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7</td>\n",
       "      <td>16.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>7.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>17</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A     B     C    D\n",
       "0  18  16.0   NaN  NaN\n",
       "1   7  13.0   NaN  NaN\n",
       "2   7  16.0   NaN  NaN\n",
       "3   4   5.0   NaN  NaN\n",
       "4   3   7.0   NaN  NaN\n",
       "5  17   NaN  13.0  1.0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([A,A2,A3],ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>A</th>\n",
       "      <th>D</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>18.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>7</td>\n",
       "      <td>16</td>\n",
       "      <td>13.0</td>\n",
       "      <td>17.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      A     B  A   B     C     A    D\n",
       "0  18.0  16.0  7  16  13.0  17.0  1.0\n",
       "1   7.0  13.0  4   5   NaN   NaN  NaN\n",
       "2   NaN   NaN  3   7   NaN   NaN  NaN"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([A,A2,A3],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>18</td>\n",
       "      <td>16</td>\n",
       "      <td>7</td>\n",
       "      <td>16</td>\n",
       "      <td>13</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1  2   3   4   5  6\n",
       "0  18  16  7  16  13  17  1"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([A,A2,A3],axis=1,ignore_index=True,join='inner')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* **Merging** is for putting together two dataframes with *hopefully* common data\n",
    "\n",
    "\n",
    "For a detailed view see  [this doc on merging, appending, and concatenating](https://pandas.pydata.org/pandas-docs/stable/merging.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>employee</th>\n",
       "      <th>group</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Bob</td>\n",
       "      <td>Accounting</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jake</td>\n",
       "      <td>Engineering</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Lisa</td>\n",
       "      <td>Engineering</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Sue</td>\n",
       "      <td>HR</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  employee        group\n",
       "0      Bob   Accounting\n",
       "1     Jake  Engineering\n",
       "2     Lisa  Engineering\n",
       "3      Sue           HR"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],\n",
    "                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>employee</th>\n",
       "      <th>hire_date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Lisa</td>\n",
       "      <td>2004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Bob</td>\n",
       "      <td>2008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Jake</td>\n",
       "      <td>2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Sue</td>\n",
       "      <td>2014</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  employee  hire_date\n",
       "0     Lisa       2004\n",
       "1      Bob       2008\n",
       "2     Jake       2012\n",
       "3      Sue       2014"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],\n",
    "                    'hire_date': [2004, 2008, 2012, 2014]})\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>employee</th>\n",
       "      <th>group</th>\n",
       "      <th>hire_date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Bob</td>\n",
       "      <td>Accounting</td>\n",
       "      <td>2008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jake</td>\n",
       "      <td>Engineering</td>\n",
       "      <td>2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Lisa</td>\n",
       "      <td>Engineering</td>\n",
       "      <td>2004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Sue</td>\n",
       "      <td>HR</td>\n",
       "      <td>2014</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  employee        group  hire_date\n",
       "0      Bob   Accounting       2008\n",
       "1     Jake  Engineering       2012\n",
       "2     Lisa  Engineering       2004\n",
       "3      Sue           HR       2014"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df3 = pd.merge(df1,df2)\n",
    "df3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>group</th>\n",
       "      <th>supervisor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Accounting</td>\n",
       "      <td>Carly</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Engineering</td>\n",
       "      <td>Guido</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>HR</td>\n",
       "      <td>Steve</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         group supervisor\n",
       "0   Accounting      Carly\n",
       "1  Engineering      Guido\n",
       "2           HR      Steve"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],\n",
    "                    'supervisor': ['Carly', 'Guido', 'Steve']})\n",
    "df4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>employee</th>\n",
       "      <th>group</th>\n",
       "      <th>hire_date</th>\n",
       "      <th>supervisor</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Bob</td>\n",
       "      <td>Accounting</td>\n",
       "      <td>2008</td>\n",
       "      <td>Carly</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Jake</td>\n",
       "      <td>Engineering</td>\n",
       "      <td>2012</td>\n",
       "      <td>Guido</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Lisa</td>\n",
       "      <td>Engineering</td>\n",
       "      <td>2004</td>\n",
       "      <td>Guido</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Sue</td>\n",
       "      <td>HR</td>\n",
       "      <td>2014</td>\n",
       "      <td>Steve</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  employee        group  hire_date supervisor\n",
       "0      Bob   Accounting       2008      Carly\n",
       "1     Jake  Engineering       2012      Guido\n",
       "2     Lisa  Engineering       2004      Guido\n",
       "3      Sue           HR       2014      Steve"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(df3,df4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Preparing the Data\n",
    "\n",
    "\n",
    "Before exploring the data, it is primordial to verify its soundness, indeed if it has missing or replicated data, the results of our test may not be accurate. Pandas provides a collection of methodes to verify the sanity of the data (recall that when data is missing for an entry, it is noted as `NaN`, and thus any further operation including this will be `NaN`). \n",
    "\n",
    "\n",
    "To explore some typical problems in a dataset, I messed with a small part of the [*MovieLens*](https://grouplens.org/datasets/movielens/) dataset. The `ratings_mess.csv` file contains 4 columns:\n",
    "* `userId` id of the user, integer greater than 1 \n",
    "* `movieId`  id of the user, integer greater than 1 \n",
    "* `rating` rating of the user to the movie, float between 0.0 and 5.0  \n",
    "* `timestamp` timestamp, integer \n",
    "\n",
    "and features (man-made!) errors, some of them minor some of them major.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>2.5</td>\n",
       "      <td>1.260759e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1029.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.260759e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>83.0</td>\n",
       "      <td>8.353560e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.353557e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.0</td>\n",
       "      <td>144.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.353560e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.0</td>\n",
       "      <td>616.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.353559e+08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating     timestamp\n",
       "0     1.0     31.0     2.5  1.260759e+09\n",
       "1     1.0   1029.0     3.0  1.260759e+09\n",
       "2     2.0     10.0     4.0           NaN\n",
       "3     2.0     52.0    83.0  8.353560e+08\n",
       "4     NaN     62.0     3.0  8.353557e+08\n",
       "5     2.0    144.0     NaN  8.353560e+08\n",
       "6     0.0    616.0     3.0  8.353559e+08"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings = pd.read_csv('data/ml-small/ratings_mess.csv')\n",
    "ratings.head(7) # enables to display the top n lines of a dataframe, 5 by default"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Missing values\n",
    "\n",
    "\n",
    "Pandas provides functions that check if the values are missing:\n",
    "\n",
    "* ``isnull()``: Generate a boolean mask indicating missing values\n",
    "* ``notnull()``: Opposite of ``isnull()``\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating  timestamp\n",
       "0   False    False   False      False\n",
       "1   False    False   False      False\n",
       "2   False    False   False       True\n",
       "3   False    False   False      False\n",
       "4    True    False   False      False"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.isnull().head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### Carefully pruning data\n",
    "\n",
    "Now that we have to prune lines of our data, this will be done using `dropna()` through `dataframe.dropna(subset=[\"col_1\",\"col_2\"],inplace=True)` which drops all rows with at least one missing value in the columns `col1, col2` of `dataframe` *in place* that is *without copy*.\n",
    "\n",
    "\n",
    "**Warning:** this function deletes any line with at least **one** missing data, which is not always wishable. Also, with *inplace=True*, it is applied in place, meaning that they modify the dataframe it is applied to, it is thus an **irreversible operation**; drop `inplace=True` to create a copy or see the result before apllying it.\n",
    "\n",
    "For instance here, `userId,movieId,rating` are essential whereas the `timestamp` is not (it can be dropped for the prediciton process). Thus, we will delete the lines where one of `userId,movieId,rating` is missing and fill the `timestamp` with 0 when it is missing."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>2.5</td>\n",
       "      <td>1.260759e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1029.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.260759e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>83.0</td>\n",
       "      <td>8.353560e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.0</td>\n",
       "      <td>616.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.353559e+08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating     timestamp\n",
       "0     1.0     31.0     2.5  1.260759e+09\n",
       "1     1.0   1029.0     3.0  1.260759e+09\n",
       "2     2.0     10.0     4.0           NaN\n",
       "3     2.0     52.0    83.0  8.353560e+08\n",
       "6     0.0    616.0     3.0  8.353559e+08"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.dropna(subset=[\"userId\",\"movieId\",\"rating\"],inplace=True)\n",
    "ratings.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To fill missing data (from a certain column), the recommended way is to use `fillna()` through `dataframe[\"col\"].fillna(value,inplace=True)` which replace all missing values in the column `col` of `dataframe` by `value` *in place* that is without copy (again this is irreversible, to use the copy version use `inplace=False`).\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>2.5</td>\n",
       "      <td>1.260759e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1029.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.260759e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>83.0</td>\n",
       "      <td>8.353560e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0.0</td>\n",
       "      <td>616.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.353559e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>8.353560e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>3.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.298862e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating     timestamp\n",
       "0     1.0     31.0     2.5  1.260759e+09\n",
       "1     1.0   1029.0     3.0  1.260759e+09\n",
       "2     2.0     10.0     4.0  0.000000e+00\n",
       "3     2.0     52.0    83.0  8.353560e+08\n",
       "6     0.0    616.0     3.0  8.353559e+08\n",
       "8     2.0    720.0     4.0  8.353560e+08\n",
       "9     3.0     60.0     3.0  1.298862e+09"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings[\"timestamp\"].fillna(0,inplace=True)\n",
    "ratings.head(7)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This indeed gives the correct result, however, the line indexing presents missing number. The indexes can be resetted with  `reset_index(inplace=True,drop=True)`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>2.5</td>\n",
       "      <td>1.260759e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1029.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.260759e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>83.0</td>\n",
       "      <td>8.353560e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>616.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.353559e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>8.353560e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.298862e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating     timestamp\n",
       "0     1.0     31.0     2.5  1.260759e+09\n",
       "1     1.0   1029.0     3.0  1.260759e+09\n",
       "2     2.0     10.0     4.0  0.000000e+00\n",
       "3     2.0     52.0    83.0  8.353560e+08\n",
       "4     0.0    616.0     3.0  8.353559e+08\n",
       "5     2.0    720.0     4.0  8.353560e+08\n",
       "6     3.0     60.0     3.0  1.298862e+09"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.reset_index(inplace=True,drop=True)\n",
    "ratings.head(7)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Improper values\n",
    "\n",
    "Even without the missing values, some lines are problematic as they feature values outside of prescribed range (`userId` id of the user, integer greater than 1; `movieId`  id of the user, integer greater than 1; `rating` rating of the user to the movie, float between 0.0 and 5.0; `imestamp` timestamp, integer ) \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>616.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>835355932.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating    timestamp\n",
       "4     0.0    616.0     3.0  835355932.0"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings[ratings[\"userId\"]<1]  # Identifying a problem"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, we drop the corresponding line, with `drop` by `drop(problematic_row.index, inplace=True)`. \n",
    "\n",
    "**Warning:** Do not forget `.index` and `inplace=True`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>2.5</td>\n",
       "      <td>1.260759e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1029.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.260759e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>83.0</td>\n",
       "      <td>8.353560e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>8.353560e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.298862e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>3.0</td>\n",
       "      <td>110.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1.298922e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating     timestamp\n",
       "0     1.0     31.0     2.5  1.260759e+09\n",
       "1     1.0   1029.0     3.0  1.260759e+09\n",
       "2     2.0     10.0     4.0  0.000000e+00\n",
       "3     2.0     52.0    83.0  8.353560e+08\n",
       "5     2.0    720.0     4.0  8.353560e+08\n",
       "6     3.0     60.0     3.0  1.298862e+09\n",
       "7     3.0    110.0     4.0  1.298922e+09"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.drop(ratings[ratings[\"userId\"]<1].index, inplace=True)\n",
    "ratings.head(7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.298924e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>4.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>9.499822e+08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    userId  movieId  rating     timestamp\n",
       "12     3.0      0.0     3.0  1.298924e+09\n",
       "24     4.0     -1.0     2.0  9.499822e+08"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pb_rows = ratings[ratings[\"movieId\"]<1]\n",
    "pb_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings.drop(pb_rows.index, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And finally the ratings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2858.0</td>\n",
       "      <td>-4.0</td>\n",
       "      <td>1.298922e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>83.0</td>\n",
       "      <td>8.353560e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3176.0</td>\n",
       "      <td>123.5</td>\n",
       "      <td>1.163374e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>6.0</td>\n",
       "      <td>111.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>1.109258e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    userId  movieId  rating     timestamp\n",
       "15     3.0   2858.0    -4.0  1.298922e+09\n",
       "3      2.0     52.0    83.0  8.353560e+08\n",
       "25     5.0   3176.0   123.5  1.163374e+09\n",
       "29     6.0    111.0     6.0  1.109258e+09"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pb_rows = ratings[ratings[\"rating\"]<0]\n",
    "pb_rows2 = ratings[ratings[\"rating\"]>5]\n",
    "tot_pb_rows = pb_rows.append(pb_rows2 )\n",
    "tot_pb_rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings.drop(tot_pb_rows.index, inplace=True)\n",
    "ratings.reset_index(inplace=True,drop=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We finally have our dataset cured! Let us save it for further use.\n",
    "\n",
    "`to_csv` saves as CSV into some file, `index=False` drops the index names as we did not specify it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "ratings.to_csv(\"data/ml-small/ratings_cured.csv\",index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Basic Statistics \n",
    "\n",
    "With our cured dataset, we can begin exploring."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>2.5</td>\n",
       "      <td>1.260759e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1029.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.260759e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>8.353560e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.298862e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating     timestamp\n",
       "0     1.0     31.0     2.5  1.260759e+09\n",
       "1     1.0   1029.0     3.0  1.260759e+09\n",
       "2     2.0     10.0     4.0  0.000000e+00\n",
       "3     2.0    720.0     4.0  8.353560e+08\n",
       "4     3.0     60.0     3.0  1.298862e+09"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings = pd.read_csv('data/ml-small/ratings_cured.csv')\n",
    "ratings.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following table summarizes some other built-in Pandas aggregations:\n",
    "\n",
    "| Aggregation              | Description                     |\n",
    "|--------------------------|---------------------------------|\n",
    "| ``count()``              | Total number of items           |\n",
    "| ``first()``, ``last()``  | First and last item             |\n",
    "| ``mean()``, ``median()`` | Mean and median                 |\n",
    "| ``min()``, ``max()``     | Minimum and maximum             |\n",
    "| ``std()``, ``var()``     | Standard deviation and variance |\n",
    "| ``mad()``                | Mean absolute deviation         |\n",
    "| ``prod()``               | Product of all items            |\n",
    "| ``sum()``                | Sum of all items                |\n",
    "\n",
    "These are all methods of ``DataFrame`` and ``Series`` objects, and ``description`` also provides a quick overview."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>24.000000</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>24.000000</td>\n",
       "      <td>2.400000e+01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>3.416667</td>\n",
       "      <td>2465.916667</td>\n",
       "      <td>4.020833</td>\n",
       "      <td>1.056019e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>1.212854</td>\n",
       "      <td>6171.507686</td>\n",
       "      <td>0.926414</td>\n",
       "      <td>3.610777e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>213.250000</td>\n",
       "      <td>3.375000</td>\n",
       "      <td>9.498962e+08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>3.000000</td>\n",
       "      <td>1028.500000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>1.163375e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>4.000000</td>\n",
       "      <td>1628.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1.298862e+09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>6.000000</td>\n",
       "      <td>30749.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1.298923e+09</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          userId       movieId     rating     timestamp\n",
       "count  24.000000     24.000000  24.000000  2.400000e+01\n",
       "mean    3.416667   2465.916667   4.020833  1.056019e+09\n",
       "std     1.212854   6171.507686   0.926414  3.610777e+08\n",
       "min     1.000000     10.000000   2.000000  0.000000e+00\n",
       "25%     3.000000    213.250000   3.375000  9.498962e+08\n",
       "50%     3.000000   1028.500000   4.000000  1.163375e+09\n",
       "75%     4.000000   1628.000000   5.000000  1.298862e+09\n",
       "max     6.000000  30749.000000   5.000000  1.298923e+09"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We see that these statistics do not make sense for all rows. Let us drop the timestamp and examine the ratings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>2.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1029.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating\n",
       "0     1.0     31.0     2.5\n",
       "1     1.0   1029.0     3.0\n",
       "2     2.0     10.0     4.0\n",
       "3     2.0    720.0     4.0\n",
       "4     3.0     60.0     3.0"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.drop(\"timestamp\",axis=1,inplace=True)\n",
    "ratings.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    24.000000\n",
       "mean      4.020833\n",
       "std       0.926414\n",
       "min       2.000000\n",
       "25%       3.375000\n",
       "50%       4.000000\n",
       "75%       5.000000\n",
       "max       5.000000\n",
       "Name: rating, dtype: float64"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings[\"rating\"].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## GroupBy \n",
    "These ratings are linked to users and movies, in order to have a separate view per user/movie, *grouping* has to be used."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The ``GroupBy`` operation (that comes from SQL) accomplishes:\n",
    "\n",
    "- The *split* step involves breaking up and grouping a ``DataFrame`` depending on the value of the specified key.\n",
    "- The *apply* step involves computing some function, usually an sum, median, means etc *within the individual groups*.\n",
    "- The *combine* step merges the results of these operations into an output array.\n",
    "\n",
    "<img src=\"img/GroupBy.png\">\n",
    "<p style=\"text-align: right\">Source: [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do) by Jake VanderPlas</p>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>2.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1029.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating\n",
       "0     1.0     31.0     2.5\n",
       "1     1.0   1029.0     3.0\n",
       "2     2.0     10.0     4.0\n",
       "3     2.0    720.0     4.0\n",
       "4     3.0     60.0     3.0"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So to get the mean of the ratings per user, the command is"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "userId\n",
       "1.0    2.750000\n",
       "2.0    4.000000\n",
       "3.0    3.444444\n",
       "4.0    5.000000\n",
       "5.0    4.166667\n",
       "6.0    4.500000\n",
       "Name: rating, dtype: float64"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.groupby(\"userId\")[\"rating\"].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filtering\n",
    "\n",
    "Filtering is the action of deleting rows depending on a boolean function. For instance, the following removes the user with a rating of only one movie."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "userId\n",
       "1.0    2\n",
       "2.0    2\n",
       "3.0    9\n",
       "4.0    7\n",
       "5.0    3\n",
       "6.0    1\n",
       "Name: rating, dtype: int64"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.groupby(\"userId\")[\"rating\"].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>2.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1029.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>720.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>3.0</td>\n",
       "      <td>110.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.0</td>\n",
       "      <td>247.0</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>3.0</td>\n",
       "      <td>592.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>3.0</td>\n",
       "      <td>593.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>3.0</td>\n",
       "      <td>595.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2762.0</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2841.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>3.0</td>\n",
       "      <td>2959.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>4.0</td>\n",
       "      <td>34.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>4.0</td>\n",
       "      <td>112.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1028.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1030.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1079.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1089.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>4.0</td>\n",
       "      <td>1097.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>5.0</td>\n",
       "      <td>4447.0</td>\n",
       "      <td>4.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>5.0</td>\n",
       "      <td>4718.0</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>5.0</td>\n",
       "      <td>30749.0</td>\n",
       "      <td>4.5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    userId  movieId  rating\n",
       "0      1.0     31.0     2.5\n",
       "1      1.0   1029.0     3.0\n",
       "2      2.0     10.0     4.0\n",
       "3      2.0    720.0     4.0\n",
       "4      3.0     60.0     3.0\n",
       "5      3.0    110.0     4.0\n",
       "6      3.0    247.0     3.5\n",
       "7      3.0    592.0     3.0\n",
       "8      3.0    593.0     3.0\n",
       "9      3.0    595.0     2.0\n",
       "10     3.0   2762.0     3.5\n",
       "11     3.0   2841.0     4.0\n",
       "12     3.0   2959.0     5.0\n",
       "13     4.0     34.0     5.0\n",
       "14     4.0    112.0     5.0\n",
       "15     4.0   1028.0     5.0\n",
       "16     4.0   1030.0     5.0\n",
       "17     4.0   1079.0     5.0\n",
       "18     4.0   1089.0     5.0\n",
       "19     4.0   1097.0     5.0\n",
       "20     5.0   4447.0     4.5\n",
       "21     5.0   4718.0     3.5\n",
       "22     5.0  30749.0     4.5"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def filter_func(x):\n",
    "    return x[\"rating\"].count() >= 2\n",
    "\n",
    "filtered = ratings.groupby(\"userId\").filter(filter_func)\n",
    "filtered"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "userId\n",
       "1.0    2\n",
       "2.0    2\n",
       "3.0    9\n",
       "4.0    7\n",
       "5.0    3\n",
       "Name: rating, dtype: int64"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filtered.groupby(\"userId\")[\"rating\"].count()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Transformations\n",
    "\n",
    "Transforming is the actions of applying a transformation (sic).\n",
    "\n",
    "For instance, let us normalize the ratings so that they have zero mean for each user."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "userId\n",
       "1.0    2.750000\n",
       "2.0    4.000000\n",
       "3.0    3.444444\n",
       "4.0    5.000000\n",
       "5.0    4.166667\n",
       "6.0    4.500000\n",
       "Name: rating, dtype: float64"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.groupby(\"userId\")[\"rating\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "def center_ratings(x):\n",
    "    x[\"rating\"] = x[\"rating\"] - x[\"rating\"].mean()\n",
    "    return x\n",
    "\n",
    "centered = ratings.groupby(\"userId\").apply(center_ratings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "userId\n",
       "1.0    0.000000e+00\n",
       "2.0    0.000000e+00\n",
       "3.0   -1.973730e-16\n",
       "4.0    0.000000e+00\n",
       "5.0   -2.960595e-16\n",
       "6.0    0.000000e+00\n",
       "Name: rating, dtype: float64"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "centered.groupby(\"userId\")[\"rating\"].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Aggregations [*]\n",
    "\n",
    "Aggregations let you aggreagate several operations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>min</th>\n",
       "      <th>max</th>\n",
       "      <th>mean</th>\n",
       "      <th>median</th>\n",
       "      <th>len</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>userId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1.0</th>\n",
       "      <td>2.5</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.750000</td>\n",
       "      <td>2.75</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2.0</th>\n",
       "      <td>4.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>4.00</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3.0</th>\n",
       "      <td>2.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>3.444444</td>\n",
       "      <td>3.50</td>\n",
       "      <td>9.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4.0</th>\n",
       "      <td>5.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>5.00</td>\n",
       "      <td>7.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5.0</th>\n",
       "      <td>3.5</td>\n",
       "      <td>4.5</td>\n",
       "      <td>4.166667</td>\n",
       "      <td>4.50</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6.0</th>\n",
       "      <td>4.5</td>\n",
       "      <td>4.5</td>\n",
       "      <td>4.500000</td>\n",
       "      <td>4.50</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        min  max      mean  median  len\n",
       "userId                                 \n",
       "1.0     2.5  3.0  2.750000    2.75  2.0\n",
       "2.0     4.0  4.0  4.000000    4.00  2.0\n",
       "3.0     2.0  5.0  3.444444    3.50  9.0\n",
       "4.0     5.0  5.0  5.000000    5.00  7.0\n",
       "5.0     3.5  4.5  4.166667    4.50  3.0\n",
       "6.0     4.5  4.5  4.500000    4.50  1.0"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.groupby(\"userId\")[\"rating\"].aggregate([min,max,np.mean,np.median,len])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exercises "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> **Exercise:** Bots Discovery\n",
    ">\n",
    "> In the dataset `ratings_bots.csv`, some users may be bots. To help a movie sucess they add ratings (favorable ones often). To get a better recommendation, we try to remove them.\n",
    ">\n",
    "> * Count the users with a mean rating above 4.7/5 and delete them \n",
    ">\n",
    ">   *hint:* the [nunique](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.nunique.html)  function may be helpful to count\n",
    ">\n",
    ">\n",
    "> * Delete multiples reviews of a movie by a single user by replacing them with only the first one. What is the proportion of potential bots among the users?\n",
    ">\n",
    ">   *hint:* the `groupby` function can be applied to several columns, also `reset_index(drop=True)` removes the grouby indexing. \n",
    ">\n",
    ">  *hint:* remember the `loc` function, e.g.  `df.loc[df['userId'] == 128]` returns a dataframe of the rows where the userId is 128; and `df.loc[df['userId'] == 128].loc[samerev['movieId'] == 3825]` returns a dataframe of the rows where the userId is 128 **and** the movieID is 3825.\n",
    ">\n",
    ">  In total , 17 ratings have to be removed. For instance, user 128 has 3 ratings of the movie 3825\n",
    ">\n",
    "> This dataset has around 100 000 ratings so hand picking won't do!\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "ratings_bots = pd.read_csv('data/ml-small/ratings_bots.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> **Exercise:** Planets discovery\n",
    "> \n",
    "> We will use the Planets dataset, available via the [Seaborn package](http://seaborn.pydata.org/). It provides information on how astronomers found new planets around stars, *exoplanets*.\n",
    ">\n",
    "> * Display median, mean and quantile informations for these planets orbital periods, masses, and distances.\n",
    "> * For each method, display statistic on the years planets were discovered using this technique.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1035, 6)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>method</th>\n",
       "      <th>number</th>\n",
       "      <th>orbital_period</th>\n",
       "      <th>mass</th>\n",
       "      <th>distance</th>\n",
       "      <th>year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Radial Velocity</td>\n",
       "      <td>1</td>\n",
       "      <td>269.300</td>\n",
       "      <td>7.10</td>\n",
       "      <td>77.40</td>\n",
       "      <td>2006</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Radial Velocity</td>\n",
       "      <td>1</td>\n",
       "      <td>874.774</td>\n",
       "      <td>2.21</td>\n",
       "      <td>56.95</td>\n",
       "      <td>2008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Radial Velocity</td>\n",
       "      <td>1</td>\n",
       "      <td>763.000</td>\n",
       "      <td>2.60</td>\n",
       "      <td>19.84</td>\n",
       "      <td>2011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Radial Velocity</td>\n",
       "      <td>1</td>\n",
       "      <td>326.030</td>\n",
       "      <td>19.40</td>\n",
       "      <td>110.62</td>\n",
       "      <td>2007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Radial Velocity</td>\n",
       "      <td>1</td>\n",
       "      <td>516.220</td>\n",
       "      <td>10.50</td>\n",
       "      <td>119.47</td>\n",
       "      <td>2009</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            method  number  orbital_period   mass  distance  year\n",
       "0  Radial Velocity       1         269.300   7.10     77.40  2006\n",
       "1  Radial Velocity       1         874.774   2.21     56.95  2008\n",
       "2  Radial Velocity       1         763.000   2.60     19.84  2011\n",
       "3  Radial Velocity       1         326.030  19.40    110.62  2007\n",
       "4  Radial Velocity       1         516.220  10.50    119.47  2009"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "planets = pd.read_csv('data/planets.csv')\n",
    "print(planets.shape)\n",
    "planets.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}