{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read and Info"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read tabular data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read a dataset of movie reviewers\n",
    "user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']\n",
    "users = pd.read_table('./data/movie.user',\n",
    "                      sep='|',\n",
    "                      header=None,\n",
    "                      names=user_cols)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip_code</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>24</td>\n",
       "      <td>M</td>\n",
       "      <td>technician</td>\n",
       "      <td>85711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>53</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>94043</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>23</td>\n",
       "      <td>M</td>\n",
       "      <td>writer</td>\n",
       "      <td>32067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>24</td>\n",
       "      <td>M</td>\n",
       "      <td>technician</td>\n",
       "      <td>43537</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>33</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>15213</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  age gender  occupation zip_code\n",
       "0        1   24      M  technician    85711\n",
       "1        2   53      F       other    94043\n",
       "2        3   23      M      writer    32067\n",
       "3        4   24      M  technician    43537\n",
       "4        5   33      F       other    15213"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the first 5 rows\n",
    "users.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read_csv is equivalent to read_table, except it assumes a comma separator\n",
    "ufo = pd.read_csv('./data/ufo.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>6/1/1931 13:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>4/18/1933 19:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Colors Reported Shape Reported State             Time\n",
       "0                Ithaca             NaN       TRIANGLE    NY   6/1/1930 22:00\n",
       "1           Willingboro             NaN          OTHER    NJ  6/30/1930 20:00\n",
       "2               Holyoke             NaN           OVAL    CO  2/15/1931 14:00\n",
       "3               Abilene             NaN           DISK    KS   6/1/1931 13:00\n",
       "4  New York Worlds Fair             NaN          LIGHT    NY  4/18/1933 19:00"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the first 5 rows\n",
    "ufo.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<bound method NDFrame.head of                        City State\n",
       "0                    Ithaca    NY\n",
       "1               Willingboro    NJ\n",
       "2                   Holyoke    CO\n",
       "3                   Abilene    KS\n",
       "4      New York Worlds Fair    NY\n",
       "...                     ...   ...\n",
       "18236            Grant Park    IL\n",
       "18237           Spirit Lake    IA\n",
       "18238           Eagle River    WI\n",
       "18239           Eagle River    WI\n",
       "18240                  Ybor    FL\n",
       "\n",
       "[18241 rows x 2 columns]>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# specify which columns to include by name\n",
    "pd.read_csv('./data/ufo.csv', usecols=['City', 'State']).head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: 'ufo.csv'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39m# specify columns by position\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m pd\u001b[39m.\u001b[39;49mread_csv(\u001b[39m'\u001b[39;49m\u001b[39mufo.csv\u001b[39;49m\u001b[39m'\u001b[39;49m, usecols\u001b[39m=\u001b[39;49m[\u001b[39m0\u001b[39;49m, \u001b[39m4\u001b[39;49m])\u001b[39m.\u001b[39mhead\n",
      "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/util/_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    209\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    210\u001b[0m         kwargs[new_arg_name] \u001b[39m=\u001b[39m new_arg_value\n\u001b[0;32m--> 211\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/util/_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    325\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(args) \u001b[39m>\u001b[39m num_allow_args:\n\u001b[1;32m    326\u001b[0m     warnings\u001b[39m.\u001b[39mwarn(\n\u001b[1;32m    327\u001b[0m         msg\u001b[39m.\u001b[39mformat(arguments\u001b[39m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[1;32m    328\u001b[0m         \u001b[39mFutureWarning\u001b[39;00m,\n\u001b[1;32m    329\u001b[0m         stacklevel\u001b[39m=\u001b[39mfind_stack_level(),\n\u001b[1;32m    330\u001b[0m     )\n\u001b[0;32m--> 331\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/parsers/readers.py:950\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m    935\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m    936\u001b[0m     dialect,\n\u001b[1;32m    937\u001b[0m     delimiter,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    946\u001b[0m     defaults\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mdelimiter\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39m,\u001b[39m\u001b[39m\"\u001b[39m},\n\u001b[1;32m    947\u001b[0m )\n\u001b[1;32m    948\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 950\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n",
      "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/parsers/readers.py:605\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    602\u001b[0m _validate_names(kwds\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mnames\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[1;32m    604\u001b[0m \u001b[39m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 605\u001b[0m parser \u001b[39m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwds)\n\u001b[1;32m    607\u001b[0m \u001b[39mif\u001b[39;00m chunksize \u001b[39mor\u001b[39;00m iterator:\n\u001b[1;32m    608\u001b[0m     \u001b[39mreturn\u001b[39;00m parser\n",
      "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1442\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m   1439\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptions[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m kwds[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m   1441\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles: IOHandles \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m-> 1442\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_engine \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_engine(f, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mengine)\n",
      "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1735\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m   1733\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m mode:\n\u001b[1;32m   1734\u001b[0m         mode \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m-> 1735\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39m=\u001b[39m get_handle(\n\u001b[1;32m   1736\u001b[0m     f,\n\u001b[1;32m   1737\u001b[0m     mode,\n\u001b[1;32m   1738\u001b[0m     encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[1;32m   1739\u001b[0m     compression\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mcompression\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[1;32m   1740\u001b[0m     memory_map\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mmemory_map\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mFalse\u001b[39;49;00m),\n\u001b[1;32m   1741\u001b[0m     is_text\u001b[39m=\u001b[39;49mis_text,\n\u001b[1;32m   1742\u001b[0m     errors\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding_errors\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mstrict\u001b[39;49m\u001b[39m\"\u001b[39;49m),\n\u001b[1;32m   1743\u001b[0m     storage_options\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mstorage_options\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[1;32m   1744\u001b[0m )\n\u001b[1;32m   1745\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m   1746\u001b[0m f \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles\u001b[39m.\u001b[39mhandle\n",
      "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/common.py:856\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m    851\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(handle, \u001b[39mstr\u001b[39m):\n\u001b[1;32m    852\u001b[0m     \u001b[39m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m    853\u001b[0m     \u001b[39m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m    854\u001b[0m     \u001b[39mif\u001b[39;00m ioargs\u001b[39m.\u001b[39mencoding \u001b[39mand\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m ioargs\u001b[39m.\u001b[39mmode:\n\u001b[1;32m    855\u001b[0m         \u001b[39m# Encoding\u001b[39;00m\n\u001b[0;32m--> 856\u001b[0m         handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39;49m(\n\u001b[1;32m    857\u001b[0m             handle,\n\u001b[1;32m    858\u001b[0m             ioargs\u001b[39m.\u001b[39;49mmode,\n\u001b[1;32m    859\u001b[0m             encoding\u001b[39m=\u001b[39;49mioargs\u001b[39m.\u001b[39;49mencoding,\n\u001b[1;32m    860\u001b[0m             errors\u001b[39m=\u001b[39;49merrors,\n\u001b[1;32m    861\u001b[0m             newline\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m    862\u001b[0m         )\n\u001b[1;32m    863\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    864\u001b[0m         \u001b[39m# Binary mode\u001b[39;00m\n\u001b[1;32m    865\u001b[0m         handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(handle, ioargs\u001b[39m.\u001b[39mmode)\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'ufo.csv'"
     ]
    }
   ],
   "source": [
    "# specify columns by position\n",
    "pd.read_csv('ufo.csv', usecols=[0, 4]).head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          City  Colors Reported Shape Reported State             Time\n",
       "0       Ithaca              NaN       TRIANGLE    NY   6/1/1930 22:00\n",
       "1  Willingboro              NaN          OTHER    NJ  6/30/1930 20:00\n",
       "2      Holyoke              NaN           OVAL    CO  2/15/1931 14:00"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# specify how many rows to read\n",
    "pd.read_csv('ufo.csv', nrows=3).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Describe data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read a dataset of top-rated IMDb movies into a DataFrame\n",
    "movies = pd.read_csv('imdb_1000.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>duration</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9.3</td>\n",
       "      <td>The Shawshank Redemption</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>142</td>\n",
       "      <td>[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9.2</td>\n",
       "      <td>The Godfather</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>175</td>\n",
       "      <td>[u'Marlon Brando', u'Al Pacino', u'James Caan']</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9.1</td>\n",
       "      <td>The Godfather: Part II</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>200</td>\n",
       "      <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9.0</td>\n",
       "      <td>The Dark Knight</td>\n",
       "      <td>PG-13</td>\n",
       "      <td>Action</td>\n",
       "      <td>152</td>\n",
       "      <td>[u'Christian Bale', u'Heath Ledger', u'Aaron E...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8.9</td>\n",
       "      <td>Pulp Fiction</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>154</td>\n",
       "      <td>[u'John Travolta', u'Uma Thurman', u'Samuel L....</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   star_rating                     title content_rating   genre  duration  \\\n",
       "0          9.3  The Shawshank Redemption              R   Crime       142   \n",
       "1          9.2             The Godfather              R   Crime       175   \n",
       "2          9.1    The Godfather: Part II              R   Crime       200   \n",
       "3          9.0           The Dark Knight          PG-13  Action       152   \n",
       "4          8.9              Pulp Fiction              R   Crime       154   \n",
       "\n",
       "                                         actors_list  \n",
       "0  [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...  \n",
       "1    [u'Marlon Brando', u'Al Pacino', u'James Caan']  \n",
       "2  [u'Al Pacino', u'Robert De Niro', u'Robert Duv...  \n",
       "3  [u'Christian Bale', u'Heath Ledger', u'Aaron E...  \n",
       "4  [u'John Travolta', u'Uma Thurman', u'Samuel L....  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# example method: show the first 5 rows\n",
    "movies.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(979, 6)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# example attribute: number of rows and columns\n",
    "movies.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "star_rating       float64\n",
       "title              object\n",
       "content_rating     object\n",
       "genre              object\n",
       "duration            int64\n",
       "actors_list        object\n",
       "dtype: object"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# example attribute: data type of each column\n",
    "movies.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>duration</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>979.000000</td>\n",
       "      <td>979.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>7.889785</td>\n",
       "      <td>120.979571</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.336069</td>\n",
       "      <td>26.218010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>7.400000</td>\n",
       "      <td>64.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>7.600000</td>\n",
       "      <td>102.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>7.800000</td>\n",
       "      <td>117.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>8.100000</td>\n",
       "      <td>134.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>9.300000</td>\n",
       "      <td>242.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       star_rating    duration\n",
       "count   979.000000  979.000000\n",
       "mean      7.889785  120.979571\n",
       "std       0.336069   26.218010\n",
       "min       7.400000   64.000000\n",
       "25%       7.600000  102.000000\n",
       "50%       7.800000  117.000000\n",
       "75%       8.100000  134.000000\n",
       "max       9.300000  242.000000"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# example method: calculate summary statistics\n",
    "movies.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>979</td>\n",
       "      <td>976</td>\n",
       "      <td>979</td>\n",
       "      <td>979</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>975</td>\n",
       "      <td>12</td>\n",
       "      <td>16</td>\n",
       "      <td>969</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>Les Miserables</td>\n",
       "      <td>R</td>\n",
       "      <td>Drama</td>\n",
       "      <td>[u'Daniel Radcliffe', u'Emma Watson', u'Rupert...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>2</td>\n",
       "      <td>460</td>\n",
       "      <td>278</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 title content_rating  genre  \\\n",
       "count              979            976    979   \n",
       "unique             975             12     16   \n",
       "top     Les Miserables              R  Drama   \n",
       "freq                 2            460    278   \n",
       "\n",
       "                                              actors_list  \n",
       "count                                                 979  \n",
       "unique                                                969  \n",
       "top     [u'Daniel Radcliffe', u'Emma Watson', u'Rupert...  \n",
       "freq                                                    6  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use an optional parameter to the describe method to summarize only 'object' columns\n",
    "movies.describe(include=['object'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create, Rename & Remove"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create a new column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "      <th>Location</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "      <td>Ithaca, NY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "      <td>Willingboro, NJ</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "      <td>Holyoke, CO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>6/1/1931 13:00</td>\n",
       "      <td>Abilene, KS</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>4/18/1933 19:00</td>\n",
       "      <td>New York Worlds Fair, NY</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Colors Reported Shape Reported State             Time  \\\n",
       "0                Ithaca             NaN       TRIANGLE    NY   6/1/1930 22:00   \n",
       "1           Willingboro             NaN          OTHER    NJ  6/30/1930 20:00   \n",
       "2               Holyoke             NaN           OVAL    CO  2/15/1931 14:00   \n",
       "3               Abilene             NaN           DISK    KS   6/1/1931 13:00   \n",
       "4  New York Worlds Fair             NaN          LIGHT    NY  4/18/1933 19:00   \n",
       "\n",
       "                   Location  \n",
       "0                Ithaca, NY  \n",
       "1           Willingboro, NJ  \n",
       "2               Holyoke, CO  \n",
       "3               Abilene, KS  \n",
       "4  New York Worlds Fair, NY  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a new 'Location' Series (must use bracket notation to define the Series name)\n",
    "ufo['Location'] = ufo.City + ', ' + ufo.State\n",
    "ufo.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Rename columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read a dataset of UFO reports into a DataFrame\n",
    "ufo = pd.read_csv('ufo.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the column names\n",
    "ufo.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['City', 'Colors_Reported', 'Shape_Reported', 'State', 'Time'], dtype='object')"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# rename two of the columns by using the 'rename' method\n",
    "ufo.rename(columns={\n",
    "    'Colors Reported': 'Colors_Reported',\n",
    "    'Shape Reported': 'Shape_Reported'\n",
    "},\n",
    "           inplace=True)\n",
    "ufo.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['city', 'colors reported', 'shape reported', 'state', 'time'], dtype='object')"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# replace all of the column names by overwriting the 'columns' attribute\n",
    "ufo_cols = ['city', 'colors reported', 'shape reported', 'state', 'time']\n",
    "ufo.columns = ufo_cols\n",
    "ufo.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['city', 'colors reported', 'shape reported', 'state', 'time'], dtype='object')"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# replace the column names during the file reading process by using the 'names' parameter\n",
    "ufo = pd.read_csv('ufo.csv', header=0, names=ufo_cols)\n",
    "ufo.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['city', 'colors_reported', 'shape_reported', 'state', 'time'], dtype='object')"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# replace all spaces with underscores in the column names by using the 'str.replace' method\n",
    "ufo.columns = ufo.columns.str.replace(' ', '_')\n",
    "ufo.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Remove columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>6/1/1931 13:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>4/18/1933 19:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Colors Reported Shape Reported State             Time\n",
       "0                Ithaca             NaN       TRIANGLE    NY   6/1/1930 22:00\n",
       "1           Willingboro             NaN          OTHER    NJ  6/30/1930 20:00\n",
       "2               Holyoke             NaN           OVAL    CO  2/15/1931 14:00\n",
       "3               Abilene             NaN           DISK    KS   6/1/1931 13:00\n",
       "4  New York Worlds Fair             NaN          LIGHT    NY  4/18/1933 19:00"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ufo = pd.read_csv('ufo.csv')\n",
    "ufo.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>6/1/1931 13:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>4/18/1933 19:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Shape Reported State             Time\n",
       "0                Ithaca       TRIANGLE    NY   6/1/1930 22:00\n",
       "1           Willingboro          OTHER    NJ  6/30/1930 20:00\n",
       "2               Holyoke           OVAL    CO  2/15/1931 14:00\n",
       "3               Abilene           DISK    KS   6/1/1931 13:00\n",
       "4  New York Worlds Fair          LIGHT    NY  4/18/1933 19:00"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# remove a single column (axis=1 refers to columns)\n",
    "ufo.drop('Colors Reported', axis=1, inplace=True)\n",
    "ufo.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>OTHER</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>OVAL</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>DISK</td>\n",
       "      <td>6/1/1931 13:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>LIGHT</td>\n",
       "      <td>4/18/1933 19:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Shape Reported             Time\n",
       "0       TRIANGLE   6/1/1930 22:00\n",
       "1          OTHER  6/30/1930 20:00\n",
       "2           OVAL  2/15/1931 14:00\n",
       "3           DISK   6/1/1931 13:00\n",
       "4          LIGHT  4/18/1933 19:00"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# remove multiple columns at once\n",
    "ufo.drop(columns=['City', 'State']).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Remove rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>6/1/1931 13:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>4/18/1933 19:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Valley City</td>\n",
       "      <td>DISK</td>\n",
       "      <td>ND</td>\n",
       "      <td>9/15/1934 15:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Crater Lake</td>\n",
       "      <td>CIRCLE</td>\n",
       "      <td>CA</td>\n",
       "      <td>6/15/1935 0:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Shape Reported State             Time\n",
       "2               Holyoke           OVAL    CO  2/15/1931 14:00\n",
       "3               Abilene           DISK    KS   6/1/1931 13:00\n",
       "4  New York Worlds Fair          LIGHT    NY  4/18/1933 19:00\n",
       "5           Valley City           DISK    ND  9/15/1934 15:30\n",
       "6           Crater Lake         CIRCLE    CA   6/15/1935 0:00"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# new way to drop rows: specify index\n",
    "ufo.drop(index=[0, 1]).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select and Filter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Iterate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>6/1/1931 13:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>4/18/1933 19:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Colors Reported Shape Reported State             Time\n",
       "0                Ithaca             NaN       TRIANGLE    NY   6/1/1930 22:00\n",
       "1           Willingboro             NaN          OTHER    NJ  6/30/1930 20:00\n",
       "2               Holyoke             NaN           OVAL    CO  2/15/1931 14:00\n",
       "3               Abilene             NaN           DISK    KS   6/1/1931 13:00\n",
       "4  New York Worlds Fair             NaN          LIGHT    NY  4/18/1933 19:00"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ufo = pd.read_csv('ufo.csv')\n",
    "ufo.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ithaca\n",
      "Willingboro\n",
      "Holyoke\n",
      "Abilene\n",
      "New York Worlds Fair\n",
      "Valley City\n",
      "Crater Lake\n",
      "Alma\n",
      "Eklutna\n",
      "Hubbard\n"
     ]
    }
   ],
   "source": [
    "# Series are directly iterable (like a list)\n",
    "for c in ufo.City[:10]:\n",
    "    print(c)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 Ithaca NY\n",
      "1 Willingboro NJ\n",
      "2 Holyoke CO\n",
      "3 Abilene KS\n",
      "4 New York Worlds Fair NY\n",
      "5 Valley City ND\n",
      "6 Crater Lake CA\n",
      "7 Alma MI\n",
      "8 Eklutna AK\n",
      "9 Hubbard OR\n"
     ]
    }
   ],
   "source": [
    "# various methods are available to iterate through a DataFrame\n",
    "for index, row in ufo[:10].iterrows():\n",
    "    print(index, row.City, row.State)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Know about the index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>24</td>\n",
       "      <td>M</td>\n",
       "      <td>technician</td>\n",
       "      <td>85711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>53</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>94043</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>23</td>\n",
       "      <td>M</td>\n",
       "      <td>writer</td>\n",
       "      <td>32067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>24</td>\n",
       "      <td>M</td>\n",
       "      <td>technician</td>\n",
       "      <td>43537</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>33</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>15213</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0   1  2           3      4\n",
       "0  1  24  M  technician  85711\n",
       "1  2  53  F       other  94043\n",
       "2  3  23  M      writer  32067\n",
       "3  4  24  M  technician  43537\n",
       "4  5  33  F       other  15213"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# index and columns both default to integers if you don't define them\n",
    "pd.read_table('movie.user', header=None, sep='|').head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  \n",
       "0                           0.0      Asia  \n",
       "1                           4.9    Europe  \n",
       "2                           0.7    Africa  \n",
       "3                          12.4    Europe  \n",
       "4                           5.9    Africa  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# read a dataset of alcohol consumption into a DataFrame\n",
    "drinks = pd.read_csv('drinks.csv')\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RangeIndex(start=0, stop=193, step=1)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# every DataFrame has an index (sometimes called the \"row labels\")\n",
    "drinks.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',\n",
       "       'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',\n",
       "       ...\n",
       "       'Tanzania', 'USA', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela',\n",
       "       'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],\n",
       "      dtype='object', name='country', length=193)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# set an existing column as the index\n",
    "drinks.set_index('country', inplace=True)\n",
    "drinks.head()\n",
    "# 'country' is now the index\n",
    "drinks.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  \n",
       "0                           0.0      Asia  \n",
       "1                           4.9    Europe  \n",
       "2                           0.7    Africa  \n",
       "3                          12.4    Europe  \n",
       "4                           5.9    Africa  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# restore the index name, and move the index back to a column\n",
    "drinks.index.name = 'country'\n",
    "drinks.reset_index(inplace=True)\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Select a column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                      Ithaca\n",
       "1                 Willingboro\n",
       "2                     Holyoke\n",
       "3                     Abilene\n",
       "4        New York Worlds Fair\n",
       "                 ...         \n",
       "18236              Grant Park\n",
       "18237             Spirit Lake\n",
       "18238             Eagle River\n",
       "18239             Eagle River\n",
       "18240                    Ybor\n",
       "Name: City, Length: 18241, dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# select the 'City' Series using bracket notation\n",
    "ufo['City']\n",
    "# or equivalently, use dot notation\n",
    "ufo.City"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Select only numeric columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country                          object\n",
       "beer_servings                     int64\n",
       "spirit_servings                   int64\n",
       "wine_servings                     int64\n",
       "total_litres_of_pure_alcohol    float64\n",
       "continent                        object\n",
       "dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# read a dataset of alcohol consumption into a DataFrame, and check the data types\n",
    "drinks = pd.read_csv('drinks.csv')\n",
    "drinks.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "beer_servings                     int64\n",
       "spirit_servings                   int64\n",
       "wine_servings                     int64\n",
       "total_litres_of_pure_alcohol    float64\n",
       "dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "drinks.select_dtypes(include=[np.number]).dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Select multiple rows and columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          City Colors Reported Shape Reported State             Time\n",
       "0       Ithaca             NaN       TRIANGLE    NY   6/1/1930 22:00\n",
       "1  Willingboro             NaN          OTHER    NJ  6/30/1930 20:00\n",
       "2      Holyoke             NaN           OVAL    CO  2/15/1931 14:00"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# read a dataset of UFO reports into a DataFrame\n",
    "ufo = pd.read_csv('ufo.csv')\n",
    "ufo.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "City                       Ithaca\n",
       "Colors Reported               NaN\n",
       "Shape Reported           TRIANGLE\n",
       "State                          NY\n",
       "Time               6/1/1930 22:00\n",
       "Name: 0, dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# row 0, all columns\n",
    "ufo.loc[0, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          City Colors Reported Shape Reported State             Time\n",
       "0       Ithaca             NaN       TRIANGLE    NY   6/1/1930 22:00\n",
       "1  Willingboro             NaN          OTHER    NJ  6/30/1930 20:00\n",
       "2      Holyoke             NaN           OVAL    CO  2/15/1931 14:00"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# rows 0 and 1 and 2, all columns\n",
    "ufo.loc[[0, 1, 2], :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          City Colors Reported Shape Reported State             Time\n",
       "0       Ithaca             NaN       TRIANGLE    NY   6/1/1930 22:00\n",
       "1  Willingboro             NaN          OTHER    NJ  6/30/1930 20:00\n",
       "2      Holyoke             NaN           OVAL    CO  2/15/1931 14:00"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# rows 0 through 2 (inclusive), all columns\n",
    "ufo.loc[0:2, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         Ithaca\n",
       "1    Willingboro\n",
       "2        Holyoke\n",
       "Name: City, dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# rows 0 through 2 (inclusive), column 'City'\n",
    "ufo.loc[0:2, 'City']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NJ</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>CO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          City State\n",
       "0       Ithaca    NY\n",
       "1  Willingboro    NJ\n",
       "2      Holyoke    CO"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# rows 0 through 2 (inclusive), columns 'City' and 'State'\n",
    "ufo.loc[0:2, ['City', 'State']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          City Colors Reported Shape Reported State\n",
       "0       Ithaca             NaN       TRIANGLE    NY\n",
       "1  Willingboro             NaN          OTHER    NJ\n",
       "2      Holyoke             NaN           OVAL    CO"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# rows 0 through 2 (inclusive), columns 'City' through 'State' (inclusive)\n",
    "ufo.loc[0:2, 'City':'State']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NJ</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          City State\n",
       "0       Ithaca    NY\n",
       "1  Willingboro    NJ"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# rows in positions 0 and 1, columns in positions 0 and 3\n",
    "ufo.iloc[[0, 1], [0, 3]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          City Colors Reported Shape Reported State\n",
       "0       Ithaca             NaN       TRIANGLE    NY\n",
       "1  Willingboro             NaN          OTHER    NJ"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# rows in positions 0 through 2 (exclusive), columns in positions 0 through 4 (exclusive)\n",
    "ufo.iloc[0:2, 0:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          City Colors Reported Shape Reported State             Time\n",
       "0       Ithaca             NaN       TRIANGLE    NY   6/1/1930 22:00\n",
       "1  Willingboro             NaN          OTHER    NJ  6/30/1930 20:00"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# rows in positions 0 through 2 (exclusive), all columns\n",
    "ufo.iloc[0:2, :]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Argentina</th>\n",
       "      <td>193</td>\n",
       "      <td>25</td>\n",
       "      <td>221</td>\n",
       "      <td>8.3</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bolivia</th>\n",
       "      <td>167</td>\n",
       "      <td>41</td>\n",
       "      <td>8</td>\n",
       "      <td>3.8</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Brazil</th>\n",
       "      <td>245</td>\n",
       "      <td>145</td>\n",
       "      <td>16</td>\n",
       "      <td>7.2</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chile</th>\n",
       "      <td>130</td>\n",
       "      <td>124</td>\n",
       "      <td>172</td>\n",
       "      <td>7.6</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Colombia</th>\n",
       "      <td>159</td>\n",
       "      <td>76</td>\n",
       "      <td>3</td>\n",
       "      <td>4.2</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ecuador</th>\n",
       "      <td>162</td>\n",
       "      <td>74</td>\n",
       "      <td>3</td>\n",
       "      <td>4.2</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Guyana</th>\n",
       "      <td>93</td>\n",
       "      <td>302</td>\n",
       "      <td>1</td>\n",
       "      <td>7.1</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Paraguay</th>\n",
       "      <td>213</td>\n",
       "      <td>117</td>\n",
       "      <td>74</td>\n",
       "      <td>7.3</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Peru</th>\n",
       "      <td>163</td>\n",
       "      <td>160</td>\n",
       "      <td>21</td>\n",
       "      <td>6.1</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Suriname</th>\n",
       "      <td>128</td>\n",
       "      <td>178</td>\n",
       "      <td>7</td>\n",
       "      <td>5.6</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Uruguay</th>\n",
       "      <td>115</td>\n",
       "      <td>35</td>\n",
       "      <td>220</td>\n",
       "      <td>6.6</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Venezuela</th>\n",
       "      <td>333</td>\n",
       "      <td>100</td>\n",
       "      <td>3</td>\n",
       "      <td>7.7</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           beer_servings  spirit_servings  wine_servings  \\\n",
       "country                                                    \n",
       "Argentina            193               25            221   \n",
       "Bolivia              167               41              8   \n",
       "Brazil               245              145             16   \n",
       "Chile                130              124            172   \n",
       "Colombia             159               76              3   \n",
       "Ecuador              162               74              3   \n",
       "Guyana                93              302              1   \n",
       "Paraguay             213              117             74   \n",
       "Peru                 163              160             21   \n",
       "Suriname             128              178              7   \n",
       "Uruguay              115               35            220   \n",
       "Venezuela            333              100              3   \n",
       "\n",
       "           total_litres_of_pure_alcohol      continent  \n",
       "country                                                 \n",
       "Argentina                           8.3  South America  \n",
       "Bolivia                             3.8  South America  \n",
       "Brazil                              7.2  South America  \n",
       "Chile                               7.6  South America  \n",
       "Colombia                            4.2  South America  \n",
       "Ecuador                             4.2  South America  \n",
       "Guyana                              7.1  South America  \n",
       "Paraguay                            7.3  South America  \n",
       "Peru                                6.1  South America  \n",
       "Suriname                            5.6  South America  \n",
       "Uruguay                             6.6  South America  \n",
       "Venezuela                           7.7  South America  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "drinks.loc[drinks.continent=='South America']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6      221\n",
       "20       8\n",
       "23      16\n",
       "35     172\n",
       "37       3\n",
       "52       3\n",
       "72       1\n",
       "132     74\n",
       "133     21\n",
       "163      7\n",
       "185    220\n",
       "188      3\n",
       "Name: wine_servings, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "drinks.loc[drinks.continent=='South America', 'wine_servings']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Argentina</th>\n",
       "      <td>193</td>\n",
       "      <td>25</td>\n",
       "      <td>221</td>\n",
       "      <td>8.3</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bolivia</th>\n",
       "      <td>167</td>\n",
       "      <td>41</td>\n",
       "      <td>8</td>\n",
       "      <td>3.8</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Brazil</th>\n",
       "      <td>245</td>\n",
       "      <td>145</td>\n",
       "      <td>16</td>\n",
       "      <td>7.2</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Chile</th>\n",
       "      <td>130</td>\n",
       "      <td>124</td>\n",
       "      <td>172</td>\n",
       "      <td>7.6</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Colombia</th>\n",
       "      <td>159</td>\n",
       "      <td>76</td>\n",
       "      <td>3</td>\n",
       "      <td>4.2</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ecuador</th>\n",
       "      <td>162</td>\n",
       "      <td>74</td>\n",
       "      <td>3</td>\n",
       "      <td>4.2</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Guyana</th>\n",
       "      <td>93</td>\n",
       "      <td>302</td>\n",
       "      <td>1</td>\n",
       "      <td>7.1</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Paraguay</th>\n",
       "      <td>213</td>\n",
       "      <td>117</td>\n",
       "      <td>74</td>\n",
       "      <td>7.3</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Peru</th>\n",
       "      <td>163</td>\n",
       "      <td>160</td>\n",
       "      <td>21</td>\n",
       "      <td>6.1</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Suriname</th>\n",
       "      <td>128</td>\n",
       "      <td>178</td>\n",
       "      <td>7</td>\n",
       "      <td>5.6</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Uruguay</th>\n",
       "      <td>115</td>\n",
       "      <td>35</td>\n",
       "      <td>220</td>\n",
       "      <td>6.6</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Venezuela</th>\n",
       "      <td>333</td>\n",
       "      <td>100</td>\n",
       "      <td>3</td>\n",
       "      <td>7.7</td>\n",
       "      <td>South America</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           beer_servings  spirit_servings  wine_servings  \\\n",
       "country                                                    \n",
       "Argentina            193               25            221   \n",
       "Bolivia              167               41              8   \n",
       "Brazil               245              145             16   \n",
       "Chile                130              124            172   \n",
       "Colombia             159               76              3   \n",
       "Ecuador              162               74              3   \n",
       "Guyana                93              302              1   \n",
       "Paraguay             213              117             74   \n",
       "Peru                 163              160             21   \n",
       "Suriname             128              178              7   \n",
       "Uruguay              115               35            220   \n",
       "Venezuela            333              100              3   \n",
       "\n",
       "           total_litres_of_pure_alcohol      continent  \n",
       "country                                                 \n",
       "Argentina                           8.3  South America  \n",
       "Bolivia                             3.8  South America  \n",
       "Brazil                              7.2  South America  \n",
       "Chile                               7.6  South America  \n",
       "Colombia                            4.2  South America  \n",
       "Ecuador                             4.2  South America  \n",
       "Guyana                              7.1  South America  \n",
       "Paraguay                            7.3  South America  \n",
       "Peru                                6.1  South America  \n",
       "Suriname                            5.6  South America  \n",
       "Uruguay                             6.6  South America  \n",
       "Venezuela                           7.7  South America  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "drinks.query(\"continent=='South America'\")['wine_servings']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Multiple filter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>duration</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9.3</td>\n",
       "      <td>The Shawshank Redemption</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>142</td>\n",
       "      <td>[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9.2</td>\n",
       "      <td>The Godfather</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>175</td>\n",
       "      <td>[u'Marlon Brando', u'Al Pacino', u'James Caan']</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9.1</td>\n",
       "      <td>The Godfather: Part II</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>200</td>\n",
       "      <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9.0</td>\n",
       "      <td>The Dark Knight</td>\n",
       "      <td>PG-13</td>\n",
       "      <td>Action</td>\n",
       "      <td>152</td>\n",
       "      <td>[u'Christian Bale', u'Heath Ledger', u'Aaron E...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8.9</td>\n",
       "      <td>Pulp Fiction</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>154</td>\n",
       "      <td>[u'John Travolta', u'Uma Thurman', u'Samuel L....</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   star_rating                     title content_rating   genre  duration  \\\n",
       "0          9.3  The Shawshank Redemption              R   Crime       142   \n",
       "1          9.2             The Godfather              R   Crime       175   \n",
       "2          9.1    The Godfather: Part II              R   Crime       200   \n",
       "3          9.0           The Dark Knight          PG-13  Action       152   \n",
       "4          8.9              Pulp Fiction              R   Crime       154   \n",
       "\n",
       "                                         actors_list  \n",
       "0  [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...  \n",
       "1    [u'Marlon Brando', u'Al Pacino', u'James Caan']  \n",
       "2  [u'Al Pacino', u'Robert De Niro', u'Robert Duv...  \n",
       "3  [u'Christian Bale', u'Heath Ledger', u'Aaron E...  \n",
       "4  [u'John Travolta', u'Uma Thurman', u'Samuel L....  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "movies = pd.read_csv('pandas/imdb_1000.csv')\n",
    "movies.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>duration</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>8.7</td>\n",
       "      <td>Seven Samurai</td>\n",
       "      <td>UNRATED</td>\n",
       "      <td>Drama</td>\n",
       "      <td>207</td>\n",
       "      <td>[u'Toshir\\xf4 Mifune', u'Takashi Shimura', u'K...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>157</th>\n",
       "      <td>8.2</td>\n",
       "      <td>Gone with the Wind</td>\n",
       "      <td>G</td>\n",
       "      <td>Drama</td>\n",
       "      <td>238</td>\n",
       "      <td>[u'Clark Gable', u'Vivien Leigh', u'Thomas Mit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>476</th>\n",
       "      <td>7.8</td>\n",
       "      <td>Hamlet</td>\n",
       "      <td>PG-13</td>\n",
       "      <td>Drama</td>\n",
       "      <td>242</td>\n",
       "      <td>[u'Kenneth Branagh', u'Julie Christie', u'Dere...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     star_rating               title content_rating  genre  duration  \\\n",
       "17           8.7       Seven Samurai        UNRATED  Drama       207   \n",
       "157          8.2  Gone with the Wind              G  Drama       238   \n",
       "476          7.8              Hamlet          PG-13  Drama       242   \n",
       "\n",
       "                                           actors_list  \n",
       "17   [u'Toshir\\xf4 Mifune', u'Takashi Shimura', u'K...  \n",
       "157  [u'Clark Gable', u'Vivien Leigh', u'Thomas Mit...  \n",
       "476  [u'Kenneth Branagh', u'Julie Christie', u'Dere...  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# use the '&' operator to specify that both conditions are required\n",
    "movies.loc[(movies.duration >=200) & (movies.genre == 'Drama')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>duration</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9.1</td>\n",
       "      <td>The Godfather: Part II</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>200</td>\n",
       "      <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>8.9</td>\n",
       "      <td>12 Angry Men</td>\n",
       "      <td>NOT RATED</td>\n",
       "      <td>Drama</td>\n",
       "      <td>96</td>\n",
       "      <td>[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8.9</td>\n",
       "      <td>The Lord of the Rings: The Return of the King</td>\n",
       "      <td>PG-13</td>\n",
       "      <td>Adventure</td>\n",
       "      <td>201</td>\n",
       "      <td>[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>8.9</td>\n",
       "      <td>Fight Club</td>\n",
       "      <td>R</td>\n",
       "      <td>Drama</td>\n",
       "      <td>139</td>\n",
       "      <td>[u'Brad Pitt', u'Edward Norton', u'Helena Bonh...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>8.8</td>\n",
       "      <td>Forrest Gump</td>\n",
       "      <td>PG-13</td>\n",
       "      <td>Drama</td>\n",
       "      <td>142</td>\n",
       "      <td>[u'Tom Hanks', u'Robin Wright', u'Gary Sinise']</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    star_rating                                          title content_rating  \\\n",
       "2           9.1                         The Godfather: Part II              R   \n",
       "5           8.9                                   12 Angry Men      NOT RATED   \n",
       "7           8.9  The Lord of the Rings: The Return of the King          PG-13   \n",
       "9           8.9                                     Fight Club              R   \n",
       "13          8.8                                   Forrest Gump          PG-13   \n",
       "\n",
       "        genre  duration                                        actors_list  \n",
       "2       Crime       200  [u'Al Pacino', u'Robert De Niro', u'Robert Duv...  \n",
       "5       Drama        96  [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...  \n",
       "7   Adventure       201  [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK...  \n",
       "9       Drama       139  [u'Brad Pitt', u'Edward Norton', u'Helena Bonh...  \n",
       "13      Drama       142    [u'Tom Hanks', u'Robin Wright', u'Gary Sinise']  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# using the '|' operator would have shown movies that are either long or dramas (or both)\n",
    "movies.loc[(movies.duration >=200) | (movies.genre == 'Drama')].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>duration</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9.3</td>\n",
       "      <td>The Shawshank Redemption</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>142</td>\n",
       "      <td>[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9.2</td>\n",
       "      <td>The Godfather</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>175</td>\n",
       "      <td>[u'Marlon Brando', u'Al Pacino', u'James Caan']</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9.1</td>\n",
       "      <td>The Godfather: Part II</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>200</td>\n",
       "      <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9.0</td>\n",
       "      <td>The Dark Knight</td>\n",
       "      <td>PG-13</td>\n",
       "      <td>Action</td>\n",
       "      <td>152</td>\n",
       "      <td>[u'Christian Bale', u'Heath Ledger', u'Aaron E...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8.9</td>\n",
       "      <td>Pulp Fiction</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>154</td>\n",
       "      <td>[u'John Travolta', u'Uma Thurman', u'Samuel L....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>8.9</td>\n",
       "      <td>12 Angry Men</td>\n",
       "      <td>NOT RATED</td>\n",
       "      <td>Drama</td>\n",
       "      <td>96</td>\n",
       "      <td>[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>8.9</td>\n",
       "      <td>Fight Club</td>\n",
       "      <td>R</td>\n",
       "      <td>Drama</td>\n",
       "      <td>139</td>\n",
       "      <td>[u'Brad Pitt', u'Edward Norton', u'Helena Bonh...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>8.8</td>\n",
       "      <td>Inception</td>\n",
       "      <td>PG-13</td>\n",
       "      <td>Action</td>\n",
       "      <td>148</td>\n",
       "      <td>[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>8.8</td>\n",
       "      <td>Star Wars: Episode V - The Empire Strikes Back</td>\n",
       "      <td>PG</td>\n",
       "      <td>Action</td>\n",
       "      <td>124</td>\n",
       "      <td>[u'Mark Hamill', u'Harrison Ford', u'Carrie Fi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>8.8</td>\n",
       "      <td>Forrest Gump</td>\n",
       "      <td>PG-13</td>\n",
       "      <td>Drama</td>\n",
       "      <td>142</td>\n",
       "      <td>[u'Tom Hanks', u'Robin Wright', u'Gary Sinise']</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    star_rating                                           title  \\\n",
       "0           9.3                        The Shawshank Redemption   \n",
       "1           9.2                                   The Godfather   \n",
       "2           9.1                          The Godfather: Part II   \n",
       "3           9.0                                 The Dark Knight   \n",
       "4           8.9                                    Pulp Fiction   \n",
       "5           8.9                                    12 Angry Men   \n",
       "9           8.9                                      Fight Club   \n",
       "11          8.8                                       Inception   \n",
       "12          8.8  Star Wars: Episode V - The Empire Strikes Back   \n",
       "13          8.8                                    Forrest Gump   \n",
       "\n",
       "   content_rating   genre  duration  \\\n",
       "0               R   Crime       142   \n",
       "1               R   Crime       175   \n",
       "2               R   Crime       200   \n",
       "3           PG-13  Action       152   \n",
       "4               R   Crime       154   \n",
       "5       NOT RATED   Drama        96   \n",
       "9               R   Drama       139   \n",
       "11          PG-13  Action       148   \n",
       "12             PG  Action       124   \n",
       "13          PG-13   Drama       142   \n",
       "\n",
       "                                          actors_list  \n",
       "0   [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...  \n",
       "1     [u'Marlon Brando', u'Al Pacino', u'James Caan']  \n",
       "2   [u'Al Pacino', u'Robert De Niro', u'Robert Duv...  \n",
       "3   [u'Christian Bale', u'Heath Ledger', u'Aaron E...  \n",
       "4   [u'John Travolta', u'Uma Thurman', u'Samuel L....  \n",
       "5   [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...  \n",
       "9   [u'Brad Pitt', u'Edward Norton', u'Helena Bonh...  \n",
       "11  [u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'...  \n",
       "12  [u'Mark Hamill', u'Harrison Ford', u'Carrie Fi...  \n",
       "13    [u'Tom Hanks', u'Robin Wright', u'Gary Sinise']  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "movies.loc[movies.genre.isin(['Crime', 'Drama', 'Action'])].head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Explore"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  \n",
       "0                           0.0      Asia  \n",
       "1                           4.9    Europe  \n",
       "2                           0.7    Africa  \n",
       "3                          12.4    Europe  \n",
       "4                           5.9    Africa  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "drinks = pd.read_csv('pandas/drinks.csv')\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Africa           53\n",
       "Europe           45\n",
       "Asia             44\n",
       "North America    23\n",
       "Oceania          16\n",
       "South America    12\n",
       "Name: continent, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "drinks.continent.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Africa           0.274611\n",
       "Europe           0.233161\n",
       "Asia             0.227979\n",
       "North America    0.119171\n",
       "Oceania          0.082902\n",
       "South America    0.062176\n",
       "Name: continent, dtype: float64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "drinks.continent.value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Africa', 'Europe', 'Asia', 'North America', 'Oceania',\n",
       "       'South America'],\n",
       "      dtype='object')"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# access the Series index\n",
    "drinks.continent.value_counts().index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([53, 45, 44, 23, 16, 12])"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# access the Series values\n",
    "drinks.continent.value_counts().values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "South America    12\n",
       "Oceania          16\n",
       "North America    23\n",
       "Asia             44\n",
       "Europe           45\n",
       "Africa           53\n",
       "Name: continent, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# any Series can be sorted by its values\n",
    "drinks.continent.value_counts().sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Africa           53\n",
       "Asia             44\n",
       "Europe           45\n",
       "North America    23\n",
       "Oceania          16\n",
       "South America    12\n",
       "Name: continent, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# any Series can also be sorted by its index\n",
    "drinks.continent.value_counts().sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Asia', 'Europe', 'Africa', 'North America', 'South America',\n",
       "       'Oceania'], dtype=object)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "drinks.continent.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "drinks.continent.nunique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sort"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>duration</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9.3</td>\n",
       "      <td>The Shawshank Redemption</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>142</td>\n",
       "      <td>[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9.2</td>\n",
       "      <td>The Godfather</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>175</td>\n",
       "      <td>[u'Marlon Brando', u'Al Pacino', u'James Caan']</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9.1</td>\n",
       "      <td>The Godfather: Part II</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>200</td>\n",
       "      <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9.0</td>\n",
       "      <td>The Dark Knight</td>\n",
       "      <td>PG-13</td>\n",
       "      <td>Action</td>\n",
       "      <td>152</td>\n",
       "      <td>[u'Christian Bale', u'Heath Ledger', u'Aaron E...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8.9</td>\n",
       "      <td>Pulp Fiction</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>154</td>\n",
       "      <td>[u'John Travolta', u'Uma Thurman', u'Samuel L....</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   star_rating                     title content_rating   genre  duration  \\\n",
       "0          9.3  The Shawshank Redemption              R   Crime       142   \n",
       "1          9.2             The Godfather              R   Crime       175   \n",
       "2          9.1    The Godfather: Part II              R   Crime       200   \n",
       "3          9.0           The Dark Knight          PG-13  Action       152   \n",
       "4          8.9              Pulp Fiction              R   Crime       154   \n",
       "\n",
       "                                         actors_list  \n",
       "0  [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...  \n",
       "1    [u'Marlon Brando', u'Al Pacino', u'James Caan']  \n",
       "2  [u'Al Pacino', u'Robert De Niro', u'Robert Duv...  \n",
       "3  [u'Christian Bale', u'Heath Ledger', u'Aaron E...  \n",
       "4  [u'John Travolta', u'Uma Thurman', u'Samuel L....  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "movies = pd.read_csv('pandas/imdb_1000.csv')\n",
    "movies.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "542     (500) Days of Summer\n",
       "5               12 Angry Men\n",
       "201         12 Years a Slave\n",
       "698                127 Hours\n",
       "110    2001: A Space Odyssey\n",
       "Name: title, dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# sort the 'title' Series in ascending order (returns a Series)\n",
    "movies.title.sort_values().head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "864               [Rec]\n",
       "526                Zulu\n",
       "615          Zombieland\n",
       "677              Zodiac\n",
       "955    Zero Dark Thirty\n",
       "Name: title, dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# sort in descending order instead\n",
    "movies.title.sort_values(ascending=False).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>duration</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>542</th>\n",
       "      <td>7.8</td>\n",
       "      <td>(500) Days of Summer</td>\n",
       "      <td>PG-13</td>\n",
       "      <td>Comedy</td>\n",
       "      <td>95</td>\n",
       "      <td>[u'Zooey Deschanel', u'Joseph Gordon-Levitt', ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>8.9</td>\n",
       "      <td>12 Angry Men</td>\n",
       "      <td>NOT RATED</td>\n",
       "      <td>Drama</td>\n",
       "      <td>96</td>\n",
       "      <td>[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>201</th>\n",
       "      <td>8.1</td>\n",
       "      <td>12 Years a Slave</td>\n",
       "      <td>R</td>\n",
       "      <td>Biography</td>\n",
       "      <td>134</td>\n",
       "      <td>[u'Chiwetel Ejiofor', u'Michael Kenneth Willia...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>698</th>\n",
       "      <td>7.6</td>\n",
       "      <td>127 Hours</td>\n",
       "      <td>R</td>\n",
       "      <td>Adventure</td>\n",
       "      <td>94</td>\n",
       "      <td>[u'James Franco', u'Amber Tamblyn', u'Kate Mara']</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>8.3</td>\n",
       "      <td>2001: A Space Odyssey</td>\n",
       "      <td>G</td>\n",
       "      <td>Mystery</td>\n",
       "      <td>160</td>\n",
       "      <td>[u'Keir Dullea', u'Gary Lockwood', u'William S...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     star_rating                  title content_rating      genre  duration  \\\n",
       "542          7.8   (500) Days of Summer          PG-13     Comedy        95   \n",
       "5            8.9           12 Angry Men      NOT RATED      Drama        96   \n",
       "201          8.1       12 Years a Slave              R  Biography       134   \n",
       "698          7.6              127 Hours              R  Adventure        94   \n",
       "110          8.3  2001: A Space Odyssey              G    Mystery       160   \n",
       "\n",
       "                                           actors_list  \n",
       "542  [u'Zooey Deschanel', u'Joseph Gordon-Levitt', ...  \n",
       "5    [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...  \n",
       "201  [u'Chiwetel Ejiofor', u'Michael Kenneth Willia...  \n",
       "698  [u'James Franco', u'Amber Tamblyn', u'Kate Mara']  \n",
       "110  [u'Keir Dullea', u'Gary Lockwood', u'William S...  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# sort the entire DataFrame by the 'title' Series (returns a DataFrame)\n",
    "movies.sort_values('title').head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>duration</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>864</th>\n",
       "      <td>7.5</td>\n",
       "      <td>[Rec]</td>\n",
       "      <td>R</td>\n",
       "      <td>Horror</td>\n",
       "      <td>78</td>\n",
       "      <td>[u'Manuela Velasco', u'Ferran Terraza', u'Jorg...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>526</th>\n",
       "      <td>7.8</td>\n",
       "      <td>Zulu</td>\n",
       "      <td>UNRATED</td>\n",
       "      <td>Drama</td>\n",
       "      <td>138</td>\n",
       "      <td>[u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>615</th>\n",
       "      <td>7.7</td>\n",
       "      <td>Zombieland</td>\n",
       "      <td>R</td>\n",
       "      <td>Comedy</td>\n",
       "      <td>88</td>\n",
       "      <td>[u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>677</th>\n",
       "      <td>7.7</td>\n",
       "      <td>Zodiac</td>\n",
       "      <td>R</td>\n",
       "      <td>Crime</td>\n",
       "      <td>157</td>\n",
       "      <td>[u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>955</th>\n",
       "      <td>7.4</td>\n",
       "      <td>Zero Dark Thirty</td>\n",
       "      <td>R</td>\n",
       "      <td>Drama</td>\n",
       "      <td>157</td>\n",
       "      <td>[u'Jessica Chastain', u'Joel Edgerton', u'Chri...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     star_rating             title content_rating   genre  duration  \\\n",
       "864          7.5             [Rec]              R  Horror        78   \n",
       "526          7.8              Zulu        UNRATED   Drama       138   \n",
       "615          7.7        Zombieland              R  Comedy        88   \n",
       "677          7.7            Zodiac              R   Crime       157   \n",
       "955          7.4  Zero Dark Thirty              R   Drama       157   \n",
       "\n",
       "                                           actors_list  \n",
       "864  [u'Manuela Velasco', u'Ferran Terraza', u'Jorg...  \n",
       "526  [u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac...  \n",
       "615  [u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha...  \n",
       "677  [u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M...  \n",
       "955  [u'Jessica Chastain', u'Joel Edgerton', u'Chri...  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# sort in descending order instead\n",
    "movies.sort_values('title', ascending=False).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>star_rating</th>\n",
       "      <th>title</th>\n",
       "      <th>content_rating</th>\n",
       "      <th>genre</th>\n",
       "      <th>duration</th>\n",
       "      <th>actors_list</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>713</th>\n",
       "      <td>7.6</td>\n",
       "      <td>The Jungle Book</td>\n",
       "      <td>APPROVED</td>\n",
       "      <td>Animation</td>\n",
       "      <td>78</td>\n",
       "      <td>[u'Phil Harris', u'Sebastian Cabot', u'Louis P...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>513</th>\n",
       "      <td>7.8</td>\n",
       "      <td>Invasion of the Body Snatchers</td>\n",
       "      <td>APPROVED</td>\n",
       "      <td>Horror</td>\n",
       "      <td>80</td>\n",
       "      <td>[u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>272</th>\n",
       "      <td>8.1</td>\n",
       "      <td>The Killing</td>\n",
       "      <td>APPROVED</td>\n",
       "      <td>Crime</td>\n",
       "      <td>85</td>\n",
       "      <td>[u'Sterling Hayden', u'Coleen Gray', u'Vince E...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>703</th>\n",
       "      <td>7.6</td>\n",
       "      <td>Dracula</td>\n",
       "      <td>APPROVED</td>\n",
       "      <td>Horror</td>\n",
       "      <td>85</td>\n",
       "      <td>[u'Bela Lugosi', u'Helen Chandler', u'David Ma...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>612</th>\n",
       "      <td>7.7</td>\n",
       "      <td>A Hard Day's Night</td>\n",
       "      <td>APPROVED</td>\n",
       "      <td>Comedy</td>\n",
       "      <td>87</td>\n",
       "      <td>[u'John Lennon', u'Paul McCartney', u'George H...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     star_rating                           title content_rating      genre  \\\n",
       "713          7.6                 The Jungle Book       APPROVED  Animation   \n",
       "513          7.8  Invasion of the Body Snatchers       APPROVED     Horror   \n",
       "272          8.1                     The Killing       APPROVED      Crime   \n",
       "703          7.6                         Dracula       APPROVED     Horror   \n",
       "612          7.7              A Hard Day's Night       APPROVED     Comedy   \n",
       "\n",
       "     duration                                        actors_list  \n",
       "713        78  [u'Phil Harris', u'Sebastian Cabot', u'Louis P...  \n",
       "513        80  [u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga...  \n",
       "272        85  [u'Sterling Hayden', u'Coleen Gray', u'Vince E...  \n",
       "703        85  [u'Bela Lugosi', u'Helen Chandler', u'David Ma...  \n",
       "612        87  [u'John Lennon', u'Paul McCartney', u'George H...  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# sort the DataFrame first by 'content_rating', then by 'duration'\n",
    "movies.sort_values(['content_rating', 'duration']).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Group"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  \n",
       "0                           0.0      Asia  \n",
       "1                           4.9    Europe  \n",
       "2                           0.7    Africa  \n",
       "3                          12.4    Europe  \n",
       "4                           5.9    Africa  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "drinks = pd.read_csv('pandas/drinks.csv')\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "continent\n",
       "Africa            61.471698\n",
       "Asia              37.045455\n",
       "Europe           193.777778\n",
       "North America    145.434783\n",
       "Oceania           89.687500\n",
       "South America    175.083333\n",
       "Name: beer_servings, dtype: float64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# calculate the mean beer servings for each continent\n",
    "drinks.groupby('continent').beer_servings.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "continent\n",
       "Africa           376\n",
       "Asia             247\n",
       "Europe           361\n",
       "North America    285\n",
       "Oceania          306\n",
       "South America    333\n",
       "Name: beer_servings, dtype: int64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# other aggregation functions (such as 'max') can also be used with groupby\n",
    "drinks.groupby('continent').beer_servings.max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>min</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>continent</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Africa</th>\n",
       "      <td>53</td>\n",
       "      <td>61.471698</td>\n",
       "      <td>0</td>\n",
       "      <td>376</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Asia</th>\n",
       "      <td>44</td>\n",
       "      <td>37.045455</td>\n",
       "      <td>0</td>\n",
       "      <td>247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Europe</th>\n",
       "      <td>45</td>\n",
       "      <td>193.777778</td>\n",
       "      <td>0</td>\n",
       "      <td>361</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>North America</th>\n",
       "      <td>23</td>\n",
       "      <td>145.434783</td>\n",
       "      <td>1</td>\n",
       "      <td>285</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Oceania</th>\n",
       "      <td>16</td>\n",
       "      <td>89.687500</td>\n",
       "      <td>0</td>\n",
       "      <td>306</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>South America</th>\n",
       "      <td>12</td>\n",
       "      <td>175.083333</td>\n",
       "      <td>93</td>\n",
       "      <td>333</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               count        mean  min  max\n",
       "continent                                 \n",
       "Africa            53   61.471698    0  376\n",
       "Asia              44   37.045455    0  247\n",
       "Europe            45  193.777778    0  361\n",
       "North America     23  145.434783    1  285\n",
       "Oceania           16   89.687500    0  306\n",
       "South America     12  175.083333   93  333"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# multiple aggregation functions can be applied simultaneously\n",
    "drinks.groupby('continent').beer_servings.agg(['count', 'mean', 'min', 'max'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Correlation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>continent</th>\n",
       "      <th>Africa</th>\n",
       "      <th>Asia</th>\n",
       "      <th>Europe</th>\n",
       "      <th>North America</th>\n",
       "      <th>Oceania</th>\n",
       "      <th>South America</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Afghanistan</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Albania</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Algeria</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Andorra</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Angola</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Venezuela</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Vietnam</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Yemen</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Zambia</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Zimbabwe</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>193 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "continent    Africa  Asia  Europe  North America  Oceania  South America\n",
       "country                                                                 \n",
       "Afghanistan       0     1       0              0        0              0\n",
       "Albania           0     0       1              0        0              0\n",
       "Algeria           1     0       0              0        0              0\n",
       "Andorra           0     0       1              0        0              0\n",
       "Angola            1     0       0              0        0              0\n",
       "...             ...   ...     ...            ...      ...            ...\n",
       "Venezuela         0     0       0              0        0              1\n",
       "Vietnam           0     1       0              0        0              0\n",
       "Yemen             0     1       0              0        0              0\n",
       "Zambia            1     0       0              0        0              0\n",
       "Zimbabwe          1     0       0              0        0              0\n",
       "\n",
       "[193 rows x 6 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "pd.crosstab(drinks.country, drinks.continent)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "toc-hr-collapsed": true,
    "toc-nb-collapsed": true
   },
   "source": [
    "### MultiIndex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Date</th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "      <th>Symbol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2016-10-03</td>\n",
       "      <td>31.50</td>\n",
       "      <td>14070500</td>\n",
       "      <td>CSCO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2016-10-03</td>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "      <td>AAPL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2016-10-03</td>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "      <td>MSFT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2016-10-04</td>\n",
       "      <td>113.00</td>\n",
       "      <td>29736800</td>\n",
       "      <td>AAPL</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2016-10-04</td>\n",
       "      <td>57.24</td>\n",
       "      <td>20085900</td>\n",
       "      <td>MSFT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2016-10-04</td>\n",
       "      <td>31.35</td>\n",
       "      <td>18460400</td>\n",
       "      <td>CSCO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2016-10-05</td>\n",
       "      <td>57.64</td>\n",
       "      <td>16726400</td>\n",
       "      <td>MSFT</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2016-10-05</td>\n",
       "      <td>31.59</td>\n",
       "      <td>11808600</td>\n",
       "      <td>CSCO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2016-10-05</td>\n",
       "      <td>113.05</td>\n",
       "      <td>21453100</td>\n",
       "      <td>AAPL</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         Date   Close    Volume Symbol\n",
       "0  2016-10-03   31.50  14070500   CSCO\n",
       "1  2016-10-03  112.52  21701800   AAPL\n",
       "2  2016-10-03   57.42  19189500   MSFT\n",
       "3  2016-10-04  113.00  29736800   AAPL\n",
       "4  2016-10-04   57.24  20085900   MSFT\n",
       "5  2016-10-04   31.35  18460400   CSCO\n",
       "6  2016-10-05   57.64  16726400   MSFT\n",
       "7  2016-10-05   31.59  11808600   CSCO\n",
       "8  2016-10-05  113.05  21453100   AAPL"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks = pd.read_csv('pandas/stocks.csv')\n",
    "stocks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RangeIndex(start=0, stop=9, step=1)"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Symbol  Date      \n",
       "AAPL    2016-10-03    112.52\n",
       "        2016-10-04    113.00\n",
       "        2016-10-05    113.05\n",
       "CSCO    2016-10-03     31.50\n",
       "        2016-10-04     31.35\n",
       "        2016-10-05     31.59\n",
       "MSFT    2016-10-03     57.42\n",
       "        2016-10-04     57.24\n",
       "        2016-10-05     57.64\n",
       "Name: Close, dtype: float64"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ser = stocks.groupby(['Symbol', 'Date'])['Close'].mean()\n",
    "ser"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MultiIndex([('AAPL', '2016-10-03'),\n",
       "            ('AAPL', '2016-10-04'),\n",
       "            ('AAPL', '2016-10-05'),\n",
       "            ('CSCO', '2016-10-03'),\n",
       "            ('CSCO', '2016-10-04'),\n",
       "            ('CSCO', '2016-10-05'),\n",
       "            ('MSFT', '2016-10-03'),\n",
       "            ('MSFT', '2016-10-04'),\n",
       "            ('MSFT', '2016-10-05')],\n",
       "           names=['Symbol', 'Date'])"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ser.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Date</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <th>2016-10-04</th>\n",
       "      <th>2016-10-05</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAPL</th>\n",
       "      <td>112.52</td>\n",
       "      <td>113.00</td>\n",
       "      <td>113.05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CSCO</th>\n",
       "      <td>31.50</td>\n",
       "      <td>31.35</td>\n",
       "      <td>31.59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MSFT</th>\n",
       "      <td>57.42</td>\n",
       "      <td>57.24</td>\n",
       "      <td>57.64</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Date    2016-10-03  2016-10-04  2016-10-05\n",
       "Symbol                                    \n",
       "AAPL        112.52      113.00      113.05\n",
       "CSCO         31.50       31.35       31.59\n",
       "MSFT         57.42       57.24       57.64"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ser.unstack()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CSCO</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>31.50</td>\n",
       "      <td>14070500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAPL</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MSFT</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAPL</th>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>113.00</td>\n",
       "      <td>29736800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MSFT</th>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>57.24</td>\n",
       "      <td>20085900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CSCO</th>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>31.35</td>\n",
       "      <td>18460400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MSFT</th>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>57.64</td>\n",
       "      <td>16726400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CSCO</th>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>31.59</td>\n",
       "      <td>11808600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AAPL</th>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>113.05</td>\n",
       "      <td>21453100</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Close    Volume\n",
       "Symbol Date                        \n",
       "CSCO   2016-10-03   31.50  14070500\n",
       "AAPL   2016-10-03  112.52  21701800\n",
       "MSFT   2016-10-03   57.42  19189500\n",
       "AAPL   2016-10-04  113.00  29736800\n",
       "MSFT   2016-10-04   57.24  20085900\n",
       "CSCO   2016-10-04   31.35  18460400\n",
       "MSFT   2016-10-05   57.64  16726400\n",
       "CSCO   2016-10-05   31.59  11808600\n",
       "AAPL   2016-10-05  113.05  21453100"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.set_index(['Symbol', 'Date'], inplace=True)\n",
    "stocks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MultiIndex([('CSCO', '2016-10-03'),\n",
       "            ('AAPL', '2016-10-03'),\n",
       "            ('MSFT', '2016-10-03'),\n",
       "            ('AAPL', '2016-10-04'),\n",
       "            ('MSFT', '2016-10-04'),\n",
       "            ('CSCO', '2016-10-04'),\n",
       "            ('MSFT', '2016-10-05'),\n",
       "            ('CSCO', '2016-10-05'),\n",
       "            ('AAPL', '2016-10-05')],\n",
       "           names=['Symbol', 'Date'])"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">AAPL</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>113.00</td>\n",
       "      <td>29736800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>113.05</td>\n",
       "      <td>21453100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">CSCO</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>31.50</td>\n",
       "      <td>14070500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>31.35</td>\n",
       "      <td>18460400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>31.59</td>\n",
       "      <td>11808600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">MSFT</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>57.24</td>\n",
       "      <td>20085900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>57.64</td>\n",
       "      <td>16726400</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Close    Volume\n",
       "Symbol Date                        \n",
       "AAPL   2016-10-03  112.52  21701800\n",
       "       2016-10-04  113.00  29736800\n",
       "       2016-10-05  113.05  21453100\n",
       "CSCO   2016-10-03   31.50  14070500\n",
       "       2016-10-04   31.35  18460400\n",
       "       2016-10-05   31.59  11808600\n",
       "MSFT   2016-10-03   57.42  19189500\n",
       "       2016-10-04   57.24  20085900\n",
       "       2016-10-05   57.64  16726400"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.sort_index(inplace=True)\n",
    "stocks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Close          112.52\n",
       "Volume    21701800.00\n",
       "Name: (AAPL, 2016-10-03), dtype: float64"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.loc[('AAPL', '2016-10-03'), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "112.52"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.loc[('AAPL', '2016-10-03'), 'Close']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">AAPL</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>113.00</td>\n",
       "      <td>29736800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>113.05</td>\n",
       "      <td>21453100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">MSFT</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>57.24</td>\n",
       "      <td>20085900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>57.64</td>\n",
       "      <td>16726400</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Close    Volume\n",
       "Symbol Date                        \n",
       "AAPL   2016-10-03  112.52  21701800\n",
       "       2016-10-04  113.00  29736800\n",
       "       2016-10-05  113.05  21453100\n",
       "MSFT   2016-10-03   57.42  19189500\n",
       "       2016-10-04   57.24  20085900\n",
       "       2016-10-05   57.64  16726400"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.loc[['AAPL', 'MSFT'], :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAPL</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MSFT</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Close    Volume\n",
       "Symbol Date                        \n",
       "AAPL   2016-10-03  112.52  21701800\n",
       "MSFT   2016-10-03   57.42  19189500"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.loc[(['AAPL', 'MSFT'], '2016-10-03'), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Symbol  Date      \n",
       "AAPL    2016-10-03    112.52\n",
       "MSFT    2016-10-03     57.42\n",
       "Name: Close, dtype: float64"
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.loc[(['AAPL', 'MSFT'], '2016-10-03'), 'Close']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">AAPL</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>113.00</td>\n",
       "      <td>29736800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">CSCO</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>31.50</td>\n",
       "      <td>14070500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>31.35</td>\n",
       "      <td>18460400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">MSFT</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>57.24</td>\n",
       "      <td>20085900</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Close    Volume\n",
       "Symbol Date                        \n",
       "AAPL   2016-10-03  112.52  21701800\n",
       "       2016-10-04  113.00  29736800\n",
       "CSCO   2016-10-03   31.50  14070500\n",
       "       2016-10-04   31.35  18460400\n",
       "MSFT   2016-10-03   57.42  19189500\n",
       "       2016-10-04   57.24  20085900"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stocks.loc[(slice(None), ['2016-10-03', '2016-10-04']), :]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pivot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>Date</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <th>2016-10-04</th>\n",
       "      <th>2016-10-05</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AAPL</th>\n",
       "      <td>112.52</td>\n",
       "      <td>113.00</td>\n",
       "      <td>113.05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CSCO</th>\n",
       "      <td>31.50</td>\n",
       "      <td>31.35</td>\n",
       "      <td>31.59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MSFT</th>\n",
       "      <td>57.42</td>\n",
       "      <td>57.24</td>\n",
       "      <td>57.64</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Date    2016-10-03  2016-10-04  2016-10-05\n",
       "Symbol                                    \n",
       "AAPL        112.52      113.00      113.05\n",
       "CSCO         31.50       31.35       31.59\n",
       "MSFT         57.42       57.24       57.64"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = stocks.pivot_table(values='Close', index='Symbol', columns='Date')\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Clean"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Remove duplicate rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip_code</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>24</td>\n",
       "      <td>M</td>\n",
       "      <td>technician</td>\n",
       "      <td>85711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>53</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>94043</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>23</td>\n",
       "      <td>M</td>\n",
       "      <td>writer</td>\n",
       "      <td>32067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>24</td>\n",
       "      <td>M</td>\n",
       "      <td>technician</td>\n",
       "      <td>43537</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>33</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>15213</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         age gender  occupation zip_code\n",
       "user_id                                 \n",
       "1         24      M  technician    85711\n",
       "2         53      F       other    94043\n",
       "3         23      M      writer    32067\n",
       "4         24      M  technician    43537\n",
       "5         33      F       other    15213"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# read a dataset of movie reviewers into a DataFrame\n",
    "user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']\n",
    "users = pd.read_table('pandas/movie.user', sep='|', header=None, names=user_cols, index_col='user_id')\n",
    "users.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(943, 4)"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "users.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "user_id\n",
       "939    False\n",
       "940     True\n",
       "941    False\n",
       "942    False\n",
       "943    False\n",
       "Name: zip_code, dtype: bool"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# detect duplicate zip codes: True if an item is identical to a previous item\n",
    "users['zip_code'].duplicated().tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "148"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count the duplicate items (True becomes 1, False becomes 0)\n",
    "users['zip_code'].duplicated().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip_code</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>496</th>\n",
       "      <td>21</td>\n",
       "      <td>F</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>572</th>\n",
       "      <td>51</td>\n",
       "      <td>M</td>\n",
       "      <td>educator</td>\n",
       "      <td>20003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>621</th>\n",
       "      <td>17</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>60402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684</th>\n",
       "      <td>28</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>733</th>\n",
       "      <td>44</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>60630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>805</th>\n",
       "      <td>27</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>20009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>97301</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         age gender occupation zip_code\n",
       "user_id                                \n",
       "496       21      F    student    55414\n",
       "572       51      M   educator    20003\n",
       "621       17      M    student    60402\n",
       "684       28      M    student    55414\n",
       "733       44      F      other    60630\n",
       "805       27      F      other    20009\n",
       "890       32      M    student    97301"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the duplicate rows (ignoring the first occurrence)\n",
    "users.loc[users.duplicated(keep='first'), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip_code</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>67</th>\n",
       "      <td>17</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>60402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>51</td>\n",
       "      <td>M</td>\n",
       "      <td>educator</td>\n",
       "      <td>20003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198</th>\n",
       "      <td>21</td>\n",
       "      <td>F</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350</th>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>97301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>428</th>\n",
       "      <td>28</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>437</th>\n",
       "      <td>27</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>20009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>460</th>\n",
       "      <td>44</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>60630</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         age gender occupation zip_code\n",
       "user_id                                \n",
       "67        17      M    student    60402\n",
       "85        51      M   educator    20003\n",
       "198       21      F    student    55414\n",
       "350       32      M    student    97301\n",
       "428       28      M    student    55414\n",
       "437       27      F      other    20009\n",
       "460       44      F      other    60630"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the duplicate rows (ignoring the last occurrence)\n",
    "users.loc[users.duplicated(keep='last'), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zip_code</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>67</th>\n",
       "      <td>17</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>60402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>51</td>\n",
       "      <td>M</td>\n",
       "      <td>educator</td>\n",
       "      <td>20003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198</th>\n",
       "      <td>21</td>\n",
       "      <td>F</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350</th>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>97301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>428</th>\n",
       "      <td>28</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>437</th>\n",
       "      <td>27</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>20009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>460</th>\n",
       "      <td>44</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>60630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496</th>\n",
       "      <td>21</td>\n",
       "      <td>F</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>572</th>\n",
       "      <td>51</td>\n",
       "      <td>M</td>\n",
       "      <td>educator</td>\n",
       "      <td>20003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>621</th>\n",
       "      <td>17</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>60402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684</th>\n",
       "      <td>28</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>733</th>\n",
       "      <td>44</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>60630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>805</th>\n",
       "      <td>27</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>20009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>97301</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         age gender occupation zip_code\n",
       "user_id                                \n",
       "67        17      M    student    60402\n",
       "85        51      M   educator    20003\n",
       "198       21      F    student    55414\n",
       "350       32      M    student    97301\n",
       "428       28      M    student    55414\n",
       "437       27      F      other    20009\n",
       "460       44      F      other    60630\n",
       "496       21      F    student    55414\n",
       "572       51      M   educator    20003\n",
       "621       17      M    student    60402\n",
       "684       28      M    student    55414\n",
       "733       44      F      other    60630\n",
       "805       27      F      other    20009\n",
       "890       32      M    student    97301"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the duplicate rows (including all duplicates)\n",
    "users.loc[users.duplicated(keep=False), :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(936, 4)"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# drop the duplicate rows (inplace=False by default)\n",
    "users.drop_duplicates(keep='first').shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(936, 4)"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "users.drop_duplicates(keep='last').shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(929, 4)"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "users.drop_duplicates(keep=False).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "16"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# only consider a subset of columns when identifying duplicates\n",
    "users.duplicated(subset=['age', 'zip_code']).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(927, 4)"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "users.drop_duplicates(subset=['age', 'zip_code']).shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Detect missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18236</th>\n",
       "      <td>Grant Park</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>IL</td>\n",
       "      <td>12/31/2000 23:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18237</th>\n",
       "      <td>Spirit Lake</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>IA</td>\n",
       "      <td>12/31/2000 23:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18238</th>\n",
       "      <td>Eagle River</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>WI</td>\n",
       "      <td>12/31/2000 23:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18239</th>\n",
       "      <td>Eagle River</td>\n",
       "      <td>RED</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>WI</td>\n",
       "      <td>12/31/2000 23:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18240</th>\n",
       "      <td>Ybor</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>FL</td>\n",
       "      <td>12/31/2000 23:59</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              City Colors Reported Shape Reported State              Time\n",
       "18236   Grant Park             NaN       TRIANGLE    IL  12/31/2000 23:00\n",
       "18237  Spirit Lake             NaN           DISK    IA  12/31/2000 23:00\n",
       "18238  Eagle River             NaN            NaN    WI  12/31/2000 23:45\n",
       "18239  Eagle River             RED          LIGHT    WI  12/31/2000 23:45\n",
       "18240         Ybor             NaN           OVAL    FL  12/31/2000 23:59"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ufo = pd.read_csv('pandas/ufo.csv')\n",
    "ufo.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18236</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18237</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18238</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18239</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18240</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        City  Colors Reported  Shape Reported  State   Time\n",
       "18236  False             True           False  False  False\n",
       "18237  False             True           False  False  False\n",
       "18238  False             True           False  False  False\n",
       "18239  False            False           False  False  False\n",
       "18240  False             True           False  False  False"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ufo.isna().tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18236</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18237</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18238</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18239</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18240</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       City  Colors Reported  Shape Reported  State  Time\n",
       "18236  True            False            True   True  True\n",
       "18237  True            False            True   True  True\n",
       "18238  True            False            True   True  True\n",
       "18239  True             True            True   True  True\n",
       "18240  True            False            True   True  True"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ufo.notna().tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "City                  25\n",
       "Colors Reported    15359\n",
       "Shape Reported         0\n",
       "State                  0\n",
       "Time                   0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count the number of missing values in each Series\n",
    "ufo.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LA</td>\n",
       "      <td>8/15/1943 0:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>LA</td>\n",
       "      <td>8/15/1943 0:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>204</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>CA</td>\n",
       "      <td>7/15/1952 12:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>241</th>\n",
       "      <td>NaN</td>\n",
       "      <td>BLUE</td>\n",
       "      <td>DISK</td>\n",
       "      <td>MT</td>\n",
       "      <td>7/4/1953 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>613</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>NV</td>\n",
       "      <td>7/1/1960 12:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    City Colors Reported Shape Reported State             Time\n",
       "21   NaN             NaN            NaN    LA   8/15/1943 0:00\n",
       "22   NaN             NaN          LIGHT    LA   8/15/1943 0:00\n",
       "204  NaN             NaN           DISK    CA  7/15/1952 12:30\n",
       "241  NaN            BLUE           DISK    MT   7/4/1953 14:00\n",
       "613  NaN             NaN           DISK    NV   7/1/1960 12:00"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the 'isnull' Series method to filter the DataFrame rows\n",
    "ufo.loc[ufo.City.isnull()].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(18241, 5)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# examine the number of rows and columns\n",
    "ufo.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2486, 5)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# if 'any' values are missing in a row, then drop that row\n",
    "ufo.dropna(how='any').shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(18241, 5)"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 'inplace' parameter for 'dropna' is False by default, thus rows were only dropped temporarily\n",
    "ufo.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(18241, 5)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# if 'all' values are missing in a row, then drop that row (none are dropped in this case)\n",
    "ufo.dropna(how='all').shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15576, 5)"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row\n",
    "ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LIGHT       2803\n",
       "DISK        2122\n",
       "TRIANGLE    1889\n",
       "OTHER       1402\n",
       "CIRCLE      1365\n",
       "Name: Shape Reported, dtype: int64"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 'value_counts' does not include missing values by default\n",
    "ufo['Shape Reported'].value_counts().head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LIGHT       2803\n",
       "NaN         2644\n",
       "DISK        2122\n",
       "TRIANGLE    1889\n",
       "OTHER       1402\n",
       "Name: Shape Reported, dtype: int64"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# explicitly include missing values\n",
    "ufo['Shape Reported'].value_counts(dropna=False).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fill missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "# fill in missing values with a specified value\n",
    "ufo['Shape Reported'].fillna(value='VARIOUS', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "VARIOUS     2977\n",
       "LIGHT       2803\n",
       "DISK        2122\n",
       "TRIANGLE    1889\n",
       "OTHER       1402\n",
       "Name: Shape Reported, dtype: int64"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# confirm that the missing values were filled in\n",
    "ufo['Shape Reported'].value_counts().head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>UNKNOWN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>UNKNOWN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>UNKNOWN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>UNKNOWN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>6/1/1931 13:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>UNKNOWN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>4/18/1933 19:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Colors Reported Shape Reported State             Time\n",
       "0                Ithaca         UNKNOWN       TRIANGLE    NY   6/1/1930 22:00\n",
       "1           Willingboro         UNKNOWN          OTHER    NJ  6/30/1930 20:00\n",
       "2               Holyoke         UNKNOWN           OVAL    CO  2/15/1931 14:00\n",
       "3               Abilene         UNKNOWN           DISK    KS   6/1/1931 13:00\n",
       "4  New York Worlds Fair         UNKNOWN          LIGHT    NY  4/18/1933 19:00"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# fill in missing values\n",
    "ufo.fillna(value='UNKNOWN').head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18236</th>\n",
       "      <td>Grant Park</td>\n",
       "      <td>RED</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>IL</td>\n",
       "      <td>12/31/2000 23:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18237</th>\n",
       "      <td>Spirit Lake</td>\n",
       "      <td>RED</td>\n",
       "      <td>DISK</td>\n",
       "      <td>IA</td>\n",
       "      <td>12/31/2000 23:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18238</th>\n",
       "      <td>Eagle River</td>\n",
       "      <td>RED</td>\n",
       "      <td>VARIOUS</td>\n",
       "      <td>WI</td>\n",
       "      <td>12/31/2000 23:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18239</th>\n",
       "      <td>Eagle River</td>\n",
       "      <td>RED</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>WI</td>\n",
       "      <td>12/31/2000 23:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18240</th>\n",
       "      <td>Ybor</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>FL</td>\n",
       "      <td>12/31/2000 23:59</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              City Colors Reported Shape Reported State              Time\n",
       "18236   Grant Park             RED       TRIANGLE    IL  12/31/2000 23:00\n",
       "18237  Spirit Lake             RED           DISK    IA  12/31/2000 23:00\n",
       "18238  Eagle River             RED        VARIOUS    WI  12/31/2000 23:45\n",
       "18239  Eagle River             RED          LIGHT    WI  12/31/2000 23:45\n",
       "18240         Ybor             NaN           OVAL    FL  12/31/2000 23:59"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# fill missing values using \"backward fill\" strategy (doesn't affect the DataFrame since inplace=False)\n",
    "ufo.fillna(method='bfill').tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>18236</th>\n",
       "      <td>Grant Park</td>\n",
       "      <td>RED</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>IL</td>\n",
       "      <td>12/31/2000 23:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18237</th>\n",
       "      <td>Spirit Lake</td>\n",
       "      <td>RED</td>\n",
       "      <td>DISK</td>\n",
       "      <td>IA</td>\n",
       "      <td>12/31/2000 23:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18238</th>\n",
       "      <td>Eagle River</td>\n",
       "      <td>RED</td>\n",
       "      <td>VARIOUS</td>\n",
       "      <td>WI</td>\n",
       "      <td>12/31/2000 23:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18239</th>\n",
       "      <td>Eagle River</td>\n",
       "      <td>RED</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>WI</td>\n",
       "      <td>12/31/2000 23:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18240</th>\n",
       "      <td>Ybor</td>\n",
       "      <td>RED</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>FL</td>\n",
       "      <td>12/31/2000 23:59</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              City Colors Reported Shape Reported State              Time\n",
       "18236   Grant Park             RED       TRIANGLE    IL  12/31/2000 23:00\n",
       "18237  Spirit Lake             RED           DISK    IA  12/31/2000 23:00\n",
       "18238  Eagle River             RED        VARIOUS    WI  12/31/2000 23:45\n",
       "18239  Eagle River             RED          LIGHT    WI  12/31/2000 23:45\n",
       "18240         Ybor             RED           OVAL    FL  12/31/2000 23:59"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# compare with \"forward fill\" strategy (doesn't affect the DataFrame since inplace=False)\n",
    "ufo.fillna(method='ffill').tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Apply a function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "train = pd.read_csv('pandas/titanic_train.csv')\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sex</th>\n",
       "      <th>Sex_num</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>male</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>female</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>female</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>female</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>male</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Sex  Sex_num\n",
       "0    male        1\n",
       "1  female        0\n",
       "2  female        0\n",
       "3  female        0\n",
       "4    male        1"
      ]
     },
     "execution_count": 137,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# map 'female' to 0 and 'male' to 1\n",
    "train['Sex_num'] = train.Sex.map({'female': 0, 'male': 1})\n",
    "train.loc[0:4, ['Sex', 'Sex_num']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Name</th>\n",
       "      <th>Name_length</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>51</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                Name  Name_length\n",
       "0                            Braund, Mr. Owen Harris           23\n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...           51\n",
       "2                             Heikkinen, Miss. Laina           22\n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)           44\n",
       "4                           Allen, Mr. William Henry           24"
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate the length of each string in the 'Name' Series\n",
    "train['Name_length'] = train['Name'].apply(len)\n",
    "train.loc[0:4, ['Name', 'Name_length']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                           [Braund,  Mr. Owen Harris]\n",
       "1    [Cumings,  Mrs. John Bradley (Florence Briggs ...\n",
       "2                            [Heikkinen,  Miss. Laina]\n",
       "3      [Futrelle,  Mrs. Jacques Heath (Lily May Peel)]\n",
       "4                          [Allen,  Mr. William Henry]\n",
       "Name: Name, dtype: object"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use a string method to split the 'Name' Series at commas (returns a Series of lists)\n",
    "train['Name'].str.split(',').head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       Braund\n",
       "1      Cumings\n",
       "2    Heikkinen\n",
       "3     Futrelle\n",
       "4        Allen\n",
       "Name: Name, dtype: object"
      ]
     },
     "execution_count": 143,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#  use a lambda function\n",
    "train['Name'].str.split(',').apply(lambda x: x[0]).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  \n",
       "0                           0.0      Asia  \n",
       "1                           4.9    Europe  \n",
       "2                           0.7    Africa  \n",
       "3                          12.4    Europe  \n",
       "4                           5.9    Africa  "
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drinks = pd.read_csv('pandas/drinks.csv')\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "beer_servings      376\n",
       "spirit_servings    438\n",
       "wine_servings      370\n",
       "dtype: int64"
      ]
     },
     "execution_count": 146,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# apply the 'max' function along axis 0 to calculate the maximum value in each column\n",
    "drinks.loc[:, 'beer_servings':'wine_servings'].apply(max, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>89.0</td>\n",
       "      <td>132.0</td>\n",
       "      <td>54.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>25.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>245.0</td>\n",
       "      <td>138.0</td>\n",
       "      <td>312.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>217.0</td>\n",
       "      <td>57.0</td>\n",
       "      <td>45.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   beer_servings  spirit_servings  wine_servings\n",
       "0            0.0              0.0            0.0\n",
       "1           89.0            132.0           54.0\n",
       "2           25.0              0.0           14.0\n",
       "3          245.0            138.0          312.0\n",
       "4          217.0             57.0           45.0"
      ]
     },
     "execution_count": 148,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# convert every DataFrame element into a float\n",
    "drinks.loc[:, 'beer_servings':'wine_servings'].applymap(float).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89.0</td>\n",
       "      <td>132.0</td>\n",
       "      <td>54.0</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>14.0</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245.0</td>\n",
       "      <td>138.0</td>\n",
       "      <td>312.0</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217.0</td>\n",
       "      <td>57.0</td>\n",
       "      <td>45.0</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan            0.0              0.0            0.0   \n",
       "1      Albania           89.0            132.0           54.0   \n",
       "2      Algeria           25.0              0.0           14.0   \n",
       "3      Andorra          245.0            138.0          312.0   \n",
       "4       Angola          217.0             57.0           45.0   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  \n",
       "0                           0.0      Asia  \n",
       "1                           4.9    Europe  \n",
       "2                           0.7    Africa  \n",
       "3                          12.4    Europe  \n",
       "4                           5.9    Africa  "
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# overwrite the existing DataFrame columns\n",
    "drinks.loc[:, 'beer_servings':\n",
    "           'wine_servings'] = drinks.loc[:, 'beer_servings':\n",
    "                                         'wine_servings'].applymap(float)\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reshape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Concat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# concatenate the 'drinks' DataFrame with the 'population' Series (aligns by the index)\n",
    "pd.concat([drinks, people], axis=1).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "toc-hr-collapsed": true,
    "toc-nb-collapsed": true
   },
   "source": [
    "### Merge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movie_id</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>GoldenEye (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Four Rooms (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Get Shorty (1995)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Copycat (1995)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movie_id              title\n",
       "0         1   Toy Story (1995)\n",
       "1         2   GoldenEye (1995)\n",
       "2         3  Four Rooms (1995)\n",
       "3         4  Get Shorty (1995)\n",
       "4         5     Copycat (1995)"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movie_cols = ['movie_id', 'title']\n",
    "movies = pd.read_table('pandas/u.item', sep='|', header=None, names=movie_cols, usecols=[0, 1])\n",
    "movies.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1682, 2)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "movies.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>196</td>\n",
       "      <td>242</td>\n",
       "      <td>3</td>\n",
       "      <td>881250949</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>186</td>\n",
       "      <td>302</td>\n",
       "      <td>3</td>\n",
       "      <td>891717742</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>22</td>\n",
       "      <td>377</td>\n",
       "      <td>1</td>\n",
       "      <td>878887116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>244</td>\n",
       "      <td>51</td>\n",
       "      <td>2</td>\n",
       "      <td>880606923</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>166</td>\n",
       "      <td>346</td>\n",
       "      <td>1</td>\n",
       "      <td>886397596</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   user_id  movie_id  rating  timestamp\n",
       "0      196       242       3  881250949\n",
       "1      186       302       3  891717742\n",
       "2       22       377       1  878887116\n",
       "3      244        51       2  880606923\n",
       "4      166       346       1  886397596"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']\n",
    "ratings = pd.read_table('pandas/u.data', sep='\\t', header=None, names=rating_cols)\n",
    "ratings.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(100000, 4)"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>308</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>887736532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>454</th>\n",
       "      <td>287</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>875334088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>957</th>\n",
       "      <td>148</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>877019411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>971</th>\n",
       "      <td>280</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>891700426</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1324</th>\n",
       "      <td>66</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>883601324</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      user_id  movie_id  rating  timestamp\n",
       "24        308         1       4  887736532\n",
       "454       287         1       5  875334088\n",
       "957       148         1       4  877019411\n",
       "971       280         1       4  891700426\n",
       "1324       66         1       3  883601324"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.query(\"movie_id == 1\").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['movie_id', 'title', 'user_id', 'rating', 'timestamp'], dtype='object')"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movie_ratings = pd.merge(movies, ratings)\n",
    "movie_ratings.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movie_id</th>\n",
       "      <th>title</th>\n",
       "      <th>user_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>308</td>\n",
       "      <td>4</td>\n",
       "      <td>887736532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>287</td>\n",
       "      <td>5</td>\n",
       "      <td>875334088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>148</td>\n",
       "      <td>4</td>\n",
       "      <td>877019411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>280</td>\n",
       "      <td>4</td>\n",
       "      <td>891700426</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>66</td>\n",
       "      <td>3</td>\n",
       "      <td>883601324</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movie_id             title  user_id  rating  timestamp\n",
       "0         1  Toy Story (1995)      308       4  887736532\n",
       "1         1  Toy Story (1995)      287       5  875334088\n",
       "2         1  Toy Story (1995)      148       4  877019411\n",
       "3         1  Toy Story (1995)      280       4  891700426\n",
       "4         1  Toy Story (1995)       66       3  883601324"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movie_ratings.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1682, 2)\n",
      "(100000, 4)\n",
      "(100000, 5)\n"
     ]
    }
   ],
   "source": [
    "print(movies.shape)\n",
    "print(ratings.shape)\n",
    "print(movie_ratings.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['m_id', 'title'], dtype='object')"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies.columns = ['m_id', 'title']\n",
    "movies.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['user_id', 'movie_id', 'rating', 'timestamp'], dtype='object')"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>m_id</th>\n",
       "      <th>title</th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>308</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>887736532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>287</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>875334088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>148</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>877019411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>280</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>891700426</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>66</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>883601324</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   m_id             title  user_id  movie_id  rating  timestamp\n",
       "0     1  Toy Story (1995)      308         1       4  887736532\n",
       "1     1  Toy Story (1995)      287         1       5  875334088\n",
       "2     1  Toy Story (1995)      148         1       4  877019411\n",
       "3     1  Toy Story (1995)      280         1       4  891700426\n",
       "4     1  Toy Story (1995)       66         1       3  883601324"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(movies, ratings, left_on='m_id', right_on='movie_id').head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>308</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>887736532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>454</th>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>287</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>875334088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>957</th>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>148</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>877019411</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>971</th>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>280</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>891700426</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1324</th>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>66</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>883601324</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 title  user_id  movie_id  rating  timestamp\n",
       "24    Toy Story (1995)      308         1       4  887736532\n",
       "454   Toy Story (1995)      287         1       5  875334088\n",
       "957   Toy Story (1995)      148         1       4  877019411\n",
       "971   Toy Story (1995)      280         1       4  891700426\n",
       "1324  Toy Story (1995)       66         1       3  883601324"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(movies, ratings, left_index=True, right_on='movie_id').head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>user_id</th>\n",
       "      <th>movie_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Toy Story (1995)</td>\n",
       "      <td>186</td>\n",
       "      <td>302</td>\n",
       "      <td>3</td>\n",
       "      <td>891717742</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>GoldenEye (1995)</td>\n",
       "      <td>22</td>\n",
       "      <td>377</td>\n",
       "      <td>1</td>\n",
       "      <td>878887116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Four Rooms (1995)</td>\n",
       "      <td>244</td>\n",
       "      <td>51</td>\n",
       "      <td>2</td>\n",
       "      <td>880606923</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Get Shorty (1995)</td>\n",
       "      <td>166</td>\n",
       "      <td>346</td>\n",
       "      <td>1</td>\n",
       "      <td>886397596</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Copycat (1995)</td>\n",
       "      <td>298</td>\n",
       "      <td>474</td>\n",
       "      <td>4</td>\n",
       "      <td>884182806</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               title  user_id  movie_id  rating  timestamp\n",
       "1   Toy Story (1995)      186       302       3  891717742\n",
       "2   GoldenEye (1995)       22       377       1  878887116\n",
       "3  Four Rooms (1995)      244        51       2  880606923\n",
       "4  Get Shorty (1995)      166       346       1  886397596\n",
       "5     Copycat (1995)      298       474       4  884182806"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(movies, ratings, left_index=True, right_index=True).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Merging with MultiIndexes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Close</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">AAPL</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>112.52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>113.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>113.05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">CSCO</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>31.50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>31.35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>31.59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">MSFT</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>57.42</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>57.24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>57.64</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Close\n",
       "Symbol Date              \n",
       "AAPL   2016-10-03  112.52\n",
       "       2016-10-04  113.00\n",
       "       2016-10-05  113.05\n",
       "CSCO   2016-10-03   31.50\n",
       "       2016-10-04   31.35\n",
       "       2016-10-05   31.59\n",
       "MSFT   2016-10-03   57.42\n",
       "       2016-10-04   57.24\n",
       "       2016-10-05   57.64"
      ]
     },
     "execution_count": 130,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "close = pd.read_csv('pandas/stocks.csv', usecols=[0, 1, 3], index_col=['Symbol', 'Date']).sort_index()\n",
    "close"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Volume</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">AAPL</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>21701800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>29736800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>21453100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">CSCO</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>14070500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>18460400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>11808600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">MSFT</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>19189500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>20085900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>16726400</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     Volume\n",
       "Symbol Date                \n",
       "AAPL   2016-10-03  21701800\n",
       "       2016-10-04  29736800\n",
       "       2016-10-05  21453100\n",
       "CSCO   2016-10-03  14070500\n",
       "       2016-10-04  18460400\n",
       "       2016-10-05  11808600\n",
       "MSFT   2016-10-03  19189500\n",
       "       2016-10-04  20085900\n",
       "       2016-10-05  16726400"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "volume = pd.read_csv('pandas/stocks.csv', usecols=[0, 2, 3], index_col=['Symbol', 'Date']).sort_index()\n",
    "volume"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">AAPL</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>113.00</td>\n",
       "      <td>29736800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>113.05</td>\n",
       "      <td>21453100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">CSCO</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>31.50</td>\n",
       "      <td>14070500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>31.35</td>\n",
       "      <td>18460400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>31.59</td>\n",
       "      <td>11808600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">MSFT</th>\n",
       "      <th>2016-10-03</th>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-04</th>\n",
       "      <td>57.24</td>\n",
       "      <td>20085900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-10-05</th>\n",
       "      <td>57.64</td>\n",
       "      <td>16726400</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    Close    Volume\n",
       "Symbol Date                        \n",
       "AAPL   2016-10-03  112.52  21701800\n",
       "       2016-10-04  113.00  29736800\n",
       "       2016-10-05  113.05  21453100\n",
       "CSCO   2016-10-03   31.50  14070500\n",
       "       2016-10-04   31.35  18460400\n",
       "       2016-10-05   31.59  11808600\n",
       "MSFT   2016-10-03   57.42  19189500\n",
       "       2016-10-04   57.24  20085900\n",
       "       2016-10-05   57.64  16726400"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "both = pd.merge(close, volume, left_index=True, right_index=True)\n",
    "both"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Symbol</th>\n",
       "      <th>Date</th>\n",
       "      <th>Close</th>\n",
       "      <th>Volume</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AAPL</td>\n",
       "      <td>2016-10-03</td>\n",
       "      <td>112.52</td>\n",
       "      <td>21701800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AAPL</td>\n",
       "      <td>2016-10-04</td>\n",
       "      <td>113.00</td>\n",
       "      <td>29736800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AAPL</td>\n",
       "      <td>2016-10-05</td>\n",
       "      <td>113.05</td>\n",
       "      <td>21453100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CSCO</td>\n",
       "      <td>2016-10-03</td>\n",
       "      <td>31.50</td>\n",
       "      <td>14070500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CSCO</td>\n",
       "      <td>2016-10-04</td>\n",
       "      <td>31.35</td>\n",
       "      <td>18460400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>CSCO</td>\n",
       "      <td>2016-10-05</td>\n",
       "      <td>31.59</td>\n",
       "      <td>11808600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>MSFT</td>\n",
       "      <td>2016-10-03</td>\n",
       "      <td>57.42</td>\n",
       "      <td>19189500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>MSFT</td>\n",
       "      <td>2016-10-04</td>\n",
       "      <td>57.24</td>\n",
       "      <td>20085900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>MSFT</td>\n",
       "      <td>2016-10-05</td>\n",
       "      <td>57.64</td>\n",
       "      <td>16726400</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Symbol        Date   Close    Volume\n",
       "0   AAPL  2016-10-03  112.52  21701800\n",
       "1   AAPL  2016-10-04  113.00  29736800\n",
       "2   AAPL  2016-10-05  113.05  21453100\n",
       "3   CSCO  2016-10-03   31.50  14070500\n",
       "4   CSCO  2016-10-04   31.35  18460400\n",
       "5   CSCO  2016-10-05   31.59  11808600\n",
       "6   MSFT  2016-10-03   57.42  19189500\n",
       "7   MSFT  2016-10-04   57.24  20085900\n",
       "8   MSFT  2016-10-05   57.64  16726400"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "both.reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Join"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>num</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>green</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>yellow</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    color  num\n",
       "0   green    1\n",
       "1  yellow    2\n",
       "2     red    3"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "A = pd.DataFrame({'color': ['green', 'yellow', 'red'], 'num': [1, 2, 3]})\n",
    "A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>size</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>green</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>yellow</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>pink</td>\n",
       "      <td>L</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    color size\n",
       "0   green    S\n",
       "1  yellow    M\n",
       "2    pink    L"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "B = pd.DataFrame({'color': ['green', 'yellow', 'pink'], 'size':['S', 'M', 'L']})\n",
    "B"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Inner join: Only include observations found in both A and B:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>num</th>\n",
       "      <th>size</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>green</td>\n",
       "      <td>1</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>yellow</td>\n",
       "      <td>2</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    color  num size\n",
       "0   green    1    S\n",
       "1  yellow    2    M"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(A, B, how='inner')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Outer join: Include observations found in either A or B:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>num</th>\n",
       "      <th>size</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>green</td>\n",
       "      <td>1.0</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>yellow</td>\n",
       "      <td>2.0</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>3.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>pink</td>\n",
       "      <td>NaN</td>\n",
       "      <td>L</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    color  num size\n",
       "0   green  1.0    S\n",
       "1  yellow  2.0    M\n",
       "2     red  3.0  NaN\n",
       "3    pink  NaN    L"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(A, B, how='outer')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Left join: Include all observations found in A:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>num</th>\n",
       "      <th>size</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>green</td>\n",
       "      <td>1</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>yellow</td>\n",
       "      <td>2</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>red</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    color  num size\n",
       "0   green    1    S\n",
       "1  yellow    2    M\n",
       "2     red    3  NaN"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(A, B, how='left')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Right join: Include all observations found in B:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>color</th>\n",
       "      <th>num</th>\n",
       "      <th>size</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>green</td>\n",
       "      <td>1.0</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>yellow</td>\n",
       "      <td>2.0</td>\n",
       "      <td>M</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>pink</td>\n",
       "      <td>NaN</td>\n",
       "      <td>L</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    color  num size\n",
       "0   green  1.0    S\n",
       "1  yellow  2.0    M\n",
       "2    pink  NaN    L"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(A, B, how='right')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## String & Time"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Change the data type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  \n",
       "0                           0.0      Asia  \n",
       "1                           4.9    Europe  \n",
       "2                           0.7    Africa  \n",
       "3                          12.4    Europe  \n",
       "4                           5.9    Africa  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# read a dataset of alcohol consumption into a DataFrame\n",
    "drinks = pd.read_csv('drinks.csv')\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country                          object\n",
       "beer_servings                     int64\n",
       "spirit_servings                   int64\n",
       "wine_servings                     int64\n",
       "total_litres_of_pure_alcohol    float64\n",
       "continent                        object\n",
       "dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# examine the data type of each Series\n",
    "drinks.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country                          object\n",
       "beer_servings                   float64\n",
       "spirit_servings                   int64\n",
       "wine_servings                     int64\n",
       "total_litres_of_pure_alcohol    float64\n",
       "continent                        object\n",
       "dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# change the data type of an existing Series\n",
    "drinks['beer_servings'] = drinks.beer_servings.astype(float)\n",
    "drinks.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country                          object\n",
       "beer_servings                   float64\n",
       "spirit_servings                   int64\n",
       "wine_servings                     int64\n",
       "total_litres_of_pure_alcohol    float64\n",
       "continent                        object\n",
       "dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# alternatively, change the data type of a Series while reading in a file\n",
    "drinks = pd.read_csv('drinks.csv', dtype={'beer_servings': float})\n",
    "drinks.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Use string methods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>order_id</th>\n",
       "      <th>quantity</th>\n",
       "      <th>item_name</th>\n",
       "      <th>choice_description</th>\n",
       "      <th>item_price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Chips and Fresh Tomato Salsa</td>\n",
       "      <td>NaN</td>\n",
       "      <td>$2.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Izze</td>\n",
       "      <td>[Clementine]</td>\n",
       "      <td>$3.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Nantucket Nectar</td>\n",
       "      <td>[Apple]</td>\n",
       "      <td>$3.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Chips and Tomatillo-Green Chili Salsa</td>\n",
       "      <td>NaN</td>\n",
       "      <td>$2.39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>Chicken Bowl</td>\n",
       "      <td>[Tomatillo-Red Chili Salsa (Hot), [Black Beans...</td>\n",
       "      <td>$16.98</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   order_id  quantity                              item_name  \\\n",
       "0         1         1           Chips and Fresh Tomato Salsa   \n",
       "1         1         1                                   Izze   \n",
       "2         1         1                       Nantucket Nectar   \n",
       "3         1         1  Chips and Tomatillo-Green Chili Salsa   \n",
       "4         2         2                           Chicken Bowl   \n",
       "\n",
       "                                  choice_description item_price  \n",
       "0                                                NaN     $2.39   \n",
       "1                                       [Clementine]     $3.39   \n",
       "2                                            [Apple]     $3.39   \n",
       "3                                                NaN     $2.39   \n",
       "4  [Tomatillo-Red Chili Salsa (Hot), [Black Beans...    $16.98   "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# read a dataset of Chipotle orders into a DataFrame\n",
    "orders = pd.read_table('chipotle.tsv')\n",
    "orders.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0             CHIPS AND FRESH TOMATO SALSA\n",
       "1                                     IZZE\n",
       "2                         NANTUCKET NECTAR\n",
       "3    CHIPS AND TOMATILLO-GREEN CHILI SALSA\n",
       "4                             CHICKEN BOWL\n",
       "Name: item_name, dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# string methods for pandas Series are accessed via 'str'\n",
    "orders.item_name.str.upper().head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1    False\n",
       "2    False\n",
       "3    False\n",
       "4     True\n",
       "Name: item_name, dtype: bool"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# string method 'contains' checks for a substring and returns a boolean Series\n",
    "orders.item_name.str.contains('Chicken').head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>order_id</th>\n",
       "      <th>quantity</th>\n",
       "      <th>item_name</th>\n",
       "      <th>choice_description</th>\n",
       "      <th>item_price</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>Chicken Bowl</td>\n",
       "      <td>[Tomatillo-Red Chili Salsa (Hot), [Black Beans...</td>\n",
       "      <td>$16.98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>Chicken Bowl</td>\n",
       "      <td>[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...</td>\n",
       "      <td>$10.98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>Chicken Crispy Tacos</td>\n",
       "      <td>[Roasted Chili Corn Salsa, [Fajita Vegetables,...</td>\n",
       "      <td>$8.75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>Chicken Soft Tacos</td>\n",
       "      <td>[Roasted Chili Corn Salsa, [Rice, Black Beans,...</td>\n",
       "      <td>$8.75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>Chicken Bowl</td>\n",
       "      <td>[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...</td>\n",
       "      <td>$11.25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    order_id  quantity             item_name  \\\n",
       "4          2         2          Chicken Bowl   \n",
       "5          3         1          Chicken Bowl   \n",
       "11         6         1  Chicken Crispy Tacos   \n",
       "12         6         1    Chicken Soft Tacos   \n",
       "13         7         1          Chicken Bowl   \n",
       "\n",
       "                                   choice_description item_price  \n",
       "4   [Tomatillo-Red Chili Salsa (Hot), [Black Beans...    $16.98   \n",
       "5   [Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...    $10.98   \n",
       "11  [Roasted Chili Corn Salsa, [Fajita Vegetables,...     $8.75   \n",
       "12  [Roasted Chili Corn Salsa, [Rice, Black Beans,...     $8.75   \n",
       "13  [Fresh Tomato Salsa, [Fajita Vegetables, Rice,...    $11.25   "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# use the boolean Series to filter the DataFrame\n",
    "orders[orders.item_name.str.contains('Chicken')].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/93/795zm8c93m16_92qkk86t0_r0000gn/T/ipykernel_64526/7676934.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.\n",
      "  orders.choice_description.str.replace('[', '').str.replace(']', '').head()\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0                                                  NaN\n",
       "1                                           Clementine\n",
       "2                                                Apple\n",
       "3                                                  NaN\n",
       "4    Tomatillo-Red Chili Salsa (Hot), Black Beans, ...\n",
       "Name: choice_description, dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# string methods can be chained together\n",
    "orders.choice_description.str.replace('[', '').str.replace(']', '').head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/93/795zm8c93m16_92qkk86t0_r0000gn/T/ipykernel_64526/1086997848.py:2: FutureWarning: The default value of regex will change from True to False in a future version.\n",
      "  orders.choice_description.str.replace('[\\[\\]]', '').head()\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0                                                  NaN\n",
       "1                                           Clementine\n",
       "2                                                Apple\n",
       "3                                                  NaN\n",
       "4    Tomatillo-Red Chili Salsa (Hot), Black Beans, ...\n",
       "Name: choice_description, dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# many pandas string methods support regular expressions (regex)\n",
    "orders.choice_description.str.replace('[\\[\\]]', '').head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dates & Times"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>6/1/1931 13:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>4/18/1933 19:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Colors Reported Shape Reported State             Time\n",
       "0                Ithaca             NaN       TRIANGLE    NY   6/1/1930 22:00\n",
       "1           Willingboro             NaN          OTHER    NJ  6/30/1930 20:00\n",
       "2               Holyoke             NaN           OVAL    CO  2/15/1931 14:00\n",
       "3               Abilene             NaN           DISK    KS   6/1/1931 13:00\n",
       "4  New York Worlds Fair             NaN          LIGHT    NY  4/18/1933 19:00"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# read a dataset of UFO reports into a DataFrame\n",
    "ufo = pd.read_csv('pandas/ufo.csv')\n",
    "ufo.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "City               object\n",
       "Colors Reported    object\n",
       "Shape Reported     object\n",
       "State              object\n",
       "Time               object\n",
       "dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 'Time' is currently stored as a string\n",
    "ufo.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>1930-06-01 22:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>1930-06-30 20:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>1931-02-15 14:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>1931-06-01 13:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>1933-04-18 19:00:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Colors Reported Shape Reported State  \\\n",
       "0                Ithaca             NaN       TRIANGLE    NY   \n",
       "1           Willingboro             NaN          OTHER    NJ   \n",
       "2               Holyoke             NaN           OVAL    CO   \n",
       "3               Abilene             NaN           DISK    KS   \n",
       "4  New York Worlds Fair             NaN          LIGHT    NY   \n",
       "\n",
       "                 Time  \n",
       "0 1930-06-01 22:00:00  \n",
       "1 1930-06-30 20:00:00  \n",
       "2 1931-02-15 14:00:00  \n",
       "3 1931-06-01 13:00:00  \n",
       "4 1933-04-18 19:00:00  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# convert 'Time' to datetime format\n",
    "ufo['Time'] = pd.to_datetime(ufo.Time)\n",
    "ufo.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "City                       object\n",
       "Colors Reported            object\n",
       "Shape Reported             object\n",
       "State                      object\n",
       "Time               datetime64[ns]\n",
       "dtype: object"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ufo.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Category data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>country</th>\n",
       "      <th>beer_servings</th>\n",
       "      <th>spirit_servings</th>\n",
       "      <th>wine_servings</th>\n",
       "      <th>total_litres_of_pure_alcohol</th>\n",
       "      <th>continent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Afghanistan</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Asia</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Albania</td>\n",
       "      <td>89</td>\n",
       "      <td>132</td>\n",
       "      <td>54</td>\n",
       "      <td>4.9</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Algeria</td>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>14</td>\n",
       "      <td>0.7</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Andorra</td>\n",
       "      <td>245</td>\n",
       "      <td>138</td>\n",
       "      <td>312</td>\n",
       "      <td>12.4</td>\n",
       "      <td>Europe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Angola</td>\n",
       "      <td>217</td>\n",
       "      <td>57</td>\n",
       "      <td>45</td>\n",
       "      <td>5.9</td>\n",
       "      <td>Africa</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       country  beer_servings  spirit_servings  wine_servings  \\\n",
       "0  Afghanistan              0                0              0   \n",
       "1      Albania             89              132             54   \n",
       "2      Algeria             25                0             14   \n",
       "3      Andorra            245              138            312   \n",
       "4       Angola            217               57             45   \n",
       "\n",
       "   total_litres_of_pure_alcohol continent  \n",
       "0                           0.0      Asia  \n",
       "1                           4.9    Europe  \n",
       "2                           0.7    Africa  \n",
       "3                          12.4    Europe  \n",
       "4                           5.9    Africa  "
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# read a dataset of alcohol consumption into a DataFrame\n",
    "drinks = pd.read_csv('drinks.csv')\n",
    "drinks.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index                             128\n",
       "country                         12588\n",
       "beer_servings                    1544\n",
       "spirit_servings                  1544\n",
       "wine_servings                    1544\n",
       "total_litres_of_pure_alcohol     1544\n",
       "continent                       12332\n",
       "dtype: int64"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate the memory usage for each Series (in bytes)\n",
    "drinks.memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "country                           object\n",
       "beer_servings                      int64\n",
       "spirit_servings                    int64\n",
       "wine_servings                      int64\n",
       "total_litres_of_pure_alcohol     float64\n",
       "continent                       category\n",
       "dtype: object"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the 'category' data type to store the 'continent' strings as integers\n",
    "drinks['continent'] = drinks.continent.astype('category')\n",
    "drinks.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      Asia\n",
       "1    Europe\n",
       "2    Africa\n",
       "3    Europe\n",
       "4    Africa\n",
       "Name: continent, dtype: category\n",
       "Categories (6, object): ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 'continent' Series appears to be unchanged\n",
    "drinks.continent.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1\n",
       "1    2\n",
       "2    0\n",
       "3    2\n",
       "4    0\n",
       "dtype: int8"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)\n",
    "drinks.continent.cat.codes.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index                             128\n",
       "country                         12588\n",
       "beer_servings                    1544\n",
       "spirit_servings                  1544\n",
       "wine_servings                    1544\n",
       "total_litres_of_pure_alcohol     1544\n",
       "continent                         756\n",
       "dtype: int64"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# memory usage has been drastically reduced\n",
    "drinks.memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index                             128\n",
       "country                         17142\n",
       "beer_servings                    1544\n",
       "spirit_servings                  1544\n",
       "wine_servings                    1544\n",
       "total_litres_of_pure_alcohol     1544\n",
       "continent                         756\n",
       "dtype: int64"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# repeat this process for the 'country' Series\n",
    "drinks['country'] = drinks.country.astype('category')\n",
    "drinks.memory_usage(deep=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# memory usage increased because we created 193 categories\n",
    "drinks.country.cat.categories"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dummy variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "train = pd.read_csv('pandas/titanic_train.csv')\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>female</th>\n",
       "      <th>male</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   female  male\n",
       "0       0     1\n",
       "1       1     0\n",
       "2       1     0\n",
       "3       1     0\n",
       "4       0     1"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use 'get_dummies' to create one column for every possible value\n",
    "pd.get_dummies(train.Sex).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sex_male</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Sex_male\n",
       "0         1\n",
       "1         0\n",
       "2         0\n",
       "3         0\n",
       "4         1"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# drop the first dummy variable ('female') using the 'iloc' method\n",
    "# add a prefix to identify the source of the dummy variables\n",
    "pd.get_dummies(train.Sex, prefix='Sex').iloc[:, 1:].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Embarked_C  Embarked_Q  Embarked_S\n",
       "0           0           0           1\n",
       "1           1           0           0\n",
       "2           0           0           1\n",
       "3           0           0           1\n",
       "4           0           0           1\n",
       "5           0           1           0\n",
       "6           0           0           1\n",
       "7           0           0           1\n",
       "8           0           0           1\n",
       "9           1           0           0"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use 'get_dummies' with a feature that has 3 possible values\n",
    "pd.get_dummies(train.Embarked, prefix='Embarked').head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Embarked_Q  Embarked_S\n",
       "0           0           1\n",
       "1           0           0\n",
       "2           0           1\n",
       "3           0           1\n",
       "4           0           1\n",
       "5           1           0\n",
       "6           0           1\n",
       "7           0           1\n",
       "8           0           1\n",
       "9           0           0"
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# drop the first dummy variable ('C')\n",
    "pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:].head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  Embarked_Q  Embarked_S  \n",
       "0      0         A/5 21171   7.2500   NaN        S           0           1  \n",
       "1      0          PC 17599  71.2833   C85        C           0           0  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S           0           1  \n",
       "3      0            113803  53.1000  C123        S           0           1  \n",
       "4      0            373450   8.0500   NaN        S           0           1  "
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# save the DataFrame of dummy variables and concatenate them to the original DataFrame\n",
    "embarked_dummies = pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:]\n",
    "train = pd.concat([train, embarked_dummies], axis=1)\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name     Sex   Age  SibSp  \\\n",
       "0                            Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                             Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                           Allen, Mr. William Henry    male  35.0      0   \n",
       "\n",
       "   Parch            Ticket     Fare Cabin Embarked  \n",
       "0      0         A/5 21171   7.2500   NaN        S  \n",
       "1      0          PC 17599  71.2833   C85        C  \n",
       "2      0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3      0            113803  53.1000  C123        S  \n",
       "4      0            373450   8.0500   NaN        S  "
      ]
     },
     "execution_count": 126,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# reset the DataFrame\n",
    "train = pd.read_csv('pandas/titanic_train.csv')\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Sex_female</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name   Age  SibSp  Parch  \\\n",
       "0                            Braund, Mr. Owen Harris  22.0      1      0   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0      1      0   \n",
       "2                             Heikkinen, Miss. Laina  26.0      0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0      1      0   \n",
       "4                           Allen, Mr. William Henry  35.0      0      0   \n",
       "\n",
       "             Ticket     Fare Cabin  Sex_female  Sex_male  Embarked_C  \\\n",
       "0         A/5 21171   7.2500   NaN           0         1           0   \n",
       "1          PC 17599  71.2833   C85           1         0           1   \n",
       "2  STON/O2. 3101282   7.9250   NaN           1         0           0   \n",
       "3            113803  53.1000  C123           1         0           0   \n",
       "4            373450   8.0500   NaN           0         1           0   \n",
       "\n",
       "   Embarked_Q  Embarked_S  \n",
       "0           0           1  \n",
       "1           0           0  \n",
       "2           0           1  \n",
       "3           0           1  \n",
       "4           0           1  "
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)\n",
    "pd.get_dummies(train, columns=['Sex', 'Embarked']).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Sex_male</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  \\\n",
       "0            1         0       3   \n",
       "1            2         1       1   \n",
       "2            3         1       3   \n",
       "3            4         1       1   \n",
       "4            5         0       3   \n",
       "\n",
       "                                                Name   Age  SibSp  Parch  \\\n",
       "0                            Braund, Mr. Owen Harris  22.0      1      0   \n",
       "1  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0      1      0   \n",
       "2                             Heikkinen, Miss. Laina  26.0      0      0   \n",
       "3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0      1      0   \n",
       "4                           Allen, Mr. William Henry  35.0      0      0   \n",
       "\n",
       "             Ticket     Fare Cabin  Sex_male  Embarked_Q  Embarked_S  \n",
       "0         A/5 21171   7.2500   NaN         1           0           1  \n",
       "1          PC 17599  71.2833   C85         0           0           0  \n",
       "2  STON/O2. 3101282   7.9250   NaN         0           0           1  \n",
       "3            113803  53.1000  C123         0           0           1  \n",
       "4            373450   8.0500   NaN         1           0           1  "
      ]
     },
     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature\n",
    "pd.get_dummies(train, columns=['Sex', 'Embarked'], drop_first=True).head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Cardinal data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>very good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>excellent</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID    quality\n",
       "0  100       good\n",
       "1  101  very good\n",
       "2  102       good\n",
       "3  103  excellent"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a small DataFrame from a dictionary\n",
    "df = pd.DataFrame({\n",
    "    'ID': [100, 101, 102, 103],\n",
    "    'quality': ['good', 'very good', 'good', 'excellent']\n",
    "})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>excellent</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>very good</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID    quality\n",
       "3  103  excellent\n",
       "0  100       good\n",
       "2  102       good\n",
       "1  101  very good"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort the DataFrame by the 'quality' Series (alphabetical order)\n",
    "df.sort_values('quality')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "from pandas.api.types import CategoricalDtype"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         good\n",
       "1    very good\n",
       "2         good\n",
       "3    excellent\n",
       "Name: quality, dtype: category\n",
       "Categories (3, object): ['good' < 'very good' < 'excellent']"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# define a logical ordering for the categories\n",
    "cats = ['good', 'very good', 'excellent']\n",
    "cat_type = CategoricalDtype(categories=cats, ordered=True)\n",
    "df['quality'] = df.quality.astype(cat_type)\n",
    "df.quality"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>102</td>\n",
       "      <td>good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>very good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>excellent</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID    quality\n",
       "0  100       good\n",
       "2  102       good\n",
       "1  101  very good\n",
       "3  103  excellent"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sort the DataFrame by the 'quality' Series (logical order)\n",
    "df.sort_values('quality')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>quality</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>101</td>\n",
       "      <td>very good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>103</td>\n",
       "      <td>excellent</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID    quality\n",
       "1  101  very good\n",
       "3  103  excellent"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# comparison operators work with ordered categories\n",
    "df.loc[df.quality > 'good', :]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>217</th>\n",
       "      <td>Norridgewock</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>ME</td>\n",
       "      <td>9/15/1952 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12282</th>\n",
       "      <td>Ipava</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>IL</td>\n",
       "      <td>10/1/1998 21:15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17933</th>\n",
       "      <td>Ellinwood</td>\n",
       "      <td>NaN</td>\n",
       "      <td>FIREBALL</td>\n",
       "      <td>KS</td>\n",
       "      <td>11/13/2000 22:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               City Colors Reported Shape Reported State              Time\n",
       "217    Norridgewock             NaN           DISK    ME   9/15/1952 14:00\n",
       "12282         Ipava             NaN       TRIANGLE    IL   10/1/1998 21:15\n",
       "17933     Ellinwood             NaN       FIREBALL    KS  11/13/2000 22:00"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use the 'random_state' parameter for reproducibility\n",
    "ufo.sample(n=3, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# sample 75% of the DataFrame's rows without replacement\n",
    "train = ufo.sample(frac=0.75, random_state=99)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>4/18/1933 19:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Valley City</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>ND</td>\n",
       "      <td>9/15/1934 15:30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Eklutna</td>\n",
       "      <td>NaN</td>\n",
       "      <td>CIGAR</td>\n",
       "      <td>AK</td>\n",
       "      <td>10/15/1936 17:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Waterloo</td>\n",
       "      <td>NaN</td>\n",
       "      <td>FIREBALL</td>\n",
       "      <td>AL</td>\n",
       "      <td>6/1/1939 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Keokuk</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>IA</td>\n",
       "      <td>7/7/1939 2:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                    City Colors Reported Shape Reported State  \\\n",
       "4   New York Worlds Fair             NaN          LIGHT    NY   \n",
       "5            Valley City             NaN           DISK    ND   \n",
       "8                Eklutna             NaN          CIGAR    AK   \n",
       "11              Waterloo             NaN       FIREBALL    AL   \n",
       "13                Keokuk             NaN           OVAL    IA   \n",
       "\n",
       "                Time  \n",
       "4    4/18/1933 19:00  \n",
       "5    9/15/1934 15:30  \n",
       "8   10/15/1936 17:00  \n",
       "11    6/1/1939 20:00  \n",
       "13     7/7/1939 2:00  "
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# store the remaining 25% of the rows in another DataFrame\n",
    "test = ufo.loc[~ufo.index.isin(train.index), :]\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## List the name of Dateframes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mRunning cells with '/usr/local/bin/python3.11' requires the ipykernel package.\n",
      "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n",
      "\u001b[1;31mCommand: '/usr/local/bin/python3.11 -m pip install ipykernel -U --user --force-reinstall'"
     ]
    }
   ],
   "source": [
    "# https://stackoverflow.com/questions/44835358/pandas-list-of-dataframe-names\n",
    "g = globals()\n",
    "\n",
    "df_names = [k for k, v in g.items() if isinstance(v, pd.DataFrame)]\n",
    "\n",
    "df_list = list(filter(lambda x: 'df' in x, df_names))\n",
    "\n",
    "df_list\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "p39",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "metadata": {
   "interpreter": {
    "hash": "ff2e645ef952f1284ebef59edbc19ad3cff03e6406671d2dab1ca7ad9588368d"
   }
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "toc-showcode": false,
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  },
  "vscode": {
   "interpreter": {
    "hash": "c3a1109010e257dd9f1e593983c951231fd94a0c98fc2b081b8760e1222f1725"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}