{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read and Info" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read tabular data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# read a dataset of movie reviewers\n", "user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']\n", "users = pd.read_table('./data/movie.user',\n", " sep='|',\n", " header=None,\n", " names=user_cols)\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>user_id</th>\n", " <th>age</th>\n", " <th>gender</th>\n", " <th>occupation</th>\n", " <th>zip_code</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>24</td>\n", " <td>M</td>\n", " <td>technician</td>\n", " <td>85711</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>53</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>94043</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>23</td>\n", " <td>M</td>\n", " <td>writer</td>\n", " <td>32067</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4</td>\n", " <td>24</td>\n", " <td>M</td>\n", " <td>technician</td>\n", " <td>43537</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5</td>\n", " <td>33</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>15213</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " user_id age gender occupation zip_code\n", "0 1 24 M technician 85711\n", "1 2 53 F other 94043\n", "2 3 23 M writer 32067\n", "3 4 24 M technician 43537\n", "4 5 33 F other 15213" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the first 5 rows\n", "users.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# read_csv is equivalent to read_table, except it assumes a comma separator\n", "ufo = pd.read_csv('./data/ufo.csv')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Abilene</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>KS</td>\n", " <td>6/1/1931 13:00</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>New York Worlds Fair</td>\n", " <td>NaN</td>\n", " <td>LIGHT</td>\n", " <td>NY</td>\n", " <td>4/18/1933 19:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke NaN OVAL CO 2/15/1931 14:00\n", "3 Abilene NaN DISK KS 6/1/1931 13:00\n", "4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the first 5 rows\n", "ufo.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<bound method NDFrame.head of City State\n", "0 Ithaca NY\n", "1 Willingboro NJ\n", "2 Holyoke CO\n", "3 Abilene KS\n", "4 New York Worlds Fair NY\n", "... ... ...\n", "18236 Grant Park IL\n", "18237 Spirit Lake IA\n", "18238 Eagle River WI\n", "18239 Eagle River WI\n", "18240 Ybor FL\n", "\n", "[18241 rows x 2 columns]>" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# specify which columns to include by name\n", "pd.read_csv('./data/ufo.csv', usecols=['City', 'State']).head" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: 'ufo.csv'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# specify columns by position\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m pd\u001b[39m.\u001b[39;49mread_csv(\u001b[39m'\u001b[39;49m\u001b[39mufo.csv\u001b[39;49m\u001b[39m'\u001b[39;49m, usecols\u001b[39m=\u001b[39;49m[\u001b[39m0\u001b[39;49m, \u001b[39m4\u001b[39;49m])\u001b[39m.\u001b[39mhead\n", "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/util/_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 210\u001b[0m kwargs[new_arg_name] \u001b[39m=\u001b[39m new_arg_value\n\u001b[0;32m--> 211\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/util/_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(args) \u001b[39m>\u001b[39m num_allow_args:\n\u001b[1;32m 326\u001b[0m warnings\u001b[39m.\u001b[39mwarn(\n\u001b[1;32m 327\u001b[0m msg\u001b[39m.\u001b[39mformat(arguments\u001b[39m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[1;32m 328\u001b[0m \u001b[39mFutureWarning\u001b[39;00m,\n\u001b[1;32m 329\u001b[0m stacklevel\u001b[39m=\u001b[39mfind_stack_level(),\n\u001b[1;32m 330\u001b[0m )\n\u001b[0;32m--> 331\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/parsers/readers.py:950\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)\u001b[0m\n\u001b[1;32m 935\u001b[0m kwds_defaults \u001b[39m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 936\u001b[0m dialect,\n\u001b[1;32m 937\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 946\u001b[0m defaults\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mdelimiter\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39m,\u001b[39m\u001b[39m\"\u001b[39m},\n\u001b[1;32m 947\u001b[0m )\n\u001b[1;32m 948\u001b[0m kwds\u001b[39m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 950\u001b[0m \u001b[39mreturn\u001b[39;00m _read(filepath_or_buffer, kwds)\n", "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/parsers/readers.py:605\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 602\u001b[0m _validate_names(kwds\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mnames\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[1;32m 604\u001b[0m \u001b[39m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 605\u001b[0m parser \u001b[39m=\u001b[39m TextFileReader(filepath_or_buffer, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwds)\n\u001b[1;32m 607\u001b[0m \u001b[39mif\u001b[39;00m chunksize \u001b[39mor\u001b[39;00m iterator:\n\u001b[1;32m 608\u001b[0m \u001b[39mreturn\u001b[39;00m parser\n", "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1442\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1439\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39moptions[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m kwds[\u001b[39m\"\u001b[39m\u001b[39mhas_index_names\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m 1441\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles: IOHandles \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m-> 1442\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_engine \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_make_engine(f, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mengine)\n", "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1735\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1733\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m mode:\n\u001b[1;32m 1734\u001b[0m mode \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m-> 1735\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39m=\u001b[39m get_handle(\n\u001b[1;32m 1736\u001b[0m f,\n\u001b[1;32m 1737\u001b[0m mode,\n\u001b[1;32m 1738\u001b[0m encoding\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[1;32m 1739\u001b[0m compression\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mcompression\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[1;32m 1740\u001b[0m memory_map\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mmemory_map\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mFalse\u001b[39;49;00m),\n\u001b[1;32m 1741\u001b[0m is_text\u001b[39m=\u001b[39;49mis_text,\n\u001b[1;32m 1742\u001b[0m errors\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mencoding_errors\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mstrict\u001b[39;49m\u001b[39m\"\u001b[39;49m),\n\u001b[1;32m 1743\u001b[0m storage_options\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49moptions\u001b[39m.\u001b[39;49mget(\u001b[39m\"\u001b[39;49m\u001b[39mstorage_options\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39mNone\u001b[39;49;00m),\n\u001b[1;32m 1744\u001b[0m )\n\u001b[1;32m 1745\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m f \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mhandles\u001b[39m.\u001b[39mhandle\n", "File \u001b[0;32m/usr/local/Caskroom/mambaforge/base/envs/p39/lib/python3.10/site-packages/pandas/io/common.py:856\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 851\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(handle, \u001b[39mstr\u001b[39m):\n\u001b[1;32m 852\u001b[0m \u001b[39m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 853\u001b[0m \u001b[39m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 854\u001b[0m \u001b[39mif\u001b[39;00m ioargs\u001b[39m.\u001b[39mencoding \u001b[39mand\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m ioargs\u001b[39m.\u001b[39mmode:\n\u001b[1;32m 855\u001b[0m \u001b[39m# Encoding\u001b[39;00m\n\u001b[0;32m--> 856\u001b[0m handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39;49m(\n\u001b[1;32m 857\u001b[0m handle,\n\u001b[1;32m 858\u001b[0m ioargs\u001b[39m.\u001b[39;49mmode,\n\u001b[1;32m 859\u001b[0m encoding\u001b[39m=\u001b[39;49mioargs\u001b[39m.\u001b[39;49mencoding,\n\u001b[1;32m 860\u001b[0m errors\u001b[39m=\u001b[39;49merrors,\n\u001b[1;32m 861\u001b[0m newline\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 862\u001b[0m )\n\u001b[1;32m 863\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 864\u001b[0m \u001b[39m# Binary mode\u001b[39;00m\n\u001b[1;32m 865\u001b[0m handle \u001b[39m=\u001b[39m \u001b[39mopen\u001b[39m(handle, ioargs\u001b[39m.\u001b[39mmode)\n", "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'ufo.csv'" ] } ], "source": [ "# specify columns by position\n", "pd.read_csv('ufo.csv', usecols=[0, 4]).head" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke NaN OVAL CO 2/15/1931 14:00" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# specify how many rows to read\n", "pd.read_csv('ufo.csv', nrows=3).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Describe data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# read a dataset of top-rated IMDb movies into a DataFrame\n", "movies = pd.read_csv('imdb_1000.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>9.3</td>\n", " <td>The Shawshank Redemption</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>142</td>\n", " <td>[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>9.2</td>\n", " <td>The Godfather</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>175</td>\n", " <td>[u'Marlon Brando', u'Al Pacino', u'James Caan']</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>9.1</td>\n", " <td>The Godfather: Part II</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>200</td>\n", " <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>9.0</td>\n", " <td>The Dark Knight</td>\n", " <td>PG-13</td>\n", " <td>Action</td>\n", " <td>152</td>\n", " <td>[u'Christian Bale', u'Heath Ledger', u'Aaron E...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>8.9</td>\n", " <td>Pulp Fiction</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>154</td>\n", " <td>[u'John Travolta', u'Uma Thurman', u'Samuel L....</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "0 9.3 The Shawshank Redemption R Crime 142 \n", "1 9.2 The Godfather R Crime 175 \n", "2 9.1 The Godfather: Part II R Crime 200 \n", "3 9.0 The Dark Knight PG-13 Action 152 \n", "4 8.9 Pulp Fiction R Crime 154 \n", "\n", " actors_list \n", "0 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... \n", "1 [u'Marlon Brando', u'Al Pacino', u'James Caan'] \n", "2 [u'Al Pacino', u'Robert De Niro', u'Robert Duv... \n", "3 [u'Christian Bale', u'Heath Ledger', u'Aaron E... \n", "4 [u'John Travolta', u'Uma Thurman', u'Samuel L.... " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# example method: show the first 5 rows\n", "movies.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "(979, 6)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# example attribute: number of rows and columns\n", "movies.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "star_rating float64\n", "title object\n", "content_rating object\n", "genre object\n", "duration int64\n", "actors_list object\n", "dtype: object" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# example attribute: data type of each column\n", "movies.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>duration</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>979.000000</td>\n", " <td>979.000000</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>7.889785</td>\n", " <td>120.979571</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>0.336069</td>\n", " <td>26.218010</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>7.400000</td>\n", " <td>64.000000</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>7.600000</td>\n", " <td>102.000000</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>7.800000</td>\n", " <td>117.000000</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>8.100000</td>\n", " <td>134.000000</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>9.300000</td>\n", " <td>242.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating duration\n", "count 979.000000 979.000000\n", "mean 7.889785 120.979571\n", "std 0.336069 26.218010\n", "min 7.400000 64.000000\n", "25% 7.600000 102.000000\n", "50% 7.800000 117.000000\n", "75% 8.100000 134.000000\n", "max 9.300000 242.000000" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# example method: calculate summary statistics\n", "movies.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>979</td>\n", " <td>976</td>\n", " <td>979</td>\n", " <td>979</td>\n", " </tr>\n", " <tr>\n", " <th>unique</th>\n", " <td>975</td>\n", " <td>12</td>\n", " <td>16</td>\n", " <td>969</td>\n", " </tr>\n", " <tr>\n", " <th>top</th>\n", " <td>Les Miserables</td>\n", " <td>R</td>\n", " <td>Drama</td>\n", " <td>[u'Daniel Radcliffe', u'Emma Watson', u'Rupert...</td>\n", " </tr>\n", " <tr>\n", " <th>freq</th>\n", " <td>2</td>\n", " <td>460</td>\n", " <td>278</td>\n", " <td>6</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " title content_rating genre \\\n", "count 979 976 979 \n", "unique 975 12 16 \n", "top Les Miserables R Drama \n", "freq 2 460 278 \n", "\n", " actors_list \n", "count 979 \n", "unique 969 \n", "top [u'Daniel Radcliffe', u'Emma Watson', u'Rupert... \n", "freq 6 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use an optional parameter to the describe method to summarize only 'object' columns\n", "movies.describe(include=['object'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create, Rename & Remove" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a new column" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " <th>Location</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " <td>Ithaca, NY</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " <td>Willingboro, NJ</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " <td>Holyoke, CO</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Abilene</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>KS</td>\n", " <td>6/1/1931 13:00</td>\n", " <td>Abilene, KS</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>New York Worlds Fair</td>\n", " <td>NaN</td>\n", " <td>LIGHT</td>\n", " <td>NY</td>\n", " <td>4/18/1933 19:00</td>\n", " <td>New York Worlds Fair, NY</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time \\\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00 \n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00 \n", "2 Holyoke NaN OVAL CO 2/15/1931 14:00 \n", "3 Abilene NaN DISK KS 6/1/1931 13:00 \n", "4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00 \n", "\n", " Location \n", "0 Ithaca, NY \n", "1 Willingboro, NJ \n", "2 Holyoke, CO \n", "3 Abilene, KS \n", "4 New York Worlds Fair, NY " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create a new 'Location' Series (must use bracket notation to define the Series name)\n", "ufo['Location'] = ufo.City + ', ' + ufo.State\n", "ufo.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Rename columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# read a dataset of UFO reports into a DataFrame\n", "ufo = pd.read_csv('ufo.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "Index(['City', 'Colors Reported', 'Shape Reported', 'State', 'Time'], dtype='object')" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the column names\n", "ufo.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "Index(['City', 'Colors_Reported', 'Shape_Reported', 'State', 'Time'], dtype='object')" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# rename two of the columns by using the 'rename' method\n", "ufo.rename(columns={\n", " 'Colors Reported': 'Colors_Reported',\n", " 'Shape Reported': 'Shape_Reported'\n", "},\n", " inplace=True)\n", "ufo.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "Index(['city', 'colors reported', 'shape reported', 'state', 'time'], dtype='object')" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# replace all of the column names by overwriting the 'columns' attribute\n", "ufo_cols = ['city', 'colors reported', 'shape reported', 'state', 'time']\n", "ufo.columns = ufo_cols\n", "ufo.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "Index(['city', 'colors reported', 'shape reported', 'state', 'time'], dtype='object')" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# replace the column names during the file reading process by using the 'names' parameter\n", "ufo = pd.read_csv('ufo.csv', header=0, names=ufo_cols)\n", "ufo.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "Index(['city', 'colors_reported', 'shape_reported', 'state', 'time'], dtype='object')" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# replace all spaces with underscores in the column names by using the 'str.replace' method\n", "ufo.columns = ufo.columns.str.replace(' ', '_')\n", "ufo.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Remove columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Abilene</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>KS</td>\n", " <td>6/1/1931 13:00</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>New York Worlds Fair</td>\n", " <td>NaN</td>\n", " <td>LIGHT</td>\n", " <td>NY</td>\n", " <td>4/18/1933 19:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke NaN OVAL CO 2/15/1931 14:00\n", "3 Abilene NaN DISK KS 6/1/1931 13:00\n", "4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ufo = pd.read_csv('ufo.csv')\n", "ufo.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Abilene</td>\n", " <td>DISK</td>\n", " <td>KS</td>\n", " <td>6/1/1931 13:00</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>New York Worlds Fair</td>\n", " <td>LIGHT</td>\n", " <td>NY</td>\n", " <td>4/18/1933 19:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Shape Reported State Time\n", "0 Ithaca TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro OTHER NJ 6/30/1930 20:00\n", "2 Holyoke OVAL CO 2/15/1931 14:00\n", "3 Abilene DISK KS 6/1/1931 13:00\n", "4 New York Worlds Fair LIGHT NY 4/18/1933 19:00" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# remove a single column (axis=1 refers to columns)\n", "ufo.drop('Colors Reported', axis=1, inplace=True)\n", "ufo.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Shape Reported</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>TRIANGLE</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>OTHER</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>OVAL</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>DISK</td>\n", " <td>6/1/1931 13:00</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>LIGHT</td>\n", " <td>4/18/1933 19:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Shape Reported Time\n", "0 TRIANGLE 6/1/1930 22:00\n", "1 OTHER 6/30/1930 20:00\n", "2 OVAL 2/15/1931 14:00\n", "3 DISK 6/1/1931 13:00\n", "4 LIGHT 4/18/1933 19:00" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# remove multiple columns at once\n", "ufo.drop(columns=['City', 'State']).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Remove rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Abilene</td>\n", " <td>DISK</td>\n", " <td>KS</td>\n", " <td>6/1/1931 13:00</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>New York Worlds Fair</td>\n", " <td>LIGHT</td>\n", " <td>NY</td>\n", " <td>4/18/1933 19:00</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>Valley City</td>\n", " <td>DISK</td>\n", " <td>ND</td>\n", " <td>9/15/1934 15:30</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>Crater Lake</td>\n", " <td>CIRCLE</td>\n", " <td>CA</td>\n", " <td>6/15/1935 0:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Shape Reported State Time\n", "2 Holyoke OVAL CO 2/15/1931 14:00\n", "3 Abilene DISK KS 6/1/1931 13:00\n", "4 New York Worlds Fair LIGHT NY 4/18/1933 19:00\n", "5 Valley City DISK ND 9/15/1934 15:30\n", "6 Crater Lake CIRCLE CA 6/15/1935 0:00" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# new way to drop rows: specify index\n", "ufo.drop(index=[0, 1]).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Select and Filter" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Iterate" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Abilene</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>KS</td>\n", " <td>6/1/1931 13:00</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>New York Worlds Fair</td>\n", " <td>NaN</td>\n", " <td>LIGHT</td>\n", " <td>NY</td>\n", " <td>4/18/1933 19:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke NaN OVAL CO 2/15/1931 14:00\n", "3 Abilene NaN DISK KS 6/1/1931 13:00\n", "4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ufo = pd.read_csv('ufo.csv')\n", "ufo.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ithaca\n", "Willingboro\n", "Holyoke\n", "Abilene\n", "New York Worlds Fair\n", "Valley City\n", "Crater Lake\n", "Alma\n", "Eklutna\n", "Hubbard\n" ] } ], "source": [ "# Series are directly iterable (like a list)\n", "for c in ufo.City[:10]:\n", " print(c)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 Ithaca NY\n", "1 Willingboro NJ\n", "2 Holyoke CO\n", "3 Abilene KS\n", "4 New York Worlds Fair NY\n", "5 Valley City ND\n", "6 Crater Lake CA\n", "7 Alma MI\n", "8 Eklutna AK\n", "9 Hubbard OR\n" ] } ], "source": [ "# various methods are available to iterate through a DataFrame\n", "for index, row in ufo[:10].iterrows():\n", " print(index, row.City, row.State)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Know about the index" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " <th>1</th>\n", " <th>2</th>\n", " <th>3</th>\n", " <th>4</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>24</td>\n", " <td>M</td>\n", " <td>technician</td>\n", " <td>85711</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>53</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>94043</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>23</td>\n", " <td>M</td>\n", " <td>writer</td>\n", " <td>32067</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4</td>\n", " <td>24</td>\n", " <td>M</td>\n", " <td>technician</td>\n", " <td>43537</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5</td>\n", " <td>33</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>15213</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 0 1 2 3 4\n", "0 1 24 M technician 85711\n", "1 2 53 F other 94043\n", "2 3 23 M writer 32067\n", "3 4 24 M technician 43537\n", "4 5 33 F other 15213" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# index and columns both default to integers if you don't define them\n", "pd.read_table('movie.user', header=None, sep='|').head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country</th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " <th>total_litres_of_pure_alcohol</th>\n", " <th>continent</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0.0</td>\n", " <td>Asia</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>89</td>\n", " <td>132</td>\n", " <td>54</td>\n", " <td>4.9</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>25</td>\n", " <td>0</td>\n", " <td>14</td>\n", " <td>0.7</td>\n", " <td>Africa</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>245</td>\n", " <td>138</td>\n", " <td>312</td>\n", " <td>12.4</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>217</td>\n", " <td>57</td>\n", " <td>45</td>\n", " <td>5.9</td>\n", " <td>Africa</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# read a dataset of alcohol consumption into a DataFrame\n", "drinks = pd.read_csv('drinks.csv')\n", "drinks.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "RangeIndex(start=0, stop=193, step=1)" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# every DataFrame has an index (sometimes called the \"row labels\")\n", "drinks.index" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "Index(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',\n", " 'Antigua & Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria',\n", " ...\n", " 'Tanzania', 'USA', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela',\n", " 'Vietnam', 'Yemen', 'Zambia', 'Zimbabwe'],\n", " dtype='object', name='country', length=193)" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# set an existing column as the index\n", "drinks.set_index('country', inplace=True)\n", "drinks.head()\n", "# 'country' is now the index\n", "drinks.index" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country</th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " <th>total_litres_of_pure_alcohol</th>\n", " <th>continent</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0.0</td>\n", " <td>Asia</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>89</td>\n", " <td>132</td>\n", " <td>54</td>\n", " <td>4.9</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>25</td>\n", " <td>0</td>\n", " <td>14</td>\n", " <td>0.7</td>\n", " <td>Africa</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>245</td>\n", " <td>138</td>\n", " <td>312</td>\n", " <td>12.4</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>217</td>\n", " <td>57</td>\n", " <td>45</td>\n", " <td>5.9</td>\n", " <td>Africa</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# restore the index name, and move the index back to a column\n", "drinks.index.name = 'country'\n", "drinks.reset_index(inplace=True)\n", "drinks.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Select a column" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "0 Ithaca\n", "1 Willingboro\n", "2 Holyoke\n", "3 Abilene\n", "4 New York Worlds Fair\n", " ... \n", "18236 Grant Park\n", "18237 Spirit Lake\n", "18238 Eagle River\n", "18239 Eagle River\n", "18240 Ybor\n", "Name: City, Length: 18241, dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# select the 'City' Series using bracket notation\n", "ufo['City']\n", "# or equivalently, use dot notation\n", "ufo.City" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Select only numeric columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "country object\n", "beer_servings int64\n", "spirit_servings int64\n", "wine_servings int64\n", "total_litres_of_pure_alcohol float64\n", "continent object\n", "dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# read a dataset of alcohol consumption into a DataFrame, and check the data types\n", "drinks = pd.read_csv('drinks.csv')\n", "drinks.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "beer_servings int64\n", "spirit_servings int64\n", "wine_servings int64\n", "total_litres_of_pure_alcohol float64\n", "dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import numpy as np\n", "\n", "drinks.select_dtypes(include=[np.number]).dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Select multiple rows and columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke NaN OVAL CO 2/15/1931 14:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# read a dataset of UFO reports into a DataFrame\n", "ufo = pd.read_csv('ufo.csv')\n", "ufo.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "City Ithaca\n", "Colors Reported NaN\n", "Shape Reported TRIANGLE\n", "State NY\n", "Time 6/1/1930 22:00\n", "Name: 0, dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# row 0, all columns\n", "ufo.loc[0, :]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke NaN OVAL CO 2/15/1931 14:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# rows 0 and 1 and 2, all columns\n", "ufo.loc[[0, 1, 2], :]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke NaN OVAL CO 2/15/1931 14:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# rows 0 through 2 (inclusive), all columns\n", "ufo.loc[0:2, :]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "0 Ithaca\n", "1 Willingboro\n", "2 Holyoke\n", "Name: City, dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# rows 0 through 2 (inclusive), column 'City'\n", "ufo.loc[0:2, 'City']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>State</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NY</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NJ</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>CO</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City State\n", "0 Ithaca NY\n", "1 Willingboro NJ\n", "2 Holyoke CO" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# rows 0 through 2 (inclusive), columns 'City' and 'State'\n", "ufo.loc[0:2, ['City', 'State']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State\n", "0 Ithaca NaN TRIANGLE NY\n", "1 Willingboro NaN OTHER NJ\n", "2 Holyoke NaN OVAL CO" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# rows 0 through 2 (inclusive), columns 'City' through 'State' (inclusive)\n", "ufo.loc[0:2, 'City':'State']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>State</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NY</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NJ</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City State\n", "0 Ithaca NY\n", "1 Willingboro NJ" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# rows in positions 0 and 1, columns in positions 0 and 3\n", "ufo.iloc[[0, 1], [0, 3]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State\n", "0 Ithaca NaN TRIANGLE NY\n", "1 Willingboro NaN OTHER NJ" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# rows in positions 0 through 2 (exclusive), columns in positions 0 through 4 (exclusive)\n", "ufo.iloc[0:2, 0:4]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# rows in positions 0 through 2 (exclusive), all columns\n", "ufo.iloc[0:2, :]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Filter" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " <th>total_litres_of_pure_alcohol</th>\n", " <th>continent</th>\n", " </tr>\n", " <tr>\n", " <th>country</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Argentina</th>\n", " <td>193</td>\n", " <td>25</td>\n", " <td>221</td>\n", " <td>8.3</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Bolivia</th>\n", " <td>167</td>\n", " <td>41</td>\n", " <td>8</td>\n", " <td>3.8</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Brazil</th>\n", " <td>245</td>\n", " <td>145</td>\n", " <td>16</td>\n", " <td>7.2</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Chile</th>\n", " <td>130</td>\n", " <td>124</td>\n", " <td>172</td>\n", " <td>7.6</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Colombia</th>\n", " <td>159</td>\n", " <td>76</td>\n", " <td>3</td>\n", " <td>4.2</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Ecuador</th>\n", " <td>162</td>\n", " <td>74</td>\n", " <td>3</td>\n", " <td>4.2</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Guyana</th>\n", " <td>93</td>\n", " <td>302</td>\n", " <td>1</td>\n", " <td>7.1</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Paraguay</th>\n", " <td>213</td>\n", " <td>117</td>\n", " <td>74</td>\n", " <td>7.3</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Peru</th>\n", " <td>163</td>\n", " <td>160</td>\n", " <td>21</td>\n", " <td>6.1</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Suriname</th>\n", " <td>128</td>\n", " <td>178</td>\n", " <td>7</td>\n", " <td>5.6</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Uruguay</th>\n", " <td>115</td>\n", " <td>35</td>\n", " <td>220</td>\n", " <td>6.6</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Venezuela</th>\n", " <td>333</td>\n", " <td>100</td>\n", " <td>3</td>\n", " <td>7.7</td>\n", " <td>South America</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " beer_servings spirit_servings wine_servings \\\n", "country \n", "Argentina 193 25 221 \n", "Bolivia 167 41 8 \n", "Brazil 245 145 16 \n", "Chile 130 124 172 \n", "Colombia 159 76 3 \n", "Ecuador 162 74 3 \n", "Guyana 93 302 1 \n", "Paraguay 213 117 74 \n", "Peru 163 160 21 \n", "Suriname 128 178 7 \n", "Uruguay 115 35 220 \n", "Venezuela 333 100 3 \n", "\n", " total_litres_of_pure_alcohol continent \n", "country \n", "Argentina 8.3 South America \n", "Bolivia 3.8 South America \n", "Brazil 7.2 South America \n", "Chile 7.6 South America \n", "Colombia 4.2 South America \n", "Ecuador 4.2 South America \n", "Guyana 7.1 South America \n", "Paraguay 7.3 South America \n", "Peru 6.1 South America \n", "Suriname 5.6 South America \n", "Uruguay 6.6 South America \n", "Venezuela 7.7 South America " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "drinks.loc[drinks.continent=='South America']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6 221\n", "20 8\n", "23 16\n", "35 172\n", "37 3\n", "52 3\n", "72 1\n", "132 74\n", "133 21\n", "163 7\n", "185 220\n", "188 3\n", "Name: wine_servings, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "drinks.loc[drinks.continent=='South America', 'wine_servings']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " <th>total_litres_of_pure_alcohol</th>\n", " <th>continent</th>\n", " </tr>\n", " <tr>\n", " <th>country</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Argentina</th>\n", " <td>193</td>\n", " <td>25</td>\n", " <td>221</td>\n", " <td>8.3</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Bolivia</th>\n", " <td>167</td>\n", " <td>41</td>\n", " <td>8</td>\n", " <td>3.8</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Brazil</th>\n", " <td>245</td>\n", " <td>145</td>\n", " <td>16</td>\n", " <td>7.2</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Chile</th>\n", " <td>130</td>\n", " <td>124</td>\n", " <td>172</td>\n", " <td>7.6</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Colombia</th>\n", " <td>159</td>\n", " <td>76</td>\n", " <td>3</td>\n", " <td>4.2</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Ecuador</th>\n", " <td>162</td>\n", " <td>74</td>\n", " <td>3</td>\n", " <td>4.2</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Guyana</th>\n", " <td>93</td>\n", " <td>302</td>\n", " <td>1</td>\n", " <td>7.1</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Paraguay</th>\n", " <td>213</td>\n", " <td>117</td>\n", " <td>74</td>\n", " <td>7.3</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Peru</th>\n", " <td>163</td>\n", " <td>160</td>\n", " <td>21</td>\n", " <td>6.1</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Suriname</th>\n", " <td>128</td>\n", " <td>178</td>\n", " <td>7</td>\n", " <td>5.6</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Uruguay</th>\n", " <td>115</td>\n", " <td>35</td>\n", " <td>220</td>\n", " <td>6.6</td>\n", " <td>South America</td>\n", " </tr>\n", " <tr>\n", " <th>Venezuela</th>\n", " <td>333</td>\n", " <td>100</td>\n", " <td>3</td>\n", " <td>7.7</td>\n", " <td>South America</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " beer_servings spirit_servings wine_servings \\\n", "country \n", "Argentina 193 25 221 \n", "Bolivia 167 41 8 \n", "Brazil 245 145 16 \n", "Chile 130 124 172 \n", "Colombia 159 76 3 \n", "Ecuador 162 74 3 \n", "Guyana 93 302 1 \n", "Paraguay 213 117 74 \n", "Peru 163 160 21 \n", "Suriname 128 178 7 \n", "Uruguay 115 35 220 \n", "Venezuela 333 100 3 \n", "\n", " total_litres_of_pure_alcohol continent \n", "country \n", "Argentina 8.3 South America \n", "Bolivia 3.8 South America \n", "Brazil 7.2 South America \n", "Chile 7.6 South America \n", "Colombia 4.2 South America \n", "Ecuador 4.2 South America \n", "Guyana 7.1 South America \n", "Paraguay 7.3 South America \n", "Peru 6.1 South America \n", "Suriname 5.6 South America \n", "Uruguay 6.6 South America \n", "Venezuela 7.7 South America " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "drinks.query(\"continent=='South America'\")['wine_servings']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Multiple filter" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>9.3</td>\n", " <td>The Shawshank Redemption</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>142</td>\n", " <td>[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>9.2</td>\n", " <td>The Godfather</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>175</td>\n", " <td>[u'Marlon Brando', u'Al Pacino', u'James Caan']</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>9.1</td>\n", " <td>The Godfather: Part II</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>200</td>\n", " <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>9.0</td>\n", " <td>The Dark Knight</td>\n", " <td>PG-13</td>\n", " <td>Action</td>\n", " <td>152</td>\n", " <td>[u'Christian Bale', u'Heath Ledger', u'Aaron E...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>8.9</td>\n", " <td>Pulp Fiction</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>154</td>\n", " <td>[u'John Travolta', u'Uma Thurman', u'Samuel L....</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "0 9.3 The Shawshank Redemption R Crime 142 \n", "1 9.2 The Godfather R Crime 175 \n", "2 9.1 The Godfather: Part II R Crime 200 \n", "3 9.0 The Dark Knight PG-13 Action 152 \n", "4 8.9 Pulp Fiction R Crime 154 \n", "\n", " actors_list \n", "0 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... \n", "1 [u'Marlon Brando', u'Al Pacino', u'James Caan'] \n", "2 [u'Al Pacino', u'Robert De Niro', u'Robert Duv... \n", "3 [u'Christian Bale', u'Heath Ledger', u'Aaron E... \n", "4 [u'John Travolta', u'Uma Thurman', u'Samuel L.... " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "movies = pd.read_csv('pandas/imdb_1000.csv')\n", "movies.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>17</th>\n", " <td>8.7</td>\n", " <td>Seven Samurai</td>\n", " <td>UNRATED</td>\n", " <td>Drama</td>\n", " <td>207</td>\n", " <td>[u'Toshir\\xf4 Mifune', u'Takashi Shimura', u'K...</td>\n", " </tr>\n", " <tr>\n", " <th>157</th>\n", " <td>8.2</td>\n", " <td>Gone with the Wind</td>\n", " <td>G</td>\n", " <td>Drama</td>\n", " <td>238</td>\n", " <td>[u'Clark Gable', u'Vivien Leigh', u'Thomas Mit...</td>\n", " </tr>\n", " <tr>\n", " <th>476</th>\n", " <td>7.8</td>\n", " <td>Hamlet</td>\n", " <td>PG-13</td>\n", " <td>Drama</td>\n", " <td>242</td>\n", " <td>[u'Kenneth Branagh', u'Julie Christie', u'Dere...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "17 8.7 Seven Samurai UNRATED Drama 207 \n", "157 8.2 Gone with the Wind G Drama 238 \n", "476 7.8 Hamlet PG-13 Drama 242 \n", "\n", " actors_list \n", "17 [u'Toshir\\xf4 Mifune', u'Takashi Shimura', u'K... \n", "157 [u'Clark Gable', u'Vivien Leigh', u'Thomas Mit... \n", "476 [u'Kenneth Branagh', u'Julie Christie', u'Dere... " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# use the '&' operator to specify that both conditions are required\n", "movies.loc[(movies.duration >=200) & (movies.genre == 'Drama')]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>2</th>\n", " <td>9.1</td>\n", " <td>The Godfather: Part II</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>200</td>\n", " <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>8.9</td>\n", " <td>12 Angry Men</td>\n", " <td>NOT RATED</td>\n", " <td>Drama</td>\n", " <td>96</td>\n", " <td>[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>8.9</td>\n", " <td>The Lord of the Rings: The Return of the King</td>\n", " <td>PG-13</td>\n", " <td>Adventure</td>\n", " <td>201</td>\n", " <td>[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK...</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>8.9</td>\n", " <td>Fight Club</td>\n", " <td>R</td>\n", " <td>Drama</td>\n", " <td>139</td>\n", " <td>[u'Brad Pitt', u'Edward Norton', u'Helena Bonh...</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>8.8</td>\n", " <td>Forrest Gump</td>\n", " <td>PG-13</td>\n", " <td>Drama</td>\n", " <td>142</td>\n", " <td>[u'Tom Hanks', u'Robin Wright', u'Gary Sinise']</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating \\\n", "2 9.1 The Godfather: Part II R \n", "5 8.9 12 Angry Men NOT RATED \n", "7 8.9 The Lord of the Rings: The Return of the King PG-13 \n", "9 8.9 Fight Club R \n", "13 8.8 Forrest Gump PG-13 \n", "\n", " genre duration actors_list \n", "2 Crime 200 [u'Al Pacino', u'Robert De Niro', u'Robert Duv... \n", "5 Drama 96 [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... \n", "7 Adventure 201 [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... \n", "9 Drama 139 [u'Brad Pitt', u'Edward Norton', u'Helena Bonh... \n", "13 Drama 142 [u'Tom Hanks', u'Robin Wright', u'Gary Sinise'] " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# using the '|' operator would have shown movies that are either long or dramas (or both)\n", "movies.loc[(movies.duration >=200) | (movies.genre == 'Drama')].head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>9.3</td>\n", " <td>The Shawshank Redemption</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>142</td>\n", " <td>[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>9.2</td>\n", " <td>The Godfather</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>175</td>\n", " <td>[u'Marlon Brando', u'Al Pacino', u'James Caan']</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>9.1</td>\n", " <td>The Godfather: Part II</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>200</td>\n", " <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>9.0</td>\n", " <td>The Dark Knight</td>\n", " <td>PG-13</td>\n", " <td>Action</td>\n", " <td>152</td>\n", " <td>[u'Christian Bale', u'Heath Ledger', u'Aaron E...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>8.9</td>\n", " <td>Pulp Fiction</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>154</td>\n", " <td>[u'John Travolta', u'Uma Thurman', u'Samuel L....</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>8.9</td>\n", " <td>12 Angry Men</td>\n", " <td>NOT RATED</td>\n", " <td>Drama</td>\n", " <td>96</td>\n", " <td>[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>8.9</td>\n", " <td>Fight Club</td>\n", " <td>R</td>\n", " <td>Drama</td>\n", " <td>139</td>\n", " <td>[u'Brad Pitt', u'Edward Norton', u'Helena Bonh...</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>8.8</td>\n", " <td>Inception</td>\n", " <td>PG-13</td>\n", " <td>Action</td>\n", " <td>148</td>\n", " <td>[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'...</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>8.8</td>\n", " <td>Star Wars: Episode V - The Empire Strikes Back</td>\n", " <td>PG</td>\n", " <td>Action</td>\n", " <td>124</td>\n", " <td>[u'Mark Hamill', u'Harrison Ford', u'Carrie Fi...</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>8.8</td>\n", " <td>Forrest Gump</td>\n", " <td>PG-13</td>\n", " <td>Drama</td>\n", " <td>142</td>\n", " <td>[u'Tom Hanks', u'Robin Wright', u'Gary Sinise']</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title \\\n", "0 9.3 The Shawshank Redemption \n", "1 9.2 The Godfather \n", "2 9.1 The Godfather: Part II \n", "3 9.0 The Dark Knight \n", "4 8.9 Pulp Fiction \n", "5 8.9 12 Angry Men \n", "9 8.9 Fight Club \n", "11 8.8 Inception \n", "12 8.8 Star Wars: Episode V - The Empire Strikes Back \n", "13 8.8 Forrest Gump \n", "\n", " content_rating genre duration \\\n", "0 R Crime 142 \n", "1 R Crime 175 \n", "2 R Crime 200 \n", "3 PG-13 Action 152 \n", "4 R Crime 154 \n", "5 NOT RATED Drama 96 \n", "9 R Drama 139 \n", "11 PG-13 Action 148 \n", "12 PG Action 124 \n", "13 PG-13 Drama 142 \n", "\n", " actors_list \n", "0 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... \n", "1 [u'Marlon Brando', u'Al Pacino', u'James Caan'] \n", "2 [u'Al Pacino', u'Robert De Niro', u'Robert Duv... \n", "3 [u'Christian Bale', u'Heath Ledger', u'Aaron E... \n", "4 [u'John Travolta', u'Uma Thurman', u'Samuel L.... \n", "5 [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... \n", "9 [u'Brad Pitt', u'Edward Norton', u'Helena Bonh... \n", "11 [u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'... \n", "12 [u'Mark Hamill', u'Harrison Ford', u'Carrie Fi... \n", "13 [u'Tom Hanks', u'Robin Wright', u'Gary Sinise'] " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "movies.loc[movies.genre.isin(['Crime', 'Drama', 'Action'])].head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Explore" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Count" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country</th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " <th>total_litres_of_pure_alcohol</th>\n", " <th>continent</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0.0</td>\n", " <td>Asia</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>89</td>\n", " <td>132</td>\n", " <td>54</td>\n", " <td>4.9</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>25</td>\n", " <td>0</td>\n", " <td>14</td>\n", " <td>0.7</td>\n", " <td>Africa</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>245</td>\n", " <td>138</td>\n", " <td>312</td>\n", " <td>12.4</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>217</td>\n", " <td>57</td>\n", " <td>45</td>\n", " <td>5.9</td>\n", " <td>Africa</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "drinks = pd.read_csv('pandas/drinks.csv')\n", "drinks.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "Africa 53\n", "Europe 45\n", "Asia 44\n", "North America 23\n", "Oceania 16\n", "South America 12\n", "Name: continent, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "drinks.continent.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Africa 0.274611\n", "Europe 0.233161\n", "Asia 0.227979\n", "North America 0.119171\n", "Oceania 0.082902\n", "South America 0.062176\n", "Name: continent, dtype: float64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "drinks.continent.value_counts(normalize=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "Index(['Africa', 'Europe', 'Asia', 'North America', 'Oceania',\n", " 'South America'],\n", " dtype='object')" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# access the Series index\n", "drinks.continent.value_counts().index" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "array([53, 45, 44, 23, 16, 12])" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# access the Series values\n", "drinks.continent.value_counts().values" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "South America 12\n", "Oceania 16\n", "North America 23\n", "Asia 44\n", "Europe 45\n", "Africa 53\n", "Name: continent, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# any Series can be sorted by its values\n", "drinks.continent.value_counts().sort_values()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "Africa 53\n", "Asia 44\n", "Europe 45\n", "North America 23\n", "Oceania 16\n", "South America 12\n", "Name: continent, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# any Series can also be sorted by its index\n", "drinks.continent.value_counts().sort_index()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Asia', 'Europe', 'Africa', 'North America', 'South America',\n", " 'Oceania'], dtype=object)" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "drinks.continent.unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "drinks.continent.nunique()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sort" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>9.3</td>\n", " <td>The Shawshank Redemption</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>142</td>\n", " <td>[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>9.2</td>\n", " <td>The Godfather</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>175</td>\n", " <td>[u'Marlon Brando', u'Al Pacino', u'James Caan']</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>9.1</td>\n", " <td>The Godfather: Part II</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>200</td>\n", " <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>9.0</td>\n", " <td>The Dark Knight</td>\n", " <td>PG-13</td>\n", " <td>Action</td>\n", " <td>152</td>\n", " <td>[u'Christian Bale', u'Heath Ledger', u'Aaron E...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>8.9</td>\n", " <td>Pulp Fiction</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>154</td>\n", " <td>[u'John Travolta', u'Uma Thurman', u'Samuel L....</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "0 9.3 The Shawshank Redemption R Crime 142 \n", "1 9.2 The Godfather R Crime 175 \n", "2 9.1 The Godfather: Part II R Crime 200 \n", "3 9.0 The Dark Knight PG-13 Action 152 \n", "4 8.9 Pulp Fiction R Crime 154 \n", "\n", " actors_list \n", "0 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... \n", "1 [u'Marlon Brando', u'Al Pacino', u'James Caan'] \n", "2 [u'Al Pacino', u'Robert De Niro', u'Robert Duv... \n", "3 [u'Christian Bale', u'Heath Ledger', u'Aaron E... \n", "4 [u'John Travolta', u'Uma Thurman', u'Samuel L.... " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "movies = pd.read_csv('pandas/imdb_1000.csv')\n", "movies.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "542 (500) Days of Summer\n", "5 12 Angry Men\n", "201 12 Years a Slave\n", "698 127 Hours\n", "110 2001: A Space Odyssey\n", "Name: title, dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# sort the 'title' Series in ascending order (returns a Series)\n", "movies.title.sort_values().head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "864 [Rec]\n", "526 Zulu\n", "615 Zombieland\n", "677 Zodiac\n", "955 Zero Dark Thirty\n", "Name: title, dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# sort in descending order instead\n", "movies.title.sort_values(ascending=False).head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>542</th>\n", " <td>7.8</td>\n", " <td>(500) Days of Summer</td>\n", " <td>PG-13</td>\n", " <td>Comedy</td>\n", " <td>95</td>\n", " <td>[u'Zooey Deschanel', u'Joseph Gordon-Levitt', ...</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>8.9</td>\n", " <td>12 Angry Men</td>\n", " <td>NOT RATED</td>\n", " <td>Drama</td>\n", " <td>96</td>\n", " <td>[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...</td>\n", " </tr>\n", " <tr>\n", " <th>201</th>\n", " <td>8.1</td>\n", " <td>12 Years a Slave</td>\n", " <td>R</td>\n", " <td>Biography</td>\n", " <td>134</td>\n", " <td>[u'Chiwetel Ejiofor', u'Michael Kenneth Willia...</td>\n", " </tr>\n", " <tr>\n", " <th>698</th>\n", " <td>7.6</td>\n", " <td>127 Hours</td>\n", " <td>R</td>\n", " <td>Adventure</td>\n", " <td>94</td>\n", " <td>[u'James Franco', u'Amber Tamblyn', u'Kate Mara']</td>\n", " </tr>\n", " <tr>\n", " <th>110</th>\n", " <td>8.3</td>\n", " <td>2001: A Space Odyssey</td>\n", " <td>G</td>\n", " <td>Mystery</td>\n", " <td>160</td>\n", " <td>[u'Keir Dullea', u'Gary Lockwood', u'William S...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "542 7.8 (500) Days of Summer PG-13 Comedy 95 \n", "5 8.9 12 Angry Men NOT RATED Drama 96 \n", "201 8.1 12 Years a Slave R Biography 134 \n", "698 7.6 127 Hours R Adventure 94 \n", "110 8.3 2001: A Space Odyssey G Mystery 160 \n", "\n", " actors_list \n", "542 [u'Zooey Deschanel', u'Joseph Gordon-Levitt', ... \n", "5 [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... \n", "201 [u'Chiwetel Ejiofor', u'Michael Kenneth Willia... \n", "698 [u'James Franco', u'Amber Tamblyn', u'Kate Mara'] \n", "110 [u'Keir Dullea', u'Gary Lockwood', u'William S... " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# sort the entire DataFrame by the 'title' Series (returns a DataFrame)\n", "movies.sort_values('title').head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>864</th>\n", " <td>7.5</td>\n", " <td>[Rec]</td>\n", " <td>R</td>\n", " <td>Horror</td>\n", " <td>78</td>\n", " <td>[u'Manuela Velasco', u'Ferran Terraza', u'Jorg...</td>\n", " </tr>\n", " <tr>\n", " <th>526</th>\n", " <td>7.8</td>\n", " <td>Zulu</td>\n", " <td>UNRATED</td>\n", " <td>Drama</td>\n", " <td>138</td>\n", " <td>[u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac...</td>\n", " </tr>\n", " <tr>\n", " <th>615</th>\n", " <td>7.7</td>\n", " <td>Zombieland</td>\n", " <td>R</td>\n", " <td>Comedy</td>\n", " <td>88</td>\n", " <td>[u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha...</td>\n", " </tr>\n", " <tr>\n", " <th>677</th>\n", " <td>7.7</td>\n", " <td>Zodiac</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>157</td>\n", " <td>[u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M...</td>\n", " </tr>\n", " <tr>\n", " <th>955</th>\n", " <td>7.4</td>\n", " <td>Zero Dark Thirty</td>\n", " <td>R</td>\n", " <td>Drama</td>\n", " <td>157</td>\n", " <td>[u'Jessica Chastain', u'Joel Edgerton', u'Chri...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "864 7.5 [Rec] R Horror 78 \n", "526 7.8 Zulu UNRATED Drama 138 \n", "615 7.7 Zombieland R Comedy 88 \n", "677 7.7 Zodiac R Crime 157 \n", "955 7.4 Zero Dark Thirty R Drama 157 \n", "\n", " actors_list \n", "864 [u'Manuela Velasco', u'Ferran Terraza', u'Jorg... \n", "526 [u'Stanley Baker', u'Jack Hawkins', u'Ulla Jac... \n", "615 [u'Jesse Eisenberg', u'Emma Stone', u'Woody Ha... \n", "677 [u'Jake Gyllenhaal', u'Robert Downey Jr.', u'M... \n", "955 [u'Jessica Chastain', u'Joel Edgerton', u'Chri... " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# sort in descending order instead\n", "movies.sort_values('title', ascending=False).head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>713</th>\n", " <td>7.6</td>\n", " <td>The Jungle Book</td>\n", " <td>APPROVED</td>\n", " <td>Animation</td>\n", " <td>78</td>\n", " <td>[u'Phil Harris', u'Sebastian Cabot', u'Louis P...</td>\n", " </tr>\n", " <tr>\n", " <th>513</th>\n", " <td>7.8</td>\n", " <td>Invasion of the Body Snatchers</td>\n", " <td>APPROVED</td>\n", " <td>Horror</td>\n", " <td>80</td>\n", " <td>[u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga...</td>\n", " </tr>\n", " <tr>\n", " <th>272</th>\n", " <td>8.1</td>\n", " <td>The Killing</td>\n", " <td>APPROVED</td>\n", " <td>Crime</td>\n", " <td>85</td>\n", " <td>[u'Sterling Hayden', u'Coleen Gray', u'Vince E...</td>\n", " </tr>\n", " <tr>\n", " <th>703</th>\n", " <td>7.6</td>\n", " <td>Dracula</td>\n", " <td>APPROVED</td>\n", " <td>Horror</td>\n", " <td>85</td>\n", " <td>[u'Bela Lugosi', u'Helen Chandler', u'David Ma...</td>\n", " </tr>\n", " <tr>\n", " <th>612</th>\n", " <td>7.7</td>\n", " <td>A Hard Day's Night</td>\n", " <td>APPROVED</td>\n", " <td>Comedy</td>\n", " <td>87</td>\n", " <td>[u'John Lennon', u'Paul McCartney', u'George H...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre \\\n", "713 7.6 The Jungle Book APPROVED Animation \n", "513 7.8 Invasion of the Body Snatchers APPROVED Horror \n", "272 8.1 The Killing APPROVED Crime \n", "703 7.6 Dracula APPROVED Horror \n", "612 7.7 A Hard Day's Night APPROVED Comedy \n", "\n", " duration actors_list \n", "713 78 [u'Phil Harris', u'Sebastian Cabot', u'Louis P... \n", "513 80 [u'Kevin McCarthy', u'Dana Wynter', u'Larry Ga... \n", "272 85 [u'Sterling Hayden', u'Coleen Gray', u'Vince E... \n", "703 85 [u'Bela Lugosi', u'Helen Chandler', u'David Ma... \n", "612 87 [u'John Lennon', u'Paul McCartney', u'George H... " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# sort the DataFrame first by 'content_rating', then by 'duration'\n", "movies.sort_values(['content_rating', 'duration']).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Group" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country</th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " <th>total_litres_of_pure_alcohol</th>\n", " <th>continent</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0.0</td>\n", " <td>Asia</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>89</td>\n", " <td>132</td>\n", " <td>54</td>\n", " <td>4.9</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>25</td>\n", " <td>0</td>\n", " <td>14</td>\n", " <td>0.7</td>\n", " <td>Africa</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>245</td>\n", " <td>138</td>\n", " <td>312</td>\n", " <td>12.4</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>217</td>\n", " <td>57</td>\n", " <td>45</td>\n", " <td>5.9</td>\n", " <td>Africa</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "drinks = pd.read_csv('pandas/drinks.csv')\n", "drinks.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "continent\n", "Africa 61.471698\n", "Asia 37.045455\n", "Europe 193.777778\n", "North America 145.434783\n", "Oceania 89.687500\n", "South America 175.083333\n", "Name: beer_servings, dtype: float64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# calculate the mean beer servings for each continent\n", "drinks.groupby('continent').beer_servings.mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "continent\n", "Africa 376\n", "Asia 247\n", "Europe 361\n", "North America 285\n", "Oceania 306\n", "South America 333\n", "Name: beer_servings, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# other aggregation functions (such as 'max') can also be used with groupby\n", "drinks.groupby('continent').beer_servings.max()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>count</th>\n", " <th>mean</th>\n", " <th>min</th>\n", " <th>max</th>\n", " </tr>\n", " <tr>\n", " <th>continent</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Africa</th>\n", " <td>53</td>\n", " <td>61.471698</td>\n", " <td>0</td>\n", " <td>376</td>\n", " </tr>\n", " <tr>\n", " <th>Asia</th>\n", " <td>44</td>\n", " <td>37.045455</td>\n", " <td>0</td>\n", " <td>247</td>\n", " </tr>\n", " <tr>\n", " <th>Europe</th>\n", " <td>45</td>\n", " <td>193.777778</td>\n", " <td>0</td>\n", " <td>361</td>\n", " </tr>\n", " <tr>\n", " <th>North America</th>\n", " <td>23</td>\n", " <td>145.434783</td>\n", " <td>1</td>\n", " <td>285</td>\n", " </tr>\n", " <tr>\n", " <th>Oceania</th>\n", " <td>16</td>\n", " <td>89.687500</td>\n", " <td>0</td>\n", " <td>306</td>\n", " </tr>\n", " <tr>\n", " <th>South America</th>\n", " <td>12</td>\n", " <td>175.083333</td>\n", " <td>93</td>\n", " <td>333</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " count mean min max\n", "continent \n", "Africa 53 61.471698 0 376\n", "Asia 44 37.045455 0 247\n", "Europe 45 193.777778 0 361\n", "North America 23 145.434783 1 285\n", "Oceania 16 89.687500 0 306\n", "South America 12 175.083333 93 333" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# multiple aggregation functions can be applied simultaneously\n", "drinks.groupby('continent').beer_servings.agg(['count', 'mean', 'min', 'max'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Correlation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th>continent</th>\n", " <th>Africa</th>\n", " <th>Asia</th>\n", " <th>Europe</th>\n", " <th>North America</th>\n", " <th>Oceania</th>\n", " <th>South America</th>\n", " </tr>\n", " <tr>\n", " <th>country</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Afghanistan</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>Albania</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>Algeria</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>Andorra</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>Angola</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>Venezuela</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>Vietnam</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>Yemen</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>Zambia</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>Zimbabwe</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>193 rows × 6 columns</p>\n", "</div>" ], "text/plain": [ "continent Africa Asia Europe North America Oceania South America\n", "country \n", "Afghanistan 0 1 0 0 0 0\n", "Albania 0 0 1 0 0 0\n", "Algeria 1 0 0 0 0 0\n", "Andorra 0 0 1 0 0 0\n", "Angola 1 0 0 0 0 0\n", "... ... ... ... ... ... ...\n", "Venezuela 0 0 0 0 0 1\n", "Vietnam 0 1 0 0 0 0\n", "Yemen 0 1 0 0 0 0\n", "Zambia 1 0 0 0 0 0\n", "Zimbabwe 1 0 0 0 0 0\n", "\n", "[193 rows x 6 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pd.crosstab(drinks.country, drinks.continent)" ] }, { "cell_type": "markdown", "metadata": { "toc-hr-collapsed": true, "toc-nb-collapsed": true }, "source": [ "### MultiIndex" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Date</th>\n", " <th>Close</th>\n", " <th>Volume</th>\n", " <th>Symbol</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2016-10-03</td>\n", " <td>31.50</td>\n", " <td>14070500</td>\n", " <td>CSCO</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2016-10-03</td>\n", " <td>112.52</td>\n", " <td>21701800</td>\n", " <td>AAPL</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2016-10-03</td>\n", " <td>57.42</td>\n", " <td>19189500</td>\n", " <td>MSFT</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2016-10-04</td>\n", " <td>113.00</td>\n", " <td>29736800</td>\n", " <td>AAPL</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>2016-10-04</td>\n", " <td>57.24</td>\n", " <td>20085900</td>\n", " <td>MSFT</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>2016-10-04</td>\n", " <td>31.35</td>\n", " <td>18460400</td>\n", " <td>CSCO</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>2016-10-05</td>\n", " <td>57.64</td>\n", " <td>16726400</td>\n", " <td>MSFT</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>2016-10-05</td>\n", " <td>31.59</td>\n", " <td>11808600</td>\n", " <td>CSCO</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>2016-10-05</td>\n", " <td>113.05</td>\n", " <td>21453100</td>\n", " <td>AAPL</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Date Close Volume Symbol\n", "0 2016-10-03 31.50 14070500 CSCO\n", "1 2016-10-03 112.52 21701800 AAPL\n", "2 2016-10-03 57.42 19189500 MSFT\n", "3 2016-10-04 113.00 29736800 AAPL\n", "4 2016-10-04 57.24 20085900 MSFT\n", "5 2016-10-04 31.35 18460400 CSCO\n", "6 2016-10-05 57.64 16726400 MSFT\n", "7 2016-10-05 31.59 11808600 CSCO\n", "8 2016-10-05 113.05 21453100 AAPL" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks = pd.read_csv('pandas/stocks.csv')\n", "stocks" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RangeIndex(start=0, stop=9, step=1)" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Symbol Date \n", "AAPL 2016-10-03 112.52\n", " 2016-10-04 113.00\n", " 2016-10-05 113.05\n", "CSCO 2016-10-03 31.50\n", " 2016-10-04 31.35\n", " 2016-10-05 31.59\n", "MSFT 2016-10-03 57.42\n", " 2016-10-04 57.24\n", " 2016-10-05 57.64\n", "Name: Close, dtype: float64" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser = stocks.groupby(['Symbol', 'Date'])['Close'].mean()\n", "ser" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex([('AAPL', '2016-10-03'),\n", " ('AAPL', '2016-10-04'),\n", " ('AAPL', '2016-10-05'),\n", " ('CSCO', '2016-10-03'),\n", " ('CSCO', '2016-10-04'),\n", " ('CSCO', '2016-10-05'),\n", " ('MSFT', '2016-10-03'),\n", " ('MSFT', '2016-10-04'),\n", " ('MSFT', '2016-10-05')],\n", " names=['Symbol', 'Date'])" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser.index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th>Date</th>\n", " <th>2016-10-03</th>\n", " <th>2016-10-04</th>\n", " <th>2016-10-05</th>\n", " </tr>\n", " <tr>\n", " <th>Symbol</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>AAPL</th>\n", " <td>112.52</td>\n", " <td>113.00</td>\n", " <td>113.05</td>\n", " </tr>\n", " <tr>\n", " <th>CSCO</th>\n", " <td>31.50</td>\n", " <td>31.35</td>\n", " <td>31.59</td>\n", " </tr>\n", " <tr>\n", " <th>MSFT</th>\n", " <td>57.42</td>\n", " <td>57.24</td>\n", " <td>57.64</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "Date 2016-10-03 2016-10-04 2016-10-05\n", "Symbol \n", "AAPL 112.52 113.00 113.05\n", "CSCO 31.50 31.35 31.59\n", "MSFT 57.42 57.24 57.64" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser.unstack()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th>Close</th>\n", " <th>Volume</th>\n", " </tr>\n", " <tr>\n", " <th>Symbol</th>\n", " <th>Date</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>CSCO</th>\n", " <th>2016-10-03</th>\n", " <td>31.50</td>\n", " <td>14070500</td>\n", " </tr>\n", " <tr>\n", " <th>AAPL</th>\n", " <th>2016-10-03</th>\n", " <td>112.52</td>\n", " <td>21701800</td>\n", " </tr>\n", " <tr>\n", " <th>MSFT</th>\n", " <th>2016-10-03</th>\n", " <td>57.42</td>\n", " <td>19189500</td>\n", " </tr>\n", " <tr>\n", " <th>AAPL</th>\n", " <th>2016-10-04</th>\n", " <td>113.00</td>\n", " <td>29736800</td>\n", " </tr>\n", " <tr>\n", " <th>MSFT</th>\n", " <th>2016-10-04</th>\n", " <td>57.24</td>\n", " <td>20085900</td>\n", " </tr>\n", " <tr>\n", " <th>CSCO</th>\n", " <th>2016-10-04</th>\n", " <td>31.35</td>\n", " <td>18460400</td>\n", " </tr>\n", " <tr>\n", " <th>MSFT</th>\n", " <th>2016-10-05</th>\n", " <td>57.64</td>\n", " <td>16726400</td>\n", " </tr>\n", " <tr>\n", " <th>CSCO</th>\n", " <th>2016-10-05</th>\n", " <td>31.59</td>\n", " <td>11808600</td>\n", " </tr>\n", " <tr>\n", " <th>AAPL</th>\n", " <th>2016-10-05</th>\n", " <td>113.05</td>\n", " <td>21453100</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Close Volume\n", "Symbol Date \n", "CSCO 2016-10-03 31.50 14070500\n", "AAPL 2016-10-03 112.52 21701800\n", "MSFT 2016-10-03 57.42 19189500\n", "AAPL 2016-10-04 113.00 29736800\n", "MSFT 2016-10-04 57.24 20085900\n", "CSCO 2016-10-04 31.35 18460400\n", "MSFT 2016-10-05 57.64 16726400\n", "CSCO 2016-10-05 31.59 11808600\n", "AAPL 2016-10-05 113.05 21453100" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.set_index(['Symbol', 'Date'], inplace=True)\n", "stocks" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex([('CSCO', '2016-10-03'),\n", " ('AAPL', '2016-10-03'),\n", " ('MSFT', '2016-10-03'),\n", " ('AAPL', '2016-10-04'),\n", " ('MSFT', '2016-10-04'),\n", " ('CSCO', '2016-10-04'),\n", " ('MSFT', '2016-10-05'),\n", " ('CSCO', '2016-10-05'),\n", " ('AAPL', '2016-10-05')],\n", " names=['Symbol', 'Date'])" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th>Close</th>\n", " <th>Volume</th>\n", " </tr>\n", " <tr>\n", " <th>Symbol</th>\n", " <th>Date</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">AAPL</th>\n", " <th>2016-10-03</th>\n", " <td>112.52</td>\n", " <td>21701800</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>113.00</td>\n", " <td>29736800</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>113.05</td>\n", " <td>21453100</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">CSCO</th>\n", " <th>2016-10-03</th>\n", " <td>31.50</td>\n", " <td>14070500</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>31.35</td>\n", " <td>18460400</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>31.59</td>\n", " <td>11808600</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">MSFT</th>\n", " <th>2016-10-03</th>\n", " <td>57.42</td>\n", " <td>19189500</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>57.24</td>\n", " <td>20085900</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>57.64</td>\n", " <td>16726400</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Close Volume\n", "Symbol Date \n", "AAPL 2016-10-03 112.52 21701800\n", " 2016-10-04 113.00 29736800\n", " 2016-10-05 113.05 21453100\n", "CSCO 2016-10-03 31.50 14070500\n", " 2016-10-04 31.35 18460400\n", " 2016-10-05 31.59 11808600\n", "MSFT 2016-10-03 57.42 19189500\n", " 2016-10-04 57.24 20085900\n", " 2016-10-05 57.64 16726400" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.sort_index(inplace=True)\n", "stocks" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Close 112.52\n", "Volume 21701800.00\n", "Name: (AAPL, 2016-10-03), dtype: float64" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.loc[('AAPL', '2016-10-03'), :]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "112.52" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.loc[('AAPL', '2016-10-03'), 'Close']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th>Close</th>\n", " <th>Volume</th>\n", " </tr>\n", " <tr>\n", " <th>Symbol</th>\n", " <th>Date</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">AAPL</th>\n", " <th>2016-10-03</th>\n", " <td>112.52</td>\n", " <td>21701800</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>113.00</td>\n", " <td>29736800</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>113.05</td>\n", " <td>21453100</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">MSFT</th>\n", " <th>2016-10-03</th>\n", " <td>57.42</td>\n", " <td>19189500</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>57.24</td>\n", " <td>20085900</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>57.64</td>\n", " <td>16726400</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Close Volume\n", "Symbol Date \n", "AAPL 2016-10-03 112.52 21701800\n", " 2016-10-04 113.00 29736800\n", " 2016-10-05 113.05 21453100\n", "MSFT 2016-10-03 57.42 19189500\n", " 2016-10-04 57.24 20085900\n", " 2016-10-05 57.64 16726400" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.loc[['AAPL', 'MSFT'], :]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th>Close</th>\n", " <th>Volume</th>\n", " </tr>\n", " <tr>\n", " <th>Symbol</th>\n", " <th>Date</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>AAPL</th>\n", " <th>2016-10-03</th>\n", " <td>112.52</td>\n", " <td>21701800</td>\n", " </tr>\n", " <tr>\n", " <th>MSFT</th>\n", " <th>2016-10-03</th>\n", " <td>57.42</td>\n", " <td>19189500</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Close Volume\n", "Symbol Date \n", "AAPL 2016-10-03 112.52 21701800\n", "MSFT 2016-10-03 57.42 19189500" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.loc[(['AAPL', 'MSFT'], '2016-10-03'), :]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Symbol Date \n", "AAPL 2016-10-03 112.52\n", "MSFT 2016-10-03 57.42\n", "Name: Close, dtype: float64" ] }, "execution_count": 134, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.loc[(['AAPL', 'MSFT'], '2016-10-03'), 'Close']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th>Close</th>\n", " <th>Volume</th>\n", " </tr>\n", " <tr>\n", " <th>Symbol</th>\n", " <th>Date</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">AAPL</th>\n", " <th>2016-10-03</th>\n", " <td>112.52</td>\n", " <td>21701800</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>113.00</td>\n", " <td>29736800</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">CSCO</th>\n", " <th>2016-10-03</th>\n", " <td>31.50</td>\n", " <td>14070500</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>31.35</td>\n", " <td>18460400</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"2\" valign=\"top\">MSFT</th>\n", " <th>2016-10-03</th>\n", " <td>57.42</td>\n", " <td>19189500</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>57.24</td>\n", " <td>20085900</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Close Volume\n", "Symbol Date \n", "AAPL 2016-10-03 112.52 21701800\n", " 2016-10-04 113.00 29736800\n", "CSCO 2016-10-03 31.50 14070500\n", " 2016-10-04 31.35 18460400\n", "MSFT 2016-10-03 57.42 19189500\n", " 2016-10-04 57.24 20085900" ] }, "execution_count": 135, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.loc[(slice(None), ['2016-10-03', '2016-10-04']), :]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Pivot" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th>Date</th>\n", " <th>2016-10-03</th>\n", " <th>2016-10-04</th>\n", " <th>2016-10-05</th>\n", " </tr>\n", " <tr>\n", " <th>Symbol</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>AAPL</th>\n", " <td>112.52</td>\n", " <td>113.00</td>\n", " <td>113.05</td>\n", " </tr>\n", " <tr>\n", " <th>CSCO</th>\n", " <td>31.50</td>\n", " <td>31.35</td>\n", " <td>31.59</td>\n", " </tr>\n", " <tr>\n", " <th>MSFT</th>\n", " <td>57.42</td>\n", " <td>57.24</td>\n", " <td>57.64</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "Date 2016-10-03 2016-10-04 2016-10-05\n", "Symbol \n", "AAPL 112.52 113.00 113.05\n", "CSCO 31.50 31.35 31.59\n", "MSFT 57.42 57.24 57.64" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = stocks.pivot_table(values='Close', index='Symbol', columns='Date')\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Clean" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Remove duplicate rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>age</th>\n", " <th>gender</th>\n", " <th>occupation</th>\n", " <th>zip_code</th>\n", " </tr>\n", " <tr>\n", " <th>user_id</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>24</td>\n", " <td>M</td>\n", " <td>technician</td>\n", " <td>85711</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>53</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>94043</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>23</td>\n", " <td>M</td>\n", " <td>writer</td>\n", " <td>32067</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>24</td>\n", " <td>M</td>\n", " <td>technician</td>\n", " <td>43537</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>33</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>15213</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " age gender occupation zip_code\n", "user_id \n", "1 24 M technician 85711\n", "2 53 F other 94043\n", "3 23 M writer 32067\n", "4 24 M technician 43537\n", "5 33 F other 15213" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# read a dataset of movie reviewers into a DataFrame\n", "user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']\n", "users = pd.read_table('pandas/movie.user', sep='|', header=None, names=user_cols, index_col='user_id')\n", "users.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "(943, 4)" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "users.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "user_id\n", "939 False\n", "940 True\n", "941 False\n", "942 False\n", "943 False\n", "Name: zip_code, dtype: bool" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# detect duplicate zip codes: True if an item is identical to a previous item\n", "users['zip_code'].duplicated().tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "148" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count the duplicate items (True becomes 1, False becomes 0)\n", "users['zip_code'].duplicated().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>age</th>\n", " <th>gender</th>\n", " <th>occupation</th>\n", " <th>zip_code</th>\n", " </tr>\n", " <tr>\n", " <th>user_id</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>496</th>\n", " <td>21</td>\n", " <td>F</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>572</th>\n", " <td>51</td>\n", " <td>M</td>\n", " <td>educator</td>\n", " <td>20003</td>\n", " </tr>\n", " <tr>\n", " <th>621</th>\n", " <td>17</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>60402</td>\n", " </tr>\n", " <tr>\n", " <th>684</th>\n", " <td>28</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>733</th>\n", " <td>44</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>60630</td>\n", " </tr>\n", " <tr>\n", " <th>805</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>20009</td>\n", " </tr>\n", " <tr>\n", " <th>890</th>\n", " <td>32</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>97301</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " age gender occupation zip_code\n", "user_id \n", "496 21 F student 55414\n", "572 51 M educator 20003\n", "621 17 M student 60402\n", "684 28 M student 55414\n", "733 44 F other 60630\n", "805 27 F other 20009\n", "890 32 M student 97301" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the duplicate rows (ignoring the first occurrence)\n", "users.loc[users.duplicated(keep='first'), :]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>age</th>\n", " <th>gender</th>\n", " <th>occupation</th>\n", " <th>zip_code</th>\n", " </tr>\n", " <tr>\n", " <th>user_id</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>67</th>\n", " <td>17</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>60402</td>\n", " </tr>\n", " <tr>\n", " <th>85</th>\n", " <td>51</td>\n", " <td>M</td>\n", " <td>educator</td>\n", " <td>20003</td>\n", " </tr>\n", " <tr>\n", " <th>198</th>\n", " <td>21</td>\n", " <td>F</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>350</th>\n", " <td>32</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>97301</td>\n", " </tr>\n", " <tr>\n", " <th>428</th>\n", " <td>28</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>437</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>20009</td>\n", " </tr>\n", " <tr>\n", " <th>460</th>\n", " <td>44</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>60630</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " age gender occupation zip_code\n", "user_id \n", "67 17 M student 60402\n", "85 51 M educator 20003\n", "198 21 F student 55414\n", "350 32 M student 97301\n", "428 28 M student 55414\n", "437 27 F other 20009\n", "460 44 F other 60630" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the duplicate rows (ignoring the last occurrence)\n", "users.loc[users.duplicated(keep='last'), :]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>age</th>\n", " <th>gender</th>\n", " <th>occupation</th>\n", " <th>zip_code</th>\n", " </tr>\n", " <tr>\n", " <th>user_id</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>67</th>\n", " <td>17</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>60402</td>\n", " </tr>\n", " <tr>\n", " <th>85</th>\n", " <td>51</td>\n", " <td>M</td>\n", " <td>educator</td>\n", " <td>20003</td>\n", " </tr>\n", " <tr>\n", " <th>198</th>\n", " <td>21</td>\n", " <td>F</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>350</th>\n", " <td>32</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>97301</td>\n", " </tr>\n", " <tr>\n", " <th>428</th>\n", " <td>28</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>437</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>20009</td>\n", " </tr>\n", " <tr>\n", " <th>460</th>\n", " <td>44</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>60630</td>\n", " </tr>\n", " <tr>\n", " <th>496</th>\n", " <td>21</td>\n", " <td>F</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>572</th>\n", " <td>51</td>\n", " <td>M</td>\n", " <td>educator</td>\n", " <td>20003</td>\n", " </tr>\n", " <tr>\n", " <th>621</th>\n", " <td>17</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>60402</td>\n", " </tr>\n", " <tr>\n", " <th>684</th>\n", " <td>28</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>733</th>\n", " <td>44</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>60630</td>\n", " </tr>\n", " <tr>\n", " <th>805</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>20009</td>\n", " </tr>\n", " <tr>\n", " <th>890</th>\n", " <td>32</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>97301</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " age gender occupation zip_code\n", "user_id \n", "67 17 M student 60402\n", "85 51 M educator 20003\n", "198 21 F student 55414\n", "350 32 M student 97301\n", "428 28 M student 55414\n", "437 27 F other 20009\n", "460 44 F other 60630\n", "496 21 F student 55414\n", "572 51 M educator 20003\n", "621 17 M student 60402\n", "684 28 M student 55414\n", "733 44 F other 60630\n", "805 27 F other 20009\n", "890 32 M student 97301" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the duplicate rows (including all duplicates)\n", "users.loc[users.duplicated(keep=False), :]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "(936, 4)" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop the duplicate rows (inplace=False by default)\n", "users.drop_duplicates(keep='first').shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "(936, 4)" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "users.drop_duplicates(keep='last').shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "(929, 4)" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "users.drop_duplicates(keep=False).shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "16" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# only consider a subset of columns when identifying duplicates\n", "users.duplicated(subset=['age', 'zip_code']).sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "(927, 4)" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "users.drop_duplicates(subset=['age', 'zip_code']).shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Detect missing values" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>18236</th>\n", " <td>Grant Park</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>IL</td>\n", " <td>12/31/2000 23:00</td>\n", " </tr>\n", " <tr>\n", " <th>18237</th>\n", " <td>Spirit Lake</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>IA</td>\n", " <td>12/31/2000 23:00</td>\n", " </tr>\n", " <tr>\n", " <th>18238</th>\n", " <td>Eagle River</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>WI</td>\n", " <td>12/31/2000 23:45</td>\n", " </tr>\n", " <tr>\n", " <th>18239</th>\n", " <td>Eagle River</td>\n", " <td>RED</td>\n", " <td>LIGHT</td>\n", " <td>WI</td>\n", " <td>12/31/2000 23:45</td>\n", " </tr>\n", " <tr>\n", " <th>18240</th>\n", " <td>Ybor</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>FL</td>\n", " <td>12/31/2000 23:59</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "18236 Grant Park NaN TRIANGLE IL 12/31/2000 23:00\n", "18237 Spirit Lake NaN DISK IA 12/31/2000 23:00\n", "18238 Eagle River NaN NaN WI 12/31/2000 23:45\n", "18239 Eagle River RED LIGHT WI 12/31/2000 23:45\n", "18240 Ybor NaN OVAL FL 12/31/2000 23:59" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ufo = pd.read_csv('pandas/ufo.csv')\n", "ufo.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>18236</th>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>18237</th>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>18238</th>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>18239</th>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>18240</th>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "18236 False True False False False\n", "18237 False True False False False\n", "18238 False True False False False\n", "18239 False False False False False\n", "18240 False True False False False" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ufo.isna().tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>18236</th>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <th>18237</th>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <th>18238</th>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <th>18239</th>\n", " <td>True</td>\n", " <td>True</td>\n", " <td>True</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <th>18240</th>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>True</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "18236 True False True True True\n", "18237 True False True True True\n", "18238 True False True True True\n", "18239 True True True True True\n", "18240 True False True True True" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ufo.notna().tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "City 25\n", "Colors Reported 15359\n", "Shape Reported 0\n", "State 0\n", "Time 0\n", "dtype: int64" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# count the number of missing values in each Series\n", "ufo.isna().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>21</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>LA</td>\n", " <td>8/15/1943 0:00</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>LIGHT</td>\n", " <td>LA</td>\n", " <td>8/15/1943 0:00</td>\n", " </tr>\n", " <tr>\n", " <th>204</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>CA</td>\n", " <td>7/15/1952 12:30</td>\n", " </tr>\n", " <tr>\n", " <th>241</th>\n", " <td>NaN</td>\n", " <td>BLUE</td>\n", " <td>DISK</td>\n", " <td>MT</td>\n", " <td>7/4/1953 14:00</td>\n", " </tr>\n", " <tr>\n", " <th>613</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>NV</td>\n", " <td>7/1/1960 12:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "21 NaN NaN NaN LA 8/15/1943 0:00\n", "22 NaN NaN LIGHT LA 8/15/1943 0:00\n", "204 NaN NaN DISK CA 7/15/1952 12:30\n", "241 NaN BLUE DISK MT 7/4/1953 14:00\n", "613 NaN NaN DISK NV 7/1/1960 12:00" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use the 'isnull' Series method to filter the DataFrame rows\n", "ufo.loc[ufo.City.isnull()].head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "(18241, 5)" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# examine the number of rows and columns\n", "ufo.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "(2486, 5)" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# if 'any' values are missing in a row, then drop that row\n", "ufo.dropna(how='any').shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "(18241, 5)" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 'inplace' parameter for 'dropna' is False by default, thus rows were only dropped temporarily\n", "ufo.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "(18241, 5)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# if 'all' values are missing in a row, then drop that row (none are dropped in this case)\n", "ufo.dropna(how='all').shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "(15576, 5)" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row\n", "ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "LIGHT 2803\n", "DISK 2122\n", "TRIANGLE 1889\n", "OTHER 1402\n", "CIRCLE 1365\n", "Name: Shape Reported, dtype: int64" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 'value_counts' does not include missing values by default\n", "ufo['Shape Reported'].value_counts().head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "LIGHT 2803\n", "NaN 2644\n", "DISK 2122\n", "TRIANGLE 1889\n", "OTHER 1402\n", "Name: Shape Reported, dtype: int64" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# explicitly include missing values\n", "ufo['Shape Reported'].value_counts(dropna=False).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Fill missing values" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true } }, "outputs": [], "source": [ "# fill in missing values with a specified value\n", "ufo['Shape Reported'].fillna(value='VARIOUS', inplace=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "VARIOUS 2977\n", "LIGHT 2803\n", "DISK 2122\n", "TRIANGLE 1889\n", "OTHER 1402\n", "Name: Shape Reported, dtype: int64" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# confirm that the missing values were filled in\n", "ufo['Shape Reported'].value_counts().head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>UNKNOWN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>UNKNOWN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>UNKNOWN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Abilene</td>\n", " <td>UNKNOWN</td>\n", " <td>DISK</td>\n", " <td>KS</td>\n", " <td>6/1/1931 13:00</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>New York Worlds Fair</td>\n", " <td>UNKNOWN</td>\n", " <td>LIGHT</td>\n", " <td>NY</td>\n", " <td>4/18/1933 19:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca UNKNOWN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro UNKNOWN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke UNKNOWN OVAL CO 2/15/1931 14:00\n", "3 Abilene UNKNOWN DISK KS 6/1/1931 13:00\n", "4 New York Worlds Fair UNKNOWN LIGHT NY 4/18/1933 19:00" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# fill in missing values\n", "ufo.fillna(value='UNKNOWN').head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>18236</th>\n", " <td>Grant Park</td>\n", " <td>RED</td>\n", " <td>TRIANGLE</td>\n", " <td>IL</td>\n", " <td>12/31/2000 23:00</td>\n", " </tr>\n", " <tr>\n", " <th>18237</th>\n", " <td>Spirit Lake</td>\n", " <td>RED</td>\n", " <td>DISK</td>\n", " <td>IA</td>\n", " <td>12/31/2000 23:00</td>\n", " </tr>\n", " <tr>\n", " <th>18238</th>\n", " <td>Eagle River</td>\n", " <td>RED</td>\n", " <td>VARIOUS</td>\n", " <td>WI</td>\n", " <td>12/31/2000 23:45</td>\n", " </tr>\n", " <tr>\n", " <th>18239</th>\n", " <td>Eagle River</td>\n", " <td>RED</td>\n", " <td>LIGHT</td>\n", " <td>WI</td>\n", " <td>12/31/2000 23:45</td>\n", " </tr>\n", " <tr>\n", " <th>18240</th>\n", " <td>Ybor</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>FL</td>\n", " <td>12/31/2000 23:59</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "18236 Grant Park RED TRIANGLE IL 12/31/2000 23:00\n", "18237 Spirit Lake RED DISK IA 12/31/2000 23:00\n", "18238 Eagle River RED VARIOUS WI 12/31/2000 23:45\n", "18239 Eagle River RED LIGHT WI 12/31/2000 23:45\n", "18240 Ybor NaN OVAL FL 12/31/2000 23:59" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# fill missing values using \"backward fill\" strategy (doesn't affect the DataFrame since inplace=False)\n", "ufo.fillna(method='bfill').tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>18236</th>\n", " <td>Grant Park</td>\n", " <td>RED</td>\n", " <td>TRIANGLE</td>\n", " <td>IL</td>\n", " <td>12/31/2000 23:00</td>\n", " </tr>\n", " <tr>\n", " <th>18237</th>\n", " <td>Spirit Lake</td>\n", " <td>RED</td>\n", " <td>DISK</td>\n", " <td>IA</td>\n", " <td>12/31/2000 23:00</td>\n", " </tr>\n", " <tr>\n", " <th>18238</th>\n", " <td>Eagle River</td>\n", " <td>RED</td>\n", " <td>VARIOUS</td>\n", " <td>WI</td>\n", " <td>12/31/2000 23:45</td>\n", " </tr>\n", " <tr>\n", " <th>18239</th>\n", " <td>Eagle River</td>\n", " <td>RED</td>\n", " <td>LIGHT</td>\n", " <td>WI</td>\n", " <td>12/31/2000 23:45</td>\n", " </tr>\n", " <tr>\n", " <th>18240</th>\n", " <td>Ybor</td>\n", " <td>RED</td>\n", " <td>OVAL</td>\n", " <td>FL</td>\n", " <td>12/31/2000 23:59</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "18236 Grant Park RED TRIANGLE IL 12/31/2000 23:00\n", "18237 Spirit Lake RED DISK IA 12/31/2000 23:00\n", "18238 Eagle River RED VARIOUS WI 12/31/2000 23:45\n", "18239 Eagle River RED LIGHT WI 12/31/2000 23:45\n", "18240 Ybor RED OVAL FL 12/31/2000 23:59" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# compare with \"forward fill\" strategy (doesn't affect the DataFrame since inplace=False)\n", "ufo.fillna(method='ffill').tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Apply a function" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>PassengerId</th>\n", " <th>Survived</th>\n", " <th>Pclass</th>\n", " <th>Name</th>\n", " <th>Sex</th>\n", " <th>Age</th>\n", " <th>SibSp</th>\n", " <th>Parch</th>\n", " <th>Ticket</th>\n", " <th>Fare</th>\n", " <th>Cabin</th>\n", " <th>Embarked</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Braund, Mr. Owen Harris</td>\n", " <td>male</td>\n", " <td>22.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>A/5 21171</td>\n", " <td>7.2500</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", " <td>female</td>\n", " <td>38.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>PC 17599</td>\n", " <td>71.2833</td>\n", " <td>C85</td>\n", " <td>C</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>Heikkinen, Miss. Laina</td>\n", " <td>female</td>\n", " <td>26.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>STON/O2. 3101282</td>\n", " <td>7.9250</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", " <td>female</td>\n", " <td>35.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>113803</td>\n", " <td>53.1000</td>\n", " <td>C123</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Allen, Mr. William Henry</td>\n", " <td>male</td>\n", " <td>35.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>373450</td>\n", " <td>8.0500</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "train = pd.read_csv('pandas/titanic_train.csv')\n", "train.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Sex</th>\n", " <th>Sex_num</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>male</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>female</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>female</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>female</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>male</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Sex Sex_num\n", "0 male 1\n", "1 female 0\n", "2 female 0\n", "3 female 0\n", "4 male 1" ] }, "execution_count": 137, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# map 'female' to 0 and 'male' to 1\n", "train['Sex_num'] = train.Sex.map({'female': 0, 'male': 1})\n", "train.loc[0:4, ['Sex', 'Sex_num']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Name</th>\n", " <th>Name_length</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Braund, Mr. Owen Harris</td>\n", " <td>23</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", " <td>51</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Heikkinen, Miss. Laina</td>\n", " <td>22</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", " <td>44</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Allen, Mr. William Henry</td>\n", " <td>24</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Name Name_length\n", "0 Braund, Mr. Owen Harris 23\n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... 51\n", "2 Heikkinen, Miss. Laina 22\n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 44\n", "4 Allen, Mr. William Henry 24" ] }, "execution_count": 139, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# calculate the length of each string in the 'Name' Series\n", "train['Name_length'] = train['Name'].apply(len)\n", "train.loc[0:4, ['Name', 'Name_length']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "0 [Braund, Mr. Owen Harris]\n", "1 [Cumings, Mrs. John Bradley (Florence Briggs ...\n", "2 [Heikkinen, Miss. Laina]\n", "3 [Futrelle, Mrs. Jacques Heath (Lily May Peel)]\n", "4 [Allen, Mr. William Henry]\n", "Name: Name, dtype: object" ] }, "execution_count": 140, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use a string method to split the 'Name' Series at commas (returns a Series of lists)\n", "train['Name'].str.split(',').head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "0 Braund\n", "1 Cumings\n", "2 Heikkinen\n", "3 Futrelle\n", "4 Allen\n", "Name: Name, dtype: object" ] }, "execution_count": 143, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use a lambda function\n", "train['Name'].str.split(',').apply(lambda x: x[0]).head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country</th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " <th>total_litres_of_pure_alcohol</th>\n", " <th>continent</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0.0</td>\n", " <td>Asia</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>89</td>\n", " <td>132</td>\n", " <td>54</td>\n", " <td>4.9</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>25</td>\n", " <td>0</td>\n", " <td>14</td>\n", " <td>0.7</td>\n", " <td>Africa</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>245</td>\n", " <td>138</td>\n", " <td>312</td>\n", " <td>12.4</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>217</td>\n", " <td>57</td>\n", " <td>45</td>\n", " <td>5.9</td>\n", " <td>Africa</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "execution_count": 144, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks = pd.read_csv('pandas/drinks.csv')\n", "drinks.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "beer_servings 376\n", "spirit_servings 438\n", "wine_servings 370\n", "dtype: int64" ] }, "execution_count": 146, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# apply the 'max' function along axis 0 to calculate the maximum value in each column\n", "drinks.loc[:, 'beer_servings':'wine_servings'].apply(max, axis=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>89.0</td>\n", " <td>132.0</td>\n", " <td>54.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>25.0</td>\n", " <td>0.0</td>\n", " <td>14.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>245.0</td>\n", " <td>138.0</td>\n", " <td>312.0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>217.0</td>\n", " <td>57.0</td>\n", " <td>45.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " beer_servings spirit_servings wine_servings\n", "0 0.0 0.0 0.0\n", "1 89.0 132.0 54.0\n", "2 25.0 0.0 14.0\n", "3 245.0 138.0 312.0\n", "4 217.0 57.0 45.0" ] }, "execution_count": 148, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# convert every DataFrame element into a float\n", "drinks.loc[:, 'beer_servings':'wine_servings'].applymap(float).head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country</th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " <th>total_litres_of_pure_alcohol</th>\n", " <th>continent</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>Asia</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>89.0</td>\n", " <td>132.0</td>\n", " <td>54.0</td>\n", " <td>4.9</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>25.0</td>\n", " <td>0.0</td>\n", " <td>14.0</td>\n", " <td>0.7</td>\n", " <td>Africa</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>245.0</td>\n", " <td>138.0</td>\n", " <td>312.0</td>\n", " <td>12.4</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>217.0</td>\n", " <td>57.0</td>\n", " <td>45.0</td>\n", " <td>5.9</td>\n", " <td>Africa</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0.0 0.0 0.0 \n", "1 Albania 89.0 132.0 54.0 \n", "2 Algeria 25.0 0.0 14.0 \n", "3 Andorra 245.0 138.0 312.0 \n", "4 Angola 217.0 57.0 45.0 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "execution_count": 149, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# overwrite the existing DataFrame columns\n", "drinks.loc[:, 'beer_servings':\n", " 'wine_servings'] = drinks.loc[:, 'beer_servings':\n", " 'wine_servings'].applymap(float)\n", "drinks.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reshape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concat" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "# concatenate the 'drinks' DataFrame with the 'population' Series (aligns by the index)\n", "pd.concat([drinks, people], axis=1).head()" ] }, { "cell_type": "markdown", "metadata": { "toc-hr-collapsed": true, "toc-nb-collapsed": true }, "source": [ "### Merge" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>movie_id</th>\n", " <th>title</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>Toy Story (1995)</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>GoldenEye (1995)</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>Four Rooms (1995)</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4</td>\n", " <td>Get Shorty (1995)</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5</td>\n", " <td>Copycat (1995)</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " movie_id title\n", "0 1 Toy Story (1995)\n", "1 2 GoldenEye (1995)\n", "2 3 Four Rooms (1995)\n", "3 4 Get Shorty (1995)\n", "4 5 Copycat (1995)" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movie_cols = ['movie_id', 'title']\n", "movies = pd.read_table('pandas/u.item', sep='|', header=None, names=movie_cols, usecols=[0, 1])\n", "movies.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1682, 2)" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "movies.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>user_id</th>\n", " <th>movie_id</th>\n", " <th>rating</th>\n", " <th>timestamp</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>196</td>\n", " <td>242</td>\n", " <td>3</td>\n", " <td>881250949</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>186</td>\n", " <td>302</td>\n", " <td>3</td>\n", " <td>891717742</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>22</td>\n", " <td>377</td>\n", " <td>1</td>\n", " <td>878887116</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>244</td>\n", " <td>51</td>\n", " <td>2</td>\n", " <td>880606923</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>166</td>\n", " <td>346</td>\n", " <td>1</td>\n", " <td>886397596</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " user_id movie_id rating timestamp\n", "0 196 242 3 881250949\n", "1 186 302 3 891717742\n", "2 22 377 1 878887116\n", "3 244 51 2 880606923\n", "4 166 346 1 886397596" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']\n", "ratings = pd.read_table('pandas/u.data', sep='\\t', header=None, names=rating_cols)\n", "ratings.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(100000, 4)" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>user_id</th>\n", " <th>movie_id</th>\n", " <th>rating</th>\n", " <th>timestamp</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>24</th>\n", " <td>308</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>887736532</td>\n", " </tr>\n", " <tr>\n", " <th>454</th>\n", " <td>287</td>\n", " <td>1</td>\n", " <td>5</td>\n", " <td>875334088</td>\n", " </tr>\n", " <tr>\n", " <th>957</th>\n", " <td>148</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>877019411</td>\n", " </tr>\n", " <tr>\n", " <th>971</th>\n", " <td>280</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>891700426</td>\n", " </tr>\n", " <tr>\n", " <th>1324</th>\n", " <td>66</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>883601324</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " user_id movie_id rating timestamp\n", "24 308 1 4 887736532\n", "454 287 1 5 875334088\n", "957 148 1 4 877019411\n", "971 280 1 4 891700426\n", "1324 66 1 3 883601324" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings.query(\"movie_id == 1\").head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['movie_id', 'title', 'user_id', 'rating', 'timestamp'], dtype='object')" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movie_ratings = pd.merge(movies, ratings)\n", "movie_ratings.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>movie_id</th>\n", " <th>title</th>\n", " <th>user_id</th>\n", " <th>rating</th>\n", " <th>timestamp</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>Toy Story (1995)</td>\n", " <td>308</td>\n", " <td>4</td>\n", " <td>887736532</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>Toy Story (1995)</td>\n", " <td>287</td>\n", " <td>5</td>\n", " <td>875334088</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1</td>\n", " <td>Toy Story (1995)</td>\n", " <td>148</td>\n", " <td>4</td>\n", " <td>877019411</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1</td>\n", " <td>Toy Story (1995)</td>\n", " <td>280</td>\n", " <td>4</td>\n", " <td>891700426</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1</td>\n", " <td>Toy Story (1995)</td>\n", " <td>66</td>\n", " <td>3</td>\n", " <td>883601324</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " movie_id title user_id rating timestamp\n", "0 1 Toy Story (1995) 308 4 887736532\n", "1 1 Toy Story (1995) 287 5 875334088\n", "2 1 Toy Story (1995) 148 4 877019411\n", "3 1 Toy Story (1995) 280 4 891700426\n", "4 1 Toy Story (1995) 66 3 883601324" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movie_ratings.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1682, 2)\n", "(100000, 4)\n", "(100000, 5)\n" ] } ], "source": [ "print(movies.shape)\n", "print(ratings.shape)\n", "print(movie_ratings.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['m_id', 'title'], dtype='object')" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.columns = ['m_id', 'title']\n", "movies.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['user_id', 'movie_id', 'rating', 'timestamp'], dtype='object')" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>m_id</th>\n", " <th>title</th>\n", " <th>user_id</th>\n", " <th>movie_id</th>\n", " <th>rating</th>\n", " <th>timestamp</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>Toy Story (1995)</td>\n", " <td>308</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>887736532</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>Toy Story (1995)</td>\n", " <td>287</td>\n", " <td>1</td>\n", " <td>5</td>\n", " <td>875334088</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1</td>\n", " <td>Toy Story (1995)</td>\n", " <td>148</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>877019411</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1</td>\n", " <td>Toy Story (1995)</td>\n", " <td>280</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>891700426</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1</td>\n", " <td>Toy Story (1995)</td>\n", " <td>66</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>883601324</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " m_id title user_id movie_id rating timestamp\n", "0 1 Toy Story (1995) 308 1 4 887736532\n", "1 1 Toy Story (1995) 287 1 5 875334088\n", "2 1 Toy Story (1995) 148 1 4 877019411\n", "3 1 Toy Story (1995) 280 1 4 891700426\n", "4 1 Toy Story (1995) 66 1 3 883601324" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(movies, ratings, left_on='m_id', right_on='movie_id').head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>title</th>\n", " <th>user_id</th>\n", " <th>movie_id</th>\n", " <th>rating</th>\n", " <th>timestamp</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>24</th>\n", " <td>Toy Story (1995)</td>\n", " <td>308</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>887736532</td>\n", " </tr>\n", " <tr>\n", " <th>454</th>\n", " <td>Toy Story (1995)</td>\n", " <td>287</td>\n", " <td>1</td>\n", " <td>5</td>\n", " <td>875334088</td>\n", " </tr>\n", " <tr>\n", " <th>957</th>\n", " <td>Toy Story (1995)</td>\n", " <td>148</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>877019411</td>\n", " </tr>\n", " <tr>\n", " <th>971</th>\n", " <td>Toy Story (1995)</td>\n", " <td>280</td>\n", " <td>1</td>\n", " <td>4</td>\n", " <td>891700426</td>\n", " </tr>\n", " <tr>\n", " <th>1324</th>\n", " <td>Toy Story (1995)</td>\n", " <td>66</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>883601324</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " title user_id movie_id rating timestamp\n", "24 Toy Story (1995) 308 1 4 887736532\n", "454 Toy Story (1995) 287 1 5 875334088\n", "957 Toy Story (1995) 148 1 4 877019411\n", "971 Toy Story (1995) 280 1 4 891700426\n", "1324 Toy Story (1995) 66 1 3 883601324" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(movies, ratings, left_index=True, right_on='movie_id').head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>title</th>\n", " <th>user_id</th>\n", " <th>movie_id</th>\n", " <th>rating</th>\n", " <th>timestamp</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>Toy Story (1995)</td>\n", " <td>186</td>\n", " <td>302</td>\n", " <td>3</td>\n", " <td>891717742</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>GoldenEye (1995)</td>\n", " <td>22</td>\n", " <td>377</td>\n", " <td>1</td>\n", " <td>878887116</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Four Rooms (1995)</td>\n", " <td>244</td>\n", " <td>51</td>\n", " <td>2</td>\n", " <td>880606923</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Get Shorty (1995)</td>\n", " <td>166</td>\n", " <td>346</td>\n", " <td>1</td>\n", " <td>886397596</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>Copycat (1995)</td>\n", " <td>298</td>\n", " <td>474</td>\n", " <td>4</td>\n", " <td>884182806</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " title user_id movie_id rating timestamp\n", "1 Toy Story (1995) 186 302 3 891717742\n", "2 GoldenEye (1995) 22 377 1 878887116\n", "3 Four Rooms (1995) 244 51 2 880606923\n", "4 Get Shorty (1995) 166 346 1 886397596\n", "5 Copycat (1995) 298 474 4 884182806" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(movies, ratings, left_index=True, right_index=True).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Merging with MultiIndexes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th>Close</th>\n", " </tr>\n", " <tr>\n", " <th>Symbol</th>\n", " <th>Date</th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">AAPL</th>\n", " <th>2016-10-03</th>\n", " <td>112.52</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>113.00</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>113.05</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">CSCO</th>\n", " <th>2016-10-03</th>\n", " <td>31.50</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>31.35</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>31.59</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">MSFT</th>\n", " <th>2016-10-03</th>\n", " <td>57.42</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>57.24</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>57.64</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Close\n", "Symbol Date \n", "AAPL 2016-10-03 112.52\n", " 2016-10-04 113.00\n", " 2016-10-05 113.05\n", "CSCO 2016-10-03 31.50\n", " 2016-10-04 31.35\n", " 2016-10-05 31.59\n", "MSFT 2016-10-03 57.42\n", " 2016-10-04 57.24\n", " 2016-10-05 57.64" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "close = pd.read_csv('pandas/stocks.csv', usecols=[0, 1, 3], index_col=['Symbol', 'Date']).sort_index()\n", "close" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th>Volume</th>\n", " </tr>\n", " <tr>\n", " <th>Symbol</th>\n", " <th>Date</th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">AAPL</th>\n", " <th>2016-10-03</th>\n", " <td>21701800</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>29736800</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>21453100</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">CSCO</th>\n", " <th>2016-10-03</th>\n", " <td>14070500</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>18460400</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>11808600</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">MSFT</th>\n", " <th>2016-10-03</th>\n", " <td>19189500</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>20085900</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>16726400</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Volume\n", "Symbol Date \n", "AAPL 2016-10-03 21701800\n", " 2016-10-04 29736800\n", " 2016-10-05 21453100\n", "CSCO 2016-10-03 14070500\n", " 2016-10-04 18460400\n", " 2016-10-05 11808600\n", "MSFT 2016-10-03 19189500\n", " 2016-10-04 20085900\n", " 2016-10-05 16726400" ] }, "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ "volume = pd.read_csv('pandas/stocks.csv', usecols=[0, 2, 3], index_col=['Symbol', 'Date']).sort_index()\n", "volume" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th>Close</th>\n", " <th>Volume</th>\n", " </tr>\n", " <tr>\n", " <th>Symbol</th>\n", " <th>Date</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">AAPL</th>\n", " <th>2016-10-03</th>\n", " <td>112.52</td>\n", " <td>21701800</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>113.00</td>\n", " <td>29736800</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>113.05</td>\n", " <td>21453100</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">CSCO</th>\n", " <th>2016-10-03</th>\n", " <td>31.50</td>\n", " <td>14070500</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>31.35</td>\n", " <td>18460400</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>31.59</td>\n", " <td>11808600</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">MSFT</th>\n", " <th>2016-10-03</th>\n", " <td>57.42</td>\n", " <td>19189500</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-04</th>\n", " <td>57.24</td>\n", " <td>20085900</td>\n", " </tr>\n", " <tr>\n", " <th>2016-10-05</th>\n", " <td>57.64</td>\n", " <td>16726400</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Close Volume\n", "Symbol Date \n", "AAPL 2016-10-03 112.52 21701800\n", " 2016-10-04 113.00 29736800\n", " 2016-10-05 113.05 21453100\n", "CSCO 2016-10-03 31.50 14070500\n", " 2016-10-04 31.35 18460400\n", " 2016-10-05 31.59 11808600\n", "MSFT 2016-10-03 57.42 19189500\n", " 2016-10-04 57.24 20085900\n", " 2016-10-05 57.64 16726400" ] }, "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ "both = pd.merge(close, volume, left_index=True, right_index=True)\n", "both" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Symbol</th>\n", " <th>Date</th>\n", " <th>Close</th>\n", " <th>Volume</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>AAPL</td>\n", " <td>2016-10-03</td>\n", " <td>112.52</td>\n", " <td>21701800</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>AAPL</td>\n", " <td>2016-10-04</td>\n", " <td>113.00</td>\n", " <td>29736800</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>AAPL</td>\n", " <td>2016-10-05</td>\n", " <td>113.05</td>\n", " <td>21453100</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>CSCO</td>\n", " <td>2016-10-03</td>\n", " <td>31.50</td>\n", " <td>14070500</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>CSCO</td>\n", " <td>2016-10-04</td>\n", " <td>31.35</td>\n", " <td>18460400</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>CSCO</td>\n", " <td>2016-10-05</td>\n", " <td>31.59</td>\n", " <td>11808600</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>MSFT</td>\n", " <td>2016-10-03</td>\n", " <td>57.42</td>\n", " <td>19189500</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>MSFT</td>\n", " <td>2016-10-04</td>\n", " <td>57.24</td>\n", " <td>20085900</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>MSFT</td>\n", " <td>2016-10-05</td>\n", " <td>57.64</td>\n", " <td>16726400</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Symbol Date Close Volume\n", "0 AAPL 2016-10-03 112.52 21701800\n", "1 AAPL 2016-10-04 113.00 29736800\n", "2 AAPL 2016-10-05 113.05 21453100\n", "3 CSCO 2016-10-03 31.50 14070500\n", "4 CSCO 2016-10-04 31.35 18460400\n", "5 CSCO 2016-10-05 31.59 11808600\n", "6 MSFT 2016-10-03 57.42 19189500\n", "7 MSFT 2016-10-04 57.24 20085900\n", "8 MSFT 2016-10-05 57.64 16726400" ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "both.reset_index()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Join" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>color</th>\n", " <th>num</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>green</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>yellow</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>red</td>\n", " <td>3</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " color num\n", "0 green 1\n", "1 yellow 2\n", "2 red 3" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "A = pd.DataFrame({'color': ['green', 'yellow', 'red'], 'num': [1, 2, 3]})\n", "A" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>color</th>\n", " <th>size</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>green</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>yellow</td>\n", " <td>M</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>pink</td>\n", " <td>L</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " color size\n", "0 green S\n", "1 yellow M\n", "2 pink L" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "B = pd.DataFrame({'color': ['green', 'yellow', 'pink'], 'size':['S', 'M', 'L']})\n", "B" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Inner join: Only include observations found in both A and B:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>color</th>\n", " <th>num</th>\n", " <th>size</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>green</td>\n", " <td>1</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>yellow</td>\n", " <td>2</td>\n", " <td>M</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " color num size\n", "0 green 1 S\n", "1 yellow 2 M" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(A, B, how='inner')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Outer join: Include observations found in either A or B:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>color</th>\n", " <th>num</th>\n", " <th>size</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>green</td>\n", " <td>1.0</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>yellow</td>\n", " <td>2.0</td>\n", " <td>M</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>red</td>\n", " <td>3.0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>pink</td>\n", " <td>NaN</td>\n", " <td>L</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " color num size\n", "0 green 1.0 S\n", "1 yellow 2.0 M\n", "2 red 3.0 NaN\n", "3 pink NaN L" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(A, B, how='outer')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Left join: Include all observations found in A:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>color</th>\n", " <th>num</th>\n", " <th>size</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>green</td>\n", " <td>1</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>yellow</td>\n", " <td>2</td>\n", " <td>M</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>red</td>\n", " <td>3</td>\n", " <td>NaN</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " color num size\n", "0 green 1 S\n", "1 yellow 2 M\n", "2 red 3 NaN" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(A, B, how='left')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Right join: Include all observations found in B:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>color</th>\n", " <th>num</th>\n", " <th>size</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>green</td>\n", " <td>1.0</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>yellow</td>\n", " <td>2.0</td>\n", " <td>M</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>pink</td>\n", " <td>NaN</td>\n", " <td>L</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " color num size\n", "0 green 1.0 S\n", "1 yellow 2.0 M\n", "2 pink NaN L" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(A, B, how='right')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## String & Time" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Change the data type" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country</th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " <th>total_litres_of_pure_alcohol</th>\n", " <th>continent</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0.0</td>\n", " <td>Asia</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>89</td>\n", " <td>132</td>\n", " <td>54</td>\n", " <td>4.9</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>25</td>\n", " <td>0</td>\n", " <td>14</td>\n", " <td>0.7</td>\n", " <td>Africa</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>245</td>\n", " <td>138</td>\n", " <td>312</td>\n", " <td>12.4</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>217</td>\n", " <td>57</td>\n", " <td>45</td>\n", " <td>5.9</td>\n", " <td>Africa</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# read a dataset of alcohol consumption into a DataFrame\n", "drinks = pd.read_csv('drinks.csv')\n", "drinks.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "country object\n", "beer_servings int64\n", "spirit_servings int64\n", "wine_servings int64\n", "total_litres_of_pure_alcohol float64\n", "continent object\n", "dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# examine the data type of each Series\n", "drinks.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "country object\n", "beer_servings float64\n", "spirit_servings int64\n", "wine_servings int64\n", "total_litres_of_pure_alcohol float64\n", "continent object\n", "dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# change the data type of an existing Series\n", "drinks['beer_servings'] = drinks.beer_servings.astype(float)\n", "drinks.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "country object\n", "beer_servings float64\n", "spirit_servings int64\n", "wine_servings int64\n", "total_litres_of_pure_alcohol float64\n", "continent object\n", "dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# alternatively, change the data type of a Series while reading in a file\n", "drinks = pd.read_csv('drinks.csv', dtype={'beer_servings': float})\n", "drinks.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Use string methods" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>order_id</th>\n", " <th>quantity</th>\n", " <th>item_name</th>\n", " <th>choice_description</th>\n", " <th>item_price</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Chips and Fresh Tomato Salsa</td>\n", " <td>NaN</td>\n", " <td>$2.39</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Izze</td>\n", " <td>[Clementine]</td>\n", " <td>$3.39</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Nantucket Nectar</td>\n", " <td>[Apple]</td>\n", " <td>$3.39</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Chips and Tomatillo-Green Chili Salsa</td>\n", " <td>NaN</td>\n", " <td>$2.39</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>Chicken Bowl</td>\n", " <td>[Tomatillo-Red Chili Salsa (Hot), [Black Beans...</td>\n", " <td>$16.98</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " order_id quantity item_name \\\n", "0 1 1 Chips and Fresh Tomato Salsa \n", "1 1 1 Izze \n", "2 1 1 Nantucket Nectar \n", "3 1 1 Chips and Tomatillo-Green Chili Salsa \n", "4 2 2 Chicken Bowl \n", "\n", " choice_description item_price \n", "0 NaN $2.39 \n", "1 [Clementine] $3.39 \n", "2 [Apple] $3.39 \n", "3 NaN $2.39 \n", "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# read a dataset of Chipotle orders into a DataFrame\n", "orders = pd.read_table('chipotle.tsv')\n", "orders.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "0 CHIPS AND FRESH TOMATO SALSA\n", "1 IZZE\n", "2 NANTUCKET NECTAR\n", "3 CHIPS AND TOMATILLO-GREEN CHILI SALSA\n", "4 CHICKEN BOWL\n", "Name: item_name, dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# string methods for pandas Series are accessed via 'str'\n", "orders.item_name.str.upper().head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 True\n", "Name: item_name, dtype: bool" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# string method 'contains' checks for a substring and returns a boolean Series\n", "orders.item_name.str.contains('Chicken').head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>order_id</th>\n", " <th>quantity</th>\n", " <th>item_name</th>\n", " <th>choice_description</th>\n", " <th>item_price</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>4</th>\n", " <td>2</td>\n", " <td>2</td>\n", " <td>Chicken Bowl</td>\n", " <td>[Tomatillo-Red Chili Salsa (Hot), [Black Beans...</td>\n", " <td>$16.98</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>Chicken Bowl</td>\n", " <td>[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...</td>\n", " <td>$10.98</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>6</td>\n", " <td>1</td>\n", " <td>Chicken Crispy Tacos</td>\n", " <td>[Roasted Chili Corn Salsa, [Fajita Vegetables,...</td>\n", " <td>$8.75</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>6</td>\n", " <td>1</td>\n", " <td>Chicken Soft Tacos</td>\n", " <td>[Roasted Chili Corn Salsa, [Rice, Black Beans,...</td>\n", " <td>$8.75</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>7</td>\n", " <td>1</td>\n", " <td>Chicken Bowl</td>\n", " <td>[Fresh Tomato Salsa, [Fajita Vegetables, Rice,...</td>\n", " <td>$11.25</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " order_id quantity item_name \\\n", "4 2 2 Chicken Bowl \n", "5 3 1 Chicken Bowl \n", "11 6 1 Chicken Crispy Tacos \n", "12 6 1 Chicken Soft Tacos \n", "13 7 1 Chicken Bowl \n", "\n", " choice_description item_price \n", "4 [Tomatillo-Red Chili Salsa (Hot), [Black Beans... $16.98 \n", "5 [Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou... $10.98 \n", "11 [Roasted Chili Corn Salsa, [Fajita Vegetables,... $8.75 \n", "12 [Roasted Chili Corn Salsa, [Rice, Black Beans,... $8.75 \n", "13 [Fresh Tomato Salsa, [Fajita Vegetables, Rice,... $11.25 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# use the boolean Series to filter the DataFrame\n", "orders[orders.item_name.str.contains('Chicken')].head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/93/795zm8c93m16_92qkk86t0_r0000gn/T/ipykernel_64526/7676934.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will*not* be treated as literal strings when regex=True.\n", " orders.choice_description.str.replace('[', '').str.replace(']', '').head()\n" ] }, { "data": { "text/plain": [ "0 NaN\n", "1 Clementine\n", "2 Apple\n", "3 NaN\n", "4 Tomatillo-Red Chili Salsa (Hot), Black Beans, ...\n", "Name: choice_description, dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# string methods can be chained together\n", "orders.choice_description.str.replace('[', '').str.replace(']', '').head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/93/795zm8c93m16_92qkk86t0_r0000gn/T/ipykernel_64526/1086997848.py:2: FutureWarning: The default value of regex will change from True to False in a future version.\n", " orders.choice_description.str.replace('[\\[\\]]', '').head()\n" ] }, { "data": { "text/plain": [ "0 NaN\n", "1 Clementine\n", "2 Apple\n", "3 NaN\n", "4 Tomatillo-Red Chili Salsa (Hot), Black Beans, ...\n", "Name: choice_description, dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# many pandas string methods support regular expressions (regex)\n", "orders.choice_description.str.replace('[\\[\\]]', '').head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dates & Times" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Abilene</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>KS</td>\n", " <td>6/1/1931 13:00</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>New York Worlds Fair</td>\n", " <td>NaN</td>\n", " <td>LIGHT</td>\n", " <td>NY</td>\n", " <td>4/18/1933 19:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke NaN OVAL CO 2/15/1931 14:00\n", "3 Abilene NaN DISK KS 6/1/1931 13:00\n", "4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# read a dataset of UFO reports into a DataFrame\n", "ufo = pd.read_csv('pandas/ufo.csv')\n", "ufo.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "City object\n", "Colors Reported object\n", "Shape Reported object\n", "State object\n", "Time object\n", "dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 'Time' is currently stored as a string\n", "ufo.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>1930-06-01 22:00:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>1930-06-30 20:00:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>1931-02-15 14:00:00</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Abilene</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>KS</td>\n", " <td>1931-06-01 13:00:00</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>New York Worlds Fair</td>\n", " <td>NaN</td>\n", " <td>LIGHT</td>\n", " <td>NY</td>\n", " <td>1933-04-18 19:00:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State \\\n", "0 Ithaca NaN TRIANGLE NY \n", "1 Willingboro NaN OTHER NJ \n", "2 Holyoke NaN OVAL CO \n", "3 Abilene NaN DISK KS \n", "4 New York Worlds Fair NaN LIGHT NY \n", "\n", " Time \n", "0 1930-06-01 22:00:00 \n", "1 1930-06-30 20:00:00 \n", "2 1931-02-15 14:00:00 \n", "3 1931-06-01 13:00:00 \n", "4 1933-04-18 19:00:00 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# convert 'Time' to datetime format\n", "ufo['Time'] = pd.to_datetime(ufo.Time)\n", "ufo.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "City object\n", "Colors Reported object\n", "Shape Reported object\n", "State object\n", "Time datetime64[ns]\n", "dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "ufo.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Category data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country</th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " <th>total_litres_of_pure_alcohol</th>\n", " <th>continent</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0.0</td>\n", " <td>Asia</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>89</td>\n", " <td>132</td>\n", " <td>54</td>\n", " <td>4.9</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>25</td>\n", " <td>0</td>\n", " <td>14</td>\n", " <td>0.7</td>\n", " <td>Africa</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>245</td>\n", " <td>138</td>\n", " <td>312</td>\n", " <td>12.4</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>217</td>\n", " <td>57</td>\n", " <td>45</td>\n", " <td>5.9</td>\n", " <td>Africa</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# read a dataset of alcohol consumption into a DataFrame\n", "drinks = pd.read_csv('drinks.csv')\n", "drinks.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "Index 128\n", "country 12588\n", "beer_servings 1544\n", "spirit_servings 1544\n", "wine_servings 1544\n", "total_litres_of_pure_alcohol 1544\n", "continent 12332\n", "dtype: int64" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# calculate the memory usage for each Series (in bytes)\n", "drinks.memory_usage(deep=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "country object\n", "beer_servings int64\n", "spirit_servings int64\n", "wine_servings int64\n", "total_litres_of_pure_alcohol float64\n", "continent category\n", "dtype: object" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use the 'category' data type to store the 'continent' strings as integers\n", "drinks['continent'] = drinks.continent.astype('category')\n", "drinks.dtypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "0 Asia\n", "1 Europe\n", "2 Africa\n", "3 Europe\n", "4 Africa\n", "Name: continent, dtype: category\n", "Categories (6, object): ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 'continent' Series appears to be unchanged\n", "drinks.continent.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "0 1\n", "1 2\n", "2 0\n", "3 2\n", "4 0\n", "dtype: int8" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)\n", "drinks.continent.cat.codes.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "Index 128\n", "country 12588\n", "beer_servings 1544\n", "spirit_servings 1544\n", "wine_servings 1544\n", "total_litres_of_pure_alcohol 1544\n", "continent 756\n", "dtype: int64" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# memory usage has been drastically reduced\n", "drinks.memory_usage(deep=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/plain": [ "Index 128\n", "country 17142\n", "beer_servings 1544\n", "spirit_servings 1544\n", "wine_servings 1544\n", "total_litres_of_pure_alcohol 1544\n", "continent 756\n", "dtype: int64" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# repeat this process for the 'country' Series\n", "drinks['country'] = drinks.country.astype('category')\n", "drinks.memory_usage(deep=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "# memory usage increased because we created 193 categories\n", "drinks.country.cat.categories" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dummy variables" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>PassengerId</th>\n", " <th>Survived</th>\n", " <th>Pclass</th>\n", " <th>Name</th>\n", " <th>Sex</th>\n", " <th>Age</th>\n", " <th>SibSp</th>\n", " <th>Parch</th>\n", " <th>Ticket</th>\n", " <th>Fare</th>\n", " <th>Cabin</th>\n", " <th>Embarked</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Braund, Mr. Owen Harris</td>\n", " <td>male</td>\n", " <td>22.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>A/5 21171</td>\n", " <td>7.2500</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", " <td>female</td>\n", " <td>38.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>PC 17599</td>\n", " <td>71.2833</td>\n", " <td>C85</td>\n", " <td>C</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>Heikkinen, Miss. Laina</td>\n", " <td>female</td>\n", " <td>26.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>STON/O2. 3101282</td>\n", " <td>7.9250</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", " <td>female</td>\n", " <td>35.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>113803</td>\n", " <td>53.1000</td>\n", " <td>C123</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Allen, Mr. William Henry</td>\n", " <td>male</td>\n", " <td>35.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>373450</td>\n", " <td>8.0500</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "train = pd.read_csv('pandas/titanic_train.csv')\n", "train.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>female</th>\n", " <th>male</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " female male\n", "0 0 1\n", "1 1 0\n", "2 1 0\n", "3 1 0\n", "4 0 1" ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use 'get_dummies' to create one column for every possible value\n", "pd.get_dummies(train.Sex).head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Sex_male</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Sex_male\n", "0 1\n", "1 0\n", "2 0\n", "3 0\n", "4 1" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop the first dummy variable ('female') using the 'iloc' method\n", "# add a prefix to identify the source of the dummy variables\n", "pd.get_dummies(train.Sex, prefix='Sex').iloc[:, 1:].head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Embarked_C</th>\n", " <th>Embarked_Q</th>\n", " <th>Embarked_S</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Embarked_C Embarked_Q Embarked_S\n", "0 0 0 1\n", "1 1 0 0\n", "2 0 0 1\n", "3 0 0 1\n", "4 0 0 1\n", "5 0 1 0\n", "6 0 0 1\n", "7 0 0 1\n", "8 0 0 1\n", "9 1 0 0" ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use 'get_dummies' with a feature that has 3 possible values\n", "pd.get_dummies(train.Embarked, prefix='Embarked').head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Embarked_Q</th>\n", " <th>Embarked_S</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Embarked_Q Embarked_S\n", "0 0 1\n", "1 0 0\n", "2 0 1\n", "3 0 1\n", "4 0 1\n", "5 1 0\n", "6 0 1\n", "7 0 1\n", "8 0 1\n", "9 0 0" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop the first dummy variable ('C')\n", "pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:].head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>PassengerId</th>\n", " <th>Survived</th>\n", " <th>Pclass</th>\n", " <th>Name</th>\n", " <th>Sex</th>\n", " <th>Age</th>\n", " <th>SibSp</th>\n", " <th>Parch</th>\n", " <th>Ticket</th>\n", " <th>Fare</th>\n", " <th>Cabin</th>\n", " <th>Embarked</th>\n", " <th>Embarked_Q</th>\n", " <th>Embarked_S</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Braund, Mr. Owen Harris</td>\n", " <td>male</td>\n", " <td>22.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>A/5 21171</td>\n", " <td>7.2500</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", " <td>female</td>\n", " <td>38.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>PC 17599</td>\n", " <td>71.2833</td>\n", " <td>C85</td>\n", " <td>C</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>Heikkinen, Miss. Laina</td>\n", " <td>female</td>\n", " <td>26.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>STON/O2. 3101282</td>\n", " <td>7.9250</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", " <td>female</td>\n", " <td>35.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>113803</td>\n", " <td>53.1000</td>\n", " <td>C123</td>\n", " <td>S</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Allen, Mr. William Henry</td>\n", " <td>male</td>\n", " <td>35.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>373450</td>\n", " <td>8.0500</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked Embarked_Q Embarked_S \n", "0 0 A/5 21171 7.2500 NaN S 0 1 \n", "1 0 PC 17599 71.2833 C85 C 0 0 \n", "2 0 STON/O2. 3101282 7.9250 NaN S 0 1 \n", "3 0 113803 53.1000 C123 S 0 1 \n", "4 0 373450 8.0500 NaN S 0 1 " ] }, "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# save the DataFrame of dummy variables and concatenate them to the original DataFrame\n", "embarked_dummies = pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:]\n", "train = pd.concat([train, embarked_dummies], axis=1)\n", "train.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>PassengerId</th>\n", " <th>Survived</th>\n", " <th>Pclass</th>\n", " <th>Name</th>\n", " <th>Sex</th>\n", " <th>Age</th>\n", " <th>SibSp</th>\n", " <th>Parch</th>\n", " <th>Ticket</th>\n", " <th>Fare</th>\n", " <th>Cabin</th>\n", " <th>Embarked</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Braund, Mr. Owen Harris</td>\n", " <td>male</td>\n", " <td>22.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>A/5 21171</td>\n", " <td>7.2500</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", " <td>female</td>\n", " <td>38.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>PC 17599</td>\n", " <td>71.2833</td>\n", " <td>C85</td>\n", " <td>C</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>Heikkinen, Miss. Laina</td>\n", " <td>female</td>\n", " <td>26.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>STON/O2. 3101282</td>\n", " <td>7.9250</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", " <td>female</td>\n", " <td>35.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>113803</td>\n", " <td>53.1000</td>\n", " <td>C123</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Allen, Mr. William Henry</td>\n", " <td>male</td>\n", " <td>35.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>373450</td>\n", " <td>8.0500</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# reset the DataFrame\n", "train = pd.read_csv('pandas/titanic_train.csv')\n", "train.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>PassengerId</th>\n", " <th>Survived</th>\n", " <th>Pclass</th>\n", " <th>Name</th>\n", " <th>Age</th>\n", " <th>SibSp</th>\n", " <th>Parch</th>\n", " <th>Ticket</th>\n", " <th>Fare</th>\n", " <th>Cabin</th>\n", " <th>Sex_female</th>\n", " <th>Sex_male</th>\n", " <th>Embarked_C</th>\n", " <th>Embarked_Q</th>\n", " <th>Embarked_S</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Braund, Mr. Owen Harris</td>\n", " <td>22.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>A/5 21171</td>\n", " <td>7.2500</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", " <td>38.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>PC 17599</td>\n", " <td>71.2833</td>\n", " <td>C85</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>Heikkinen, Miss. Laina</td>\n", " <td>26.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>STON/O2. 3101282</td>\n", " <td>7.9250</td>\n", " <td>NaN</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", " <td>35.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>113803</td>\n", " <td>53.1000</td>\n", " <td>C123</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Allen, Mr. William Henry</td>\n", " <td>35.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>373450</td>\n", " <td>8.0500</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Age SibSp Parch \\\n", "0 Braund, Mr. Owen Harris 22.0 1 0 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 1 0 \n", "2 Heikkinen, Miss. Laina 26.0 0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 1 0 \n", "4 Allen, Mr. William Henry 35.0 0 0 \n", "\n", " Ticket Fare Cabin Sex_female Sex_male Embarked_C \\\n", "0 A/5 21171 7.2500 NaN 0 1 0 \n", "1 PC 17599 71.2833 C85 1 0 1 \n", "2 STON/O2. 3101282 7.9250 NaN 1 0 0 \n", "3 113803 53.1000 C123 1 0 0 \n", "4 373450 8.0500 NaN 0 1 0 \n", "\n", " Embarked_Q Embarked_S \n", "0 0 1 \n", "1 0 0 \n", "2 0 1 \n", "3 0 1 \n", "4 0 1 " ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)\n", "pd.get_dummies(train, columns=['Sex', 'Embarked']).head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>PassengerId</th>\n", " <th>Survived</th>\n", " <th>Pclass</th>\n", " <th>Name</th>\n", " <th>Age</th>\n", " <th>SibSp</th>\n", " <th>Parch</th>\n", " <th>Ticket</th>\n", " <th>Fare</th>\n", " <th>Cabin</th>\n", " <th>Sex_male</th>\n", " <th>Embarked_Q</th>\n", " <th>Embarked_S</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Braund, Mr. Owen Harris</td>\n", " <td>22.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>A/5 21171</td>\n", " <td>7.2500</td>\n", " <td>NaN</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", " <td>38.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>PC 17599</td>\n", " <td>71.2833</td>\n", " <td>C85</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>Heikkinen, Miss. Laina</td>\n", " <td>26.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>STON/O2. 3101282</td>\n", " <td>7.9250</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", " <td>35.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>113803</td>\n", " <td>53.1000</td>\n", " <td>C123</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5</td>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Allen, Mr. William Henry</td>\n", " <td>35.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>373450</td>\n", " <td>8.0500</td>\n", " <td>NaN</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Age SibSp Parch \\\n", "0 Braund, Mr. Owen Harris 22.0 1 0 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... 38.0 1 0 \n", "2 Heikkinen, Miss. Laina 26.0 0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0 1 0 \n", "4 Allen, Mr. William Henry 35.0 0 0 \n", "\n", " Ticket Fare Cabin Sex_male Embarked_Q Embarked_S \n", "0 A/5 21171 7.2500 NaN 1 0 1 \n", "1 PC 17599 71.2833 C85 0 0 0 \n", "2 STON/O2. 3101282 7.9250 NaN 0 0 1 \n", "3 113803 53.1000 C123 0 0 1 \n", "4 373450 8.0500 NaN 1 0 1 " ] }, "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature\n", "pd.get_dummies(train, columns=['Sex', 'Embarked'], drop_first=True).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cardinal data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>ID</th>\n", " <th>quality</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>100</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>101</td>\n", " <td>very good</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>102</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>103</td>\n", " <td>excellent</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " ID quality\n", "0 100 good\n", "1 101 very good\n", "2 102 good\n", "3 103 excellent" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create a small DataFrame from a dictionary\n", "df = pd.DataFrame({\n", " 'ID': [100, 101, 102, 103],\n", " 'quality': ['good', 'very good', 'good', 'excellent']\n", "})\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>ID</th>\n", " <th>quality</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>3</th>\n", " <td>103</td>\n", " <td>excellent</td>\n", " </tr>\n", " <tr>\n", " <th>0</th>\n", " <td>100</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>102</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>101</td>\n", " <td>very good</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " ID quality\n", "3 103 excellent\n", "0 100 good\n", "2 102 good\n", "1 101 very good" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sort the DataFrame by the 'quality' Series (alphabetical order)\n", "df.sort_values('quality')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "from pandas.api.types import CategoricalDtype" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 good\n", "1 very good\n", "2 good\n", "3 excellent\n", "Name: quality, dtype: category\n", "Categories (3, object): ['good' < 'very good' < 'excellent']" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# define a logical ordering for the categories\n", "cats = ['good', 'very good', 'excellent']\n", "cat_type = CategoricalDtype(categories=cats, ordered=True)\n", "df['quality'] = df.quality.astype(cat_type)\n", "df.quality" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>ID</th>\n", " <th>quality</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>100</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>102</td>\n", " <td>good</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>101</td>\n", " <td>very good</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>103</td>\n", " <td>excellent</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " ID quality\n", "0 100 good\n", "2 102 good\n", "1 101 very good\n", "3 103 excellent" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sort the DataFrame by the 'quality' Series (logical order)\n", "df.sort_values('quality')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>ID</th>\n", " <th>quality</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>101</td>\n", " <td>very good</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>103</td>\n", " <td>excellent</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " ID quality\n", "1 101 very good\n", "3 103 excellent" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# comparison operators work with ordered categories\n", "df.loc[df.quality > 'good', :]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sample rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>217</th>\n", " <td>Norridgewock</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>ME</td>\n", " <td>9/15/1952 14:00</td>\n", " </tr>\n", " <tr>\n", " <th>12282</th>\n", " <td>Ipava</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>IL</td>\n", " <td>10/1/1998 21:15</td>\n", " </tr>\n", " <tr>\n", " <th>17933</th>\n", " <td>Ellinwood</td>\n", " <td>NaN</td>\n", " <td>FIREBALL</td>\n", " <td>KS</td>\n", " <td>11/13/2000 22:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "217 Norridgewock NaN DISK ME 9/15/1952 14:00\n", "12282 Ipava NaN TRIANGLE IL 10/1/1998 21:15\n", "17933 Ellinwood NaN FIREBALL KS 11/13/2000 22:00" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use the 'random_state' parameter for reproducibility\n", "ufo.sample(n=3, random_state=42)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "# sample 75% of the DataFrame's rows without replacement\n", "train = ufo.sample(frac=0.75, random_state=99)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>4</th>\n", " <td>New York Worlds Fair</td>\n", " <td>NaN</td>\n", " <td>LIGHT</td>\n", " <td>NY</td>\n", " <td>4/18/1933 19:00</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>Valley City</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>ND</td>\n", " <td>9/15/1934 15:30</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>Eklutna</td>\n", " <td>NaN</td>\n", " <td>CIGAR</td>\n", " <td>AK</td>\n", " <td>10/15/1936 17:00</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>Waterloo</td>\n", " <td>NaN</td>\n", " <td>FIREBALL</td>\n", " <td>AL</td>\n", " <td>6/1/1939 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>Keokuk</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>IA</td>\n", " <td>7/7/1939 2:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State \\\n", "4 New York Worlds Fair NaN LIGHT NY \n", "5 Valley City NaN DISK ND \n", "8 Eklutna NaN CIGAR AK \n", "11 Waterloo NaN FIREBALL AL \n", "13 Keokuk NaN OVAL IA \n", "\n", " Time \n", "4 4/18/1933 19:00 \n", "5 9/15/1934 15:30 \n", "8 10/15/1936 17:00 \n", "11 6/1/1939 20:00 \n", "13 7/7/1939 2:00 " ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# store the remaining 25% of the rows in another DataFrame\n", "test = ufo.loc[~ufo.index.isin(train.index), :]\n", "test.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## List the name of Dateframes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "ename": "", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31mRunning cells with '/usr/local/bin/python3.11' requires the ipykernel package.\n", "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", "\u001b[1;31mCommand: '/usr/local/bin/python3.11 -m pip install ipykernel -U --user --force-reinstall'" ] } ], "source": [ "# https://stackoverflow.com/questions/44835358/pandas-list-of-dataframe-names\n", "g = globals()\n", "\n", "df_names = [k for k, v in g.items() if isinstance(v, pd.DataFrame)]\n", "\n", "df_list = list(filter(lambda x: 'df' in x, df_names))\n", "\n", "df_list\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "p39", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" }, "latex_envs": { "LaTeX_envs_menu_present": true, "autoclose": false, "autocomplete": true, "bibliofile": "biblio.bib", "cite_by": "apalike", "current_citInitial": 1, "eqLabelWithNumbers": true, "eqNumInitial": 1, "hotkeys": { "equation": "Ctrl-E", "itemize": "Ctrl-I" }, "labels_anchors": false, "latex_user_defs": false, "report_style_numbering": false, "user_envs_cfg": false }, "metadata": { "interpreter": { "hash": "ff2e645ef952f1284ebef59edbc19ad3cff03e6406671d2dab1ca7ad9588368d" } }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false }, "toc-showcode": false, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false }, "vscode": { "interpreter": { "hash": "c3a1109010e257dd9f1e593983c951231fd94a0c98fc2b081b8760e1222f1725" } } }, "nbformat": 4, "nbformat_minor": 4 }