{ "cells": [ { "cell_type": "markdown", "metadata": { "toc": true }, "source": [ "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n", "<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Series\" data-toc-modified-id=\"Series-1\"><span class=\"toc-item-num\">1 </span>Series</a></span></li><li><span><a href=\"#DataFrame\" data-toc-modified-id=\"DataFrame-2\"><span class=\"toc-item-num\">2 </span>DataFrame</a></span></li><li><span><a href=\"#Using-the-axis-parameter\" data-toc-modified-id=\"Using-the-axis-parameter-3\"><span class=\"toc-item-num\">3 </span>Using the axis parameter</a></span></li><li><span><a href=\"#Essesntial-Functionality\" data-toc-modified-id=\"Essesntial-Functionality-4\"><span class=\"toc-item-num\">4 </span>Essesntial Functionality</a></span><ul class=\"toc-item\"><li><span><a href=\"#Reindexing\" data-toc-modified-id=\"Reindexing-4.1\"><span class=\"toc-item-num\">4.1 </span>Reindexing</a></span></li><li><span><a href=\"#Dropping-Entries-from-an-axis\" data-toc-modified-id=\"Dropping-Entries-from-an-axis-4.2\"><span class=\"toc-item-num\">4.2 </span>Dropping Entries from an axis</a></span></li><li><span><a href=\"#Indexing,-Selection,-and-Filtering\" data-toc-modified-id=\"Indexing,-Selection,-and-Filtering-4.3\"><span class=\"toc-item-num\">4.3 </span>Indexing, Selection, and Filtering</a></span><ul class=\"toc-item\"><li><span><a href=\"#indexing\" data-toc-modified-id=\"indexing-4.3.1\"><span class=\"toc-item-num\">4.3.1 </span>indexing</a></span></li><li><span><a href=\"#setting\" data-toc-modified-id=\"setting-4.3.2\"><span class=\"toc-item-num\">4.3.2 </span>setting</a></span></li><li><span><a href=\"#Selection-with-loc,-iloc\" data-toc-modified-id=\"Selection-with-loc,-iloc-4.3.3\"><span class=\"toc-item-num\">4.3.3 </span>Selection with loc, iloc</a></span></li></ul></li><li><span><a href=\"#Arithmetic-and-Data-Alignment\" data-toc-modified-id=\"Arithmetic-and-Data-Alignment-4.4\"><span class=\"toc-item-num\">4.4 </span>Arithmetic and Data Alignment</a></span></li><li><span><a href=\"#Function-Application-and-Mapping\" data-toc-modified-id=\"Function-Application-and-Mapping-4.5\"><span class=\"toc-item-num\">4.5 </span>Function Application and Mapping</a></span></li><li><span><a href=\"#Sorting-and-Ranking\" data-toc-modified-id=\"Sorting-and-Ranking-4.6\"><span class=\"toc-item-num\">4.6 </span>Sorting and Ranking</a></span><ul class=\"toc-item\"><li><span><a href=\"#Sorting\" data-toc-modified-id=\"Sorting-4.6.1\"><span class=\"toc-item-num\">4.6.1 </span>Sorting</a></span></li><li><span><a href=\"#Ranking\" data-toc-modified-id=\"Ranking-4.6.2\"><span class=\"toc-item-num\">4.6.2 </span>Ranking</a></span></li></ul></li><li><span><a href=\"#Axis-Indexes-with-Duplicate-Labels\" data-toc-modified-id=\"Axis-Indexes-with-Duplicate-Labels-4.7\"><span class=\"toc-item-num\">4.7 </span>Axis Indexes with Duplicate Labels</a></span></li></ul></li><li><span><a href=\"#Summarizing-and-Computing-Descriptive-Statistics\" data-toc-modified-id=\"Summarizing-and-Computing-Descriptive-Statistics-5\"><span class=\"toc-item-num\">5 </span>Summarizing and Computing Descriptive Statistics</a></span><ul class=\"toc-item\"><li><span><a href=\"#Correlation-and-Covariance\" data-toc-modified-id=\"Correlation-and-Covariance-5.1\"><span class=\"toc-item-num\">5.1 </span>Correlation and Covariance</a></span></li><li><span><a href=\"#Unique-Values,-Value-Counts,-and-Membership\" data-toc-modified-id=\"Unique-Values,-Value-Counts,-and-Membership-5.2\"><span class=\"toc-item-num\">5.2 </span>Unique Values, Value Counts, and Membership</a></span></li></ul></li></ul></div>" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:01.227037Z", "start_time": "2019-12-24T11:54:00.439377Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:01.232635Z", "start_time": "2019-12-24T11:54:01.227037Z" } }, "outputs": [], "source": [ "import builtins\n", "\n", "def print(*args, **kwargs):\n", " builtins.print(*args, **kwargs, end='\\n\\n')" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-12-22T12:01:19.037222Z", "start_time": "2019-12-22T12:01:19.029951Z" } }, "source": [ "## Series\n", "\n", "A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:02.268631Z", "start_time": "2019-12-24T11:54:01.233953Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 4\n", "1 7\n", "2 -5\n", "3 3\n", "dtype: int64\n", "\n", "RangeIndex(start=0, stop=4, step=1)\n", "\n", "[ 4 7 -5 3]\n", "\n" ] } ], "source": [ "obj = pd.Series([4, 7, -5, 3])\n", "print(obj) \n", "print(obj.index)\n", "print(obj.values)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:02.430806Z", "start_time": "2019-12-24T11:54:02.271623Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "d 4\n", "a 7\n", "c -5\n", "b 3\n", "dtype: int64\n", "\n", "Index(['d', 'a', 'c', 'b'], dtype='object')\n", "\n" ] } ], "source": [ "obj2 = pd.Series([4, 7, -5, 3], index=['d', 'a', 'c', 'b'])\n", "print(obj2)\n", "print(obj2.index)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:02.643902Z", "start_time": "2019-12-24T11:54:02.432505Z" } }, "outputs": [ { "data": { "text/plain": [ "7" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj2['a']" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:02.812307Z", "start_time": "2019-12-24T11:54:02.645897Z" } }, "outputs": [ { "data": { "text/plain": [ "c -5\n", "a 7\n", "d 4\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj2[['c', 'a', 'd']]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:02.962361Z", "start_time": "2019-12-24T11:54:02.814622Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ohio 35000\n", "Texas 71000\n", "Oregon 16000\n", "Utah 5000\n", "dtype: int64\n", "\n" ] } ], "source": [ "sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}\n", "obj3 = pd.Series(sdata)\n", "print(obj3)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:03.145871Z", "start_time": "2019-12-24T11:54:02.964783Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "California NaN\n", "Ohio 35000.0\n", "Oregon 16000.0\n", "Texas 71000.0\n", "dtype: float64\n", "\n" ] } ], "source": [ "states = ['California', 'Ohio', 'Oregon', 'Texas']\n", "obj4 = pd.Series(sdata, index=states)\n", "print(obj4)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:03.282293Z", "start_time": "2019-12-24T11:54:03.148390Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "California True\n", "Ohio False\n", "Oregon False\n", "Texas False\n", "dtype: bool\n", "\n", "California False\n", "Ohio True\n", "Oregon True\n", "Texas True\n", "dtype: bool\n", "\n" ] } ], "source": [ "print(obj4.isnull())\n", "print(obj4.notnull())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:03.477576Z", "start_time": "2019-12-24T11:54:03.282293Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ohio 35000\n", "Texas 71000\n", "Oregon 16000\n", "Utah 5000\n", "dtype: int64\n", "\n", "California NaN\n", "Ohio 35000.0\n", "Oregon 16000.0\n", "Texas 71000.0\n", "dtype: float64\n", "\n", "California NaN\n", "Ohio 70000.0\n", "Oregon 32000.0\n", "Texas 142000.0\n", "Utah NaN\n", "dtype: float64\n", "\n" ] } ], "source": [ "print(obj3)\n", "print(obj4)\n", "print(obj3 + obj4)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:03.586203Z", "start_time": "2019-12-24T11:54:03.480359Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "state\n", "California NaN\n", "Ohio 35000.0\n", "Oregon 16000.0\n", "Texas 71000.0\n", "Name: population, dtype: float64\n", "\n" ] } ], "source": [ "obj4.name = 'population'\n", "obj4.index.name = 'state'\n", "print(obj4)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:03.710465Z", "start_time": "2019-12-24T11:54:03.588187Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 4\n", "1 7\n", "2 -5\n", "3 3\n", "dtype: int64\n", "\n", "Bob 4\n", "Steve 7\n", "Jeff -5\n", "Ryan 3\n", "dtype: int64\n", "\n" ] } ], "source": [ "print(obj)\n", "obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']\n", "print(obj)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DataFrame\n", "\n", "A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.)." ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:03.884935Z", "start_time": "2019-12-24T11:54:03.713457Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " state year pop\n", "0 Ohio 2000 1.5\n", "1 Ohio 2001 1.7\n", "2 Ohio 2002 3.6\n", "3 Nevada 2001 2.4\n", "4 Nevada 2002 2.9\n", "5 Nevada 2003 3.2\n", "\n" ] } ], "source": [ "data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],\n", " 'year': [2000, 2001, 2002, 2001, 2002, 2003],\n", " 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}\n", "\n", "frame = pd.DataFrame(data)\n", "print(frame)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.041094Z", "start_time": "2019-12-24T11:54:03.886930Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>year</th>\n", " <th>state</th>\n", " <th>pop</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2000</td>\n", " <td>Ohio</td>\n", " <td>1.5</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2001</td>\n", " <td>Ohio</td>\n", " <td>1.7</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2002</td>\n", " <td>Ohio</td>\n", " <td>3.6</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2001</td>\n", " <td>Nevada</td>\n", " <td>2.4</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>2002</td>\n", " <td>Nevada</td>\n", " <td>2.9</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>2003</td>\n", " <td>Nevada</td>\n", " <td>3.2</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " year state pop\n", "0 2000 Ohio 1.5\n", "1 2001 Ohio 1.7\n", "2 2002 Ohio 3.6\n", "3 2001 Nevada 2.4\n", "4 2002 Nevada 2.9\n", "5 2003 Nevada 3.2" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(data, columns=['year', 'state', 'pop'])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.244915Z", "start_time": "2019-12-24T11:54:04.043184Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 NaN\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 NaN\n", "five 2002 Nevada 2.9 NaN\n", "six 2003 Nevada 3.2 NaN\n", "\n", "Index(['year', 'state', 'pop', 'debt'], dtype='object')\n", "\n" ] } ], "source": [ "# u pass a column that isn’t contained in the dict, \n", "# it will appear with missing values\n", "\n", "frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],\n", " index=['one', 'two', 'three', 'four', 'five', 'six'])\n", "\n", "print(frame2)\n", "print(frame2.columns)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.373163Z", "start_time": "2019-12-24T11:54:04.244915Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 Ohio\n", "1 Ohio\n", "2 Ohio\n", "3 Nevada\n", "4 Nevada\n", "5 Nevada\n", "Name: state, dtype: object\n", "\n", "0 2000\n", "1 2001\n", "2 2002\n", "3 2001\n", "4 2002\n", "5 2003\n", "Name: year, dtype: int64\n", "\n" ] } ], "source": [ "print(frame['state'])\n", "print(frame.year)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.502687Z", "start_time": "2019-12-24T11:54:04.374158Z" } }, "outputs": [ { "data": { "text/plain": [ "year 2002\n", "state Ohio\n", "pop 3.6\n", "debt NaN\n", "Name: three, dtype: object" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Rows can also be retrieved by \n", "# position or name with the special loc attribute\n", "frame2.loc['three']" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.699057Z", "start_time": "2019-12-24T11:54:04.503681Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 NaN\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 NaN\n", "five 2002 Nevada 2.9 NaN\n", "six 2003 Nevada 3.2 NaN\n", "\n", " year state pop debt\n", "one 2000 Ohio 1.5 16.5\n", "two 2001 Ohio 1.7 16.5\n", "three 2002 Ohio 3.6 16.5\n", "four 2001 Nevada 2.4 16.5\n", "five 2002 Nevada 2.9 16.5\n", "six 2003 Nevada 3.2 16.5\n", "\n", " year state pop debt\n", "one 2000 Ohio 1.5 0.0\n", "two 2001 Ohio 1.7 1.0\n", "three 2002 Ohio 3.6 2.0\n", "four 2001 Nevada 2.4 3.0\n", "five 2002 Nevada 2.9 4.0\n", "six 2003 Nevada 3.2 5.0\n", "\n" ] } ], "source": [ "print(frame2)\n", "frame2['debt'] = 16.5\n", "print(frame2)\n", "frame2['debt'] = np.arange(6.)\n", "print(frame2)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.822895Z", "start_time": "2019-12-24T11:54:04.700056Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 -1.2\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 -1.5\n", "five 2002 Nevada 2.9 -1.7\n", "six 2003 Nevada 3.2 NaN\n", "\n" ] } ], "source": [ "val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])\n", "frame2['debt'] = val\n", "print(frame2)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.996306Z", "start_time": "2019-12-24T11:54:04.825542Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " year state pop debt eastern\n", "one 2000 Ohio 1.5 NaN True\n", "two 2001 Ohio 1.7 -1.2 True\n", "three 2002 Ohio 3.6 NaN True\n", "four 2001 Nevada 2.4 -1.5 False\n", "five 2002 Nevada 2.9 -1.7 False\n", "six 2003 Nevada 3.2 NaN False\n", "\n" ] }, { "data": { "text/plain": [ "Index(['year', 'state', 'pop', 'debt'], dtype='object')" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Assigning a column that doesn’t exist will create a new column. \n", "# The del keyword will delete columns as with a dict.\n", "# As an example of del, I first add a new column of boolean \n", "# values where the state column equals 'Ohio':\n", "\n", "frame2['eastern'] = frame2['state'] == 'Ohio'\n", "print(frame2)\n", "\n", "del frame2['eastern']\n", "frame2.columns" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:05.121544Z", "start_time": "2019-12-24T11:54:04.998624Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Nevada Ohio\n", "2001 2.4 1.7\n", "2002 2.9 3.6\n", "2000 NaN 1.5\n", "\n" ] } ], "source": [ "# another way is a nested dict of dicts\n", "pop = {'Nevada': {2001: 2.4, 2002: 2.9},\n", " 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}\n", "\n", "# If the nested dict is passed to the DataFrame, \n", "# pandas will interpret the outer dict keys\n", "# as the columns and the inner keys as the row indices\n", "\n", "frame3 = pd.DataFrame(pop)\n", "print(frame3)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:05.295926Z", "start_time": "2019-12-24T11:54:05.126331Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>2001</th>\n", " <th>2002</th>\n", " <th>2000</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Nevada</th>\n", " <td>2.4</td>\n", " <td>2.9</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>Ohio</th>\n", " <td>1.7</td>\n", " <td>3.6</td>\n", " <td>1.5</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 2001 2002 2000\n", "Nevada 2.4 2.9 NaN\n", "Ohio 1.7 3.6 1.5" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# transpose dataframe\n", "frame3.T" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:05.432228Z", "start_time": "2019-12-24T11:54:05.299058Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "state Nevada Ohio\n", "year \n", "2001 2.4 1.7\n", "2002 2.9 3.6\n", "2000 NaN 1.5\n", "\n" ] } ], "source": [ "frame3.index.name = 'year'; frame3.columns.name = 'state'\n", "print(frame3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using the axis parameter \n", "\n", "1) axis = 0 => row\n", " * Move across/down the row axis\n", " * Direction of operation is down\n", "\n", "2) axis = 1 => column\n", " * Move along the column axis\n", " * Direction of operation is sideways\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:06.582318Z", "start_time": "2019-12-24T11:54:05.434315Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country</th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " <th>total_litres_of_pure_alcohol</th>\n", " <th>continent</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0.0</td>\n", " <td>Asia</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>89</td>\n", " <td>132</td>\n", " <td>54</td>\n", " <td>4.9</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>25</td>\n", " <td>0</td>\n", " <td>14</td>\n", " <td>0.7</td>\n", " <td>Africa</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>245</td>\n", " <td>138</td>\n", " <td>312</td>\n", " <td>12.4</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>217</td>\n", " <td>57</td>\n", " <td>45</td>\n", " <td>5.9</td>\n", " <td>Africa</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks = pd.read_csv('http://bit.ly/drinksbycountry')\n", "drinks.head()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:06.610698Z", "start_time": "2019-12-24T11:54:06.587032Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country</th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " <th>total_litres_of_pure_alcohol</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>89</td>\n", " <td>132</td>\n", " <td>54</td>\n", " <td>4.9</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Algeria</td>\n", " <td>25</td>\n", " <td>0</td>\n", " <td>14</td>\n", " <td>0.7</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>245</td>\n", " <td>138</td>\n", " <td>312</td>\n", " <td>12.4</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>217</td>\n", " <td>57</td>\n", " <td>45</td>\n", " <td>5.9</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol \n", "0 0.0 \n", "1 4.9 \n", "2 0.7 \n", "3 12.4 \n", "4 5.9 " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# dropping a column\n", "\n", "drinks.drop('continent', axis=1).head() " ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:07.987950Z", "start_time": "2019-12-24T11:54:06.613218Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>country</th>\n", " <th>beer_servings</th>\n", " <th>spirit_servings</th>\n", " <th>wine_servings</th>\n", " <th>total_litres_of_pure_alcohol</th>\n", " <th>continent</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Afghanistan</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0.0</td>\n", " <td>Asia</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Albania</td>\n", " <td>89</td>\n", " <td>132</td>\n", " <td>54</td>\n", " <td>4.9</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Andorra</td>\n", " <td>245</td>\n", " <td>138</td>\n", " <td>312</td>\n", " <td>12.4</td>\n", " <td>Europe</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>Angola</td>\n", " <td>217</td>\n", " <td>57</td>\n", " <td>45</td>\n", " <td>5.9</td>\n", " <td>Africa</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>Antigua & Barbuda</td>\n", " <td>102</td>\n", " <td>128</td>\n", " <td>45</td>\n", " <td>4.9</td>\n", " <td>North America</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "5 Antigua & Barbuda 102 128 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "3 12.4 Europe \n", "4 5.9 Africa \n", "5 4.9 North America " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# dropping a row\n", "\n", "drinks.drop(2, axis=0).head()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.176714Z", "start_time": "2019-12-24T11:54:07.990944Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(4,)\n", "\n" ] }, { "data": { "text/plain": [ "beer_servings 106.160622\n", "spirit_servings 80.994819\n", "wine_servings 49.450777\n", "total_litres_of_pure_alcohol 4.717098\n", "dtype: float64" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# default axis=0\n", "# mean of each column\n", "\n", "print(drinks.mean().shape)\n", "drinks.mean() " ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.296361Z", "start_time": "2019-12-24T11:54:08.178873Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(193,)\n", "\n" ] }, { "data": { "text/plain": [ "0 0.000\n", "1 69.975\n", "2 9.925\n", "3 176.850\n", "4 81.225\n", " ... \n", "188 110.925\n", "189 29.000\n", "190 1.525\n", "191 14.375\n", "192 22.675\n", "Length: 193, dtype: float64" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# mean of each row\n", "\n", "print(drinks.mean(axis=1).shape)\n", "drinks.mean(axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Essesntial Functionality" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reindexing\n", "\n", "An important method on pandas objects is **reindex**, which means to create a new object with the data conformed to a new index" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.467311Z", "start_time": "2019-12-24T11:54:08.296361Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "d 4.5\n", "b 7.2\n", "a -5.3\n", "c 3.6\n", "dtype: float64\n", "\n", "a -5.3\n", "b 7.2\n", "c 3.6\n", "d 4.5\n", "dtype: float64\n", "\n" ] } ], "source": [ "obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])\n", "print(obj)\n", "\n", "obj2 = obj.reindex(['a', 'b', 'c', 'd'])\n", "print(obj2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For ordered data like time series, it may be desirable to do some interpolation or filling of values when reindexing. The **method** option allows us to do this, using a\n", "method such as **ffill**, which forward-fills the values" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.580511Z", "start_time": "2019-12-24T11:54:08.470651Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 blue\n", "2 purple\n", "4 yellow\n", "dtype: object\n", "\n" ] }, { "data": { "text/plain": [ "0 blue\n", "1 blue\n", "2 purple\n", "3 purple\n", "4 yellow\n", "5 yellow\n", "dtype: object" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])\n", "print(obj3)\n", "\n", "obj3.reindex(range(6), method='ffill')" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.694431Z", "start_time": "2019-12-24T11:54:08.582639Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Ohio</th>\n", " <th>Texas</th>\n", " <th>California</th>\n", " </tr>\n", " <tr>\n", " <th>letters</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>a</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>c</th>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>5</td>\n", " </tr>\n", " <tr>\n", " <th>d</th>\n", " <td>6</td>\n", " <td>7</td>\n", " <td>8</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Ohio Texas California\n", "letters \n", "a 0 1 2\n", "c 3 4 5\n", "d 6 7 8" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],\n", " columns=['Ohio', 'Texas', 'California'])\n", "frame.index.name = 'letters'\n", "frame" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.812642Z", "start_time": "2019-12-24T11:54:08.696663Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Ohio</th>\n", " <th>Texas</th>\n", " <th>California</th>\n", " </tr>\n", " <tr>\n", " <th>letters</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>a</th>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " <td>2.0</td>\n", " </tr>\n", " <tr>\n", " <th>b</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>c</th>\n", " <td>3.0</td>\n", " <td>4.0</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <th>d</th>\n", " <td>6.0</td>\n", " <td>7.0</td>\n", " <td>8.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Ohio Texas California\n", "letters \n", "a 0.0 1.0 2.0\n", "b NaN NaN NaN\n", "c 3.0 4.0 5.0\n", "d 6.0 7.0 8.0" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame2 = frame.reindex(['a', 'b', 'c', 'd'])\n", "frame2" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.910495Z", "start_time": "2019-12-24T11:54:08.814770Z" }, "scrolled": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Texas</th>\n", " <th>Utah</th>\n", " <th>California</th>\n", " </tr>\n", " <tr>\n", " <th>letters</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>a</th>\n", " <td>1</td>\n", " <td>NaN</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>c</th>\n", " <td>4</td>\n", " <td>NaN</td>\n", " <td>5</td>\n", " </tr>\n", " <tr>\n", " <th>d</th>\n", " <td>7</td>\n", " <td>NaN</td>\n", " <td>8</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Texas Utah California\n", "letters \n", "a 1 NaN 2\n", "c 4 NaN 5\n", "d 7 NaN 8" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The columns can be reindexed with the columns keyword:\n", "states = ['Texas', 'Utah', 'California']\n", "frame.reindex(columns=states)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.024940Z", "start_time": "2019-12-24T11:54:08.911493Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Texas</th>\n", " <th>Utah</th>\n", " <th>California</th>\n", " </tr>\n", " <tr>\n", " <th>letters</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>a</th>\n", " <td>1.0</td>\n", " <td>NaN</td>\n", " <td>2.0</td>\n", " </tr>\n", " <tr>\n", " <th>b</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>c</th>\n", " <td>4.0</td>\n", " <td>NaN</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <th>d</th>\n", " <td>7.0</td>\n", " <td>NaN</td>\n", " <td>8.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Texas Utah California\n", "letters \n", "a 1.0 NaN 2.0\n", "b NaN NaN NaN\n", "c 4.0 NaN 5.0\n", "d 7.0 NaN 8.0" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.reindex(['a', 'b', 'c', 'd'], columns=states)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.141015Z", "start_time": "2019-12-24T11:54:09.024940Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Texas</th>\n", " <th>Utah</th>\n", " <th>California</th>\n", " </tr>\n", " <tr>\n", " <th>letters</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>a</th>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>b</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>c</th>\n", " <td>4</td>\n", " <td>0</td>\n", " <td>5</td>\n", " </tr>\n", " <tr>\n", " <th>d</th>\n", " <td>7</td>\n", " <td>0</td>\n", " <td>8</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Texas Utah California\n", "letters \n", "a 1 0 2\n", "b 0 0 0\n", "c 4 0 5\n", "d 7 0 8" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.reindex(['a', 'b', 'c', 'd'], columns=states, fill_value=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dropping Entries from an axis\n", "\n", "**drop** method will return a new object with the indicated value or values deleted from an axis" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.250070Z", "start_time": "2019-12-24T11:54:09.141015Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0.0\n", "b 1.0\n", "c 2.0\n", "d 3.0\n", "e 4.0\n", "dtype: float64" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])\n", "obj" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.396400Z", "start_time": "2019-12-24T11:54:09.251603Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0.0\n", "b 1.0\n", "d 3.0\n", "e 4.0\n", "dtype: float64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_obj = obj.drop('c')\n", "new_obj" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.598954Z", "start_time": "2019-12-24T11:54:09.396400Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0.0\n", "b 1.0\n", "e 4.0\n", "dtype: float64" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.drop(['d', 'c'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "With DataFrame, index values can be deleted from either axis" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.802608Z", "start_time": "2019-12-24T11:54:09.599952Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " <th>three</th>\n", " <th>four</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Ohio</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>Colorado</th>\n", " <td>4</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " </tr>\n", " <tr>\n", " <th>Utah</th>\n", " <td>8</td>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11</td>\n", " </tr>\n", " <tr>\n", " <th>New York</th>\n", " <td>12</td>\n", " <td>13</td>\n", " <td>14</td>\n", " <td>15</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two three four\n", "Ohio 0 1 2 3\n", "Colorado 4 5 6 7\n", "Utah 8 9 10 11\n", "New York 12 13 14 15" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'],\n", " columns=['one', 'two', 'three', 'four'])\n", "data" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.966588Z", "start_time": "2019-12-24T11:54:09.803605Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " <th>three</th>\n", " <th>four</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Utah</th>\n", " <td>8</td>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11</td>\n", " </tr>\n", " <tr>\n", " <th>New York</th>\n", " <td>12</td>\n", " <td>13</td>\n", " <td>14</td>\n", " <td>15</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two three four\n", "Utah 8 9 10 11\n", "New York 12 13 14 15" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Calling drop with a sequence of labels will \n", "# drop values from the row labels (axis 0):\n", "data.drop(['Colorado', 'Ohio'])" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:10.096537Z", "start_time": "2019-12-24T11:54:09.966588Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>three</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Ohio</th>\n", " <td>0</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>Colorado</th>\n", " <td>4</td>\n", " <td>6</td>\n", " </tr>\n", " <tr>\n", " <th>Utah</th>\n", " <td>8</td>\n", " <td>10</td>\n", " </tr>\n", " <tr>\n", " <th>New York</th>\n", " <td>12</td>\n", " <td>14</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one three\n", "Ohio 0 2\n", "Colorado 4 6\n", "Utah 8 10\n", "New York 12 14" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop values from columns: pass axis=1 or axis='columns'\n", "\n", "data.drop(['two', 'four'], axis=1)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:10.243523Z", "start_time": "2019-12-24T11:54:10.097533Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a 0.0\n", "b 1.0\n", "c 2.0\n", "d 3.0\n", "e 4.0\n", "dtype: float64\n", "\n", "a 0.0\n", "b 1.0\n", "d 3.0\n", "e 4.0\n", "dtype: float64\n", "\n" ] } ], "source": [ "print(obj)\n", "obj.drop('c', inplace=True)\n", "print(obj)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Indexing, Selection, and Filtering" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### indexing\n", "\n", "Series indexing (obj[...]) works analogously to NumPy array indexing, except you can use the Series’s index values instead of only integers." ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:10.362801Z", "start_time": "2019-12-24T11:54:10.243523Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0.0\n", "b 1.0\n", "c 2.0\n", "d 3.0\n", "dtype: float64" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])\n", "obj" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:10.493177Z", "start_time": "2019-12-24T11:54:10.362801Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.0\n", "\n", "1.0\n", "\n", "c 2.0\n", "d 3.0\n", "dtype: float64\n", "\n", "b 1.0\n", "a 0.0\n", "d 3.0\n", "dtype: float64\n", "\n", "b 1.0\n", "d 3.0\n", "dtype: float64\n", "\n" ] } ], "source": [ "print(obj['b'])\n", "print(obj[1])\n", "print(obj[2:4])\n", "print(obj[['b', 'a', 'd']])\n", "print(obj[[1, 3]])" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:10.631638Z", "start_time": "2019-12-24T11:54:10.494174Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a 0.0\n", "b 1.0\n", "dtype: float64\n", "\n" ] } ], "source": [ "print(obj[obj < 2])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Slicing with labels behaves differently than normal Python slicing in that the **end‐point is inclusive**" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:10.937208Z", "start_time": "2019-12-24T11:54:10.634638Z" } }, "outputs": [ { "data": { "text/plain": [ "b 1.0\n", "c 2.0\n", "dtype: float64" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj['b': 'c']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Indexing into a DataFrame is for retrieving one or more columns either with a single value or sequence" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.110423Z", "start_time": "2019-12-24T11:54:10.939206Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " <th>three</th>\n", " <th>four</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Ohio</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>Colorado</th>\n", " <td>4</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " </tr>\n", " <tr>\n", " <th>Utah</th>\n", " <td>8</td>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11</td>\n", " </tr>\n", " <tr>\n", " <th>New York</th>\n", " <td>12</td>\n", " <td>13</td>\n", " <td>14</td>\n", " <td>15</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two three four\n", "Ohio 0 1 2 3\n", "Colorado 4 5 6 7\n", "Utah 8 9 10 11\n", "New York 12 13 14 15" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame(np.arange(16).reshape((4, 4)),\n", " index=['Ohio', 'Colorado', 'Utah', 'New York'],\n", " columns=['one', 'two', 'three', 'four'])\n", "data" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.335656Z", "start_time": "2019-12-24T11:54:11.111464Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ohio 1\n", "Colorado 5\n", "Utah 9\n", "New York 13\n", "Name: two, dtype: int32\n", "\n", " three four\n", "Ohio 2 3\n", "Colorado 6 7\n", "Utah 10 11\n", "New York 14 15\n", "\n" ] } ], "source": [ "print(data['two'])\n", "print(data[['three', 'four']])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The row selection syntax **data[:2]** is provided as a convenience. Passing a single element or a list to the [ ] operator selects columns." ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.486571Z", "start_time": "2019-12-24T11:54:11.335656Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " <th>three</th>\n", " <th>four</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Ohio</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>Colorado</th>\n", " <td>4</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two three four\n", "Ohio 0 1 2 3\n", "Colorado 4 5 6 7" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[:2]" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.626605Z", "start_time": "2019-12-24T11:54:11.491623Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " <th>three</th>\n", " <th>four</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Colorado</th>\n", " <td>4</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " </tr>\n", " <tr>\n", " <th>Utah</th>\n", " <td>8</td>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11</td>\n", " </tr>\n", " <tr>\n", " <th>New York</th>\n", " <td>12</td>\n", " <td>13</td>\n", " <td>14</td>\n", " <td>15</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two three four\n", "Colorado 4 5 6 7\n", "Utah 8 9 10 11\n", "New York 12 13 14 15" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[data['three'] > 5]" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.731106Z", "start_time": "2019-12-24T11:54:11.626605Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " <th>three</th>\n", " <th>four</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Ohio</th>\n", " <td>True</td>\n", " <td>True</td>\n", " <td>True</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <th>Colorado</th>\n", " <td>True</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>Utah</th>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>New York</th>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two three four\n", "Ohio True True True True\n", "Colorado True False False False\n", "Utah False False False False\n", "New York False False False False" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data < 5" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-12-22T16:14:52.753331Z", "start_time": "2019-12-22T16:14:52.745801Z" } }, "source": [ "#### setting\n", "Setting using these methods modifies the corresponding section of the Series" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.854456Z", "start_time": "2019-12-24T11:54:11.733101Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0.0\n", "b 5.0\n", "c 5.0\n", "d 3.0\n", "dtype: float64" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj['b': 'c'] = 5\n", "obj" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.971222Z", "start_time": "2019-12-24T11:54:11.855652Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " <th>three</th>\n", " <th>four</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Ohio</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>Colorado</th>\n", " <td>4</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " </tr>\n", " <tr>\n", " <th>Utah</th>\n", " <td>8</td>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11</td>\n", " </tr>\n", " <tr>\n", " <th>New York</th>\n", " <td>12</td>\n", " <td>13</td>\n", " <td>14</td>\n", " <td>15</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two three four\n", "Ohio 0 1 2 3\n", "Colorado 4 5 6 7\n", "Utah 8 9 10 11\n", "New York 12 13 14 15" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame(np.arange(16).reshape((4, 4)),\n", " index=['Ohio', 'Colorado', 'Utah', 'New York'],\n", " columns=['one', 'two', 'three', 'four'])\n", "data" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.124945Z", "start_time": "2019-12-24T11:54:11.972250Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " <th>three</th>\n", " <th>four</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Ohio</th>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>Colorado</th>\n", " <td>0</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " </tr>\n", " <tr>\n", " <th>Utah</th>\n", " <td>8</td>\n", " <td>9</td>\n", " <td>10</td>\n", " <td>11</td>\n", " </tr>\n", " <tr>\n", " <th>New York</th>\n", " <td>12</td>\n", " <td>13</td>\n", " <td>14</td>\n", " <td>15</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two three four\n", "Ohio 0 0 0 0\n", "Colorado 0 5 6 7\n", "Utah 8 9 10 11\n", "New York 12 13 14 15" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[data < 5] = 0\n", "data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Selection with loc, iloc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For DataFrame label-indexing on the rows, I introduce the special indexing operators loc and iloc. They enable you to select a subset of the rows and columns from a DataFrame with NumPy-like notation using either axis labels (loc) or integers (iloc)." ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.244168Z", "start_time": "2019-12-24T11:54:12.124945Z" } }, "outputs": [ { "data": { "text/plain": [ "two 5\n", "three 6\n", "Name: Colorado, dtype: int32" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# select a single row and multiple columns by label\n", "# loc[Row, col]\n", "data.loc['Colorado', ['two', 'three']]" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.359593Z", "start_time": "2019-12-24T11:54:12.244168Z" } }, "outputs": [ { "data": { "text/plain": [ "two 5\n", "three 6\n", "Name: Colorado, dtype: int32" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# with integers using iloc\n", "data.iloc[1, [1, 2]]" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.517066Z", "start_time": "2019-12-24T11:54:12.361868Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "one 8\n", "two 9\n", "three 10\n", "four 11\n", "Name: Utah, dtype: int32\n", "\n", " four one two\n", "Colorado 7 0 5\n", "Utah 11 8 9\n", "\n" ] } ], "source": [ "print(data.iloc[2])\n", "print(data.iloc[[1, 2], [3, 0, 1]])" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.680750Z", "start_time": "2019-12-24T11:54:12.519063Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " one two three four\n", "Ohio 0 0 0 0\n", "Colorado 0 5 6 7\n", "Utah 8 9 10 11\n", "New York 12 13 14 15\n", "\n", "Ohio 0\n", "Colorado 5\n", "Utah 9\n", "Name: two, dtype: int32\n", "\n" ] } ], "source": [ "print(data)\n", "print(data.loc[:'Utah', 'two'])" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.840886Z", "start_time": "2019-12-24T11:54:12.682744Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " <th>three</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Colorado</th>\n", " <td>0</td>\n", " <td>5</td>\n", " <td>6</td>\n", " </tr>\n", " <tr>\n", " <th>Utah</th>\n", " <td>8</td>\n", " <td>9</td>\n", " <td>10</td>\n", " </tr>\n", " <tr>\n", " <th>New York</th>\n", " <td>12</td>\n", " <td>13</td>\n", " <td>14</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two three\n", "Colorado 0 5 6\n", "Utah 8 9 10\n", "New York 12 13 14" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.iloc[:, :3][data.three > 5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Arithmetic and Data Alignment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "An important pandas feature for some applications is the behavior of arithmetic\n", "between objects with different indexes. When you are adding together objects, if any index pairs are not the same, \n", "the respective index in the result will be the union of the index pairs. \n", "For users with database experience, this is similar to an automatic outer join on the index labels." ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.950857Z", "start_time": "2019-12-24T11:54:12.841887Z" } }, "outputs": [], "source": [ "s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])\n", "s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.066293Z", "start_time": "2019-12-24T11:54:12.950857Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a 7.3\n", "c -2.5\n", "d 3.4\n", "e 1.5\n", "dtype: float64\n", "\n", "a -2.1\n", "c 3.6\n", "e -1.5\n", "f 4.0\n", "g 3.1\n", "dtype: float64\n", "\n" ] } ], "source": [ "print(s1)\n", "print(s2)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.203969Z", "start_time": "2019-12-24T11:54:13.068249Z" } }, "outputs": [ { "data": { "text/plain": [ "a 5.2\n", "c 1.1\n", "d NaN\n", "e 0.0\n", "f NaN\n", "g NaN\n", "dtype: float64" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s1 + s2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the case of DataFrame, alignment is performed on both the rows and the columns" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.326232Z", "start_time": "2019-12-24T11:54:13.205001Z" } }, "outputs": [], "source": [ "df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),\n", " index=['Ohio', 'Texas', 'Colorado'])\n", "\n", "df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n", " index=['Utah', 'Ohio', 'Texas', 'Oregon'])" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.459283Z", "start_time": "2019-12-24T11:54:13.326232Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " b c d\n", "Ohio 0.0 1.0 2.0\n", "Texas 3.0 4.0 5.0\n", "Colorado 6.0 7.0 8.0\n", "\n", " b d e\n", "Utah 0.0 1.0 2.0\n", "Ohio 3.0 4.0 5.0\n", "Texas 6.0 7.0 8.0\n", "Oregon 9.0 10.0 11.0\n", "\n" ] } ], "source": [ "print(df1)\n", "print(df2)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.621405Z", "start_time": "2019-12-24T11:54:13.461210Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " <th>e</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Colorado</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>Ohio</th>\n", " <td>3.0</td>\n", " <td>NaN</td>\n", " <td>6.0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>Oregon</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>Texas</th>\n", " <td>9.0</td>\n", " <td>NaN</td>\n", " <td>12.0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>Utah</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " b c d e\n", "Colorado NaN NaN NaN NaN\n", "Ohio 3.0 NaN 6.0 NaN\n", "Oregon NaN NaN NaN NaN\n", "Texas 9.0 NaN 12.0 NaN\n", "Utah NaN NaN NaN NaN" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 + df2" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.778639Z", "start_time": "2019-12-24T11:54:13.623331Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " A\n", "0 1\n", "1 2\n", "\n", " B\n", "0 3\n", "1 4\n", "\n" ] } ], "source": [ "df1 = pd.DataFrame({'A': [1, 2]})\n", "df2 = pd.DataFrame({'B': [3, 4]})\n", "print(df1)\n", "print(df2)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.907441Z", "start_time": "2019-12-24T11:54:13.788272Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>A</th>\n", " <th>B</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " A B\n", "0 NaN NaN\n", "1 NaN NaN" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 - df2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In arithmetic operations between differently indexed objects, you might want to fill\n", "with a special value, like 0, when an axis label is found in one object but not the other" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.021709Z", "start_time": "2019-12-24T11:54:13.915170Z" } }, "outputs": [], "source": [ "df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list(\"abcd\"))\n", "df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.142549Z", "start_time": "2019-12-24T11:54:14.021709Z" } }, "outputs": [], "source": [ "df2.loc[1, 'b'] = np.nan" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.273758Z", "start_time": "2019-12-24T11:54:14.143900Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " a b c d\n", "0 0.0 1.0 2.0 3.0\n", "1 4.0 5.0 6.0 7.0\n", "2 8.0 9.0 10.0 11.0\n", "\n", " a b c d e\n", "0 0.0 1.0 2.0 3.0 4.0\n", "1 5.0 NaN 7.0 8.0 9.0\n", "2 10.0 11.0 12.0 13.0 14.0\n", "3 15.0 16.0 17.0 18.0 19.0\n", "\n" ] } ], "source": [ "print(df1)\n", "print(df2)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.395478Z", "start_time": "2019-12-24T11:54:14.273758Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " <th>e</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.0</td>\n", " <td>2.0</td>\n", " <td>4.0</td>\n", " <td>6.0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>9.0</td>\n", " <td>NaN</td>\n", " <td>13.0</td>\n", " <td>15.0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>18.0</td>\n", " <td>20.0</td>\n", " <td>22.0</td>\n", " <td>24.0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " a b c d e\n", "0 0.0 2.0 4.0 6.0 NaN\n", "1 9.0 NaN 13.0 15.0 NaN\n", "2 18.0 20.0 22.0 24.0 NaN\n", "3 NaN NaN NaN NaN NaN" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Adding these together results in NA values in the locations that don’t overlap\n", "df1 + df2" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.522987Z", "start_time": "2019-12-24T11:54:14.397472Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " <th>e</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>0.0</td>\n", " <td>2.0</td>\n", " <td>4.0</td>\n", " <td>6.0</td>\n", " <td>4.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>9.0</td>\n", " <td>5.0</td>\n", " <td>13.0</td>\n", " <td>15.0</td>\n", " <td>9.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>18.0</td>\n", " <td>20.0</td>\n", " <td>22.0</td>\n", " <td>24.0</td>\n", " <td>14.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>15.0</td>\n", " <td>16.0</td>\n", " <td>17.0</td>\n", " <td>18.0</td>\n", " <td>19.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " a b c d e\n", "0 0.0 2.0 4.0 6.0 4.0\n", "1 9.0 5.0 13.0 15.0 9.0\n", "2 18.0 20.0 22.0 24.0 14.0\n", "3 15.0 16.0 17.0 18.0 19.0" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Using the add method on df1,one pass df2 and an argument to fill_value\n", "df1.add(df2, fill_value=0)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.645859Z", "start_time": "2019-12-24T11:54:14.524282Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>inf</td>\n", " <td>1.000000</td>\n", " <td>0.500000</td>\n", " <td>0.333333</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>0.250</td>\n", " <td>0.200000</td>\n", " <td>0.166667</td>\n", " <td>0.142857</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>0.125</td>\n", " <td>0.111111</td>\n", " <td>0.100000</td>\n", " <td>0.090909</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " a b c d\n", "0 inf 1.000000 0.500000 0.333333\n", "1 0.250 0.200000 0.166667 0.142857\n", "2 0.125 0.111111 0.100000 0.090909" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 1 / df1\n", "# or\n", "df1.rdiv(1)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.762982Z", "start_time": "2019-12-24T11:54:14.648478Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " b d e\n", "utah 0.0 1.0 2.0\n", "ohio 3.0 4.0 5.0\n", "texas 6.0 7.0 8.0\n", "oregon 9.0 10.0 11.0\n", "\n", "b 0.0\n", "d 1.0\n", "e 2.0\n", "Name: utah, dtype: float64\n", "\n" ] } ], "source": [ "# operations between DataFrame and Series\n", "frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n", " index=(\"utah\", \"ohio\", \"texas\", \"oregon\"))\n", "series = frame.iloc[0]\n", "print(frame)\n", "print(series)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "By default, arithmetic between DataFrame and Series matches the index of the Series\n", "on the DataFrame’s columns, broadcasting down the rows" ] }, { "cell_type": "code", "execution_count": 75, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:15.272570Z", "start_time": "2019-12-24T11:54:14.764980Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>b</th>\n", " <th>d</th>\n", " <th>e</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>utah</th>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>ohio</th>\n", " <td>3.0</td>\n", " <td>3.0</td>\n", " <td>3.0</td>\n", " </tr>\n", " <tr>\n", " <th>texas</th>\n", " <td>6.0</td>\n", " <td>6.0</td>\n", " <td>6.0</td>\n", " </tr>\n", " <tr>\n", " <th>oregon</th>\n", " <td>9.0</td>\n", " <td>9.0</td>\n", " <td>9.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " b d e\n", "utah 0.0 0.0 0.0\n", "ohio 3.0 3.0 3.0\n", "texas 6.0 6.0 6.0\n", "oregon 9.0 9.0 9.0" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame - series" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If an index value is not found in either the DataFrame’s columns or the Series’s index,\n", "the objects will be reindexed to form the union" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:15.785596Z", "start_time": "2019-12-24T11:54:15.274639Z" } }, "outputs": [], "source": [ "series2 = pd.Series(range(3), index=list('bef'))" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:15.975844Z", "start_time": "2019-12-24T11:54:15.789282Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>b</th>\n", " <th>d</th>\n", " <th>e</th>\n", " <th>f</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>utah</th>\n", " <td>0.0</td>\n", " <td>NaN</td>\n", " <td>3.0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>ohio</th>\n", " <td>3.0</td>\n", " <td>NaN</td>\n", " <td>6.0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>texas</th>\n", " <td>6.0</td>\n", " <td>NaN</td>\n", " <td>9.0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>oregon</th>\n", " <td>9.0</td>\n", " <td>NaN</td>\n", " <td>12.0</td>\n", " <td>NaN</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " b d e f\n", "utah 0.0 NaN 3.0 NaN\n", "ohio 3.0 NaN 6.0 NaN\n", "texas 6.0 NaN 9.0 NaN\n", "oregon 9.0 NaN 12.0 NaN" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame + series2" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:16.113109Z", "start_time": "2019-12-24T11:54:15.978953Z" } }, "outputs": [ { "data": { "text/plain": [ "utah 1.0\n", "ohio 4.0\n", "texas 7.0\n", "oregon 10.0\n", "Name: d, dtype: float64" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# broadcasting over columns\n", "series3 = frame['d']\n", "series3" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:16.261347Z", "start_time": "2019-12-24T11:54:16.115103Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>b</th>\n", " <th>d</th>\n", " <th>e</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>utah</th>\n", " <td>-1.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>ohio</th>\n", " <td>-1.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>texas</th>\n", " <td>-1.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>oregon</th>\n", " <td>-1.0</td>\n", " <td>0.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " b d e\n", "utah -1.0 0.0 1.0\n", "ohio -1.0 0.0 1.0\n", "texas -1.0 0.0 1.0\n", "oregon -1.0 0.0 1.0" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sub(series3, axis='index')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The axis number that you pass is the axis to match on. In this case we mean to match\n", "on the DataFrame’s row index (axis='index' or axis=0) and broadcast across." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Function Application and Mapping" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:16.578884Z", "start_time": "2019-12-24T11:54:16.263428Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>b</th>\n", " <th>d</th>\n", " <th>e</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Utah</th>\n", " <td>-1.846691</td>\n", " <td>-0.622582</td>\n", " <td>-1.542595</td>\n", " </tr>\n", " <tr>\n", " <th>Ohio</th>\n", " <td>0.818697</td>\n", " <td>0.730596</td>\n", " <td>0.458859</td>\n", " </tr>\n", " <tr>\n", " <th>Texas</th>\n", " <td>0.612735</td>\n", " <td>0.593502</td>\n", " <td>-0.631133</td>\n", " </tr>\n", " <tr>\n", " <th>Oregon</th>\n", " <td>-0.067950</td>\n", " <td>-1.023214</td>\n", " <td>0.115138</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " b d e\n", "Utah -1.846691 -0.622582 -1.542595\n", "Ohio 0.818697 0.730596 0.458859\n", "Texas 0.612735 0.593502 -0.631133\n", "Oregon -0.067950 -1.023214 0.115138" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),\n", " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", "frame" ] }, { "cell_type": "code", "execution_count": 81, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:16.763901Z", "start_time": "2019-12-24T11:54:16.581885Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>b</th>\n", " <th>d</th>\n", " <th>e</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Utah</th>\n", " <td>1.846691</td>\n", " <td>0.622582</td>\n", " <td>1.542595</td>\n", " </tr>\n", " <tr>\n", " <th>Ohio</th>\n", " <td>0.818697</td>\n", " <td>0.730596</td>\n", " <td>0.458859</td>\n", " </tr>\n", " <tr>\n", " <th>Texas</th>\n", " <td>0.612735</td>\n", " <td>0.593502</td>\n", " <td>0.631133</td>\n", " </tr>\n", " <tr>\n", " <th>Oregon</th>\n", " <td>0.067950</td>\n", " <td>1.023214</td>\n", " <td>0.115138</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " b d e\n", "Utah 1.846691 0.622582 1.542595\n", "Ohio 0.818697 0.730596 0.458859\n", "Texas 0.612735 0.593502 0.631133\n", "Oregon 0.067950 1.023214 0.115138" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.abs(frame)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:16.884947Z", "start_time": "2019-12-24T11:54:16.766103Z" } }, "outputs": [ { "data": { "text/plain": [ "b 2.665389\n", "d 1.753811\n", "e 2.001455\n", "dtype: float64" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Another frequent operation is applying a function on \n", "# one-dimensional arrays to each column or row. \n", "# DataFrame’s apply method does exactly this\n", "\n", "f = lambda x: x.max() - x.min()\n", "frame.apply(f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here the function f, which computes the difference between the maximum and minimum of a Series, is **invoked once on each column** in frame. The result is a Series having the columns of frame as its index." ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.003427Z", "start_time": "2019-12-24T11:54:16.887837Z" } }, "outputs": [ { "data": { "text/plain": [ "Utah 1.224110\n", "Ohio 0.359838\n", "Texas 1.243868\n", "Oregon 1.138353\n", "dtype: float64" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# If you pass axis='columns' to apply, \n", "# the function will be invoked once per row\n", "\n", "frame.apply(f, axis='columns')" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.152859Z", "start_time": "2019-12-24T11:54:17.003427Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>b</th>\n", " <th>d</th>\n", " <th>e</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>min</th>\n", " <td>-1.846691</td>\n", " <td>-1.023214</td>\n", " <td>-1.542595</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>0.818697</td>\n", " <td>0.730596</td>\n", " <td>0.458859</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " b d e\n", "min -1.846691 -1.023214 -1.542595\n", "max 0.818697 0.730596 0.458859" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# returning a Series with multiple values\n", "\n", "def f(x):\n", " return pd.Series([x.min(), x.max()], index=['min', 'max'])\n", "\n", "frame.apply(f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Element-wise Python functions can be used, too. Suppose you wanted to compute a\n", "formatted string from each floating-point value in frame. You can do this with \n", "**applymap**" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.261634Z", "start_time": "2019-12-24T11:54:17.155897Z" } }, "outputs": [], "source": [ "format = lambda x: f'{x:.2f}'" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.398915Z", "start_time": "2019-12-24T11:54:17.263630Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " b d e\n", "Utah -1.846691 -0.622582 -1.542595\n", "Ohio 0.818697 0.730596 0.458859\n", "Texas 0.612735 0.593502 -0.631133\n", "Oregon -0.067950 -1.023214 0.115138\n", "\n" ] }, { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>b</th>\n", " <th>d</th>\n", " <th>e</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Utah</th>\n", " <td>-1.85</td>\n", " <td>-0.62</td>\n", " <td>-1.54</td>\n", " </tr>\n", " <tr>\n", " <th>Ohio</th>\n", " <td>0.82</td>\n", " <td>0.73</td>\n", " <td>0.46</td>\n", " </tr>\n", " <tr>\n", " <th>Texas</th>\n", " <td>0.61</td>\n", " <td>0.59</td>\n", " <td>-0.63</td>\n", " </tr>\n", " <tr>\n", " <th>Oregon</th>\n", " <td>-0.07</td>\n", " <td>-1.02</td>\n", " <td>0.12</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " b d e\n", "Utah -1.85 -0.62 -1.54\n", "Ohio 0.82 0.73 0.46\n", "Texas 0.61 0.59 -0.63\n", "Oregon -0.07 -1.02 0.12" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(frame)\n", "frame.applymap(format)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The reason for the name applymap is that Series has a **map** method for applying an\n", "element-wise function" ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.518935Z", "start_time": "2019-12-24T11:54:17.400935Z" } }, "outputs": [ { "data": { "text/plain": [ "Utah -1.54\n", "Ohio 0.46\n", "Texas -0.63\n", "Oregon 0.12\n", "Name: e, dtype: object" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame['e'].map(format)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sorting and Ranking" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Sorting\n", "\n", "Sorting a dataset by some criterion is another important built-in operation. To sort\n", "lexicographically by row or column index, use the **sort_index** method, which returns\n", "a new, sorted object" ] }, { "cell_type": "code", "execution_count": 88, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.662604Z", "start_time": "2019-12-24T11:54:17.522098Z" } }, "outputs": [ { "data": { "text/plain": [ "b 0\n", "d 1\n", "a 2\n", "c 3\n", "dtype: int64" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series(range(4), index=['b', 'd', 'a', 'c'])\n", "obj" ] }, { "cell_type": "code", "execution_count": 89, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.793647Z", "start_time": "2019-12-24T11:54:17.664561Z" } }, "outputs": [ { "data": { "text/plain": [ "a 2\n", "b 0\n", "c 3\n", "d 1\n", "dtype: int64" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.sort_index()" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.984142Z", "start_time": "2019-12-24T11:54:17.795974Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>d</th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>one</th>\n", " <td>4</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " </tr>\n", " <tr>\n", " <th>three</th>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " d a b c\n", "one 4 5 6 7\n", "three 0 1 2 3" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],\n", " columns=['d', 'a', 'b', 'c'])\n", "frame.sort_index()" ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.202334Z", "start_time": "2019-12-24T11:54:17.985140Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>a</th>\n", " <th>b</th>\n", " <th>c</th>\n", " <th>d</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>three</th>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>3</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>one</th>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " <td>4</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " a b c d\n", "three 1 2 3 0\n", "one 5 6 7 4" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_index(axis=1)" ] }, { "cell_type": "code", "execution_count": 92, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.322544Z", "start_time": "2019-12-24T11:54:18.205743Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>d</th>\n", " <th>c</th>\n", " <th>b</th>\n", " <th>a</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>three</th>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>2</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>one</th>\n", " <td>4</td>\n", " <td>7</td>\n", " <td>6</td>\n", " <td>5</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " d c b a\n", "three 0 3 2 1\n", "one 4 7 6 5" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_index(axis=1, ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To sort a Series by its values, use its **sort_values** method.\n", "Any missing values are sorted to the end of the Series by default" ] }, { "cell_type": "code", "execution_count": 93, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.444610Z", "start_time": "2019-12-24T11:54:18.323543Z" } }, "outputs": [ { "data": { "text/plain": [ "0 -4\n", "3 2\n", "2 3\n", "1 7\n", "dtype: int64" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series([-4, 7, 3, 2])\n", "obj.sort_values()" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.561066Z", "start_time": "2019-12-24T11:54:18.448348Z" } }, "outputs": [ { "data": { "text/plain": [ "4 -3.0\n", "5 2.0\n", "0 4.0\n", "2 7.0\n", "1 NaN\n", "3 NaN\n", "dtype: float64" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])\n", "obj.sort_values()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When sorting a DataFrame, you can use the data in one or more columns as the sort\n", "keys. To do so, pass one or more column names to the by option of sort_values" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.674098Z", "start_time": "2019-12-24T11:54:18.562063Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>b</th>\n", " <th>a</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>4</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>7</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>-3</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " b a\n", "0 4 0\n", "1 7 1\n", "2 -3 0\n", "3 2 1" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})\n", "frame" ] }, { "cell_type": "code", "execution_count": 96, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.782844Z", "start_time": "2019-12-24T11:54:18.675129Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>b</th>\n", " <th>a</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>2</th>\n", " <td>-3</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>0</th>\n", " <td>4</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>7</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " b a\n", "2 -3 0\n", "3 2 1\n", "0 4 0\n", "1 7 1" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_values(by='b')" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.885242Z", "start_time": "2019-12-24T11:54:18.783879Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>b</th>\n", " <th>a</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>2</th>\n", " <td>-3</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>0</th>\n", " <td>4</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>7</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " b a\n", "2 -3 0\n", "0 4 0\n", "3 2 1\n", "1 7 1" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_values(by=['a', 'b'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Ranking\n", "\n", "*Ranking* assigns ranks from one through the number of valid data points in an array.\n", "The rank methods for Series and DataFrame are the place to look; by default rank\n", "breaks ties by assigning each group the mean rank" ] }, { "cell_type": "code", "execution_count": 98, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.058017Z", "start_time": "2019-12-24T11:54:18.887690Z" } }, "outputs": [ { "data": { "text/plain": [ "0 6.5\n", "1 1.0\n", "2 6.5\n", "3 4.5\n", "4 3.0\n", "5 2.0\n", "6 4.5\n", "dtype: float64" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series([7, -5, 7, 4, 2, 0, 4])\n", "obj.rank()" ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.209466Z", "start_time": "2019-12-24T11:54:19.059022Z" } }, "outputs": [ { "data": { "text/plain": [ "0 6.0\n", "1 1.0\n", "2 7.0\n", "3 4.0\n", "4 3.0\n", "5 2.0\n", "6 5.0\n", "dtype: float64" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# assigning rank according to the order they're observed first in the data\n", "obj.rank(method='first')" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.352734Z", "start_time": "2019-12-24T11:54:19.211753Z" } }, "outputs": [ { "data": { "text/plain": [ "0 2.0\n", "1 7.0\n", "2 2.0\n", "3 4.0\n", "4 5.0\n", "5 6.0\n", "6 4.0\n", "dtype: float64" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.rank(ascending=False, method='max')" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.499605Z", "start_time": "2019-12-24T11:54:19.352734Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>b</th>\n", " <th>a</th>\n", " <th>c</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>4.3</td>\n", " <td>0</td>\n", " <td>-2.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>7.0</td>\n", " <td>1</td>\n", " <td>5.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>-3.0</td>\n", " <td>0</td>\n", " <td>8.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2.0</td>\n", " <td>1</td>\n", " <td>-2.5</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " b a c\n", "0 4.3 0 -2.0\n", "1 7.0 1 5.0\n", "2 -3.0 0 8.0\n", "3 2.0 1 -2.5" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})\n", "frame" ] }, { "cell_type": "code", "execution_count": 102, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.644123Z", "start_time": "2019-12-24T11:54:19.502600Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>b</th>\n", " <th>a</th>\n", " <th>c</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>3.0</td>\n", " <td>2.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>3.0</td>\n", " <td>1.0</td>\n", " <td>2.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1.0</td>\n", " <td>2.0</td>\n", " <td>3.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>3.0</td>\n", " <td>2.0</td>\n", " <td>1.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " b a c\n", "0 3.0 2.0 1.0\n", "1 3.0 1.0 2.0\n", "2 1.0 2.0 3.0\n", "3 3.0 2.0 1.0" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.rank(axis='columns')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Axis Indexes with Duplicate Labels" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.825688Z", "start_time": "2019-12-24T11:54:19.646865Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0\n", "a 1\n", "b 2\n", "b 3\n", "c 4\n", "dtype: int64" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])\n", "obj" ] }, { "cell_type": "code", "execution_count": 104, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.978965Z", "start_time": "2019-12-24T11:54:19.828121Z" } }, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.index.is_unique" ] }, { "cell_type": "code", "execution_count": 105, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.122066Z", "start_time": "2019-12-24T11:54:19.980367Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0\n", "a 1\n", "dtype: int64" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.a" ] }, { "cell_type": "code", "execution_count": 106, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.293573Z", "start_time": "2019-12-24T11:54:20.124045Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " <th>1</th>\n", " <th>2</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>a</th>\n", " <td>-2.054623</td>\n", " <td>0.654964</td>\n", " <td>-0.431453</td>\n", " </tr>\n", " <tr>\n", " <th>a</th>\n", " <td>0.654782</td>\n", " <td>0.776429</td>\n", " <td>1.175684</td>\n", " </tr>\n", " <tr>\n", " <th>b</th>\n", " <td>-0.485586</td>\n", " <td>-0.960232</td>\n", " <td>-1.368606</td>\n", " </tr>\n", " <tr>\n", " <th>b</th>\n", " <td>0.205797</td>\n", " <td>0.266294</td>\n", " <td>2.007870</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 0 1 2\n", "a -2.054623 0.654964 -0.431453\n", "a 0.654782 0.776429 1.175684\n", "b -0.485586 -0.960232 -1.368606\n", "b 0.205797 0.266294 2.007870" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])\n", "df" ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.444163Z", "start_time": "2019-12-24T11:54:20.295569Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " <th>1</th>\n", " <th>2</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>a</th>\n", " <td>-2.054623</td>\n", " <td>0.654964</td>\n", " <td>-0.431453</td>\n", " </tr>\n", " <tr>\n", " <th>a</th>\n", " <td>0.654782</td>\n", " <td>0.776429</td>\n", " <td>1.175684</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 0 1 2\n", "a -2.054623 0.654964 -0.431453\n", "a 0.654782 0.776429 1.175684" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc['a']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Summarizing and Computing Descriptive Statistics" ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.549520Z", "start_time": "2019-12-24T11:54:20.446595Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>a</th>\n", " <td>1.40</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>b</th>\n", " <td>7.10</td>\n", " <td>-4.50</td>\n", " </tr>\n", " <tr>\n", " <th>c</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>d</th>\n", " <td>0.75</td>\n", " <td>-1.43</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two\n", "a 1.40 NaN\n", "b 7.10 -4.50\n", "c NaN NaN\n", "d 0.75 -1.43" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan],\n", " [0.75, -1.43]],\n", " index=['a', 'b', 'c', 'd'],\n", " columns=['one', 'two'])\n", "df" ] }, { "cell_type": "code", "execution_count": 109, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.684771Z", "start_time": "2019-12-24T11:54:20.549520Z" } }, "outputs": [ { "data": { "text/plain": [ "one 9.25\n", "two -5.93\n", "dtype: float64" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sum()" ] }, { "cell_type": "code", "execution_count": 110, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.818099Z", "start_time": "2019-12-24T11:54:20.687422Z" } }, "outputs": [ { "data": { "text/plain": [ "a 1.40\n", "b 2.60\n", "c 0.00\n", "d -0.68\n", "dtype: float64" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sum(axis='columns')" ] }, { "cell_type": "code", "execution_count": 111, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.936482Z", "start_time": "2019-12-24T11:54:20.818099Z" } }, "outputs": [ { "data": { "text/plain": [ "a NaN\n", "b 2.60\n", "c NaN\n", "d -0.68\n", "dtype: float64" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sum(axis='columns', skipna=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Some methods, like idxmin and idxmax, return indirect statistics like the index value\n", "where the minimum or maximum values are attained" ] }, { "cell_type": "code", "execution_count": 112, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:21.074783Z", "start_time": "2019-12-24T11:54:20.937480Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "one b\n", "two d\n", "dtype: object\n", "\n", "one d\n", "two b\n", "dtype: object\n", "\n" ] } ], "source": [ "print(df.idxmax())\n", "print(df.idxmin())" ] }, { "cell_type": "code", "execution_count": 113, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:21.194122Z", "start_time": "2019-12-24T11:54:21.076618Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>a</th>\n", " <td>1.40</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>b</th>\n", " <td>8.50</td>\n", " <td>-4.50</td>\n", " </tr>\n", " <tr>\n", " <th>c</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>d</th>\n", " <td>9.25</td>\n", " <td>-5.93</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two\n", "a 1.40 NaN\n", "b 8.50 -4.50\n", "c NaN NaN\n", "d 9.25 -5.93" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# accmulation\n", "df.cumsum()" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-12-22T20:42:48.976671Z", "start_time": "2019-12-22T20:42:48.961382Z" } }, "source": [ "Another type of method is neither a reduction nor an accumulation. **describe** is one\n", "such example, producing multiple summary statistics in one shot" ] }, { "cell_type": "code", "execution_count": 114, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:21.344016Z", "start_time": "2019-12-24T11:54:21.195119Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>count</th>\n", " <td>3.000000</td>\n", " <td>2.000000</td>\n", " </tr>\n", " <tr>\n", " <th>mean</th>\n", " <td>3.083333</td>\n", " <td>-2.965000</td>\n", " </tr>\n", " <tr>\n", " <th>std</th>\n", " <td>3.493685</td>\n", " <td>2.170818</td>\n", " </tr>\n", " <tr>\n", " <th>min</th>\n", " <td>0.750000</td>\n", " <td>-4.500000</td>\n", " </tr>\n", " <tr>\n", " <th>25%</th>\n", " <td>1.075000</td>\n", " <td>-3.732500</td>\n", " </tr>\n", " <tr>\n", " <th>50%</th>\n", " <td>1.400000</td>\n", " <td>-2.965000</td>\n", " </tr>\n", " <tr>\n", " <th>75%</th>\n", " <td>4.250000</td>\n", " <td>-2.197500</td>\n", " </tr>\n", " <tr>\n", " <th>max</th>\n", " <td>7.100000</td>\n", " <td>-1.430000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two\n", "count 3.000000 2.000000\n", "mean 3.083333 -2.965000\n", "std 3.493685 2.170818\n", "min 0.750000 -4.500000\n", "25% 1.075000 -3.732500\n", "50% 1.400000 -2.965000\n", "75% 4.250000 -2.197500\n", "max 7.100000 -1.430000" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 115, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:21.511393Z", "start_time": "2019-12-24T11:54:21.344016Z" } }, "outputs": [ { "data": { "text/plain": [ "count 16\n", "unique 3\n", "top a\n", "freq 8\n", "dtype: object" ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# On non-numeric data, describe produces alternative summary statistics:\n", "\n", "obj = pd.Series(['a', 'a', 'b', 'c'] * 4)\n", "obj.describe()" ] }, { "cell_type": "code", "execution_count": 116, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:21.653404Z", "start_time": "2019-12-24T11:54:21.515120Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>one</th>\n", " <th>two</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>a</th>\n", " <td>1.40</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>b</th>\n", " <td>7.10</td>\n", " <td>-4.50</td>\n", " </tr>\n", " <tr>\n", " <th>c</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>d</th>\n", " <td>0.75</td>\n", " <td>-1.43</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " one two\n", "a 1.40 NaN\n", "b 7.10 -4.50\n", "c NaN NaN\n", "d 0.75 -1.43" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-12-22T20:47:42.840846Z", "start_time": "2019-12-22T20:47:42.833552Z" } }, "source": [ "### Correlation and Covariance" ] }, { "cell_type": "code", "execution_count": 117, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:23.076066Z", "start_time": "2019-12-24T11:54:21.655517Z" } }, "outputs": [], "source": [ "import pandas_datareader as web" ] }, { "cell_type": "code", "execution_count": 118, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:27.953329Z", "start_time": "2019-12-24T11:54:23.079097Z" } }, "outputs": [], "source": [ "all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}" ] }, { "cell_type": "code", "execution_count": 119, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:28.701569Z", "start_time": "2019-12-24T11:54:27.953329Z" } }, "outputs": [], "source": [ "price = pd.DataFrame({ticker: data['Adj Close']\n", " for ticker, data in all_data.items()})\n", "volume = pd.DataFrame({ticker: data['Volume']\n", " for ticker, data in all_data.items()})" ] }, { "cell_type": "code", "execution_count": 120, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:28.944129Z", "start_time": "2019-12-24T11:54:28.701569Z" } }, "outputs": [], "source": [ "returns = price.pct_change()" ] }, { "cell_type": "code", "execution_count": 121, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:29.202785Z", "start_time": "2019-12-24T11:54:28.944129Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>AAPL</th>\n", " <th>IBM</th>\n", " <th>MSFT</th>\n", " <th>GOOG</th>\n", " </tr>\n", " <tr>\n", " <th>Date</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>2019-12-17</th>\n", " <td>0.001965</td>\n", " <td>0.000671</td>\n", " <td>-0.005401</td>\n", " <td>-0.004445</td>\n", " </tr>\n", " <tr>\n", " <th>2019-12-18</th>\n", " <td>-0.002389</td>\n", " <td>0.001416</td>\n", " <td>-0.002069</td>\n", " <td>-0.001845</td>\n", " </tr>\n", " <tr>\n", " <th>2019-12-19</th>\n", " <td>0.001001</td>\n", " <td>0.001116</td>\n", " <td>0.008681</td>\n", " <td>0.002528</td>\n", " </tr>\n", " <tr>\n", " <th>2019-12-20</th>\n", " <td>-0.002071</td>\n", " <td>0.007655</td>\n", " <td>0.010918</td>\n", " <td>-0.004757</td>\n", " </tr>\n", " <tr>\n", " <th>2019-12-23</th>\n", " <td>0.016318</td>\n", " <td>-0.000295</td>\n", " <td>0.000000</td>\n", " <td>-0.000556</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " AAPL IBM MSFT GOOG\n", "Date \n", "2019-12-17 0.001965 0.000671 -0.005401 -0.004445\n", "2019-12-18 -0.002389 0.001416 -0.002069 -0.001845\n", "2019-12-19 0.001001 0.001116 0.008681 0.002528\n", "2019-12-20 -0.002071 0.007655 0.010918 -0.004757\n", "2019-12-23 0.016318 -0.000295 0.000000 -0.000556" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The **corr** method of Series computes the correlation of the overlapping, non-NA,\n", "aligned-by-index values in two Series. Relatedly, **cov** computes the covariance" ] }, { "cell_type": "code", "execution_count": 122, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.201205Z", "start_time": "2019-12-24T11:54:29.206950Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9.227138328874263e-05\n", "\n", "0.48490365831130916\n", "\n" ] } ], "source": [ "print(returns['MSFT'].cov(returns['IBM']))\n", "print(returns['MSFT'].corr(returns['IBM']))" ] }, { "cell_type": "code", "execution_count": 123, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.217210Z", "start_time": "2019-12-24T11:54:30.204759Z" } }, "outputs": [ { "data": { "text/plain": [ "0.48490365831130916" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.MSFT.corr(returns.IBM)" ] }, { "cell_type": "code", "execution_count": 124, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.398635Z", "start_time": "2019-12-24T11:54:30.219543Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>AAPL</th>\n", " <th>IBM</th>\n", " <th>MSFT</th>\n", " <th>GOOG</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>AAPL</th>\n", " <td>1.000000</td>\n", " <td>0.399340</td>\n", " <td>0.573224</td>\n", " <td>0.521823</td>\n", " </tr>\n", " <tr>\n", " <th>IBM</th>\n", " <td>0.399340</td>\n", " <td>1.000000</td>\n", " <td>0.484904</td>\n", " <td>0.410386</td>\n", " </tr>\n", " <tr>\n", " <th>MSFT</th>\n", " <td>0.573224</td>\n", " <td>0.484904</td>\n", " <td>1.000000</td>\n", " <td>0.658704</td>\n", " </tr>\n", " <tr>\n", " <th>GOOG</th>\n", " <td>0.521823</td>\n", " <td>0.410386</td>\n", " <td>0.658704</td>\n", " <td>1.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " AAPL IBM MSFT GOOG\n", "AAPL 1.000000 0.399340 0.573224 0.521823\n", "IBM 0.399340 1.000000 0.484904 0.410386\n", "MSFT 0.573224 0.484904 1.000000 0.658704\n", "GOOG 0.521823 0.410386 0.658704 1.000000" ] }, "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.corr()" ] }, { "cell_type": "code", "execution_count": 125, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.569188Z", "start_time": "2019-12-24T11:54:30.400959Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>AAPL</th>\n", " <th>IBM</th>\n", " <th>MSFT</th>\n", " <th>GOOG</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>AAPL</th>\n", " <td>0.000245</td>\n", " <td>0.000081</td>\n", " <td>0.000132</td>\n", " <td>0.000124</td>\n", " </tr>\n", " <tr>\n", " <th>IBM</th>\n", " <td>0.000081</td>\n", " <td>0.000168</td>\n", " <td>0.000092</td>\n", " <td>0.000080</td>\n", " </tr>\n", " <tr>\n", " <th>MSFT</th>\n", " <td>0.000132</td>\n", " <td>0.000092</td>\n", " <td>0.000216</td>\n", " <td>0.000146</td>\n", " </tr>\n", " <tr>\n", " <th>GOOG</th>\n", " <td>0.000124</td>\n", " <td>0.000080</td>\n", " <td>0.000146</td>\n", " <td>0.000229</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " AAPL IBM MSFT GOOG\n", "AAPL 0.000245 0.000081 0.000132 0.000124\n", "IBM 0.000081 0.000168 0.000092 0.000080\n", "MSFT 0.000132 0.000092 0.000216 0.000146\n", "GOOG 0.000124 0.000080 0.000146 0.000229" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.cov()" ] }, { "cell_type": "code", "execution_count": 126, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.727892Z", "start_time": "2019-12-24T11:54:30.572979Z" } }, "outputs": [ { "data": { "text/plain": [ "AAPL 0.399340\n", "IBM 1.000000\n", "MSFT 0.484904\n", "GOOG 0.410386\n", "dtype: float64" ] }, "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# corrwith method - compute pairwise correlations\n", "# between a DataFrame’s columns or rows with another Series oDataFrame’sme\n", "returns.corrwith(returns.IBM)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Unique Values, Value Counts, and Membership\n" ] }, { "cell_type": "code", "execution_count": 127, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.844787Z", "start_time": "2019-12-24T11:54:30.727892Z" } }, "outputs": [ { "data": { "text/plain": [ "array(['c', 'a', 'd', 'b'], dtype=object)" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# unique - gives you an array of the unique values in a Series\n", "\n", "obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])\n", "obj.unique()" ] }, { "cell_type": "code", "execution_count": 128, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.977262Z", "start_time": "2019-12-24T11:54:30.846938Z" } }, "outputs": [ { "data": { "text/plain": [ "c 3\n", "a 3\n", "b 2\n", "d 1\n", "dtype: int64" ] }, "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# value_counts - computes a Series containing value frequencies\n", "\n", "pd.value_counts(obj.values, sort=True)\n" ] }, { "cell_type": "code", "execution_count": 129, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:31.112737Z", "start_time": "2019-12-24T11:54:30.978817Z" } }, "outputs": [ { "data": { "text/plain": [ "0 c\n", "1 a\n", "2 d\n", "3 a\n", "4 a\n", "5 b\n", "6 b\n", "7 c\n", "8 c\n", "dtype: object" ] }, "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# isin - performs a vectorized set membership check and can be useful in filtering a\n", "# dataset down to a subset of values in a Series or column in a DataFrame\n", "\n", "obj" ] }, { "cell_type": "code", "execution_count": 130, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:31.299318Z", "start_time": "2019-12-24T11:54:31.114761Z" } }, "outputs": [ { "data": { "text/plain": [ "0 True\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", "5 True\n", "6 True\n", "7 True\n", "8 True\n", "dtype: bool" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mask = obj.isin(['b', 'c'])\n", "mask" ] }, { "cell_type": "code", "execution_count": 131, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:31.517280Z", "start_time": "2019-12-24T11:54:31.300318Z" } }, "outputs": [ { "data": { "text/plain": [ "0 c\n", "5 b\n", "6 b\n", "7 c\n", "8 c\n", "dtype: object" ] }, "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj[mask]" ] }, { "cell_type": "code", "execution_count": 132, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:31.801449Z", "start_time": "2019-12-24T11:54:31.520294Z" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Qu1</th>\n", " <th>Qu2</th>\n", " <th>Qu3</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>3</td>\n", " <td>3</td>\n", " <td>5</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>3</td>\n", " <td>2</td>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>4</td>\n", " <td>3</td>\n", " <td>4</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Qu1 Qu2 Qu3\n", "0 1 2 1\n", "1 3 3 5\n", "2 4 1 2\n", "3 3 2 4\n", "4 4 3 4" ] }, "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],\n", " 'Qu2': [2, 3, 1, 2, 3],\n", " 'Qu3': [1, 5, 2, 4, 4]})\n", "data" ] }, { "cell_type": "code", "execution_count": 133, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:32.315990Z", "start_time": "2019-12-24T11:54:31.804884Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4 2\n", "3 2\n", "1 1\n", "Name: Qu1, dtype: int64\n", "\n", "3 2\n", "2 2\n", "1 1\n", "Name: Qu2, dtype: int64\n", "\n", "4 2\n", "5 1\n", "2 1\n", "1 1\n", "Name: Qu3, dtype: int64\n", "\n", " Qu1 Qu2 Qu3\n", "1 1.0 1.0 1.0\n", "2 0.0 2.0 1.0\n", "3 2.0 2.0 0.0\n", "4 2.0 0.0 2.0\n", "5 0.0 0.0 1.0\n", "\n" ] } ], "source": [ "print(pd.value_counts(data.Qu1))\n", "print(pd.value_counts(data.Qu2))\n", "print(pd.value_counts(data.Qu3))\n", "print(data.apply(pd.value_counts).fillna(0))\n", "# index - distinct values in the df\n", "# column values - how many times the distinct value occur in that column" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": true, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "202.837px" }, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }