{ "cells": [ { "cell_type": "markdown", "metadata": { "toc": true }, "source": [ "

Table of Contents

\n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:01.227037Z", "start_time": "2019-12-24T11:54:00.439377Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:01.232635Z", "start_time": "2019-12-24T11:54:01.227037Z" } }, "outputs": [], "source": [ "import builtins\n", "\n", "def print(*args, **kwargs):\n", " builtins.print(*args, **kwargs, end='\\n\\n')" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-12-22T12:01:19.037222Z", "start_time": "2019-12-22T12:01:19.029951Z" } }, "source": [ "## Series\n", "\n", "A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:02.268631Z", "start_time": "2019-12-24T11:54:01.233953Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 4\n", "1 7\n", "2 -5\n", "3 3\n", "dtype: int64\n", "\n", "RangeIndex(start=0, stop=4, step=1)\n", "\n", "[ 4 7 -5 3]\n", "\n" ] } ], "source": [ "obj = pd.Series([4, 7, -5, 3])\n", "print(obj) \n", "print(obj.index)\n", "print(obj.values)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:02.430806Z", "start_time": "2019-12-24T11:54:02.271623Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "d 4\n", "a 7\n", "c -5\n", "b 3\n", "dtype: int64\n", "\n", "Index(['d', 'a', 'c', 'b'], dtype='object')\n", "\n" ] } ], "source": [ "obj2 = pd.Series([4, 7, -5, 3], index=['d', 'a', 'c', 'b'])\n", "print(obj2)\n", "print(obj2.index)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:02.643902Z", "start_time": "2019-12-24T11:54:02.432505Z" } }, "outputs": [ { "data": { "text/plain": [ "7" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj2['a']" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:02.812307Z", "start_time": "2019-12-24T11:54:02.645897Z" } }, "outputs": [ { "data": { "text/plain": [ "c -5\n", "a 7\n", "d 4\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj2[['c', 'a', 'd']]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:02.962361Z", "start_time": "2019-12-24T11:54:02.814622Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ohio 35000\n", "Texas 71000\n", "Oregon 16000\n", "Utah 5000\n", "dtype: int64\n", "\n" ] } ], "source": [ "sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}\n", "obj3 = pd.Series(sdata)\n", "print(obj3)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:03.145871Z", "start_time": "2019-12-24T11:54:02.964783Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "California NaN\n", "Ohio 35000.0\n", "Oregon 16000.0\n", "Texas 71000.0\n", "dtype: float64\n", "\n" ] } ], "source": [ "states = ['California', 'Ohio', 'Oregon', 'Texas']\n", "obj4 = pd.Series(sdata, index=states)\n", "print(obj4)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:03.282293Z", "start_time": "2019-12-24T11:54:03.148390Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "California True\n", "Ohio False\n", "Oregon False\n", "Texas False\n", "dtype: bool\n", "\n", "California False\n", "Ohio True\n", "Oregon True\n", "Texas True\n", "dtype: bool\n", "\n" ] } ], "source": [ "print(obj4.isnull())\n", "print(obj4.notnull())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:03.477576Z", "start_time": "2019-12-24T11:54:03.282293Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ohio 35000\n", "Texas 71000\n", "Oregon 16000\n", "Utah 5000\n", "dtype: int64\n", "\n", "California NaN\n", "Ohio 35000.0\n", "Oregon 16000.0\n", "Texas 71000.0\n", "dtype: float64\n", "\n", "California NaN\n", "Ohio 70000.0\n", "Oregon 32000.0\n", "Texas 142000.0\n", "Utah NaN\n", "dtype: float64\n", "\n" ] } ], "source": [ "print(obj3)\n", "print(obj4)\n", "print(obj3 + obj4)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:03.586203Z", "start_time": "2019-12-24T11:54:03.480359Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "state\n", "California NaN\n", "Ohio 35000.0\n", "Oregon 16000.0\n", "Texas 71000.0\n", "Name: population, dtype: float64\n", "\n" ] } ], "source": [ "obj4.name = 'population'\n", "obj4.index.name = 'state'\n", "print(obj4)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:03.710465Z", "start_time": "2019-12-24T11:54:03.588187Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 4\n", "1 7\n", "2 -5\n", "3 3\n", "dtype: int64\n", "\n", "Bob 4\n", "Steve 7\n", "Jeff -5\n", "Ryan 3\n", "dtype: int64\n", "\n" ] } ], "source": [ "print(obj)\n", "obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']\n", "print(obj)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DataFrame\n", "\n", "A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.)." ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:03.884935Z", "start_time": "2019-12-24T11:54:03.713457Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " state year pop\n", "0 Ohio 2000 1.5\n", "1 Ohio 2001 1.7\n", "2 Ohio 2002 3.6\n", "3 Nevada 2001 2.4\n", "4 Nevada 2002 2.9\n", "5 Nevada 2003 3.2\n", "\n" ] } ], "source": [ "data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],\n", " 'year': [2000, 2001, 2002, 2001, 2002, 2003],\n", " 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}\n", "\n", "frame = pd.DataFrame(data)\n", "print(frame)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.041094Z", "start_time": "2019-12-24T11:54:03.886930Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepop
02000Ohio1.5
12001Ohio1.7
22002Ohio3.6
32001Nevada2.4
42002Nevada2.9
52003Nevada3.2
\n", "
" ], "text/plain": [ " year state pop\n", "0 2000 Ohio 1.5\n", "1 2001 Ohio 1.7\n", "2 2002 Ohio 3.6\n", "3 2001 Nevada 2.4\n", "4 2002 Nevada 2.9\n", "5 2003 Nevada 3.2" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(data, columns=['year', 'state', 'pop'])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.244915Z", "start_time": "2019-12-24T11:54:04.043184Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 NaN\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 NaN\n", "five 2002 Nevada 2.9 NaN\n", "six 2003 Nevada 3.2 NaN\n", "\n", "Index(['year', 'state', 'pop', 'debt'], dtype='object')\n", "\n" ] } ], "source": [ "# u pass a column that isn’t contained in the dict, \n", "# it will appear with missing values\n", "\n", "frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],\n", " index=['one', 'two', 'three', 'four', 'five', 'six'])\n", "\n", "print(frame2)\n", "print(frame2.columns)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.373163Z", "start_time": "2019-12-24T11:54:04.244915Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 Ohio\n", "1 Ohio\n", "2 Ohio\n", "3 Nevada\n", "4 Nevada\n", "5 Nevada\n", "Name: state, dtype: object\n", "\n", "0 2000\n", "1 2001\n", "2 2002\n", "3 2001\n", "4 2002\n", "5 2003\n", "Name: year, dtype: int64\n", "\n" ] } ], "source": [ "print(frame['state'])\n", "print(frame.year)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.502687Z", "start_time": "2019-12-24T11:54:04.374158Z" } }, "outputs": [ { "data": { "text/plain": [ "year 2002\n", "state Ohio\n", "pop 3.6\n", "debt NaN\n", "Name: three, dtype: object" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Rows can also be retrieved by \n", "# position or name with the special loc attribute\n", "frame2.loc['three']" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.699057Z", "start_time": "2019-12-24T11:54:04.503681Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 NaN\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 NaN\n", "five 2002 Nevada 2.9 NaN\n", "six 2003 Nevada 3.2 NaN\n", "\n", " year state pop debt\n", "one 2000 Ohio 1.5 16.5\n", "two 2001 Ohio 1.7 16.5\n", "three 2002 Ohio 3.6 16.5\n", "four 2001 Nevada 2.4 16.5\n", "five 2002 Nevada 2.9 16.5\n", "six 2003 Nevada 3.2 16.5\n", "\n", " year state pop debt\n", "one 2000 Ohio 1.5 0.0\n", "two 2001 Ohio 1.7 1.0\n", "three 2002 Ohio 3.6 2.0\n", "four 2001 Nevada 2.4 3.0\n", "five 2002 Nevada 2.9 4.0\n", "six 2003 Nevada 3.2 5.0\n", "\n" ] } ], "source": [ "print(frame2)\n", "frame2['debt'] = 16.5\n", "print(frame2)\n", "frame2['debt'] = np.arange(6.)\n", "print(frame2)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.822895Z", "start_time": "2019-12-24T11:54:04.700056Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 -1.2\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 -1.5\n", "five 2002 Nevada 2.9 -1.7\n", "six 2003 Nevada 3.2 NaN\n", "\n" ] } ], "source": [ "val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])\n", "frame2['debt'] = val\n", "print(frame2)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:04.996306Z", "start_time": "2019-12-24T11:54:04.825542Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " year state pop debt eastern\n", "one 2000 Ohio 1.5 NaN True\n", "two 2001 Ohio 1.7 -1.2 True\n", "three 2002 Ohio 3.6 NaN True\n", "four 2001 Nevada 2.4 -1.5 False\n", "five 2002 Nevada 2.9 -1.7 False\n", "six 2003 Nevada 3.2 NaN False\n", "\n" ] }, { "data": { "text/plain": [ "Index(['year', 'state', 'pop', 'debt'], dtype='object')" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Assigning a column that doesn’t exist will create a new column. \n", "# The del keyword will delete columns as with a dict.\n", "# As an example of del, I first add a new column of boolean \n", "# values where the state column equals 'Ohio':\n", "\n", "frame2['eastern'] = frame2['state'] == 'Ohio'\n", "print(frame2)\n", "\n", "del frame2['eastern']\n", "frame2.columns" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:05.121544Z", "start_time": "2019-12-24T11:54:04.998624Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Nevada Ohio\n", "2001 2.4 1.7\n", "2002 2.9 3.6\n", "2000 NaN 1.5\n", "\n" ] } ], "source": [ "# another way is a nested dict of dicts\n", "pop = {'Nevada': {2001: 2.4, 2002: 2.9},\n", " 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}\n", "\n", "# If the nested dict is passed to the DataFrame, \n", "# pandas will interpret the outer dict keys\n", "# as the columns and the inner keys as the row indices\n", "\n", "frame3 = pd.DataFrame(pop)\n", "print(frame3)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:05.295926Z", "start_time": "2019-12-24T11:54:05.126331Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
200120022000
Nevada2.42.9NaN
Ohio1.73.61.5
\n", "
" ], "text/plain": [ " 2001 2002 2000\n", "Nevada 2.4 2.9 NaN\n", "Ohio 1.7 3.6 1.5" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# transpose dataframe\n", "frame3.T" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:05.432228Z", "start_time": "2019-12-24T11:54:05.299058Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "state Nevada Ohio\n", "year \n", "2001 2.4 1.7\n", "2002 2.9 3.6\n", "2000 NaN 1.5\n", "\n" ] } ], "source": [ "frame3.index.name = 'year'; frame3.columns.name = 'state'\n", "print(frame3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using the axis parameter \n", "\n", "1) axis = 0 => row\n", " * Move across/down the row axis\n", " * Direction of operation is down\n", "\n", "2) axis = 1 => column\n", " * Move along the column axis\n", " * Direction of operation is sideways\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:06.582318Z", "start_time": "2019-12-24T11:54:05.434315Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinent
0Afghanistan0000.0Asia
1Albania89132544.9Europe
2Algeria250140.7Africa
3Andorra24513831212.4Europe
4Angola21757455.9Africa
\n", "
" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "2 0.7 Africa \n", "3 12.4 Europe \n", "4 5.9 Africa " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "drinks = pd.read_csv('http://bit.ly/drinksbycountry')\n", "drinks.head()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:06.610698Z", "start_time": "2019-12-24T11:54:06.587032Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcohol
0Afghanistan0000.0
1Albania89132544.9
2Algeria250140.7
3Andorra24513831212.4
4Angola21757455.9
\n", "
" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "2 Algeria 25 0 14 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "\n", " total_litres_of_pure_alcohol \n", "0 0.0 \n", "1 4.9 \n", "2 0.7 \n", "3 12.4 \n", "4 5.9 " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# dropping a column\n", "\n", "drinks.drop('continent', axis=1).head() " ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:07.987950Z", "start_time": "2019-12-24T11:54:06.613218Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinent
0Afghanistan0000.0Asia
1Albania89132544.9Europe
3Andorra24513831212.4Europe
4Angola21757455.9Africa
5Antigua & Barbuda102128454.9North America
\n", "
" ], "text/plain": [ " country beer_servings spirit_servings wine_servings \\\n", "0 Afghanistan 0 0 0 \n", "1 Albania 89 132 54 \n", "3 Andorra 245 138 312 \n", "4 Angola 217 57 45 \n", "5 Antigua & Barbuda 102 128 45 \n", "\n", " total_litres_of_pure_alcohol continent \n", "0 0.0 Asia \n", "1 4.9 Europe \n", "3 12.4 Europe \n", "4 5.9 Africa \n", "5 4.9 North America " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# dropping a row\n", "\n", "drinks.drop(2, axis=0).head()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.176714Z", "start_time": "2019-12-24T11:54:07.990944Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(4,)\n", "\n" ] }, { "data": { "text/plain": [ "beer_servings 106.160622\n", "spirit_servings 80.994819\n", "wine_servings 49.450777\n", "total_litres_of_pure_alcohol 4.717098\n", "dtype: float64" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# default axis=0\n", "# mean of each column\n", "\n", "print(drinks.mean().shape)\n", "drinks.mean() " ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.296361Z", "start_time": "2019-12-24T11:54:08.178873Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(193,)\n", "\n" ] }, { "data": { "text/plain": [ "0 0.000\n", "1 69.975\n", "2 9.925\n", "3 176.850\n", "4 81.225\n", " ... \n", "188 110.925\n", "189 29.000\n", "190 1.525\n", "191 14.375\n", "192 22.675\n", "Length: 193, dtype: float64" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# mean of each row\n", "\n", "print(drinks.mean(axis=1).shape)\n", "drinks.mean(axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Essesntial Functionality" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reindexing\n", "\n", "An important method on pandas objects is **reindex**, which means to create a new object with the data conformed to a new index" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.467311Z", "start_time": "2019-12-24T11:54:08.296361Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "d 4.5\n", "b 7.2\n", "a -5.3\n", "c 3.6\n", "dtype: float64\n", "\n", "a -5.3\n", "b 7.2\n", "c 3.6\n", "d 4.5\n", "dtype: float64\n", "\n" ] } ], "source": [ "obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])\n", "print(obj)\n", "\n", "obj2 = obj.reindex(['a', 'b', 'c', 'd'])\n", "print(obj2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For ordered data like time series, it may be desirable to do some interpolation or filling of values when reindexing. The **method** option allows us to do this, using a\n", "method such as **ffill**, which forward-fills the values" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.580511Z", "start_time": "2019-12-24T11:54:08.470651Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 blue\n", "2 purple\n", "4 yellow\n", "dtype: object\n", "\n" ] }, { "data": { "text/plain": [ "0 blue\n", "1 blue\n", "2 purple\n", "3 purple\n", "4 yellow\n", "5 yellow\n", "dtype: object" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])\n", "print(obj3)\n", "\n", "obj3.reindex(range(6), method='ffill')" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.694431Z", "start_time": "2019-12-24T11:54:08.582639Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OhioTexasCalifornia
letters
a012
c345
d678
\n", "
" ], "text/plain": [ " Ohio Texas California\n", "letters \n", "a 0 1 2\n", "c 3 4 5\n", "d 6 7 8" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],\n", " columns=['Ohio', 'Texas', 'California'])\n", "frame.index.name = 'letters'\n", "frame" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.812642Z", "start_time": "2019-12-24T11:54:08.696663Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OhioTexasCalifornia
letters
a0.01.02.0
bNaNNaNNaN
c3.04.05.0
d6.07.08.0
\n", "
" ], "text/plain": [ " Ohio Texas California\n", "letters \n", "a 0.0 1.0 2.0\n", "b NaN NaN NaN\n", "c 3.0 4.0 5.0\n", "d 6.0 7.0 8.0" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame2 = frame.reindex(['a', 'b', 'c', 'd'])\n", "frame2" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:08.910495Z", "start_time": "2019-12-24T11:54:08.814770Z" }, "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TexasUtahCalifornia
letters
a1NaN2
c4NaN5
d7NaN8
\n", "
" ], "text/plain": [ " Texas Utah California\n", "letters \n", "a 1 NaN 2\n", "c 4 NaN 5\n", "d 7 NaN 8" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The columns can be reindexed with the columns keyword:\n", "states = ['Texas', 'Utah', 'California']\n", "frame.reindex(columns=states)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.024940Z", "start_time": "2019-12-24T11:54:08.911493Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TexasUtahCalifornia
letters
a1.0NaN2.0
bNaNNaNNaN
c4.0NaN5.0
d7.0NaN8.0
\n", "
" ], "text/plain": [ " Texas Utah California\n", "letters \n", "a 1.0 NaN 2.0\n", "b NaN NaN NaN\n", "c 4.0 NaN 5.0\n", "d 7.0 NaN 8.0" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.reindex(['a', 'b', 'c', 'd'], columns=states)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.141015Z", "start_time": "2019-12-24T11:54:09.024940Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TexasUtahCalifornia
letters
a102
b000
c405
d708
\n", "
" ], "text/plain": [ " Texas Utah California\n", "letters \n", "a 1 0 2\n", "b 0 0 0\n", "c 4 0 5\n", "d 7 0 8" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.reindex(['a', 'b', 'c', 'd'], columns=states, fill_value=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dropping Entries from an axis\n", "\n", "**drop** method will return a new object with the indicated value or values deleted from an axis" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.250070Z", "start_time": "2019-12-24T11:54:09.141015Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0.0\n", "b 1.0\n", "c 2.0\n", "d 3.0\n", "e 4.0\n", "dtype: float64" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])\n", "obj" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.396400Z", "start_time": "2019-12-24T11:54:09.251603Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0.0\n", "b 1.0\n", "d 3.0\n", "e 4.0\n", "dtype: float64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_obj = obj.drop('c')\n", "new_obj" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.598954Z", "start_time": "2019-12-24T11:54:09.396400Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0.0\n", "b 1.0\n", "e 4.0\n", "dtype: float64" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.drop(['d', 'c'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "With DataFrame, index values can be deleted from either axis" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.802608Z", "start_time": "2019-12-24T11:54:09.599952Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
Ohio0123
Colorado4567
Utah891011
New York12131415
\n", "
" ], "text/plain": [ " one two three four\n", "Ohio 0 1 2 3\n", "Colorado 4 5 6 7\n", "Utah 8 9 10 11\n", "New York 12 13 14 15" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'],\n", " columns=['one', 'two', 'three', 'four'])\n", "data" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:09.966588Z", "start_time": "2019-12-24T11:54:09.803605Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
Utah891011
New York12131415
\n", "
" ], "text/plain": [ " one two three four\n", "Utah 8 9 10 11\n", "New York 12 13 14 15" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Calling drop with a sequence of labels will \n", "# drop values from the row labels (axis 0):\n", "data.drop(['Colorado', 'Ohio'])" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:10.096537Z", "start_time": "2019-12-24T11:54:09.966588Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onethree
Ohio02
Colorado46
Utah810
New York1214
\n", "
" ], "text/plain": [ " one three\n", "Ohio 0 2\n", "Colorado 4 6\n", "Utah 8 10\n", "New York 12 14" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop values from columns: pass axis=1 or axis='columns'\n", "\n", "data.drop(['two', 'four'], axis=1)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:10.243523Z", "start_time": "2019-12-24T11:54:10.097533Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a 0.0\n", "b 1.0\n", "c 2.0\n", "d 3.0\n", "e 4.0\n", "dtype: float64\n", "\n", "a 0.0\n", "b 1.0\n", "d 3.0\n", "e 4.0\n", "dtype: float64\n", "\n" ] } ], "source": [ "print(obj)\n", "obj.drop('c', inplace=True)\n", "print(obj)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Indexing, Selection, and Filtering" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### indexing\n", "\n", "Series indexing (obj[...]) works analogously to NumPy array indexing, except you can use the Series’s index values instead of only integers." ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:10.362801Z", "start_time": "2019-12-24T11:54:10.243523Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0.0\n", "b 1.0\n", "c 2.0\n", "d 3.0\n", "dtype: float64" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])\n", "obj" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:10.493177Z", "start_time": "2019-12-24T11:54:10.362801Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.0\n", "\n", "1.0\n", "\n", "c 2.0\n", "d 3.0\n", "dtype: float64\n", "\n", "b 1.0\n", "a 0.0\n", "d 3.0\n", "dtype: float64\n", "\n", "b 1.0\n", "d 3.0\n", "dtype: float64\n", "\n" ] } ], "source": [ "print(obj['b'])\n", "print(obj[1])\n", "print(obj[2:4])\n", "print(obj[['b', 'a', 'd']])\n", "print(obj[[1, 3]])" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:10.631638Z", "start_time": "2019-12-24T11:54:10.494174Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a 0.0\n", "b 1.0\n", "dtype: float64\n", "\n" ] } ], "source": [ "print(obj[obj < 2])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Slicing with labels behaves differently than normal Python slicing in that the **end‐point is inclusive**" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:10.937208Z", "start_time": "2019-12-24T11:54:10.634638Z" } }, "outputs": [ { "data": { "text/plain": [ "b 1.0\n", "c 2.0\n", "dtype: float64" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj['b': 'c']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Indexing into a DataFrame is for retrieving one or more columns either with a single value or sequence" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.110423Z", "start_time": "2019-12-24T11:54:10.939206Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
Ohio0123
Colorado4567
Utah891011
New York12131415
\n", "
" ], "text/plain": [ " one two three four\n", "Ohio 0 1 2 3\n", "Colorado 4 5 6 7\n", "Utah 8 9 10 11\n", "New York 12 13 14 15" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame(np.arange(16).reshape((4, 4)),\n", " index=['Ohio', 'Colorado', 'Utah', 'New York'],\n", " columns=['one', 'two', 'three', 'four'])\n", "data" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.335656Z", "start_time": "2019-12-24T11:54:11.111464Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ohio 1\n", "Colorado 5\n", "Utah 9\n", "New York 13\n", "Name: two, dtype: int32\n", "\n", " three four\n", "Ohio 2 3\n", "Colorado 6 7\n", "Utah 10 11\n", "New York 14 15\n", "\n" ] } ], "source": [ "print(data['two'])\n", "print(data[['three', 'four']])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The row selection syntax **data[:2]** is provided as a convenience. Passing a single element or a list to the [ ] operator selects columns." ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.486571Z", "start_time": "2019-12-24T11:54:11.335656Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
Ohio0123
Colorado4567
\n", "
" ], "text/plain": [ " one two three four\n", "Ohio 0 1 2 3\n", "Colorado 4 5 6 7" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[:2]" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.626605Z", "start_time": "2019-12-24T11:54:11.491623Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
Colorado4567
Utah891011
New York12131415
\n", "
" ], "text/plain": [ " one two three four\n", "Colorado 4 5 6 7\n", "Utah 8 9 10 11\n", "New York 12 13 14 15" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[data['three'] > 5]" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.731106Z", "start_time": "2019-12-24T11:54:11.626605Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
OhioTrueTrueTrueTrue
ColoradoTrueFalseFalseFalse
UtahFalseFalseFalseFalse
New YorkFalseFalseFalseFalse
\n", "
" ], "text/plain": [ " one two three four\n", "Ohio True True True True\n", "Colorado True False False False\n", "Utah False False False False\n", "New York False False False False" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data < 5" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-12-22T16:14:52.753331Z", "start_time": "2019-12-22T16:14:52.745801Z" } }, "source": [ "#### setting\n", "Setting using these methods modifies the corresponding section of the Series" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.854456Z", "start_time": "2019-12-24T11:54:11.733101Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0.0\n", "b 5.0\n", "c 5.0\n", "d 3.0\n", "dtype: float64" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj['b': 'c'] = 5\n", "obj" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:11.971222Z", "start_time": "2019-12-24T11:54:11.855652Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
Ohio0123
Colorado4567
Utah891011
New York12131415
\n", "
" ], "text/plain": [ " one two three four\n", "Ohio 0 1 2 3\n", "Colorado 4 5 6 7\n", "Utah 8 9 10 11\n", "New York 12 13 14 15" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame(np.arange(16).reshape((4, 4)),\n", " index=['Ohio', 'Colorado', 'Utah', 'New York'],\n", " columns=['one', 'two', 'three', 'four'])\n", "data" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.124945Z", "start_time": "2019-12-24T11:54:11.972250Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
Ohio0000
Colorado0567
Utah891011
New York12131415
\n", "
" ], "text/plain": [ " one two three four\n", "Ohio 0 0 0 0\n", "Colorado 0 5 6 7\n", "Utah 8 9 10 11\n", "New York 12 13 14 15" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[data < 5] = 0\n", "data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Selection with loc, iloc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For DataFrame label-indexing on the rows, I introduce the special indexing operators loc and iloc. They enable you to select a subset of the rows and columns from a DataFrame with NumPy-like notation using either axis labels (loc) or integers (iloc)." ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.244168Z", "start_time": "2019-12-24T11:54:12.124945Z" } }, "outputs": [ { "data": { "text/plain": [ "two 5\n", "three 6\n", "Name: Colorado, dtype: int32" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# select a single row and multiple columns by label\n", "# loc[Row, col]\n", "data.loc['Colorado', ['two', 'three']]" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.359593Z", "start_time": "2019-12-24T11:54:12.244168Z" } }, "outputs": [ { "data": { "text/plain": [ "two 5\n", "three 6\n", "Name: Colorado, dtype: int32" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# with integers using iloc\n", "data.iloc[1, [1, 2]]" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.517066Z", "start_time": "2019-12-24T11:54:12.361868Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "one 8\n", "two 9\n", "three 10\n", "four 11\n", "Name: Utah, dtype: int32\n", "\n", " four one two\n", "Colorado 7 0 5\n", "Utah 11 8 9\n", "\n" ] } ], "source": [ "print(data.iloc[2])\n", "print(data.iloc[[1, 2], [3, 0, 1]])" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.680750Z", "start_time": "2019-12-24T11:54:12.519063Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " one two three four\n", "Ohio 0 0 0 0\n", "Colorado 0 5 6 7\n", "Utah 8 9 10 11\n", "New York 12 13 14 15\n", "\n", "Ohio 0\n", "Colorado 5\n", "Utah 9\n", "Name: two, dtype: int32\n", "\n" ] } ], "source": [ "print(data)\n", "print(data.loc[:'Utah', 'two'])" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.840886Z", "start_time": "2019-12-24T11:54:12.682744Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothree
Colorado056
Utah8910
New York121314
\n", "
" ], "text/plain": [ " one two three\n", "Colorado 0 5 6\n", "Utah 8 9 10\n", "New York 12 13 14" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.iloc[:, :3][data.three > 5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Arithmetic and Data Alignment" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "An important pandas feature for some applications is the behavior of arithmetic\n", "between objects with different indexes. When you are adding together objects, if any index pairs are not the same, \n", "the respective index in the result will be the union of the index pairs. \n", "For users with database experience, this is similar to an automatic outer join on the index labels." ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:12.950857Z", "start_time": "2019-12-24T11:54:12.841887Z" } }, "outputs": [], "source": [ "s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])\n", "s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.066293Z", "start_time": "2019-12-24T11:54:12.950857Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a 7.3\n", "c -2.5\n", "d 3.4\n", "e 1.5\n", "dtype: float64\n", "\n", "a -2.1\n", "c 3.6\n", "e -1.5\n", "f 4.0\n", "g 3.1\n", "dtype: float64\n", "\n" ] } ], "source": [ "print(s1)\n", "print(s2)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.203969Z", "start_time": "2019-12-24T11:54:13.068249Z" } }, "outputs": [ { "data": { "text/plain": [ "a 5.2\n", "c 1.1\n", "d NaN\n", "e 0.0\n", "f NaN\n", "g NaN\n", "dtype: float64" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s1 + s2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the case of DataFrame, alignment is performed on both the rows and the columns" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.326232Z", "start_time": "2019-12-24T11:54:13.205001Z" } }, "outputs": [], "source": [ "df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),\n", " index=['Ohio', 'Texas', 'Colorado'])\n", "\n", "df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n", " index=['Utah', 'Ohio', 'Texas', 'Oregon'])" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.459283Z", "start_time": "2019-12-24T11:54:13.326232Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " b c d\n", "Ohio 0.0 1.0 2.0\n", "Texas 3.0 4.0 5.0\n", "Colorado 6.0 7.0 8.0\n", "\n", " b d e\n", "Utah 0.0 1.0 2.0\n", "Ohio 3.0 4.0 5.0\n", "Texas 6.0 7.0 8.0\n", "Oregon 9.0 10.0 11.0\n", "\n" ] } ], "source": [ "print(df1)\n", "print(df2)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.621405Z", "start_time": "2019-12-24T11:54:13.461210Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bcde
ColoradoNaNNaNNaNNaN
Ohio3.0NaN6.0NaN
OregonNaNNaNNaNNaN
Texas9.0NaN12.0NaN
UtahNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " b c d e\n", "Colorado NaN NaN NaN NaN\n", "Ohio 3.0 NaN 6.0 NaN\n", "Oregon NaN NaN NaN NaN\n", "Texas 9.0 NaN 12.0 NaN\n", "Utah NaN NaN NaN NaN" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 + df2" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.778639Z", "start_time": "2019-12-24T11:54:13.623331Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " A\n", "0 1\n", "1 2\n", "\n", " B\n", "0 3\n", "1 4\n", "\n" ] } ], "source": [ "df1 = pd.DataFrame({'A': [1, 2]})\n", "df2 = pd.DataFrame({'B': [3, 4]})\n", "print(df1)\n", "print(df2)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:13.907441Z", "start_time": "2019-12-24T11:54:13.788272Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
0NaNNaN
1NaNNaN
\n", "
" ], "text/plain": [ " A B\n", "0 NaN NaN\n", "1 NaN NaN" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 - df2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In arithmetic operations between differently indexed objects, you might want to fill\n", "with a special value, like 0, when an axis label is found in one object but not the other" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.021709Z", "start_time": "2019-12-24T11:54:13.915170Z" } }, "outputs": [], "source": [ "df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list(\"abcd\"))\n", "df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.142549Z", "start_time": "2019-12-24T11:54:14.021709Z" } }, "outputs": [], "source": [ "df2.loc[1, 'b'] = np.nan" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.273758Z", "start_time": "2019-12-24T11:54:14.143900Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " a b c d\n", "0 0.0 1.0 2.0 3.0\n", "1 4.0 5.0 6.0 7.0\n", "2 8.0 9.0 10.0 11.0\n", "\n", " a b c d e\n", "0 0.0 1.0 2.0 3.0 4.0\n", "1 5.0 NaN 7.0 8.0 9.0\n", "2 10.0 11.0 12.0 13.0 14.0\n", "3 15.0 16.0 17.0 18.0 19.0\n", "\n" ] } ], "source": [ "print(df1)\n", "print(df2)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.395478Z", "start_time": "2019-12-24T11:54:14.273758Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcde
00.02.04.06.0NaN
19.0NaN13.015.0NaN
218.020.022.024.0NaN
3NaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " a b c d e\n", "0 0.0 2.0 4.0 6.0 NaN\n", "1 9.0 NaN 13.0 15.0 NaN\n", "2 18.0 20.0 22.0 24.0 NaN\n", "3 NaN NaN NaN NaN NaN" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Adding these together results in NA values in the locations that don’t overlap\n", "df1 + df2" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.522987Z", "start_time": "2019-12-24T11:54:14.397472Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcde
00.02.04.06.04.0
19.05.013.015.09.0
218.020.022.024.014.0
315.016.017.018.019.0
\n", "
" ], "text/plain": [ " a b c d e\n", "0 0.0 2.0 4.0 6.0 4.0\n", "1 9.0 5.0 13.0 15.0 9.0\n", "2 18.0 20.0 22.0 24.0 14.0\n", "3 15.0 16.0 17.0 18.0 19.0" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Using the add method on df1,one pass df2 and an argument to fill_value\n", "df1.add(df2, fill_value=0)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.645859Z", "start_time": "2019-12-24T11:54:14.524282Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
0inf1.0000000.5000000.333333
10.2500.2000000.1666670.142857
20.1250.1111110.1000000.090909
\n", "
" ], "text/plain": [ " a b c d\n", "0 inf 1.000000 0.500000 0.333333\n", "1 0.250 0.200000 0.166667 0.142857\n", "2 0.125 0.111111 0.100000 0.090909" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 1 / df1\n", "# or\n", "df1.rdiv(1)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:14.762982Z", "start_time": "2019-12-24T11:54:14.648478Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " b d e\n", "utah 0.0 1.0 2.0\n", "ohio 3.0 4.0 5.0\n", "texas 6.0 7.0 8.0\n", "oregon 9.0 10.0 11.0\n", "\n", "b 0.0\n", "d 1.0\n", "e 2.0\n", "Name: utah, dtype: float64\n", "\n" ] } ], "source": [ "# operations between DataFrame and Series\n", "frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n", " index=(\"utah\", \"ohio\", \"texas\", \"oregon\"))\n", "series = frame.iloc[0]\n", "print(frame)\n", "print(series)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "By default, arithmetic between DataFrame and Series matches the index of the Series\n", "on the DataFrame’s columns, broadcasting down the rows" ] }, { "cell_type": "code", "execution_count": 75, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:15.272570Z", "start_time": "2019-12-24T11:54:14.764980Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
utah0.00.00.0
ohio3.03.03.0
texas6.06.06.0
oregon9.09.09.0
\n", "
" ], "text/plain": [ " b d e\n", "utah 0.0 0.0 0.0\n", "ohio 3.0 3.0 3.0\n", "texas 6.0 6.0 6.0\n", "oregon 9.0 9.0 9.0" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame - series" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If an index value is not found in either the DataFrame’s columns or the Series’s index,\n", "the objects will be reindexed to form the union" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:15.785596Z", "start_time": "2019-12-24T11:54:15.274639Z" } }, "outputs": [], "source": [ "series2 = pd.Series(range(3), index=list('bef'))" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:15.975844Z", "start_time": "2019-12-24T11:54:15.789282Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bdef
utah0.0NaN3.0NaN
ohio3.0NaN6.0NaN
texas6.0NaN9.0NaN
oregon9.0NaN12.0NaN
\n", "
" ], "text/plain": [ " b d e f\n", "utah 0.0 NaN 3.0 NaN\n", "ohio 3.0 NaN 6.0 NaN\n", "texas 6.0 NaN 9.0 NaN\n", "oregon 9.0 NaN 12.0 NaN" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame + series2" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:16.113109Z", "start_time": "2019-12-24T11:54:15.978953Z" } }, "outputs": [ { "data": { "text/plain": [ "utah 1.0\n", "ohio 4.0\n", "texas 7.0\n", "oregon 10.0\n", "Name: d, dtype: float64" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# broadcasting over columns\n", "series3 = frame['d']\n", "series3" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:16.261347Z", "start_time": "2019-12-24T11:54:16.115103Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
utah-1.00.01.0
ohio-1.00.01.0
texas-1.00.01.0
oregon-1.00.01.0
\n", "
" ], "text/plain": [ " b d e\n", "utah -1.0 0.0 1.0\n", "ohio -1.0 0.0 1.0\n", "texas -1.0 0.0 1.0\n", "oregon -1.0 0.0 1.0" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sub(series3, axis='index')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The axis number that you pass is the axis to match on. In this case we mean to match\n", "on the DataFrame’s row index (axis='index' or axis=0) and broadcast across." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Function Application and Mapping" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:16.578884Z", "start_time": "2019-12-24T11:54:16.263428Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah-1.846691-0.622582-1.542595
Ohio0.8186970.7305960.458859
Texas0.6127350.593502-0.631133
Oregon-0.067950-1.0232140.115138
\n", "
" ], "text/plain": [ " b d e\n", "Utah -1.846691 -0.622582 -1.542595\n", "Ohio 0.818697 0.730596 0.458859\n", "Texas 0.612735 0.593502 -0.631133\n", "Oregon -0.067950 -1.023214 0.115138" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),\n", " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", "frame" ] }, { "cell_type": "code", "execution_count": 81, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:16.763901Z", "start_time": "2019-12-24T11:54:16.581885Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah1.8466910.6225821.542595
Ohio0.8186970.7305960.458859
Texas0.6127350.5935020.631133
Oregon0.0679501.0232140.115138
\n", "
" ], "text/plain": [ " b d e\n", "Utah 1.846691 0.622582 1.542595\n", "Ohio 0.818697 0.730596 0.458859\n", "Texas 0.612735 0.593502 0.631133\n", "Oregon 0.067950 1.023214 0.115138" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.abs(frame)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:16.884947Z", "start_time": "2019-12-24T11:54:16.766103Z" } }, "outputs": [ { "data": { "text/plain": [ "b 2.665389\n", "d 1.753811\n", "e 2.001455\n", "dtype: float64" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Another frequent operation is applying a function on \n", "# one-dimensional arrays to each column or row. \n", "# DataFrame’s apply method does exactly this\n", "\n", "f = lambda x: x.max() - x.min()\n", "frame.apply(f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here the function f, which computes the difference between the maximum and minimum of a Series, is **invoked once on each column** in frame. The result is a Series having the columns of frame as its index." ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.003427Z", "start_time": "2019-12-24T11:54:16.887837Z" } }, "outputs": [ { "data": { "text/plain": [ "Utah 1.224110\n", "Ohio 0.359838\n", "Texas 1.243868\n", "Oregon 1.138353\n", "dtype: float64" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# If you pass axis='columns' to apply, \n", "# the function will be invoked once per row\n", "\n", "frame.apply(f, axis='columns')" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.152859Z", "start_time": "2019-12-24T11:54:17.003427Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
min-1.846691-1.023214-1.542595
max0.8186970.7305960.458859
\n", "
" ], "text/plain": [ " b d e\n", "min -1.846691 -1.023214 -1.542595\n", "max 0.818697 0.730596 0.458859" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# returning a Series with multiple values\n", "\n", "def f(x):\n", " return pd.Series([x.min(), x.max()], index=['min', 'max'])\n", "\n", "frame.apply(f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Element-wise Python functions can be used, too. Suppose you wanted to compute a\n", "formatted string from each floating-point value in frame. You can do this with \n", "**applymap**" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.261634Z", "start_time": "2019-12-24T11:54:17.155897Z" } }, "outputs": [], "source": [ "format = lambda x: f'{x:.2f}'" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.398915Z", "start_time": "2019-12-24T11:54:17.263630Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " b d e\n", "Utah -1.846691 -0.622582 -1.542595\n", "Ohio 0.818697 0.730596 0.458859\n", "Texas 0.612735 0.593502 -0.631133\n", "Oregon -0.067950 -1.023214 0.115138\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah-1.85-0.62-1.54
Ohio0.820.730.46
Texas0.610.59-0.63
Oregon-0.07-1.020.12
\n", "
" ], "text/plain": [ " b d e\n", "Utah -1.85 -0.62 -1.54\n", "Ohio 0.82 0.73 0.46\n", "Texas 0.61 0.59 -0.63\n", "Oregon -0.07 -1.02 0.12" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(frame)\n", "frame.applymap(format)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The reason for the name applymap is that Series has a **map** method for applying an\n", "element-wise function" ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.518935Z", "start_time": "2019-12-24T11:54:17.400935Z" } }, "outputs": [ { "data": { "text/plain": [ "Utah -1.54\n", "Ohio 0.46\n", "Texas -0.63\n", "Oregon 0.12\n", "Name: e, dtype: object" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame['e'].map(format)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sorting and Ranking" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Sorting\n", "\n", "Sorting a dataset by some criterion is another important built-in operation. To sort\n", "lexicographically by row or column index, use the **sort_index** method, which returns\n", "a new, sorted object" ] }, { "cell_type": "code", "execution_count": 88, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.662604Z", "start_time": "2019-12-24T11:54:17.522098Z" } }, "outputs": [ { "data": { "text/plain": [ "b 0\n", "d 1\n", "a 2\n", "c 3\n", "dtype: int64" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series(range(4), index=['b', 'd', 'a', 'c'])\n", "obj" ] }, { "cell_type": "code", "execution_count": 89, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.793647Z", "start_time": "2019-12-24T11:54:17.664561Z" } }, "outputs": [ { "data": { "text/plain": [ "a 2\n", "b 0\n", "c 3\n", "d 1\n", "dtype: int64" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.sort_index()" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:17.984142Z", "start_time": "2019-12-24T11:54:17.795974Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dabc
one4567
three0123
\n", "
" ], "text/plain": [ " d a b c\n", "one 4 5 6 7\n", "three 0 1 2 3" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],\n", " columns=['d', 'a', 'b', 'c'])\n", "frame.sort_index()" ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.202334Z", "start_time": "2019-12-24T11:54:17.985140Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
three1230
one5674
\n", "
" ], "text/plain": [ " a b c d\n", "three 1 2 3 0\n", "one 5 6 7 4" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_index(axis=1)" ] }, { "cell_type": "code", "execution_count": 92, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.322544Z", "start_time": "2019-12-24T11:54:18.205743Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dcba
three0321
one4765
\n", "
" ], "text/plain": [ " d c b a\n", "three 0 3 2 1\n", "one 4 7 6 5" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_index(axis=1, ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To sort a Series by its values, use its **sort_values** method.\n", "Any missing values are sorted to the end of the Series by default" ] }, { "cell_type": "code", "execution_count": 93, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.444610Z", "start_time": "2019-12-24T11:54:18.323543Z" } }, "outputs": [ { "data": { "text/plain": [ "0 -4\n", "3 2\n", "2 3\n", "1 7\n", "dtype: int64" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series([-4, 7, 3, 2])\n", "obj.sort_values()" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.561066Z", "start_time": "2019-12-24T11:54:18.448348Z" } }, "outputs": [ { "data": { "text/plain": [ "4 -3.0\n", "5 2.0\n", "0 4.0\n", "2 7.0\n", "1 NaN\n", "3 NaN\n", "dtype: float64" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])\n", "obj.sort_values()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When sorting a DataFrame, you can use the data in one or more columns as the sort\n", "keys. To do so, pass one or more column names to the by option of sort_values" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.674098Z", "start_time": "2019-12-24T11:54:18.562063Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ba
040
171
2-30
321
\n", "
" ], "text/plain": [ " b a\n", "0 4 0\n", "1 7 1\n", "2 -3 0\n", "3 2 1" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})\n", "frame" ] }, { "cell_type": "code", "execution_count": 96, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.782844Z", "start_time": "2019-12-24T11:54:18.675129Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ba
2-30
321
040
171
\n", "
" ], "text/plain": [ " b a\n", "2 -3 0\n", "3 2 1\n", "0 4 0\n", "1 7 1" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_values(by='b')" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:18.885242Z", "start_time": "2019-12-24T11:54:18.783879Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ba
2-30
040
321
171
\n", "
" ], "text/plain": [ " b a\n", "2 -3 0\n", "0 4 0\n", "3 2 1\n", "1 7 1" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_values(by=['a', 'b'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Ranking\n", "\n", "*Ranking* assigns ranks from one through the number of valid data points in an array.\n", "The rank methods for Series and DataFrame are the place to look; by default rank\n", "breaks ties by assigning each group the mean rank" ] }, { "cell_type": "code", "execution_count": 98, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.058017Z", "start_time": "2019-12-24T11:54:18.887690Z" } }, "outputs": [ { "data": { "text/plain": [ "0 6.5\n", "1 1.0\n", "2 6.5\n", "3 4.5\n", "4 3.0\n", "5 2.0\n", "6 4.5\n", "dtype: float64" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series([7, -5, 7, 4, 2, 0, 4])\n", "obj.rank()" ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.209466Z", "start_time": "2019-12-24T11:54:19.059022Z" } }, "outputs": [ { "data": { "text/plain": [ "0 6.0\n", "1 1.0\n", "2 7.0\n", "3 4.0\n", "4 3.0\n", "5 2.0\n", "6 5.0\n", "dtype: float64" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# assigning rank according to the order they're observed first in the data\n", "obj.rank(method='first')" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.352734Z", "start_time": "2019-12-24T11:54:19.211753Z" } }, "outputs": [ { "data": { "text/plain": [ "0 2.0\n", "1 7.0\n", "2 2.0\n", "3 4.0\n", "4 5.0\n", "5 6.0\n", "6 4.0\n", "dtype: float64" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.rank(ascending=False, method='max')" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.499605Z", "start_time": "2019-12-24T11:54:19.352734Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bac
04.30-2.0
17.015.0
2-3.008.0
32.01-2.5
\n", "
" ], "text/plain": [ " b a c\n", "0 4.3 0 -2.0\n", "1 7.0 1 5.0\n", "2 -3.0 0 8.0\n", "3 2.0 1 -2.5" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})\n", "frame" ] }, { "cell_type": "code", "execution_count": 102, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.644123Z", "start_time": "2019-12-24T11:54:19.502600Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bac
03.02.01.0
13.01.02.0
21.02.03.0
33.02.01.0
\n", "
" ], "text/plain": [ " b a c\n", "0 3.0 2.0 1.0\n", "1 3.0 1.0 2.0\n", "2 1.0 2.0 3.0\n", "3 3.0 2.0 1.0" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.rank(axis='columns')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Axis Indexes with Duplicate Labels" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.825688Z", "start_time": "2019-12-24T11:54:19.646865Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0\n", "a 1\n", "b 2\n", "b 3\n", "c 4\n", "dtype: int64" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])\n", "obj" ] }, { "cell_type": "code", "execution_count": 104, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:19.978965Z", "start_time": "2019-12-24T11:54:19.828121Z" } }, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.index.is_unique" ] }, { "cell_type": "code", "execution_count": 105, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.122066Z", "start_time": "2019-12-24T11:54:19.980367Z" } }, "outputs": [ { "data": { "text/plain": [ "a 0\n", "a 1\n", "dtype: int64" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.a" ] }, { "cell_type": "code", "execution_count": 106, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.293573Z", "start_time": "2019-12-24T11:54:20.124045Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
a-2.0546230.654964-0.431453
a0.6547820.7764291.175684
b-0.485586-0.960232-1.368606
b0.2057970.2662942.007870
\n", "
" ], "text/plain": [ " 0 1 2\n", "a -2.054623 0.654964 -0.431453\n", "a 0.654782 0.776429 1.175684\n", "b -0.485586 -0.960232 -1.368606\n", "b 0.205797 0.266294 2.007870" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])\n", "df" ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.444163Z", "start_time": "2019-12-24T11:54:20.295569Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
a-2.0546230.654964-0.431453
a0.6547820.7764291.175684
\n", "
" ], "text/plain": [ " 0 1 2\n", "a -2.054623 0.654964 -0.431453\n", "a 0.654782 0.776429 1.175684" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc['a']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Summarizing and Computing Descriptive Statistics" ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.549520Z", "start_time": "2019-12-24T11:54:20.446595Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
a1.40NaN
b7.10-4.50
cNaNNaN
d0.75-1.43
\n", "
" ], "text/plain": [ " one two\n", "a 1.40 NaN\n", "b 7.10 -4.50\n", "c NaN NaN\n", "d 0.75 -1.43" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan],\n", " [0.75, -1.43]],\n", " index=['a', 'b', 'c', 'd'],\n", " columns=['one', 'two'])\n", "df" ] }, { "cell_type": "code", "execution_count": 109, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.684771Z", "start_time": "2019-12-24T11:54:20.549520Z" } }, "outputs": [ { "data": { "text/plain": [ "one 9.25\n", "two -5.93\n", "dtype: float64" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sum()" ] }, { "cell_type": "code", "execution_count": 110, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.818099Z", "start_time": "2019-12-24T11:54:20.687422Z" } }, "outputs": [ { "data": { "text/plain": [ "a 1.40\n", "b 2.60\n", "c 0.00\n", "d -0.68\n", "dtype: float64" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sum(axis='columns')" ] }, { "cell_type": "code", "execution_count": 111, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:20.936482Z", "start_time": "2019-12-24T11:54:20.818099Z" } }, "outputs": [ { "data": { "text/plain": [ "a NaN\n", "b 2.60\n", "c NaN\n", "d -0.68\n", "dtype: float64" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sum(axis='columns', skipna=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Some methods, like idxmin and idxmax, return indirect statistics like the index value\n", "where the minimum or maximum values are attained" ] }, { "cell_type": "code", "execution_count": 112, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:21.074783Z", "start_time": "2019-12-24T11:54:20.937480Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "one b\n", "two d\n", "dtype: object\n", "\n", "one d\n", "two b\n", "dtype: object\n", "\n" ] } ], "source": [ "print(df.idxmax())\n", "print(df.idxmin())" ] }, { "cell_type": "code", "execution_count": 113, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:21.194122Z", "start_time": "2019-12-24T11:54:21.076618Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
a1.40NaN
b8.50-4.50
cNaNNaN
d9.25-5.93
\n", "
" ], "text/plain": [ " one two\n", "a 1.40 NaN\n", "b 8.50 -4.50\n", "c NaN NaN\n", "d 9.25 -5.93" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# accmulation\n", "df.cumsum()" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-12-22T20:42:48.976671Z", "start_time": "2019-12-22T20:42:48.961382Z" } }, "source": [ "Another type of method is neither a reduction nor an accumulation. **describe** is one\n", "such example, producing multiple summary statistics in one shot" ] }, { "cell_type": "code", "execution_count": 114, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:21.344016Z", "start_time": "2019-12-24T11:54:21.195119Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
count3.0000002.000000
mean3.083333-2.965000
std3.4936852.170818
min0.750000-4.500000
25%1.075000-3.732500
50%1.400000-2.965000
75%4.250000-2.197500
max7.100000-1.430000
\n", "
" ], "text/plain": [ " one two\n", "count 3.000000 2.000000\n", "mean 3.083333 -2.965000\n", "std 3.493685 2.170818\n", "min 0.750000 -4.500000\n", "25% 1.075000 -3.732500\n", "50% 1.400000 -2.965000\n", "75% 4.250000 -2.197500\n", "max 7.100000 -1.430000" ] }, "execution_count": 114, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 115, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:21.511393Z", "start_time": "2019-12-24T11:54:21.344016Z" } }, "outputs": [ { "data": { "text/plain": [ "count 16\n", "unique 3\n", "top a\n", "freq 8\n", "dtype: object" ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# On non-numeric data, describe produces alternative summary statistics:\n", "\n", "obj = pd.Series(['a', 'a', 'b', 'c'] * 4)\n", "obj.describe()" ] }, { "cell_type": "code", "execution_count": 116, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:21.653404Z", "start_time": "2019-12-24T11:54:21.515120Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
a1.40NaN
b7.10-4.50
cNaNNaN
d0.75-1.43
\n", "
" ], "text/plain": [ " one two\n", "a 1.40 NaN\n", "b 7.10 -4.50\n", "c NaN NaN\n", "d 0.75 -1.43" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-12-22T20:47:42.840846Z", "start_time": "2019-12-22T20:47:42.833552Z" } }, "source": [ "### Correlation and Covariance" ] }, { "cell_type": "code", "execution_count": 117, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:23.076066Z", "start_time": "2019-12-24T11:54:21.655517Z" } }, "outputs": [], "source": [ "import pandas_datareader as web" ] }, { "cell_type": "code", "execution_count": 118, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:27.953329Z", "start_time": "2019-12-24T11:54:23.079097Z" } }, "outputs": [], "source": [ "all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}" ] }, { "cell_type": "code", "execution_count": 119, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:28.701569Z", "start_time": "2019-12-24T11:54:27.953329Z" } }, "outputs": [], "source": [ "price = pd.DataFrame({ticker: data['Adj Close']\n", " for ticker, data in all_data.items()})\n", "volume = pd.DataFrame({ticker: data['Volume']\n", " for ticker, data in all_data.items()})" ] }, { "cell_type": "code", "execution_count": 120, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:28.944129Z", "start_time": "2019-12-24T11:54:28.701569Z" } }, "outputs": [], "source": [ "returns = price.pct_change()" ] }, { "cell_type": "code", "execution_count": 121, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:29.202785Z", "start_time": "2019-12-24T11:54:28.944129Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLIBMMSFTGOOG
Date
2019-12-170.0019650.000671-0.005401-0.004445
2019-12-18-0.0023890.001416-0.002069-0.001845
2019-12-190.0010010.0011160.0086810.002528
2019-12-20-0.0020710.0076550.010918-0.004757
2019-12-230.016318-0.0002950.000000-0.000556
\n", "
" ], "text/plain": [ " AAPL IBM MSFT GOOG\n", "Date \n", "2019-12-17 0.001965 0.000671 -0.005401 -0.004445\n", "2019-12-18 -0.002389 0.001416 -0.002069 -0.001845\n", "2019-12-19 0.001001 0.001116 0.008681 0.002528\n", "2019-12-20 -0.002071 0.007655 0.010918 -0.004757\n", "2019-12-23 0.016318 -0.000295 0.000000 -0.000556" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The **corr** method of Series computes the correlation of the overlapping, non-NA,\n", "aligned-by-index values in two Series. Relatedly, **cov** computes the covariance" ] }, { "cell_type": "code", "execution_count": 122, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.201205Z", "start_time": "2019-12-24T11:54:29.206950Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9.227138328874263e-05\n", "\n", "0.48490365831130916\n", "\n" ] } ], "source": [ "print(returns['MSFT'].cov(returns['IBM']))\n", "print(returns['MSFT'].corr(returns['IBM']))" ] }, { "cell_type": "code", "execution_count": 123, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.217210Z", "start_time": "2019-12-24T11:54:30.204759Z" } }, "outputs": [ { "data": { "text/plain": [ "0.48490365831130916" ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.MSFT.corr(returns.IBM)" ] }, { "cell_type": "code", "execution_count": 124, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.398635Z", "start_time": "2019-12-24T11:54:30.219543Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLIBMMSFTGOOG
AAPL1.0000000.3993400.5732240.521823
IBM0.3993401.0000000.4849040.410386
MSFT0.5732240.4849041.0000000.658704
GOOG0.5218230.4103860.6587041.000000
\n", "
" ], "text/plain": [ " AAPL IBM MSFT GOOG\n", "AAPL 1.000000 0.399340 0.573224 0.521823\n", "IBM 0.399340 1.000000 0.484904 0.410386\n", "MSFT 0.573224 0.484904 1.000000 0.658704\n", "GOOG 0.521823 0.410386 0.658704 1.000000" ] }, "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.corr()" ] }, { "cell_type": "code", "execution_count": 125, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.569188Z", "start_time": "2019-12-24T11:54:30.400959Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLIBMMSFTGOOG
AAPL0.0002450.0000810.0001320.000124
IBM0.0000810.0001680.0000920.000080
MSFT0.0001320.0000920.0002160.000146
GOOG0.0001240.0000800.0001460.000229
\n", "
" ], "text/plain": [ " AAPL IBM MSFT GOOG\n", "AAPL 0.000245 0.000081 0.000132 0.000124\n", "IBM 0.000081 0.000168 0.000092 0.000080\n", "MSFT 0.000132 0.000092 0.000216 0.000146\n", "GOOG 0.000124 0.000080 0.000146 0.000229" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.cov()" ] }, { "cell_type": "code", "execution_count": 126, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.727892Z", "start_time": "2019-12-24T11:54:30.572979Z" } }, "outputs": [ { "data": { "text/plain": [ "AAPL 0.399340\n", "IBM 1.000000\n", "MSFT 0.484904\n", "GOOG 0.410386\n", "dtype: float64" ] }, "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# corrwith method - compute pairwise correlations\n", "# between a DataFrame’s columns or rows with another Series oDataFrame’sme\n", "returns.corrwith(returns.IBM)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Unique Values, Value Counts, and Membership\n" ] }, { "cell_type": "code", "execution_count": 127, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.844787Z", "start_time": "2019-12-24T11:54:30.727892Z" } }, "outputs": [ { "data": { "text/plain": [ "array(['c', 'a', 'd', 'b'], dtype=object)" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# unique - gives you an array of the unique values in a Series\n", "\n", "obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])\n", "obj.unique()" ] }, { "cell_type": "code", "execution_count": 128, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:30.977262Z", "start_time": "2019-12-24T11:54:30.846938Z" } }, "outputs": [ { "data": { "text/plain": [ "c 3\n", "a 3\n", "b 2\n", "d 1\n", "dtype: int64" ] }, "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# value_counts - computes a Series containing value frequencies\n", "\n", "pd.value_counts(obj.values, sort=True)\n" ] }, { "cell_type": "code", "execution_count": 129, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:31.112737Z", "start_time": "2019-12-24T11:54:30.978817Z" } }, "outputs": [ { "data": { "text/plain": [ "0 c\n", "1 a\n", "2 d\n", "3 a\n", "4 a\n", "5 b\n", "6 b\n", "7 c\n", "8 c\n", "dtype: object" ] }, "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# isin - performs a vectorized set membership check and can be useful in filtering a\n", "# dataset down to a subset of values in a Series or column in a DataFrame\n", "\n", "obj" ] }, { "cell_type": "code", "execution_count": 130, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:31.299318Z", "start_time": "2019-12-24T11:54:31.114761Z" } }, "outputs": [ { "data": { "text/plain": [ "0 True\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", "5 True\n", "6 True\n", "7 True\n", "8 True\n", "dtype: bool" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mask = obj.isin(['b', 'c'])\n", "mask" ] }, { "cell_type": "code", "execution_count": 131, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:31.517280Z", "start_time": "2019-12-24T11:54:31.300318Z" } }, "outputs": [ { "data": { "text/plain": [ "0 c\n", "5 b\n", "6 b\n", "7 c\n", "8 c\n", "dtype: object" ] }, "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj[mask]" ] }, { "cell_type": "code", "execution_count": 132, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:31.801449Z", "start_time": "2019-12-24T11:54:31.520294Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Qu1Qu2Qu3
0121
1335
2412
3324
4434
\n", "
" ], "text/plain": [ " Qu1 Qu2 Qu3\n", "0 1 2 1\n", "1 3 3 5\n", "2 4 1 2\n", "3 3 2 4\n", "4 4 3 4" ] }, "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],\n", " 'Qu2': [2, 3, 1, 2, 3],\n", " 'Qu3': [1, 5, 2, 4, 4]})\n", "data" ] }, { "cell_type": "code", "execution_count": 133, "metadata": { "ExecuteTime": { "end_time": "2019-12-24T11:54:32.315990Z", "start_time": "2019-12-24T11:54:31.804884Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4 2\n", "3 2\n", "1 1\n", "Name: Qu1, dtype: int64\n", "\n", "3 2\n", "2 2\n", "1 1\n", "Name: Qu2, dtype: int64\n", "\n", "4 2\n", "5 1\n", "2 1\n", "1 1\n", "Name: Qu3, dtype: int64\n", "\n", " Qu1 Qu2 Qu3\n", "1 1.0 1.0 1.0\n", "2 0.0 2.0 1.0\n", "3 2.0 2.0 0.0\n", "4 2.0 0.0 2.0\n", "5 0.0 0.0 1.0\n", "\n" ] } ], "source": [ "print(pd.value_counts(data.Qu1))\n", "print(pd.value_counts(data.Qu2))\n", "print(pd.value_counts(data.Qu3))\n", "print(data.apply(pd.value_counts).fillna(0))\n", "# index - distinct values in the df\n", "# column values - how many times the distinct value occur in that column" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": true, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "202.837px" }, "toc_section_display": true, "toc_window_display": false }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }