{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"toc": true
},
"source": [
"
Table of Contents
\n",
""
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:01.227037Z",
"start_time": "2019-12-24T11:54:00.439377Z"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:01.232635Z",
"start_time": "2019-12-24T11:54:01.227037Z"
}
},
"outputs": [],
"source": [
"import builtins\n",
"\n",
"def print(*args, **kwargs):\n",
" builtins.print(*args, **kwargs, end='\\n\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-22T12:01:19.037222Z",
"start_time": "2019-12-22T12:01:19.029951Z"
}
},
"source": [
"## Series\n",
"\n",
"A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:02.268631Z",
"start_time": "2019-12-24T11:54:01.233953Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 4\n",
"1 7\n",
"2 -5\n",
"3 3\n",
"dtype: int64\n",
"\n",
"RangeIndex(start=0, stop=4, step=1)\n",
"\n",
"[ 4 7 -5 3]\n",
"\n"
]
}
],
"source": [
"obj = pd.Series([4, 7, -5, 3])\n",
"print(obj) \n",
"print(obj.index)\n",
"print(obj.values)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:02.430806Z",
"start_time": "2019-12-24T11:54:02.271623Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"d 4\n",
"a 7\n",
"c -5\n",
"b 3\n",
"dtype: int64\n",
"\n",
"Index(['d', 'a', 'c', 'b'], dtype='object')\n",
"\n"
]
}
],
"source": [
"obj2 = pd.Series([4, 7, -5, 3], index=['d', 'a', 'c', 'b'])\n",
"print(obj2)\n",
"print(obj2.index)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:02.643902Z",
"start_time": "2019-12-24T11:54:02.432505Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"7"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj2['a']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:02.812307Z",
"start_time": "2019-12-24T11:54:02.645897Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"c -5\n",
"a 7\n",
"d 4\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj2[['c', 'a', 'd']]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:02.962361Z",
"start_time": "2019-12-24T11:54:02.814622Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ohio 35000\n",
"Texas 71000\n",
"Oregon 16000\n",
"Utah 5000\n",
"dtype: int64\n",
"\n"
]
}
],
"source": [
"sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}\n",
"obj3 = pd.Series(sdata)\n",
"print(obj3)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:03.145871Z",
"start_time": "2019-12-24T11:54:02.964783Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"California NaN\n",
"Ohio 35000.0\n",
"Oregon 16000.0\n",
"Texas 71000.0\n",
"dtype: float64\n",
"\n"
]
}
],
"source": [
"states = ['California', 'Ohio', 'Oregon', 'Texas']\n",
"obj4 = pd.Series(sdata, index=states)\n",
"print(obj4)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:03.282293Z",
"start_time": "2019-12-24T11:54:03.148390Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"California True\n",
"Ohio False\n",
"Oregon False\n",
"Texas False\n",
"dtype: bool\n",
"\n",
"California False\n",
"Ohio True\n",
"Oregon True\n",
"Texas True\n",
"dtype: bool\n",
"\n"
]
}
],
"source": [
"print(obj4.isnull())\n",
"print(obj4.notnull())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:03.477576Z",
"start_time": "2019-12-24T11:54:03.282293Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ohio 35000\n",
"Texas 71000\n",
"Oregon 16000\n",
"Utah 5000\n",
"dtype: int64\n",
"\n",
"California NaN\n",
"Ohio 35000.0\n",
"Oregon 16000.0\n",
"Texas 71000.0\n",
"dtype: float64\n",
"\n",
"California NaN\n",
"Ohio 70000.0\n",
"Oregon 32000.0\n",
"Texas 142000.0\n",
"Utah NaN\n",
"dtype: float64\n",
"\n"
]
}
],
"source": [
"print(obj3)\n",
"print(obj4)\n",
"print(obj3 + obj4)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:03.586203Z",
"start_time": "2019-12-24T11:54:03.480359Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"state\n",
"California NaN\n",
"Ohio 35000.0\n",
"Oregon 16000.0\n",
"Texas 71000.0\n",
"Name: population, dtype: float64\n",
"\n"
]
}
],
"source": [
"obj4.name = 'population'\n",
"obj4.index.name = 'state'\n",
"print(obj4)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:03.710465Z",
"start_time": "2019-12-24T11:54:03.588187Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 4\n",
"1 7\n",
"2 -5\n",
"3 3\n",
"dtype: int64\n",
"\n",
"Bob 4\n",
"Steve 7\n",
"Jeff -5\n",
"Ryan 3\n",
"dtype: int64\n",
"\n"
]
}
],
"source": [
"print(obj)\n",
"obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']\n",
"print(obj)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataFrame\n",
"\n",
"A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.)."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:03.884935Z",
"start_time": "2019-12-24T11:54:03.713457Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" state year pop\n",
"0 Ohio 2000 1.5\n",
"1 Ohio 2001 1.7\n",
"2 Ohio 2002 3.6\n",
"3 Nevada 2001 2.4\n",
"4 Nevada 2002 2.9\n",
"5 Nevada 2003 3.2\n",
"\n"
]
}
],
"source": [
"data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],\n",
" 'year': [2000, 2001, 2002, 2001, 2002, 2003],\n",
" 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}\n",
"\n",
"frame = pd.DataFrame(data)\n",
"print(frame)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:04.041094Z",
"start_time": "2019-12-24T11:54:03.886930Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" year | \n",
" state | \n",
" pop | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 2000 | \n",
" Ohio | \n",
" 1.5 | \n",
"
\n",
" \n",
" | 1 | \n",
" 2001 | \n",
" Ohio | \n",
" 1.7 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2002 | \n",
" Ohio | \n",
" 3.6 | \n",
"
\n",
" \n",
" | 3 | \n",
" 2001 | \n",
" Nevada | \n",
" 2.4 | \n",
"
\n",
" \n",
" | 4 | \n",
" 2002 | \n",
" Nevada | \n",
" 2.9 | \n",
"
\n",
" \n",
" | 5 | \n",
" 2003 | \n",
" Nevada | \n",
" 3.2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" year state pop\n",
"0 2000 Ohio 1.5\n",
"1 2001 Ohio 1.7\n",
"2 2002 Ohio 3.6\n",
"3 2001 Nevada 2.4\n",
"4 2002 Nevada 2.9\n",
"5 2003 Nevada 3.2"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(data, columns=['year', 'state', 'pop'])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:04.244915Z",
"start_time": "2019-12-24T11:54:04.043184Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" year state pop debt\n",
"one 2000 Ohio 1.5 NaN\n",
"two 2001 Ohio 1.7 NaN\n",
"three 2002 Ohio 3.6 NaN\n",
"four 2001 Nevada 2.4 NaN\n",
"five 2002 Nevada 2.9 NaN\n",
"six 2003 Nevada 3.2 NaN\n",
"\n",
"Index(['year', 'state', 'pop', 'debt'], dtype='object')\n",
"\n"
]
}
],
"source": [
"# u pass a column that isn’t contained in the dict, \n",
"# it will appear with missing values\n",
"\n",
"frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],\n",
" index=['one', 'two', 'three', 'four', 'five', 'six'])\n",
"\n",
"print(frame2)\n",
"print(frame2.columns)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:04.373163Z",
"start_time": "2019-12-24T11:54:04.244915Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 Ohio\n",
"1 Ohio\n",
"2 Ohio\n",
"3 Nevada\n",
"4 Nevada\n",
"5 Nevada\n",
"Name: state, dtype: object\n",
"\n",
"0 2000\n",
"1 2001\n",
"2 2002\n",
"3 2001\n",
"4 2002\n",
"5 2003\n",
"Name: year, dtype: int64\n",
"\n"
]
}
],
"source": [
"print(frame['state'])\n",
"print(frame.year)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:04.502687Z",
"start_time": "2019-12-24T11:54:04.374158Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"year 2002\n",
"state Ohio\n",
"pop 3.6\n",
"debt NaN\n",
"Name: three, dtype: object"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Rows can also be retrieved by \n",
"# position or name with the special loc attribute\n",
"frame2.loc['three']"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:04.699057Z",
"start_time": "2019-12-24T11:54:04.503681Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" year state pop debt\n",
"one 2000 Ohio 1.5 NaN\n",
"two 2001 Ohio 1.7 NaN\n",
"three 2002 Ohio 3.6 NaN\n",
"four 2001 Nevada 2.4 NaN\n",
"five 2002 Nevada 2.9 NaN\n",
"six 2003 Nevada 3.2 NaN\n",
"\n",
" year state pop debt\n",
"one 2000 Ohio 1.5 16.5\n",
"two 2001 Ohio 1.7 16.5\n",
"three 2002 Ohio 3.6 16.5\n",
"four 2001 Nevada 2.4 16.5\n",
"five 2002 Nevada 2.9 16.5\n",
"six 2003 Nevada 3.2 16.5\n",
"\n",
" year state pop debt\n",
"one 2000 Ohio 1.5 0.0\n",
"two 2001 Ohio 1.7 1.0\n",
"three 2002 Ohio 3.6 2.0\n",
"four 2001 Nevada 2.4 3.0\n",
"five 2002 Nevada 2.9 4.0\n",
"six 2003 Nevada 3.2 5.0\n",
"\n"
]
}
],
"source": [
"print(frame2)\n",
"frame2['debt'] = 16.5\n",
"print(frame2)\n",
"frame2['debt'] = np.arange(6.)\n",
"print(frame2)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:04.822895Z",
"start_time": "2019-12-24T11:54:04.700056Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" year state pop debt\n",
"one 2000 Ohio 1.5 NaN\n",
"two 2001 Ohio 1.7 -1.2\n",
"three 2002 Ohio 3.6 NaN\n",
"four 2001 Nevada 2.4 -1.5\n",
"five 2002 Nevada 2.9 -1.7\n",
"six 2003 Nevada 3.2 NaN\n",
"\n"
]
}
],
"source": [
"val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])\n",
"frame2['debt'] = val\n",
"print(frame2)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:04.996306Z",
"start_time": "2019-12-24T11:54:04.825542Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" year state pop debt eastern\n",
"one 2000 Ohio 1.5 NaN True\n",
"two 2001 Ohio 1.7 -1.2 True\n",
"three 2002 Ohio 3.6 NaN True\n",
"four 2001 Nevada 2.4 -1.5 False\n",
"five 2002 Nevada 2.9 -1.7 False\n",
"six 2003 Nevada 3.2 NaN False\n",
"\n"
]
},
{
"data": {
"text/plain": [
"Index(['year', 'state', 'pop', 'debt'], dtype='object')"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Assigning a column that doesn’t exist will create a new column. \n",
"# The del keyword will delete columns as with a dict.\n",
"# As an example of del, I first add a new column of boolean \n",
"# values where the state column equals 'Ohio':\n",
"\n",
"frame2['eastern'] = frame2['state'] == 'Ohio'\n",
"print(frame2)\n",
"\n",
"del frame2['eastern']\n",
"frame2.columns"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:05.121544Z",
"start_time": "2019-12-24T11:54:04.998624Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Nevada Ohio\n",
"2001 2.4 1.7\n",
"2002 2.9 3.6\n",
"2000 NaN 1.5\n",
"\n"
]
}
],
"source": [
"# another way is a nested dict of dicts\n",
"pop = {'Nevada': {2001: 2.4, 2002: 2.9},\n",
" 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}\n",
"\n",
"# If the nested dict is passed to the DataFrame, \n",
"# pandas will interpret the outer dict keys\n",
"# as the columns and the inner keys as the row indices\n",
"\n",
"frame3 = pd.DataFrame(pop)\n",
"print(frame3)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:05.295926Z",
"start_time": "2019-12-24T11:54:05.126331Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 2001 | \n",
" 2002 | \n",
" 2000 | \n",
"
\n",
" \n",
" \n",
" \n",
" | Nevada | \n",
" 2.4 | \n",
" 2.9 | \n",
" NaN | \n",
"
\n",
" \n",
" | Ohio | \n",
" 1.7 | \n",
" 3.6 | \n",
" 1.5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 2001 2002 2000\n",
"Nevada 2.4 2.9 NaN\n",
"Ohio 1.7 3.6 1.5"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# transpose dataframe\n",
"frame3.T"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:05.432228Z",
"start_time": "2019-12-24T11:54:05.299058Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"state Nevada Ohio\n",
"year \n",
"2001 2.4 1.7\n",
"2002 2.9 3.6\n",
"2000 NaN 1.5\n",
"\n"
]
}
],
"source": [
"frame3.index.name = 'year'; frame3.columns.name = 'state'\n",
"print(frame3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Using the axis parameter \n",
"\n",
"1) axis = 0 => row\n",
" * Move across/down the row axis\n",
" * Direction of operation is down\n",
"\n",
"2) axis = 1 => column\n",
" * Move along the column axis\n",
" * Direction of operation is sideways\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:06.582318Z",
"start_time": "2019-12-24T11:54:05.434315Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" country | \n",
" beer_servings | \n",
" spirit_servings | \n",
" wine_servings | \n",
" total_litres_of_pure_alcohol | \n",
" continent | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Afghanistan | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" Asia | \n",
"
\n",
" \n",
" | 1 | \n",
" Albania | \n",
" 89 | \n",
" 132 | \n",
" 54 | \n",
" 4.9 | \n",
" Europe | \n",
"
\n",
" \n",
" | 2 | \n",
" Algeria | \n",
" 25 | \n",
" 0 | \n",
" 14 | \n",
" 0.7 | \n",
" Africa | \n",
"
\n",
" \n",
" | 3 | \n",
" Andorra | \n",
" 245 | \n",
" 138 | \n",
" 312 | \n",
" 12.4 | \n",
" Europe | \n",
"
\n",
" \n",
" | 4 | \n",
" Angola | \n",
" 217 | \n",
" 57 | \n",
" 45 | \n",
" 5.9 | \n",
" Africa | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" country beer_servings spirit_servings wine_servings \\\n",
"0 Afghanistan 0 0 0 \n",
"1 Albania 89 132 54 \n",
"2 Algeria 25 0 14 \n",
"3 Andorra 245 138 312 \n",
"4 Angola 217 57 45 \n",
"\n",
" total_litres_of_pure_alcohol continent \n",
"0 0.0 Asia \n",
"1 4.9 Europe \n",
"2 0.7 Africa \n",
"3 12.4 Europe \n",
"4 5.9 Africa "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"drinks = pd.read_csv('http://bit.ly/drinksbycountry')\n",
"drinks.head()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:06.610698Z",
"start_time": "2019-12-24T11:54:06.587032Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" country | \n",
" beer_servings | \n",
" spirit_servings | \n",
" wine_servings | \n",
" total_litres_of_pure_alcohol | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Afghanistan | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" Albania | \n",
" 89 | \n",
" 132 | \n",
" 54 | \n",
" 4.9 | \n",
"
\n",
" \n",
" | 2 | \n",
" Algeria | \n",
" 25 | \n",
" 0 | \n",
" 14 | \n",
" 0.7 | \n",
"
\n",
" \n",
" | 3 | \n",
" Andorra | \n",
" 245 | \n",
" 138 | \n",
" 312 | \n",
" 12.4 | \n",
"
\n",
" \n",
" | 4 | \n",
" Angola | \n",
" 217 | \n",
" 57 | \n",
" 45 | \n",
" 5.9 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" country beer_servings spirit_servings wine_servings \\\n",
"0 Afghanistan 0 0 0 \n",
"1 Albania 89 132 54 \n",
"2 Algeria 25 0 14 \n",
"3 Andorra 245 138 312 \n",
"4 Angola 217 57 45 \n",
"\n",
" total_litres_of_pure_alcohol \n",
"0 0.0 \n",
"1 4.9 \n",
"2 0.7 \n",
"3 12.4 \n",
"4 5.9 "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# dropping a column\n",
"\n",
"drinks.drop('continent', axis=1).head() "
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:07.987950Z",
"start_time": "2019-12-24T11:54:06.613218Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" country | \n",
" beer_servings | \n",
" spirit_servings | \n",
" wine_servings | \n",
" total_litres_of_pure_alcohol | \n",
" continent | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Afghanistan | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" Asia | \n",
"
\n",
" \n",
" | 1 | \n",
" Albania | \n",
" 89 | \n",
" 132 | \n",
" 54 | \n",
" 4.9 | \n",
" Europe | \n",
"
\n",
" \n",
" | 3 | \n",
" Andorra | \n",
" 245 | \n",
" 138 | \n",
" 312 | \n",
" 12.4 | \n",
" Europe | \n",
"
\n",
" \n",
" | 4 | \n",
" Angola | \n",
" 217 | \n",
" 57 | \n",
" 45 | \n",
" 5.9 | \n",
" Africa | \n",
"
\n",
" \n",
" | 5 | \n",
" Antigua & Barbuda | \n",
" 102 | \n",
" 128 | \n",
" 45 | \n",
" 4.9 | \n",
" North America | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" country beer_servings spirit_servings wine_servings \\\n",
"0 Afghanistan 0 0 0 \n",
"1 Albania 89 132 54 \n",
"3 Andorra 245 138 312 \n",
"4 Angola 217 57 45 \n",
"5 Antigua & Barbuda 102 128 45 \n",
"\n",
" total_litres_of_pure_alcohol continent \n",
"0 0.0 Asia \n",
"1 4.9 Europe \n",
"3 12.4 Europe \n",
"4 5.9 Africa \n",
"5 4.9 North America "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# dropping a row\n",
"\n",
"drinks.drop(2, axis=0).head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:08.176714Z",
"start_time": "2019-12-24T11:54:07.990944Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(4,)\n",
"\n"
]
},
{
"data": {
"text/plain": [
"beer_servings 106.160622\n",
"spirit_servings 80.994819\n",
"wine_servings 49.450777\n",
"total_litres_of_pure_alcohol 4.717098\n",
"dtype: float64"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# default axis=0\n",
"# mean of each column\n",
"\n",
"print(drinks.mean().shape)\n",
"drinks.mean() "
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:08.296361Z",
"start_time": "2019-12-24T11:54:08.178873Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(193,)\n",
"\n"
]
},
{
"data": {
"text/plain": [
"0 0.000\n",
"1 69.975\n",
"2 9.925\n",
"3 176.850\n",
"4 81.225\n",
" ... \n",
"188 110.925\n",
"189 29.000\n",
"190 1.525\n",
"191 14.375\n",
"192 22.675\n",
"Length: 193, dtype: float64"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# mean of each row\n",
"\n",
"print(drinks.mean(axis=1).shape)\n",
"drinks.mean(axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Essesntial Functionality"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Reindexing\n",
"\n",
"An important method on pandas objects is **reindex**, which means to create a new object with the data conformed to a new index"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:08.467311Z",
"start_time": "2019-12-24T11:54:08.296361Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"d 4.5\n",
"b 7.2\n",
"a -5.3\n",
"c 3.6\n",
"dtype: float64\n",
"\n",
"a -5.3\n",
"b 7.2\n",
"c 3.6\n",
"d 4.5\n",
"dtype: float64\n",
"\n"
]
}
],
"source": [
"obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])\n",
"print(obj)\n",
"\n",
"obj2 = obj.reindex(['a', 'b', 'c', 'd'])\n",
"print(obj2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For ordered data like time series, it may be desirable to do some interpolation or filling of values when reindexing. The **method** option allows us to do this, using a\n",
"method such as **ffill**, which forward-fills the values"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:08.580511Z",
"start_time": "2019-12-24T11:54:08.470651Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 blue\n",
"2 purple\n",
"4 yellow\n",
"dtype: object\n",
"\n"
]
},
{
"data": {
"text/plain": [
"0 blue\n",
"1 blue\n",
"2 purple\n",
"3 purple\n",
"4 yellow\n",
"5 yellow\n",
"dtype: object"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])\n",
"print(obj3)\n",
"\n",
"obj3.reindex(range(6), method='ffill')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:08.694431Z",
"start_time": "2019-12-24T11:54:08.582639Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Ohio | \n",
" Texas | \n",
" California | \n",
"
\n",
" \n",
" | letters | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | a | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" | c | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
"
\n",
" \n",
" | d | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Ohio Texas California\n",
"letters \n",
"a 0 1 2\n",
"c 3 4 5\n",
"d 6 7 8"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],\n",
" columns=['Ohio', 'Texas', 'California'])\n",
"frame.index.name = 'letters'\n",
"frame"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:08.812642Z",
"start_time": "2019-12-24T11:54:08.696663Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Ohio | \n",
" Texas | \n",
" California | \n",
"
\n",
" \n",
" | letters | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | a | \n",
" 0.0 | \n",
" 1.0 | \n",
" 2.0 | \n",
"
\n",
" \n",
" | b | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | c | \n",
" 3.0 | \n",
" 4.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
" | d | \n",
" 6.0 | \n",
" 7.0 | \n",
" 8.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Ohio Texas California\n",
"letters \n",
"a 0.0 1.0 2.0\n",
"b NaN NaN NaN\n",
"c 3.0 4.0 5.0\n",
"d 6.0 7.0 8.0"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame2 = frame.reindex(['a', 'b', 'c', 'd'])\n",
"frame2"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:08.910495Z",
"start_time": "2019-12-24T11:54:08.814770Z"
},
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Texas | \n",
" Utah | \n",
" California | \n",
"
\n",
" \n",
" | letters | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | a | \n",
" 1 | \n",
" NaN | \n",
" 2 | \n",
"
\n",
" \n",
" | c | \n",
" 4 | \n",
" NaN | \n",
" 5 | \n",
"
\n",
" \n",
" | d | \n",
" 7 | \n",
" NaN | \n",
" 8 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Texas Utah California\n",
"letters \n",
"a 1 NaN 2\n",
"c 4 NaN 5\n",
"d 7 NaN 8"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# The columns can be reindexed with the columns keyword:\n",
"states = ['Texas', 'Utah', 'California']\n",
"frame.reindex(columns=states)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:09.024940Z",
"start_time": "2019-12-24T11:54:08.911493Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Texas | \n",
" Utah | \n",
" California | \n",
"
\n",
" \n",
" | letters | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | a | \n",
" 1.0 | \n",
" NaN | \n",
" 2.0 | \n",
"
\n",
" \n",
" | b | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | c | \n",
" 4.0 | \n",
" NaN | \n",
" 5.0 | \n",
"
\n",
" \n",
" | d | \n",
" 7.0 | \n",
" NaN | \n",
" 8.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Texas Utah California\n",
"letters \n",
"a 1.0 NaN 2.0\n",
"b NaN NaN NaN\n",
"c 4.0 NaN 5.0\n",
"d 7.0 NaN 8.0"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame.reindex(['a', 'b', 'c', 'd'], columns=states)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:09.141015Z",
"start_time": "2019-12-24T11:54:09.024940Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Texas | \n",
" Utah | \n",
" California | \n",
"
\n",
" \n",
" | letters | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | a | \n",
" 1 | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" | b | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | c | \n",
" 4 | \n",
" 0 | \n",
" 5 | \n",
"
\n",
" \n",
" | d | \n",
" 7 | \n",
" 0 | \n",
" 8 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Texas Utah California\n",
"letters \n",
"a 1 0 2\n",
"b 0 0 0\n",
"c 4 0 5\n",
"d 7 0 8"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame.reindex(['a', 'b', 'c', 'd'], columns=states, fill_value=0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dropping Entries from an axis\n",
"\n",
"**drop** method will return a new object with the indicated value or values deleted from an axis"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:09.250070Z",
"start_time": "2019-12-24T11:54:09.141015Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"a 0.0\n",
"b 1.0\n",
"c 2.0\n",
"d 3.0\n",
"e 4.0\n",
"dtype: float64"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])\n",
"obj"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:09.396400Z",
"start_time": "2019-12-24T11:54:09.251603Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"a 0.0\n",
"b 1.0\n",
"d 3.0\n",
"e 4.0\n",
"dtype: float64"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_obj = obj.drop('c')\n",
"new_obj"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:09.598954Z",
"start_time": "2019-12-24T11:54:09.396400Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"a 0.0\n",
"b 1.0\n",
"e 4.0\n",
"dtype: float64"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj.drop(['d', 'c'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"With DataFrame, index values can be deleted from either axis"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:09.802608Z",
"start_time": "2019-12-24T11:54:09.599952Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" | Ohio | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" | Colorado | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" | Utah | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
"
\n",
" \n",
" | New York | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
" 15 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four\n",
"Ohio 0 1 2 3\n",
"Colorado 4 5 6 7\n",
"Utah 8 9 10 11\n",
"New York 12 13 14 15"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'],\n",
" columns=['one', 'two', 'three', 'four'])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:09.966588Z",
"start_time": "2019-12-24T11:54:09.803605Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" | Utah | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
"
\n",
" \n",
" | New York | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
" 15 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four\n",
"Utah 8 9 10 11\n",
"New York 12 13 14 15"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Calling drop with a sequence of labels will \n",
"# drop values from the row labels (axis 0):\n",
"data.drop(['Colorado', 'Ohio'])"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:10.096537Z",
"start_time": "2019-12-24T11:54:09.966588Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" three | \n",
"
\n",
" \n",
" \n",
" \n",
" | Ohio | \n",
" 0 | \n",
" 2 | \n",
"
\n",
" \n",
" | Colorado | \n",
" 4 | \n",
" 6 | \n",
"
\n",
" \n",
" | Utah | \n",
" 8 | \n",
" 10 | \n",
"
\n",
" \n",
" | New York | \n",
" 12 | \n",
" 14 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one three\n",
"Ohio 0 2\n",
"Colorado 4 6\n",
"Utah 8 10\n",
"New York 12 14"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# drop values from columns: pass axis=1 or axis='columns'\n",
"\n",
"data.drop(['two', 'four'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:10.243523Z",
"start_time": "2019-12-24T11:54:10.097533Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a 0.0\n",
"b 1.0\n",
"c 2.0\n",
"d 3.0\n",
"e 4.0\n",
"dtype: float64\n",
"\n",
"a 0.0\n",
"b 1.0\n",
"d 3.0\n",
"e 4.0\n",
"dtype: float64\n",
"\n"
]
}
],
"source": [
"print(obj)\n",
"obj.drop('c', inplace=True)\n",
"print(obj)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Indexing, Selection, and Filtering"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### indexing\n",
"\n",
"Series indexing (obj[...]) works analogously to NumPy array indexing, except you can use the Series’s index values instead of only integers."
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:10.362801Z",
"start_time": "2019-12-24T11:54:10.243523Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"a 0.0\n",
"b 1.0\n",
"c 2.0\n",
"d 3.0\n",
"dtype: float64"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])\n",
"obj"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:10.493177Z",
"start_time": "2019-12-24T11:54:10.362801Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.0\n",
"\n",
"1.0\n",
"\n",
"c 2.0\n",
"d 3.0\n",
"dtype: float64\n",
"\n",
"b 1.0\n",
"a 0.0\n",
"d 3.0\n",
"dtype: float64\n",
"\n",
"b 1.0\n",
"d 3.0\n",
"dtype: float64\n",
"\n"
]
}
],
"source": [
"print(obj['b'])\n",
"print(obj[1])\n",
"print(obj[2:4])\n",
"print(obj[['b', 'a', 'd']])\n",
"print(obj[[1, 3]])"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:10.631638Z",
"start_time": "2019-12-24T11:54:10.494174Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a 0.0\n",
"b 1.0\n",
"dtype: float64\n",
"\n"
]
}
],
"source": [
"print(obj[obj < 2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Slicing with labels behaves differently than normal Python slicing in that the **end‐point is inclusive**"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:10.937208Z",
"start_time": "2019-12-24T11:54:10.634638Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"b 1.0\n",
"c 2.0\n",
"dtype: float64"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj['b': 'c']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Indexing into a DataFrame is for retrieving one or more columns either with a single value or sequence"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:11.110423Z",
"start_time": "2019-12-24T11:54:10.939206Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" | Ohio | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" | Colorado | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" | Utah | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
"
\n",
" \n",
" | New York | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
" 15 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four\n",
"Ohio 0 1 2 3\n",
"Colorado 4 5 6 7\n",
"Utah 8 9 10 11\n",
"New York 12 13 14 15"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.DataFrame(np.arange(16).reshape((4, 4)),\n",
" index=['Ohio', 'Colorado', 'Utah', 'New York'],\n",
" columns=['one', 'two', 'three', 'four'])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:11.335656Z",
"start_time": "2019-12-24T11:54:11.111464Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ohio 1\n",
"Colorado 5\n",
"Utah 9\n",
"New York 13\n",
"Name: two, dtype: int32\n",
"\n",
" three four\n",
"Ohio 2 3\n",
"Colorado 6 7\n",
"Utah 10 11\n",
"New York 14 15\n",
"\n"
]
}
],
"source": [
"print(data['two'])\n",
"print(data[['three', 'four']])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The row selection syntax **data[:2]** is provided as a convenience. Passing a single element or a list to the [ ] operator selects columns."
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:11.486571Z",
"start_time": "2019-12-24T11:54:11.335656Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" | Ohio | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" | Colorado | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four\n",
"Ohio 0 1 2 3\n",
"Colorado 4 5 6 7"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[:2]"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:11.626605Z",
"start_time": "2019-12-24T11:54:11.491623Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" | Colorado | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" | Utah | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
"
\n",
" \n",
" | New York | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
" 15 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four\n",
"Colorado 4 5 6 7\n",
"Utah 8 9 10 11\n",
"New York 12 13 14 15"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[data['three'] > 5]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:11.731106Z",
"start_time": "2019-12-24T11:54:11.626605Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" | Ohio | \n",
" True | \n",
" True | \n",
" True | \n",
" True | \n",
"
\n",
" \n",
" | Colorado | \n",
" True | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" | Utah | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
" | New York | \n",
" False | \n",
" False | \n",
" False | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four\n",
"Ohio True True True True\n",
"Colorado True False False False\n",
"Utah False False False False\n",
"New York False False False False"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data < 5"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-22T16:14:52.753331Z",
"start_time": "2019-12-22T16:14:52.745801Z"
}
},
"source": [
"#### setting\n",
"Setting using these methods modifies the corresponding section of the Series"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:11.854456Z",
"start_time": "2019-12-24T11:54:11.733101Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"a 0.0\n",
"b 5.0\n",
"c 5.0\n",
"d 3.0\n",
"dtype: float64"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj['b': 'c'] = 5\n",
"obj"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:11.971222Z",
"start_time": "2019-12-24T11:54:11.855652Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" | Ohio | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" | Colorado | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" | Utah | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
"
\n",
" \n",
" | New York | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
" 15 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four\n",
"Ohio 0 1 2 3\n",
"Colorado 4 5 6 7\n",
"Utah 8 9 10 11\n",
"New York 12 13 14 15"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.DataFrame(np.arange(16).reshape((4, 4)),\n",
" index=['Ohio', 'Colorado', 'Utah', 'New York'],\n",
" columns=['one', 'two', 'three', 'four'])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:12.124945Z",
"start_time": "2019-12-24T11:54:11.972250Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" | Ohio | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" | Colorado | \n",
" 0 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" | Utah | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
"
\n",
" \n",
" | New York | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
" 15 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four\n",
"Ohio 0 0 0 0\n",
"Colorado 0 5 6 7\n",
"Utah 8 9 10 11\n",
"New York 12 13 14 15"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[data < 5] = 0\n",
"data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Selection with loc, iloc"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For DataFrame label-indexing on the rows, I introduce the special indexing operators loc and iloc. They enable you to select a subset of the rows and columns from a DataFrame with NumPy-like notation using either axis labels (loc) or integers (iloc)."
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:12.244168Z",
"start_time": "2019-12-24T11:54:12.124945Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"two 5\n",
"three 6\n",
"Name: Colorado, dtype: int32"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# select a single row and multiple columns by label\n",
"# loc[Row, col]\n",
"data.loc['Colorado', ['two', 'three']]"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:12.359593Z",
"start_time": "2019-12-24T11:54:12.244168Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"two 5\n",
"three 6\n",
"Name: Colorado, dtype: int32"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# with integers using iloc\n",
"data.iloc[1, [1, 2]]"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:12.517066Z",
"start_time": "2019-12-24T11:54:12.361868Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"one 8\n",
"two 9\n",
"three 10\n",
"four 11\n",
"Name: Utah, dtype: int32\n",
"\n",
" four one two\n",
"Colorado 7 0 5\n",
"Utah 11 8 9\n",
"\n"
]
}
],
"source": [
"print(data.iloc[2])\n",
"print(data.iloc[[1, 2], [3, 0, 1]])"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:12.680750Z",
"start_time": "2019-12-24T11:54:12.519063Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" one two three four\n",
"Ohio 0 0 0 0\n",
"Colorado 0 5 6 7\n",
"Utah 8 9 10 11\n",
"New York 12 13 14 15\n",
"\n",
"Ohio 0\n",
"Colorado 5\n",
"Utah 9\n",
"Name: two, dtype: int32\n",
"\n"
]
}
],
"source": [
"print(data)\n",
"print(data.loc[:'Utah', 'two'])"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:12.840886Z",
"start_time": "2019-12-24T11:54:12.682744Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
"
\n",
" \n",
" \n",
" \n",
" | Colorado | \n",
" 0 | \n",
" 5 | \n",
" 6 | \n",
"
\n",
" \n",
" | Utah | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
"
\n",
" \n",
" | New York | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three\n",
"Colorado 0 5 6\n",
"Utah 8 9 10\n",
"New York 12 13 14"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.iloc[:, :3][data.three > 5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Arithmetic and Data Alignment"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"An important pandas feature for some applications is the behavior of arithmetic\n",
"between objects with different indexes. When you are adding together objects, if any index pairs are not the same, \n",
"the respective index in the result will be the union of the index pairs. \n",
"For users with database experience, this is similar to an automatic outer join on the index labels."
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:12.950857Z",
"start_time": "2019-12-24T11:54:12.841887Z"
}
},
"outputs": [],
"source": [
"s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])\n",
"s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:13.066293Z",
"start_time": "2019-12-24T11:54:12.950857Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a 7.3\n",
"c -2.5\n",
"d 3.4\n",
"e 1.5\n",
"dtype: float64\n",
"\n",
"a -2.1\n",
"c 3.6\n",
"e -1.5\n",
"f 4.0\n",
"g 3.1\n",
"dtype: float64\n",
"\n"
]
}
],
"source": [
"print(s1)\n",
"print(s2)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:13.203969Z",
"start_time": "2019-12-24T11:54:13.068249Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"a 5.2\n",
"c 1.1\n",
"d NaN\n",
"e 0.0\n",
"f NaN\n",
"g NaN\n",
"dtype: float64"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s1 + s2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the case of DataFrame, alignment is performed on both the rows and the columns"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:13.326232Z",
"start_time": "2019-12-24T11:54:13.205001Z"
}
},
"outputs": [],
"source": [
"df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),\n",
" index=['Ohio', 'Texas', 'Colorado'])\n",
"\n",
"df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n",
" index=['Utah', 'Ohio', 'Texas', 'Oregon'])"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:13.459283Z",
"start_time": "2019-12-24T11:54:13.326232Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" b c d\n",
"Ohio 0.0 1.0 2.0\n",
"Texas 3.0 4.0 5.0\n",
"Colorado 6.0 7.0 8.0\n",
"\n",
" b d e\n",
"Utah 0.0 1.0 2.0\n",
"Ohio 3.0 4.0 5.0\n",
"Texas 6.0 7.0 8.0\n",
"Oregon 9.0 10.0 11.0\n",
"\n"
]
}
],
"source": [
"print(df1)\n",
"print(df2)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:13.621405Z",
"start_time": "2019-12-24T11:54:13.461210Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" b | \n",
" c | \n",
" d | \n",
" e | \n",
"
\n",
" \n",
" \n",
" \n",
" | Colorado | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | Ohio | \n",
" 3.0 | \n",
" NaN | \n",
" 6.0 | \n",
" NaN | \n",
"
\n",
" \n",
" | Oregon | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | Texas | \n",
" 9.0 | \n",
" NaN | \n",
" 12.0 | \n",
" NaN | \n",
"
\n",
" \n",
" | Utah | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" b c d e\n",
"Colorado NaN NaN NaN NaN\n",
"Ohio 3.0 NaN 6.0 NaN\n",
"Oregon NaN NaN NaN NaN\n",
"Texas 9.0 NaN 12.0 NaN\n",
"Utah NaN NaN NaN NaN"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1 + df2"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:13.778639Z",
"start_time": "2019-12-24T11:54:13.623331Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" A\n",
"0 1\n",
"1 2\n",
"\n",
" B\n",
"0 3\n",
"1 4\n",
"\n"
]
}
],
"source": [
"df1 = pd.DataFrame({'A': [1, 2]})\n",
"df2 = pd.DataFrame({'B': [3, 4]})\n",
"print(df1)\n",
"print(df2)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:13.907441Z",
"start_time": "2019-12-24T11:54:13.788272Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | 1 | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B\n",
"0 NaN NaN\n",
"1 NaN NaN"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1 - df2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In arithmetic operations between differently indexed objects, you might want to fill\n",
"with a special value, like 0, when an axis label is found in one object but not the other"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:14.021709Z",
"start_time": "2019-12-24T11:54:13.915170Z"
}
},
"outputs": [],
"source": [
"df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list(\"abcd\"))\n",
"df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:14.142549Z",
"start_time": "2019-12-24T11:54:14.021709Z"
}
},
"outputs": [],
"source": [
"df2.loc[1, 'b'] = np.nan"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:14.273758Z",
"start_time": "2019-12-24T11:54:14.143900Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" a b c d\n",
"0 0.0 1.0 2.0 3.0\n",
"1 4.0 5.0 6.0 7.0\n",
"2 8.0 9.0 10.0 11.0\n",
"\n",
" a b c d e\n",
"0 0.0 1.0 2.0 3.0 4.0\n",
"1 5.0 NaN 7.0 8.0 9.0\n",
"2 10.0 11.0 12.0 13.0 14.0\n",
"3 15.0 16.0 17.0 18.0 19.0\n",
"\n"
]
}
],
"source": [
"print(df1)\n",
"print(df2)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:14.395478Z",
"start_time": "2019-12-24T11:54:14.273758Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" e | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 4.0 | \n",
" 6.0 | \n",
" NaN | \n",
"
\n",
" \n",
" | 1 | \n",
" 9.0 | \n",
" NaN | \n",
" 13.0 | \n",
" 15.0 | \n",
" NaN | \n",
"
\n",
" \n",
" | 2 | \n",
" 18.0 | \n",
" 20.0 | \n",
" 22.0 | \n",
" 24.0 | \n",
" NaN | \n",
"
\n",
" \n",
" | 3 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d e\n",
"0 0.0 2.0 4.0 6.0 NaN\n",
"1 9.0 NaN 13.0 15.0 NaN\n",
"2 18.0 20.0 22.0 24.0 NaN\n",
"3 NaN NaN NaN NaN NaN"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Adding these together results in NA values in the locations that don’t overlap\n",
"df1 + df2"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:14.522987Z",
"start_time": "2019-12-24T11:54:14.397472Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
" e | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.0 | \n",
" 2.0 | \n",
" 4.0 | \n",
" 6.0 | \n",
" 4.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 9.0 | \n",
" 5.0 | \n",
" 13.0 | \n",
" 15.0 | \n",
" 9.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 18.0 | \n",
" 20.0 | \n",
" 22.0 | \n",
" 24.0 | \n",
" 14.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 15.0 | \n",
" 16.0 | \n",
" 17.0 | \n",
" 18.0 | \n",
" 19.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d e\n",
"0 0.0 2.0 4.0 6.0 4.0\n",
"1 9.0 5.0 13.0 15.0 9.0\n",
"2 18.0 20.0 22.0 24.0 14.0\n",
"3 15.0 16.0 17.0 18.0 19.0"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Using the add method on df1,one pass df2 and an argument to fill_value\n",
"df1.add(df2, fill_value=0)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:14.645859Z",
"start_time": "2019-12-24T11:54:14.524282Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" inf | \n",
" 1.000000 | \n",
" 0.500000 | \n",
" 0.333333 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0.250 | \n",
" 0.200000 | \n",
" 0.166667 | \n",
" 0.142857 | \n",
"
\n",
" \n",
" | 2 | \n",
" 0.125 | \n",
" 0.111111 | \n",
" 0.100000 | \n",
" 0.090909 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d\n",
"0 inf 1.000000 0.500000 0.333333\n",
"1 0.250 0.200000 0.166667 0.142857\n",
"2 0.125 0.111111 0.100000 0.090909"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 1 / df1\n",
"# or\n",
"df1.rdiv(1)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:14.762982Z",
"start_time": "2019-12-24T11:54:14.648478Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" b d e\n",
"utah 0.0 1.0 2.0\n",
"ohio 3.0 4.0 5.0\n",
"texas 6.0 7.0 8.0\n",
"oregon 9.0 10.0 11.0\n",
"\n",
"b 0.0\n",
"d 1.0\n",
"e 2.0\n",
"Name: utah, dtype: float64\n",
"\n"
]
}
],
"source": [
"# operations between DataFrame and Series\n",
"frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n",
" index=(\"utah\", \"ohio\", \"texas\", \"oregon\"))\n",
"series = frame.iloc[0]\n",
"print(frame)\n",
"print(series)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"By default, arithmetic between DataFrame and Series matches the index of the Series\n",
"on the DataFrame’s columns, broadcasting down the rows"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:15.272570Z",
"start_time": "2019-12-24T11:54:14.764980Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" b | \n",
" d | \n",
" e | \n",
"
\n",
" \n",
" \n",
" \n",
" | utah | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | ohio | \n",
" 3.0 | \n",
" 3.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
" | texas | \n",
" 6.0 | \n",
" 6.0 | \n",
" 6.0 | \n",
"
\n",
" \n",
" | oregon | \n",
" 9.0 | \n",
" 9.0 | \n",
" 9.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" b d e\n",
"utah 0.0 0.0 0.0\n",
"ohio 3.0 3.0 3.0\n",
"texas 6.0 6.0 6.0\n",
"oregon 9.0 9.0 9.0"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame - series"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If an index value is not found in either the DataFrame’s columns or the Series’s index,\n",
"the objects will be reindexed to form the union"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:15.785596Z",
"start_time": "2019-12-24T11:54:15.274639Z"
}
},
"outputs": [],
"source": [
"series2 = pd.Series(range(3), index=list('bef'))"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:15.975844Z",
"start_time": "2019-12-24T11:54:15.789282Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" b | \n",
" d | \n",
" e | \n",
" f | \n",
"
\n",
" \n",
" \n",
" \n",
" | utah | \n",
" 0.0 | \n",
" NaN | \n",
" 3.0 | \n",
" NaN | \n",
"
\n",
" \n",
" | ohio | \n",
" 3.0 | \n",
" NaN | \n",
" 6.0 | \n",
" NaN | \n",
"
\n",
" \n",
" | texas | \n",
" 6.0 | \n",
" NaN | \n",
" 9.0 | \n",
" NaN | \n",
"
\n",
" \n",
" | oregon | \n",
" 9.0 | \n",
" NaN | \n",
" 12.0 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" b d e f\n",
"utah 0.0 NaN 3.0 NaN\n",
"ohio 3.0 NaN 6.0 NaN\n",
"texas 6.0 NaN 9.0 NaN\n",
"oregon 9.0 NaN 12.0 NaN"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame + series2"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:16.113109Z",
"start_time": "2019-12-24T11:54:15.978953Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"utah 1.0\n",
"ohio 4.0\n",
"texas 7.0\n",
"oregon 10.0\n",
"Name: d, dtype: float64"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# broadcasting over columns\n",
"series3 = frame['d']\n",
"series3"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:16.261347Z",
"start_time": "2019-12-24T11:54:16.115103Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" b | \n",
" d | \n",
" e | \n",
"
\n",
" \n",
" \n",
" \n",
" | utah | \n",
" -1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | ohio | \n",
" -1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | texas | \n",
" -1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | oregon | \n",
" -1.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" b d e\n",
"utah -1.0 0.0 1.0\n",
"ohio -1.0 0.0 1.0\n",
"texas -1.0 0.0 1.0\n",
"oregon -1.0 0.0 1.0"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame.sub(series3, axis='index')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The axis number that you pass is the axis to match on. In this case we mean to match\n",
"on the DataFrame’s row index (axis='index' or axis=0) and broadcast across."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Function Application and Mapping"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:16.578884Z",
"start_time": "2019-12-24T11:54:16.263428Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" b | \n",
" d | \n",
" e | \n",
"
\n",
" \n",
" \n",
" \n",
" | Utah | \n",
" -1.846691 | \n",
" -0.622582 | \n",
" -1.542595 | \n",
"
\n",
" \n",
" | Ohio | \n",
" 0.818697 | \n",
" 0.730596 | \n",
" 0.458859 | \n",
"
\n",
" \n",
" | Texas | \n",
" 0.612735 | \n",
" 0.593502 | \n",
" -0.631133 | \n",
"
\n",
" \n",
" | Oregon | \n",
" -0.067950 | \n",
" -1.023214 | \n",
" 0.115138 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" b d e\n",
"Utah -1.846691 -0.622582 -1.542595\n",
"Ohio 0.818697 0.730596 0.458859\n",
"Texas 0.612735 0.593502 -0.631133\n",
"Oregon -0.067950 -1.023214 0.115138"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),\n",
" index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n",
"frame"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:16.763901Z",
"start_time": "2019-12-24T11:54:16.581885Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" b | \n",
" d | \n",
" e | \n",
"
\n",
" \n",
" \n",
" \n",
" | Utah | \n",
" 1.846691 | \n",
" 0.622582 | \n",
" 1.542595 | \n",
"
\n",
" \n",
" | Ohio | \n",
" 0.818697 | \n",
" 0.730596 | \n",
" 0.458859 | \n",
"
\n",
" \n",
" | Texas | \n",
" 0.612735 | \n",
" 0.593502 | \n",
" 0.631133 | \n",
"
\n",
" \n",
" | Oregon | \n",
" 0.067950 | \n",
" 1.023214 | \n",
" 0.115138 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" b d e\n",
"Utah 1.846691 0.622582 1.542595\n",
"Ohio 0.818697 0.730596 0.458859\n",
"Texas 0.612735 0.593502 0.631133\n",
"Oregon 0.067950 1.023214 0.115138"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.abs(frame)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:16.884947Z",
"start_time": "2019-12-24T11:54:16.766103Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"b 2.665389\n",
"d 1.753811\n",
"e 2.001455\n",
"dtype: float64"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Another frequent operation is applying a function on \n",
"# one-dimensional arrays to each column or row. \n",
"# DataFrame’s apply method does exactly this\n",
"\n",
"f = lambda x: x.max() - x.min()\n",
"frame.apply(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here the function f, which computes the difference between the maximum and minimum of a Series, is **invoked once on each column** in frame. The result is a Series having the columns of frame as its index."
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:17.003427Z",
"start_time": "2019-12-24T11:54:16.887837Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"Utah 1.224110\n",
"Ohio 0.359838\n",
"Texas 1.243868\n",
"Oregon 1.138353\n",
"dtype: float64"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# If you pass axis='columns' to apply, \n",
"# the function will be invoked once per row\n",
"\n",
"frame.apply(f, axis='columns')"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:17.152859Z",
"start_time": "2019-12-24T11:54:17.003427Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" b | \n",
" d | \n",
" e | \n",
"
\n",
" \n",
" \n",
" \n",
" | min | \n",
" -1.846691 | \n",
" -1.023214 | \n",
" -1.542595 | \n",
"
\n",
" \n",
" | max | \n",
" 0.818697 | \n",
" 0.730596 | \n",
" 0.458859 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" b d e\n",
"min -1.846691 -1.023214 -1.542595\n",
"max 0.818697 0.730596 0.458859"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# returning a Series with multiple values\n",
"\n",
"def f(x):\n",
" return pd.Series([x.min(), x.max()], index=['min', 'max'])\n",
"\n",
"frame.apply(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Element-wise Python functions can be used, too. Suppose you wanted to compute a\n",
"formatted string from each floating-point value in frame. You can do this with \n",
"**applymap**"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:17.261634Z",
"start_time": "2019-12-24T11:54:17.155897Z"
}
},
"outputs": [],
"source": [
"format = lambda x: f'{x:.2f}'"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:17.398915Z",
"start_time": "2019-12-24T11:54:17.263630Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" b d e\n",
"Utah -1.846691 -0.622582 -1.542595\n",
"Ohio 0.818697 0.730596 0.458859\n",
"Texas 0.612735 0.593502 -0.631133\n",
"Oregon -0.067950 -1.023214 0.115138\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" b | \n",
" d | \n",
" e | \n",
"
\n",
" \n",
" \n",
" \n",
" | Utah | \n",
" -1.85 | \n",
" -0.62 | \n",
" -1.54 | \n",
"
\n",
" \n",
" | Ohio | \n",
" 0.82 | \n",
" 0.73 | \n",
" 0.46 | \n",
"
\n",
" \n",
" | Texas | \n",
" 0.61 | \n",
" 0.59 | \n",
" -0.63 | \n",
"
\n",
" \n",
" | Oregon | \n",
" -0.07 | \n",
" -1.02 | \n",
" 0.12 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" b d e\n",
"Utah -1.85 -0.62 -1.54\n",
"Ohio 0.82 0.73 0.46\n",
"Texas 0.61 0.59 -0.63\n",
"Oregon -0.07 -1.02 0.12"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(frame)\n",
"frame.applymap(format)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The reason for the name applymap is that Series has a **map** method for applying an\n",
"element-wise function"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:17.518935Z",
"start_time": "2019-12-24T11:54:17.400935Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"Utah -1.54\n",
"Ohio 0.46\n",
"Texas -0.63\n",
"Oregon 0.12\n",
"Name: e, dtype: object"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame['e'].map(format)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Sorting and Ranking"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Sorting\n",
"\n",
"Sorting a dataset by some criterion is another important built-in operation. To sort\n",
"lexicographically by row or column index, use the **sort_index** method, which returns\n",
"a new, sorted object"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:17.662604Z",
"start_time": "2019-12-24T11:54:17.522098Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"b 0\n",
"d 1\n",
"a 2\n",
"c 3\n",
"dtype: int64"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj = pd.Series(range(4), index=['b', 'd', 'a', 'c'])\n",
"obj"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:17.793647Z",
"start_time": "2019-12-24T11:54:17.664561Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"a 2\n",
"b 0\n",
"c 3\n",
"d 1\n",
"dtype: int64"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj.sort_index()"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:17.984142Z",
"start_time": "2019-12-24T11:54:17.795974Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" d | \n",
" a | \n",
" b | \n",
" c | \n",
"
\n",
" \n",
" \n",
" \n",
" | one | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" | three | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" d a b c\n",
"one 4 5 6 7\n",
"three 0 1 2 3"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],\n",
" columns=['d', 'a', 'b', 'c'])\n",
"frame.sort_index()"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:18.202334Z",
"start_time": "2019-12-24T11:54:17.985140Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
"
\n",
" \n",
" \n",
" \n",
" | three | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 0 | \n",
"
\n",
" \n",
" | one | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d\n",
"three 1 2 3 0\n",
"one 5 6 7 4"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame.sort_index(axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:18.322544Z",
"start_time": "2019-12-24T11:54:18.205743Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" d | \n",
" c | \n",
" b | \n",
" a | \n",
"
\n",
" \n",
" \n",
" \n",
" | three | \n",
" 0 | \n",
" 3 | \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
" | one | \n",
" 4 | \n",
" 7 | \n",
" 6 | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" d c b a\n",
"three 0 3 2 1\n",
"one 4 7 6 5"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame.sort_index(axis=1, ascending=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To sort a Series by its values, use its **sort_values** method.\n",
"Any missing values are sorted to the end of the Series by default"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:18.444610Z",
"start_time": "2019-12-24T11:54:18.323543Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0 -4\n",
"3 2\n",
"2 3\n",
"1 7\n",
"dtype: int64"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj = pd.Series([-4, 7, 3, 2])\n",
"obj.sort_values()"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:18.561066Z",
"start_time": "2019-12-24T11:54:18.448348Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"4 -3.0\n",
"5 2.0\n",
"0 4.0\n",
"2 7.0\n",
"1 NaN\n",
"3 NaN\n",
"dtype: float64"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])\n",
"obj.sort_values()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"When sorting a DataFrame, you can use the data in one or more columns as the sort\n",
"keys. To do so, pass one or more column names to the by option of sort_values"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:18.674098Z",
"start_time": "2019-12-24T11:54:18.562063Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" b | \n",
" a | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 4 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 7 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" -3 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" b a\n",
"0 4 0\n",
"1 7 1\n",
"2 -3 0\n",
"3 2 1"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})\n",
"frame"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:18.782844Z",
"start_time": "2019-12-24T11:54:18.675129Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" b | \n",
" a | \n",
"
\n",
" \n",
" \n",
" \n",
" | 2 | \n",
" -3 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
" | 0 | \n",
" 4 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 7 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" b a\n",
"2 -3 0\n",
"3 2 1\n",
"0 4 0\n",
"1 7 1"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame.sort_values(by='b')"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:18.885242Z",
"start_time": "2019-12-24T11:54:18.783879Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" b | \n",
" a | \n",
"
\n",
" \n",
" \n",
" \n",
" | 2 | \n",
" -3 | \n",
" 0 | \n",
"
\n",
" \n",
" | 0 | \n",
" 4 | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 7 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" b a\n",
"2 -3 0\n",
"0 4 0\n",
"3 2 1\n",
"1 7 1"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame.sort_values(by=['a', 'b'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Ranking\n",
"\n",
"*Ranking* assigns ranks from one through the number of valid data points in an array.\n",
"The rank methods for Series and DataFrame are the place to look; by default rank\n",
"breaks ties by assigning each group the mean rank"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:19.058017Z",
"start_time": "2019-12-24T11:54:18.887690Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0 6.5\n",
"1 1.0\n",
"2 6.5\n",
"3 4.5\n",
"4 3.0\n",
"5 2.0\n",
"6 4.5\n",
"dtype: float64"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj = pd.Series([7, -5, 7, 4, 2, 0, 4])\n",
"obj.rank()"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:19.209466Z",
"start_time": "2019-12-24T11:54:19.059022Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0 6.0\n",
"1 1.0\n",
"2 7.0\n",
"3 4.0\n",
"4 3.0\n",
"5 2.0\n",
"6 5.0\n",
"dtype: float64"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# assigning rank according to the order they're observed first in the data\n",
"obj.rank(method='first')"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:19.352734Z",
"start_time": "2019-12-24T11:54:19.211753Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0 2.0\n",
"1 7.0\n",
"2 2.0\n",
"3 4.0\n",
"4 5.0\n",
"5 6.0\n",
"6 4.0\n",
"dtype: float64"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj.rank(ascending=False, method='max')"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:19.499605Z",
"start_time": "2019-12-24T11:54:19.352734Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" b | \n",
" a | \n",
" c | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 4.3 | \n",
" 0 | \n",
" -2.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 7.0 | \n",
" 1 | \n",
" 5.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" -3.0 | \n",
" 0 | \n",
" 8.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 2.0 | \n",
" 1 | \n",
" -2.5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" b a c\n",
"0 4.3 0 -2.0\n",
"1 7.0 1 5.0\n",
"2 -3.0 0 8.0\n",
"3 2.0 1 -2.5"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})\n",
"frame"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:19.644123Z",
"start_time": "2019-12-24T11:54:19.502600Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" b | \n",
" a | \n",
" c | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 3.0 | \n",
" 2.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 3.0 | \n",
" 1.0 | \n",
" 2.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 3.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3.0 | \n",
" 2.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" b a c\n",
"0 3.0 2.0 1.0\n",
"1 3.0 1.0 2.0\n",
"2 1.0 2.0 3.0\n",
"3 3.0 2.0 1.0"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frame.rank(axis='columns')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Axis Indexes with Duplicate Labels"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:19.825688Z",
"start_time": "2019-12-24T11:54:19.646865Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"a 0\n",
"a 1\n",
"b 2\n",
"b 3\n",
"c 4\n",
"dtype: int64"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])\n",
"obj"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:19.978965Z",
"start_time": "2019-12-24T11:54:19.828121Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj.index.is_unique"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:20.122066Z",
"start_time": "2019-12-24T11:54:19.980367Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"a 0\n",
"a 1\n",
"dtype: int64"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj.a"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:20.293573Z",
"start_time": "2019-12-24T11:54:20.124045Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | a | \n",
" -2.054623 | \n",
" 0.654964 | \n",
" -0.431453 | \n",
"
\n",
" \n",
" | a | \n",
" 0.654782 | \n",
" 0.776429 | \n",
" 1.175684 | \n",
"
\n",
" \n",
" | b | \n",
" -0.485586 | \n",
" -0.960232 | \n",
" -1.368606 | \n",
"
\n",
" \n",
" | b | \n",
" 0.205797 | \n",
" 0.266294 | \n",
" 2.007870 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2\n",
"a -2.054623 0.654964 -0.431453\n",
"a 0.654782 0.776429 1.175684\n",
"b -0.485586 -0.960232 -1.368606\n",
"b 0.205797 0.266294 2.007870"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:20.444163Z",
"start_time": "2019-12-24T11:54:20.295569Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" \n",
" \n",
" | a | \n",
" -2.054623 | \n",
" 0.654964 | \n",
" -0.431453 | \n",
"
\n",
" \n",
" | a | \n",
" 0.654782 | \n",
" 0.776429 | \n",
" 1.175684 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2\n",
"a -2.054623 0.654964 -0.431453\n",
"a 0.654782 0.776429 1.175684"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc['a']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summarizing and Computing Descriptive Statistics"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:20.549520Z",
"start_time": "2019-12-24T11:54:20.446595Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
"
\n",
" \n",
" \n",
" \n",
" | a | \n",
" 1.40 | \n",
" NaN | \n",
"
\n",
" \n",
" | b | \n",
" 7.10 | \n",
" -4.50 | \n",
"
\n",
" \n",
" | c | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | d | \n",
" 0.75 | \n",
" -1.43 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two\n",
"a 1.40 NaN\n",
"b 7.10 -4.50\n",
"c NaN NaN\n",
"d 0.75 -1.43"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan],\n",
" [0.75, -1.43]],\n",
" index=['a', 'b', 'c', 'd'],\n",
" columns=['one', 'two'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:20.684771Z",
"start_time": "2019-12-24T11:54:20.549520Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"one 9.25\n",
"two -5.93\n",
"dtype: float64"
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sum()"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:20.818099Z",
"start_time": "2019-12-24T11:54:20.687422Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"a 1.40\n",
"b 2.60\n",
"c 0.00\n",
"d -0.68\n",
"dtype: float64"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sum(axis='columns')"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:20.936482Z",
"start_time": "2019-12-24T11:54:20.818099Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"a NaN\n",
"b 2.60\n",
"c NaN\n",
"d -0.68\n",
"dtype: float64"
]
},
"execution_count": 111,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sum(axis='columns', skipna=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Some methods, like idxmin and idxmax, return indirect statistics like the index value\n",
"where the minimum or maximum values are attained"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:21.074783Z",
"start_time": "2019-12-24T11:54:20.937480Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"one b\n",
"two d\n",
"dtype: object\n",
"\n",
"one d\n",
"two b\n",
"dtype: object\n",
"\n"
]
}
],
"source": [
"print(df.idxmax())\n",
"print(df.idxmin())"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:21.194122Z",
"start_time": "2019-12-24T11:54:21.076618Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
"
\n",
" \n",
" \n",
" \n",
" | a | \n",
" 1.40 | \n",
" NaN | \n",
"
\n",
" \n",
" | b | \n",
" 8.50 | \n",
" -4.50 | \n",
"
\n",
" \n",
" | c | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | d | \n",
" 9.25 | \n",
" -5.93 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two\n",
"a 1.40 NaN\n",
"b 8.50 -4.50\n",
"c NaN NaN\n",
"d 9.25 -5.93"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# accmulation\n",
"df.cumsum()"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-22T20:42:48.976671Z",
"start_time": "2019-12-22T20:42:48.961382Z"
}
},
"source": [
"Another type of method is neither a reduction nor an accumulation. **describe** is one\n",
"such example, producing multiple summary statistics in one shot"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:21.344016Z",
"start_time": "2019-12-24T11:54:21.195119Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 3.000000 | \n",
" 2.000000 | \n",
"
\n",
" \n",
" | mean | \n",
" 3.083333 | \n",
" -2.965000 | \n",
"
\n",
" \n",
" | std | \n",
" 3.493685 | \n",
" 2.170818 | \n",
"
\n",
" \n",
" | min | \n",
" 0.750000 | \n",
" -4.500000 | \n",
"
\n",
" \n",
" | 25% | \n",
" 1.075000 | \n",
" -3.732500 | \n",
"
\n",
" \n",
" | 50% | \n",
" 1.400000 | \n",
" -2.965000 | \n",
"
\n",
" \n",
" | 75% | \n",
" 4.250000 | \n",
" -2.197500 | \n",
"
\n",
" \n",
" | max | \n",
" 7.100000 | \n",
" -1.430000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two\n",
"count 3.000000 2.000000\n",
"mean 3.083333 -2.965000\n",
"std 3.493685 2.170818\n",
"min 0.750000 -4.500000\n",
"25% 1.075000 -3.732500\n",
"50% 1.400000 -2.965000\n",
"75% 4.250000 -2.197500\n",
"max 7.100000 -1.430000"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:21.511393Z",
"start_time": "2019-12-24T11:54:21.344016Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"count 16\n",
"unique 3\n",
"top a\n",
"freq 8\n",
"dtype: object"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# On non-numeric data, describe produces alternative summary statistics:\n",
"\n",
"obj = pd.Series(['a', 'a', 'b', 'c'] * 4)\n",
"obj.describe()"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:21.653404Z",
"start_time": "2019-12-24T11:54:21.515120Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
"
\n",
" \n",
" \n",
" \n",
" | a | \n",
" 1.40 | \n",
" NaN | \n",
"
\n",
" \n",
" | b | \n",
" 7.10 | \n",
" -4.50 | \n",
"
\n",
" \n",
" | c | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" | d | \n",
" 0.75 | \n",
" -1.43 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two\n",
"a 1.40 NaN\n",
"b 7.10 -4.50\n",
"c NaN NaN\n",
"d 0.75 -1.43"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-22T20:47:42.840846Z",
"start_time": "2019-12-22T20:47:42.833552Z"
}
},
"source": [
"### Correlation and Covariance"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:23.076066Z",
"start_time": "2019-12-24T11:54:21.655517Z"
}
},
"outputs": [],
"source": [
"import pandas_datareader as web"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:27.953329Z",
"start_time": "2019-12-24T11:54:23.079097Z"
}
},
"outputs": [],
"source": [
"all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:28.701569Z",
"start_time": "2019-12-24T11:54:27.953329Z"
}
},
"outputs": [],
"source": [
"price = pd.DataFrame({ticker: data['Adj Close']\n",
" for ticker, data in all_data.items()})\n",
"volume = pd.DataFrame({ticker: data['Volume']\n",
" for ticker, data in all_data.items()})"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:28.944129Z",
"start_time": "2019-12-24T11:54:28.701569Z"
}
},
"outputs": [],
"source": [
"returns = price.pct_change()"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:29.202785Z",
"start_time": "2019-12-24T11:54:28.944129Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" AAPL | \n",
" IBM | \n",
" MSFT | \n",
" GOOG | \n",
"
\n",
" \n",
" | Date | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 2019-12-17 | \n",
" 0.001965 | \n",
" 0.000671 | \n",
" -0.005401 | \n",
" -0.004445 | \n",
"
\n",
" \n",
" | 2019-12-18 | \n",
" -0.002389 | \n",
" 0.001416 | \n",
" -0.002069 | \n",
" -0.001845 | \n",
"
\n",
" \n",
" | 2019-12-19 | \n",
" 0.001001 | \n",
" 0.001116 | \n",
" 0.008681 | \n",
" 0.002528 | \n",
"
\n",
" \n",
" | 2019-12-20 | \n",
" -0.002071 | \n",
" 0.007655 | \n",
" 0.010918 | \n",
" -0.004757 | \n",
"
\n",
" \n",
" | 2019-12-23 | \n",
" 0.016318 | \n",
" -0.000295 | \n",
" 0.000000 | \n",
" -0.000556 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" AAPL IBM MSFT GOOG\n",
"Date \n",
"2019-12-17 0.001965 0.000671 -0.005401 -0.004445\n",
"2019-12-18 -0.002389 0.001416 -0.002069 -0.001845\n",
"2019-12-19 0.001001 0.001116 0.008681 0.002528\n",
"2019-12-20 -0.002071 0.007655 0.010918 -0.004757\n",
"2019-12-23 0.016318 -0.000295 0.000000 -0.000556"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"returns.tail()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The **corr** method of Series computes the correlation of the overlapping, non-NA,\n",
"aligned-by-index values in two Series. Relatedly, **cov** computes the covariance"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:30.201205Z",
"start_time": "2019-12-24T11:54:29.206950Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9.227138328874263e-05\n",
"\n",
"0.48490365831130916\n",
"\n"
]
}
],
"source": [
"print(returns['MSFT'].cov(returns['IBM']))\n",
"print(returns['MSFT'].corr(returns['IBM']))"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:30.217210Z",
"start_time": "2019-12-24T11:54:30.204759Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.48490365831130916"
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"returns.MSFT.corr(returns.IBM)"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:30.398635Z",
"start_time": "2019-12-24T11:54:30.219543Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" AAPL | \n",
" IBM | \n",
" MSFT | \n",
" GOOG | \n",
"
\n",
" \n",
" \n",
" \n",
" | AAPL | \n",
" 1.000000 | \n",
" 0.399340 | \n",
" 0.573224 | \n",
" 0.521823 | \n",
"
\n",
" \n",
" | IBM | \n",
" 0.399340 | \n",
" 1.000000 | \n",
" 0.484904 | \n",
" 0.410386 | \n",
"
\n",
" \n",
" | MSFT | \n",
" 0.573224 | \n",
" 0.484904 | \n",
" 1.000000 | \n",
" 0.658704 | \n",
"
\n",
" \n",
" | GOOG | \n",
" 0.521823 | \n",
" 0.410386 | \n",
" 0.658704 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" AAPL IBM MSFT GOOG\n",
"AAPL 1.000000 0.399340 0.573224 0.521823\n",
"IBM 0.399340 1.000000 0.484904 0.410386\n",
"MSFT 0.573224 0.484904 1.000000 0.658704\n",
"GOOG 0.521823 0.410386 0.658704 1.000000"
]
},
"execution_count": 124,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"returns.corr()"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:30.569188Z",
"start_time": "2019-12-24T11:54:30.400959Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" AAPL | \n",
" IBM | \n",
" MSFT | \n",
" GOOG | \n",
"
\n",
" \n",
" \n",
" \n",
" | AAPL | \n",
" 0.000245 | \n",
" 0.000081 | \n",
" 0.000132 | \n",
" 0.000124 | \n",
"
\n",
" \n",
" | IBM | \n",
" 0.000081 | \n",
" 0.000168 | \n",
" 0.000092 | \n",
" 0.000080 | \n",
"
\n",
" \n",
" | MSFT | \n",
" 0.000132 | \n",
" 0.000092 | \n",
" 0.000216 | \n",
" 0.000146 | \n",
"
\n",
" \n",
" | GOOG | \n",
" 0.000124 | \n",
" 0.000080 | \n",
" 0.000146 | \n",
" 0.000229 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" AAPL IBM MSFT GOOG\n",
"AAPL 0.000245 0.000081 0.000132 0.000124\n",
"IBM 0.000081 0.000168 0.000092 0.000080\n",
"MSFT 0.000132 0.000092 0.000216 0.000146\n",
"GOOG 0.000124 0.000080 0.000146 0.000229"
]
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"returns.cov()"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:30.727892Z",
"start_time": "2019-12-24T11:54:30.572979Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"AAPL 0.399340\n",
"IBM 1.000000\n",
"MSFT 0.484904\n",
"GOOG 0.410386\n",
"dtype: float64"
]
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# corrwith method - compute pairwise correlations\n",
"# between a DataFrame’s columns or rows with another Series oDataFrame’sme\n",
"returns.corrwith(returns.IBM)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Unique Values, Value Counts, and Membership\n"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:30.844787Z",
"start_time": "2019-12-24T11:54:30.727892Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"array(['c', 'a', 'd', 'b'], dtype=object)"
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# unique - gives you an array of the unique values in a Series\n",
"\n",
"obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])\n",
"obj.unique()"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:30.977262Z",
"start_time": "2019-12-24T11:54:30.846938Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"c 3\n",
"a 3\n",
"b 2\n",
"d 1\n",
"dtype: int64"
]
},
"execution_count": 128,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# value_counts - computes a Series containing value frequencies\n",
"\n",
"pd.value_counts(obj.values, sort=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:31.112737Z",
"start_time": "2019-12-24T11:54:30.978817Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0 c\n",
"1 a\n",
"2 d\n",
"3 a\n",
"4 a\n",
"5 b\n",
"6 b\n",
"7 c\n",
"8 c\n",
"dtype: object"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# isin - performs a vectorized set membership check and can be useful in filtering a\n",
"# dataset down to a subset of values in a Series or column in a DataFrame\n",
"\n",
"obj"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:31.299318Z",
"start_time": "2019-12-24T11:54:31.114761Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0 True\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
"5 True\n",
"6 True\n",
"7 True\n",
"8 True\n",
"dtype: bool"
]
},
"execution_count": 130,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mask = obj.isin(['b', 'c'])\n",
"mask"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:31.517280Z",
"start_time": "2019-12-24T11:54:31.300318Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0 c\n",
"5 b\n",
"6 b\n",
"7 c\n",
"8 c\n",
"dtype: object"
]
},
"execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj[mask]"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:31.801449Z",
"start_time": "2019-12-24T11:54:31.520294Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Qu1 | \n",
" Qu2 | \n",
" Qu3 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 3 | \n",
" 3 | \n",
" 5 | \n",
"
\n",
" \n",
" | 2 | \n",
" 4 | \n",
" 1 | \n",
" 2 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
" 2 | \n",
" 4 | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Qu1 Qu2 Qu3\n",
"0 1 2 1\n",
"1 3 3 5\n",
"2 4 1 2\n",
"3 3 2 4\n",
"4 4 3 4"
]
},
"execution_count": 132,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],\n",
" 'Qu2': [2, 3, 1, 2, 3],\n",
" 'Qu3': [1, 5, 2, 4, 4]})\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {
"ExecuteTime": {
"end_time": "2019-12-24T11:54:32.315990Z",
"start_time": "2019-12-24T11:54:31.804884Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4 2\n",
"3 2\n",
"1 1\n",
"Name: Qu1, dtype: int64\n",
"\n",
"3 2\n",
"2 2\n",
"1 1\n",
"Name: Qu2, dtype: int64\n",
"\n",
"4 2\n",
"5 1\n",
"2 1\n",
"1 1\n",
"Name: Qu3, dtype: int64\n",
"\n",
" Qu1 Qu2 Qu3\n",
"1 1.0 1.0 1.0\n",
"2 0.0 2.0 1.0\n",
"3 2.0 2.0 0.0\n",
"4 2.0 0.0 2.0\n",
"5 0.0 0.0 1.0\n",
"\n"
]
}
],
"source": [
"print(pd.value_counts(data.Qu1))\n",
"print(pd.value_counts(data.Qu2))\n",
"print(pd.value_counts(data.Qu3))\n",
"print(data.apply(pd.value_counts).fillna(0))\n",
"# index - distinct values in the df\n",
"# column values - how many times the distinct value occur in that column"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": true,
"toc_position": {
"height": "calc(100% - 180px)",
"left": "10px",
"top": "150px",
"width": "202.837px"
},
"toc_section_display": true,
"toc_window_display": false
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}