{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Aggregation and Group Operations" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## GroupBy" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1data2key1key2
0201.993315a1
1211.132069b2
2220.135880a3
323-1.071349b4
4100-0.585936a1
5200-1.501368b2
6300-0.688437a3
74000.926304a4
\n", "
" ], "text/plain": [ " data1 data2 key1 key2\n", "0 20 1.993315 a 1\n", "1 21 1.132069 b 2\n", "2 22 0.135880 a 3\n", "3 23 -1.071349 b 4\n", "4 100 -0.585936 a 1\n", "5 200 -1.501368 b 2\n", "6 300 -0.688437 a 3\n", "7 400 0.926304 a 4" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\"key1\" : [\"a\", \"b\", \"a\", \"b\", \"a\", \"b\", \"a\", \"a\"], \n", " \"key2\" : [\"1\", \"2\", \"3\", \"4\", \"1\", \"2\", \"3\", \"4\"], \n", " \"data1\" : [20, 21, 22, 23, 100, 200, 300, 400], \n", " \"data2\" : np.random.randn(8)})\n", "df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1data2
key1
a168.4000000.356225
b81.333333-0.480216
\n", "
" ], "text/plain": [ " data1 data2\n", "key1 \n", "a 168.400000 0.356225\n", "b 81.333333 -0.480216" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(df[\"key1\"]).mean()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1data2
key1
a168.4000000.356225
b81.333333-0.480216
\n", "
" ], "text/plain": [ " data1 data2\n", "key1 \n", "a 168.400000 0.356225\n", "b 81.333333 -0.480216" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(\"key1\").mean()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1data2
key1key2
a160.00.703689
3161.0-0.276278
4400.00.926304
b2110.5-0.184649
423.0-1.071349
\n", "
" ], "text/plain": [ " data1 data2\n", "key1 key2 \n", "a 1 60.0 0.703689\n", " 3 161.0 -0.276278\n", " 4 400.0 0.926304\n", "b 2 110.5 -0.184649\n", " 4 23.0 -1.071349" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "means = df.groupby([df[\"key1\"], df[\"key2\"]]).mean()\n", "means" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1data2
key1key2
a160.00.703689
3161.0-0.276278
4400.00.926304
b2110.5-0.184649
423.0-1.071349
\n", "
" ], "text/plain": [ " data1 data2\n", "key1 key2 \n", "a 1 60.0 0.703689\n", " 3 161.0 -0.276278\n", " 4 400.0 0.926304\n", "b 2 110.5 -0.184649\n", " 4 23.0 -1.071349" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby([\"key1\", \"key2\"]).mean()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1data2
key212341234
key1
a60.0NaN161.0400.00.703689NaN-0.2762780.926304
bNaN110.5NaN23.0NaN-0.184649NaN-1.071349
\n", "
" ], "text/plain": [ " data1 data2 \n", "key2 1 2 3 4 1 2 3 4\n", "key1 \n", "a 60.0 NaN 161.0 400.0 0.703689 NaN -0.276278 0.926304\n", "b NaN 110.5 NaN 23.0 NaN -0.184649 NaN -1.071349" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "means.unstack()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "key1 key2\n", "a 1 2\n", " 3 2\n", " 4 1\n", "b 2 2\n", " 4 1\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby([\"key1\", \"key2\"]).size()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a\n", " data1 data2 key1 key2\n", "0 20 1.993315 a 1\n", "2 22 0.135880 a 3\n", "4 100 -0.585936 a 1\n", "6 300 -0.688437 a 3\n", "7 400 0.926304 a 4\n", "b\n", " data1 data2 key1 key2\n", "1 21 1.132069 b 2\n", "3 23 -1.071349 b 4\n", "5 200 -1.501368 b 2\n" ] } ], "source": [ "# iterating over groups\n", "for name, group in df.groupby(\"key1\"):\n", " print(name)\n", " print(group)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "data1 int64\n", "data2 float64\n", "key1 object\n", "key2 object\n", "dtype: object" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{dtype('int64'): data1\n", " 0 20\n", " 1 21\n", " 2 22\n", " 3 23\n", " 4 100\n", " 5 200\n", " 6 300\n", " 7 400, dtype('float64'): data2\n", " 0 1.993315\n", " 1 1.132069\n", " 2 0.135880\n", " 3 -1.071349\n", " 4 -0.585936\n", " 5 -1.501368\n", " 6 -0.688437\n", " 7 0.926304, dtype('O'): key1 key2\n", " 0 a 1\n", " 1 b 2\n", " 2 a 3\n", " 3 b 4\n", " 4 a 1\n", " 5 b 2\n", " 6 a 3\n", " 7 a 4}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = df.groupby(df.dtypes, axis = 1)\n", "dict(list(df2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Select a Column" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1
key1
a168.400000
b81.333333
\n", "
" ], "text/plain": [ " data1\n", "key1 \n", "a 168.400000\n", "b 81.333333" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby('key1')[['data1']].mean()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 a\n", "1 b\n", "2 a\n", "3 b\n", "4 a\n", "5 b\n", "6 a\n", "7 a\n", "Name: key1, dtype: object" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"key1\"]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "data1 8\n", "data2 8\n", "key1 8\n", "key2 8\n", "dtype: int64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.count()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1data2
key1
a3802.681751
b1792.633437
\n", "
" ], "text/plain": [ " data1 data2\n", "key1 \n", "a 380 2.681751\n", "b 179 2.633437" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def my_range(arr):\n", " return arr.max() - arr.min()\n", "\n", "df.groupby(\"key1\").aggregate(my_range)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1data2
countmeanstdmin25%50%75%maxcountmeanstdmin25%50%75%max
key1
a5.0168.400000172.54796420.022.0100.0300.0400.05.00.3047340.896221-1.0904320.2700460.3922290.5536931.398133
b3.081.333333102.77321321.022.023.0111.5200.03.00.5532550.4660060.0646040.3335260.6024470.7975800.992713
\n", "
" ], "text/plain": [ " data1 data2 \\\n", " count mean std min 25% 50% 75% max count \n", "key1 \n", "a 5.0 168.400000 172.547964 20.0 22.0 100.0 300.0 400.0 5.0 \n", "b 3.0 81.333333 102.773213 21.0 22.0 23.0 111.5 200.0 3.0 \n", "\n", " \n", " mean std min 25% 50% 75% max \n", "key1 \n", "a 0.304734 0.896221 -1.090432 0.270046 0.392229 0.553693 1.398133 \n", "b 0.553255 0.466006 0.064604 0.333526 0.602447 0.797580 0.992713 " ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(\"key1\").describe()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1data2
countmeanstdmin25%50%75%maxcountmeanstdmin25%50%75%max
key1
a5.0168.400000172.54796420.022.0100.0300.0400.05.00.3562251.121672-0.688437-0.5859360.1358800.9263041.993315
b3.081.333333102.77321321.022.023.0111.5200.03.0-0.4802161.412737-1.501368-1.286358-1.0713490.0303601.132069
\n", "
" ], "text/plain": [ " data1 data2 \\\n", " count mean std min 25% 50% 75% max count \n", "key1 \n", "a 5.0 168.400000 172.547964 20.0 22.0 100.0 300.0 400.0 5.0 \n", "b 3.0 81.333333 102.773213 21.0 22.0 23.0 111.5 200.0 3.0 \n", "\n", " \n", " mean std min 25% 50% 75% max \n", "key1 \n", "a 0.356225 1.121672 -0.688437 -0.585936 0.135880 0.926304 1.993315 \n", "b -0.480216 1.412737 -1.501368 -1.286358 -1.071349 0.030360 1.132069 " ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(\"key1\").aggregate(\"describe\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data aggregation" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_billtipsmokerdaytimesize
016.991.01NoSunDinner2
110.341.66NoSunDinner3
221.013.50NoSunDinner3
323.683.31NoSunDinner2
424.593.61NoSunDinner4
\n", "
" ], "text/plain": [ " total_bill tip smoker day time size\n", "0 16.99 1.01 No Sun Dinner 2\n", "1 10.34 1.66 No Sun Dinner 3\n", "2 21.01 3.50 No Sun Dinner 3\n", "3 23.68 3.31 No Sun Dinner 2\n", "4 24.59 3.61 No Sun Dinner 4" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"examples/tips.csv\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_billtipsmokerdaytimesizetip_pct
016.991.01NoSunDinner25.944673
110.341.66NoSunDinner316.054159
221.013.50NoSunDinner316.658734
323.683.31NoSunDinner213.978041
424.593.61NoSunDinner414.680765
\n", "
" ], "text/plain": [ " total_bill tip smoker day time size tip_pct\n", "0 16.99 1.01 No Sun Dinner 2 5.944673\n", "1 10.34 1.66 No Sun Dinner 3 16.054159\n", "2 21.01 3.50 No Sun Dinner 3 16.658734\n", "3 23.68 3.31 No Sun Dinner 2 13.978041\n", "4 24.59 3.61 No Sun Dinner 4 14.680765" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"tip_pct\"] = df.tip / df.total_bill * 100\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tip_pct
daysmoker
FriNo15.165044
Yes17.478305
SatNo15.804766
Yes14.790607
SunNo16.011294
Yes18.725032
ThurNo16.029808
Yes16.386327
\n", "
" ], "text/plain": [ " tip_pct\n", "day smoker \n", "Fri No 15.165044\n", " Yes 17.478305\n", "Sat No 15.804766\n", " Yes 14.790607\n", "Sun No 16.011294\n", " Yes 18.725032\n", "Thur No 16.029808\n", " Yes 16.386327" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby([\"day\", \"smoker\"])[[\"tip_pct\"]].mean()" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countmeanstdmin25%50%75%max
daysmoker
FriNo4.015.1650442.81229512.03852313.72391714.92409316.36522118.773467
Yes15.017.4783055.12926710.35554013.37387117.39130420.92401926.348039
SatNo45.015.8047663.9767305.67966713.62397815.01519818.39145129.198966
Yes42.014.7906076.1374953.5638149.17971315.36243919.05021632.573290
SunNo57.016.0112944.2347235.94467313.97804116.16650518.51851925.267250
Yes19.018.72503215.4134246.5659889.77226513.81215521.53248071.034483
ThurNo45.016.0298083.8774207.29613713.77410515.34919418.48428826.631158
Yes17.016.3863273.9388819.00140614.80384915.38461519.48368224.125452
\n", "
" ], "text/plain": [ " count mean std min 25% 50% \\\n", "day smoker \n", "Fri No 4.0 15.165044 2.812295 12.038523 13.723917 14.924093 \n", " Yes 15.0 17.478305 5.129267 10.355540 13.373871 17.391304 \n", "Sat No 45.0 15.804766 3.976730 5.679667 13.623978 15.015198 \n", " Yes 42.0 14.790607 6.137495 3.563814 9.179713 15.362439 \n", "Sun No 57.0 16.011294 4.234723 5.944673 13.978041 16.166505 \n", " Yes 19.0 18.725032 15.413424 6.565988 9.772265 13.812155 \n", "Thur No 45.0 16.029808 3.877420 7.296137 13.774105 15.349194 \n", " Yes 17.0 16.386327 3.938881 9.001406 14.803849 15.384615 \n", "\n", " 75% max \n", "day smoker \n", "Fri No 16.365221 18.773467 \n", " Yes 20.924019 26.348039 \n", "Sat No 18.391451 29.198966 \n", " Yes 19.050216 32.573290 \n", "Sun No 18.518519 25.267250 \n", " Yes 21.532480 71.034483 \n", "Thur No 18.484288 26.631158 \n", " Yes 19.483682 24.125452 " ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby([\"day\", \"smoker\"]).tip_pct.aggregate('describe')" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
meansumstd
daysmoker
FriNo15.16504460.6601772.812295
Yes17.478305262.1745785.129267
SatNo15.804766711.2144593.976730
Yes14.790607621.2054746.137495
SunNo16.011294912.6437754.234723
Yes18.725032355.77560115.413424
ThurNo16.029808721.3413683.877420
Yes16.386327278.5675633.938881
\n", "
" ], "text/plain": [ " mean sum std\n", "day smoker \n", "Fri No 15.165044 60.660177 2.812295\n", " Yes 17.478305 262.174578 5.129267\n", "Sat No 15.804766 711.214459 3.976730\n", " Yes 14.790607 621.205474 6.137495\n", "Sun No 16.011294 912.643775 4.234723\n", " Yes 18.725032 355.775601 15.413424\n", "Thur No 16.029808 721.341368 3.877420\n", " Yes 16.386327 278.567563 3.938881" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby([\"day\", \"smoker\"]).tip_pct.agg([\"mean\", \"sum\", np.std])" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
averagesummationstandar dev
daysmoker
FriNo15.16504460.6601772.812295
Yes17.478305262.1745785.129267
SatNo15.804766711.2144593.976730
Yes14.790607621.2054746.137495
SunNo16.011294912.6437754.234723
Yes18.725032355.77560115.413424
ThurNo16.029808721.3413683.877420
Yes16.386327278.5675633.938881
\n", "
" ], "text/plain": [ " average summation standar dev\n", "day smoker \n", "Fri No 15.165044 60.660177 2.812295\n", " Yes 17.478305 262.174578 5.129267\n", "Sat No 15.804766 711.214459 3.976730\n", " Yes 14.790607 621.205474 6.137495\n", "Sun No 16.011294 912.643775 4.234723\n", " Yes 18.725032 355.775601 15.413424\n", "Thur No 16.029808 721.341368 3.877420\n", " Yes 16.386327 278.567563 3.938881" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import warnings\n", "warnings.filterwarnings(action = \"ignore\")\n", "df.groupby([\"day\", \"smoker\"]).tip_pct.agg({\"average\" : \"mean\",\n", " \"summation\": \"sum\", \n", " \"standar dev\" : np.std})" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tip_pcttotal_bill
countmeanmaxcountmeanmax
daysmoker
FriNo415.16504418.773467418.42000022.75
Yes1517.47830526.3480391516.81333340.17
SatNo4515.80476629.1989664519.66177848.33
Yes4214.79060732.5732904221.27666750.81
SunNo5716.01129425.2672505720.50666748.17
Yes1918.72503271.0344831924.12000045.35
ThurNo4516.02980826.6311584517.11311141.19
Yes1716.38632724.1254521719.19058843.11
\n", "
" ], "text/plain": [ " tip_pct total_bill \n", " count mean max count mean max\n", "day smoker \n", "Fri No 4 15.165044 18.773467 4 18.420000 22.75\n", " Yes 15 17.478305 26.348039 15 16.813333 40.17\n", "Sat No 45 15.804766 29.198966 45 19.661778 48.33\n", " Yes 42 14.790607 32.573290 42 21.276667 50.81\n", "Sun No 57 16.011294 25.267250 57 20.506667 48.17\n", " Yes 19 18.725032 71.034483 19 24.120000 45.35\n", "Thur No 45 16.029808 26.631158 45 17.113111 41.19\n", " Yes 17 16.386327 24.125452 17 19.190588 43.11" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "functions = [\"count\", \"mean\", \"max\"]\n", "df.groupby([\"day\", \"smoker\"])[\"tip_pct\", \"total_bill\"].agg(functions)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Apply\n", "apply splits an object into pieces, invokes the epassed function on each piece, and then attempts to concatenate the pieces together" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_billtipsmokerdaytimesizetip_pct
18323.176.50YesSunDinner428.053517
23211.613.39NoSatDinner229.198966
673.071.00YesSatDinner132.573290
1789.604.00YesSunDinner241.666667
1727.255.15YesSunDinner271.034483
\n", "
" ], "text/plain": [ " total_bill tip smoker day time size tip_pct\n", "183 23.17 6.50 Yes Sun Dinner 4 28.053517\n", "232 11.61 3.39 No Sat Dinner 2 29.198966\n", "67 3.07 1.00 Yes Sat Dinner 1 32.573290\n", "178 9.60 4.00 Yes Sun Dinner 2 41.666667\n", "172 7.25 5.15 Yes Sun Dinner 2 71.034483" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def top_five(data = df, n = 5, column = \"tip_pct\" ):\n", " return df.sort_values(by = column)[-n:]\n", "\n", "top_five(data = df)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total_billtipsmokerdaytimesizetip_pct
smoker
No18323.176.50YesSunDinner428.053517
23211.613.39NoSatDinner229.198966
673.071.00YesSatDinner132.573290
1789.604.00YesSunDinner241.666667
1727.255.15YesSunDinner271.034483
Yes18323.176.50YesSunDinner428.053517
23211.613.39NoSatDinner229.198966
673.071.00YesSatDinner132.573290
1789.604.00YesSunDinner241.666667
1727.255.15YesSunDinner271.034483
\n", "
" ], "text/plain": [ " total_bill tip smoker day time size tip_pct\n", "smoker \n", "No 183 23.17 6.50 Yes Sun Dinner 4 28.053517\n", " 232 11.61 3.39 No Sat Dinner 2 29.198966\n", " 67 3.07 1.00 Yes Sat Dinner 1 32.573290\n", " 178 9.60 4.00 Yes Sun Dinner 2 41.666667\n", " 172 7.25 5.15 Yes Sun Dinner 2 71.034483\n", "Yes 183 23.17 6.50 Yes Sun Dinner 4 28.053517\n", " 232 11.61 3.39 No Sat Dinner 2 29.198966\n", " 67 3.07 1.00 Yes Sat Dinner 1 32.573290\n", " 178 9.60 4.00 Yes Sun Dinner 2 41.666667\n", " 172 7.25 5.15 Yes Sun Dinner 2 71.034483" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(\"smoker\").apply(top_five)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }