{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Notebook accompanying pbpython article - [Pandas Grouper and Agg Functions Explained](http://pbpython.com/pandas-grouper-agg.html)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import collections" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read in the sample sales file then convert the date column to a proper date time column" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = pd.read_excel(\"https://github.com/chris1610/pbpython/blob/master/data/sample-salesv3.xlsx?raw=True\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
account numbernameskuquantityunit priceext pricedate
0740150Barton LLCB1-200003986.693380.912014-01-01 07:21:51
1714466Trantow-BarrowsS2-77896-163.16-63.162014-01-01 10:00:47
2218895Kulas IncB1-699242390.702086.102014-01-01 13:24:58
3307599Kassulke, Ondricka and MetzS1-654814121.05863.052014-01-01 15:05:22
4412290Jerde-HilpertS2-34077683.21499.262014-01-01 23:26:55
\n", "
" ], "text/plain": [ " account number name sku quantity \\\n", "0 740150 Barton LLC B1-20000 39 \n", "1 714466 Trantow-Barrows S2-77896 -1 \n", "2 218895 Kulas Inc B1-69924 23 \n", "3 307599 Kassulke, Ondricka and Metz S1-65481 41 \n", "4 412290 Jerde-Hilpert S2-34077 6 \n", "\n", " unit price ext price date \n", "0 86.69 3380.91 2014-01-01 07:21:51 \n", "1 63.16 -63.16 2014-01-01 10:00:47 \n", "2 90.70 2086.10 2014-01-01 13:24:58 \n", "3 21.05 863.05 2014-01-01 15:05:22 \n", "4 83.21 499.26 2014-01-01 23:26:55 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"date\"] = pd.to_datetime(df['date'])\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "account number int64\n", "name object\n", "sku object\n", "quantity int64\n", "unit price float64\n", "ext price float64\n", "date datetime64[ns]\n", "dtype: object" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Example showing how resample can be used along with set_index" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "date\n", "2014-01-31 185361.66\n", "2014-02-28 146211.62\n", "2014-03-31 203921.38\n", "2014-04-30 174574.11\n", "2014-05-31 165418.55\n", "2014-06-30 174089.33\n", "2014-07-31 191662.11\n", "2014-08-31 153778.59\n", "2014-09-30 168443.17\n", "2014-10-31 171495.32\n", "2014-11-30 119961.22\n", "2014-12-31 163867.26\n", "Freq: M, Name: ext price, dtype: float64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.set_index('date').resample('M')[\"ext price\"].sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A more complex example with a groupby" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "name date \n", "Barton LLC 2014-01-31 6177.57\n", " 2014-02-28 12218.03\n", " 2014-03-31 3513.53\n", " 2014-04-30 11474.20\n", " 2014-05-31 10220.17\n", " 2014-06-30 10463.73\n", " 2014-07-31 6750.48\n", " 2014-08-31 17541.46\n", " 2014-09-30 14053.61\n", " 2014-10-31 9351.68\n", " 2014-11-30 4901.14\n", " 2014-12-31 2772.90\n", "Cronin, Oberbrunner and Spencer 2014-01-31 1141.75\n", " 2014-02-28 13976.26\n", " 2014-03-31 11691.62\n", " 2014-04-30 3685.44\n", " 2014-05-31 6760.11\n", " 2014-06-30 5379.67\n", " 2014-07-31 6020.30\n", " 2014-08-31 5399.58\n", "Name: ext price, dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.set_index('date').groupby('name')[\"ext price\"].resample(\"M\").sum().head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A simpler example using pd.Grouper" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "name date \n", "Barton LLC 2014-01-31 6177.57\n", " 2014-02-28 12218.03\n", " 2014-03-31 3513.53\n", " 2014-04-30 11474.20\n", " 2014-05-31 10220.17\n", " 2014-06-30 10463.73\n", " 2014-07-31 6750.48\n", " 2014-08-31 17541.46\n", " 2014-09-30 14053.61\n", " 2014-10-31 9351.68\n", " 2014-11-30 4901.14\n", " 2014-12-31 2772.90\n", "Cronin, Oberbrunner and Spencer 2014-01-31 1141.75\n", " 2014-02-28 13976.26\n", " 2014-03-31 11691.62\n", " 2014-04-30 3685.44\n", " 2014-05-31 6760.11\n", " 2014-06-30 5379.67\n", " 2014-07-31 6020.30\n", " 2014-08-31 5399.58\n", "Name: ext price, dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(['name', pd.Grouper(key='date', freq='M')])['ext price'].sum().head(20)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "name date \n", "Barton LLC 2014-12-31 109438.50\n", "Cronin, Oberbrunner and Spencer 2014-12-31 89734.55\n", "Frami, Hills and Schmidt 2014-12-31 103569.59\n", "Fritsch, Russel and Anderson 2014-12-31 112214.71\n", "Halvorson, Crona and Champlin 2014-12-31 70004.36\n", "Herman LLC 2014-12-31 82865.00\n", "Jerde-Hilpert 2014-12-31 112591.43\n", "Kassulke, Ondricka and Metz 2014-12-31 86451.07\n", "Keeling LLC 2014-12-31 100934.30\n", "Kiehn-Spinka 2014-12-31 99608.77\n", "Koepp Ltd 2014-12-31 103660.54\n", "Kuhn-Gusikowski 2014-12-31 91094.28\n", "Kulas Inc 2014-12-31 137351.96\n", "Pollich LLC 2014-12-31 87347.18\n", "Purdy-Kunde 2014-12-31 77898.21\n", "Sanford and Sons 2014-12-31 98822.98\n", "Stokes LLC 2014-12-31 91535.92\n", "Trantow-Barrows 2014-12-31 123381.38\n", "White-Trantow 2014-12-31 135841.99\n", "Will LLC 2014-12-31 104437.60\n", "Name: ext price, dtype: float64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(['name', pd.Grouper(key='date', freq='A-DEC')])['ext price'].sum()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# This works but is kind of slow and probably not that useful for this data set\n", "#df.groupby(['name', pd.Grouper(key='date', freq='60s')])['ext price'].sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Some more examples using various off set alisases - http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "name date \n", "Barton LLC 2014-01-06 3380.91\n", " 2014-01-20 224.86\n", " 2014-01-27 2423.96\n", " 2014-02-03 147.84\n", " 2014-02-10 217.14\n", " 2014-02-17 8442.09\n", " 2014-03-03 3558.80\n", " 2014-03-10 1763.49\n", " 2014-03-17 1750.04\n", " 2014-04-07 1561.14\n", " 2014-04-14 4702.61\n", " 2014-04-21 836.63\n", " 2014-04-28 2540.79\n", " 2014-05-05 4109.99\n", " 2014-05-12 1671.83\n", " 2014-05-26 3251.43\n", " 2014-06-02 6112.87\n", " 2014-06-09 677.35\n", " 2014-06-16 2798.00\n", " 2014-06-23 3895.46\n", " 2014-07-14 1033.26\n", " 2014-07-21 3989.22\n", " 2014-08-04 7384.80\n", " 2014-08-11 84.18\n", " 2014-08-18 3857.10\n", " 2014-08-25 2925.94\n", " 2014-09-01 5017.44\n", " 2014-09-08 1319.28\n", " 2014-09-15 2608.50\n", " 2014-09-22 7371.67\n", " ... \n", "Will LLC 2014-03-31 2595.78\n", " 2014-04-07 991.99\n", " 2014-04-14 2069.20\n", " 2014-04-21 3007.03\n", " 2014-04-28 26.72\n", " 2014-05-12 2220.74\n", " 2014-05-19 3250.88\n", " 2014-06-02 6385.33\n", " 2014-06-16 1440.96\n", " 2014-06-23 56.15\n", " 2014-06-30 922.41\n", " 2014-07-07 2985.85\n", " 2014-07-14 3073.44\n", " 2014-07-21 3951.27\n", " 2014-07-28 1006.98\n", " 2014-08-11 773.90\n", " 2014-08-18 254.04\n", " 2014-08-25 411.88\n", " 2014-09-01 2020.57\n", " 2014-09-29 2325.42\n", " 2014-10-06 371.52\n", " 2014-10-20 99.47\n", " 2014-10-27 1616.14\n", " 2014-11-03 5768.60\n", " 2014-11-10 1734.08\n", " 2014-12-01 705.96\n", " 2014-12-08 4301.00\n", " 2014-12-15 3502.09\n", " 2014-12-22 2993.40\n", " 2015-01-05 1764.72\n", "Name: ext price, Length: 805, dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(['name', pd.Grouper(key='date', freq='W-MON')])['ext price'].sum()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "name sku date \n", "Barton LLC B1-04202 2014-12-31 2433.20\n", " B1-05914 2014-12-31 324.72\n", " B1-20000 2014-12-31 7212.11\n", " B1-33087 2014-12-31 4159.97\n", " B1-33364 2014-12-31 4369.19\n", " B1-38851 2014-12-31 6627.93\n", " B1-50809 2014-12-31 5087.44\n", " B1-53102 2014-12-31 431.27\n", " B1-53636 2014-12-31 3737.59\n", " B1-65551 2014-12-31 4703.54\n", " B1-69924 2014-12-31 1634.29\n", " B1-86481 2014-12-31 1457.80\n", " S1-06532 2014-12-31 8346.20\n", " S1-27722 2014-12-31 5840.43\n", " S1-30248 2014-12-31 887.97\n", " S1-47412 2014-12-31 5176.74\n", " S1-50961 2014-12-31 1883.31\n", " S1-65481 2014-12-31 3352.62\n", " S1-82801 2014-12-31 5453.39\n", " S1-93683 2014-12-31 1715.28\n", " S2-10342 2014-12-31 12337.83\n", " S2-11481 2014-12-31 728.50\n", " S2-16558 2014-12-31 623.07\n", " S2-34077 2014-12-31 5726.16\n", " S2-77896 2014-12-31 6156.02\n", " S2-78676 2014-12-31 2616.60\n", " S2-82423 2014-12-31 3735.43\n", " S2-83881 2014-12-31 2679.90\n", "Cronin, Oberbrunner and Spencer B1-04202 2014-12-31 5054.84\n", " B1-05914 2014-12-31 4003.64\n", " ... \n", "White-Trantow S2-82423 2014-12-31 5863.69\n", " S2-83881 2014-12-31 6506.07\n", "Will LLC B1-04202 2014-12-31 464.33\n", " B1-05914 2014-12-31 5935.53\n", " B1-20000 2014-12-31 4113.19\n", " B1-33087 2014-12-31 3711.06\n", " B1-33364 2014-12-31 11599.40\n", " B1-38851 2014-12-31 1475.09\n", " B1-53102 2014-12-31 6995.58\n", " B1-53636 2014-12-31 532.80\n", " B1-65551 2014-12-31 3039.01\n", " B1-69924 2014-12-31 10021.68\n", " B1-86481 2014-12-31 1770.34\n", " S1-06532 2014-12-31 3314.72\n", " S1-27722 2014-12-31 4493.11\n", " S1-30248 2014-12-31 975.52\n", " S1-47412 2014-12-31 1642.68\n", " S1-50961 2014-12-31 3513.62\n", " S1-65481 2014-12-31 593.46\n", " S1-82801 2014-12-31 2034.92\n", " S1-93683 2014-12-31 9559.57\n", " S2-00301 2014-12-31 2117.33\n", " S2-10342 2014-12-31 4918.55\n", " S2-11481 2014-12-31 4324.79\n", " S2-16558 2014-12-31 413.52\n", " S2-23246 2014-12-31 1893.25\n", " S2-34077 2014-12-31 991.99\n", " S2-77896 2014-12-31 5321.87\n", " S2-78676 2014-12-31 5016.76\n", " S2-83881 2014-12-31 3653.93\n", "Name: ext price, Length: 544, dtype: float64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(['name', 'sku', pd.Grouper(key='date', freq='A-DEC')])['ext price'].sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now show how to use the new .agg function\n", "\n", "First, how to get summary stats without agg" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ext price 2018784.32\n", "quantity 36463.00\n", "dtype: float64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[[\"ext price\", \"quantity\"]].sum()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ext price 1345.856213\n", "quantity 24.308667\n", "dtype: float64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[[\"ext price\", \"quantity\"]].mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using .agg for sums and means across multiple columns" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ext pricequantityunit price
sum2.018784e+0636463.00000082511.290000
mean1.345856e+0324.30866755.007527
\n", "
" ], "text/plain": [ " ext price quantity unit price\n", "sum 2.018784e+06 36463.000000 82511.290000\n", "mean 1.345856e+03 24.308667 55.007527" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[[\"ext price\", \"quantity\", \"unit price\"]].agg(['sum', 'mean'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Passing a dictionary containing different operations per column" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ext pricequantityunit price
mean1.345856e+0324.30866755.007527
sum2.018784e+0636463.000000NaN
\n", "
" ], "text/plain": [ " ext price quantity unit price\n", "mean 1.345856e+03 24.308667 55.007527\n", "sum 2.018784e+06 36463.000000 NaN" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.agg({'ext price': ['sum', 'mean'], 'quantity': ['sum', 'mean'], 'unit price': ['mean']})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using custom functions" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "get_max = lambda x: x.value_counts(dropna=False).index[0]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ext pricequantityunit pricesku
<lambda>NaNNaNNaNS2-77896
mean1.345856e+0324.30866755.007527NaN
sum2.018784e+0636463.000000NaNNaN
\n", "
" ], "text/plain": [ " ext price quantity unit price sku\n", " NaN NaN NaN S2-77896\n", "mean 1.345856e+03 24.308667 55.007527 NaN\n", "sum 2.018784e+06 36463.000000 NaN NaN" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.agg({'ext price': ['sum', 'mean'], 'quantity': ['sum', 'mean'], 'unit price': ['mean'], 'sku': [get_max]})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Clean up the naming in the output by defining the name for get_max" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "get_max.__name__ = \"most frequent\"" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ext pricequantityunit pricesku
mean1.345856e+0324.30866755.007527NaN
most frequentNaNNaNNaNS2-77896
sum2.018784e+0636463.000000NaNNaN
\n", "
" ], "text/plain": [ " ext price quantity unit price sku\n", "mean 1.345856e+03 24.308667 55.007527 NaN\n", "most frequent NaN NaN NaN S2-77896\n", "sum 2.018784e+06 36463.000000 NaN NaN" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.agg({'ext price': ['sum', 'mean'], 'quantity': ['sum', 'mean'], 'unit price': ['mean'], 'sku': [get_max]})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using an OrderedDictionary to maintain column order" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "f = collections.OrderedDict([('ext price', ['sum', 'mean']), ('quantity', ['sum', 'mean']), ('sku', [get_max])])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ext pricequantitysku
mean1.345856e+0324.308667NaN
most frequentNaNNaNS2-77896
sum2.018784e+0636463.000000NaN
\n", "
" ], "text/plain": [ " ext price quantity sku\n", "mean 1.345856e+03 24.308667 NaN\n", "most frequent NaN NaN S2-77896\n", "sum 2.018784e+06 36463.000000 NaN" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.agg(f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda env:pbpython3]", "language": "python", "name": "conda-env-pbpython3-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }