{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Escaping Excel Hell with Python and Pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Minnebar 11, April 23 2016" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, import necessary libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can issue shell commands too" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Combining-Multiple-Excel-File-with-Pandas.ipynb\r\n", "Common-Excel-Part-2.ipynb\r\n", "Google-Forms-Data-Analysis.ipynb\r\n", "Ipython-pandas-tips-and-tricks.ipynb\r\n", "Learn_Pandas-Weighted_Average.ipynb\r\n", "Minnebar\\ 11\\ Demo\\ -\\ Master.ipynb\r\n", "Pandas-Pivot-Table-Explained.ipynb\r\n", "test.xlsx\r\n" ] } ], "source": [ "!dir" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read in the excel file (from the github repo) and turn into a DataFrame" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df = pd.read_excel(\"https://github.com/chris1610/pbpython/blob/master/data/sample-salesv3.xlsx?raw=True\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's look at the file" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
account numbernameskuquantityunit priceext pricedate
0740150Barton LLCB1-200003986.693380.912014-01-01 07:21:51
1714466Trantow-BarrowsS2-77896-163.16-63.162014-01-01 10:00:47
2218895Kulas IncB1-699242390.702086.102014-01-01 13:24:58
3307599Kassulke, Ondricka and MetzS1-654814121.05863.052014-01-01 15:05:22
4412290Jerde-HilpertS2-34077683.21499.262014-01-01 23:26:55
\n", "
" ], "text/plain": [ " account number name sku quantity \\\n", "0 740150 Barton LLC B1-20000 39 \n", "1 714466 Trantow-Barrows S2-77896 -1 \n", "2 218895 Kulas Inc B1-69924 23 \n", "3 307599 Kassulke, Ondricka and Metz S1-65481 41 \n", "4 412290 Jerde-Hilpert S2-34077 6 \n", "\n", " unit price ext price date \n", "0 86.69 3380.91 2014-01-01 07:21:51 \n", "1 63.16 -63.16 2014-01-01 10:00:47 \n", "2 90.70 2086.10 2014-01-01 13:24:58 \n", "3 21.05 863.05 2014-01-01 15:05:22 \n", "4 83.21 499.26 2014-01-01 23:26:55 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
account numbernameskuquantityunit priceext pricedate
1495737550Fritsch, Russel and AndersonS1-065321255.80669.602014-12-30 13:38:13
1496786968Frami, Hills and SchmidtS1-065323713.14486.182014-12-30 21:42:17
1497239344Stokes LLCS2-103421438.75542.502014-12-30 22:45:19
1498642753Pollich LLCS2-82423365.97197.912014-12-31 10:36:24
1499383080Will LLCS2-003013846.441764.722014-12-31 12:48:35
\n", "
" ], "text/plain": [ " account number name sku quantity \\\n", "1495 737550 Fritsch, Russel and Anderson S1-06532 12 \n", "1496 786968 Frami, Hills and Schmidt S1-06532 37 \n", "1497 239344 Stokes LLC S2-10342 14 \n", "1498 642753 Pollich LLC S2-82423 3 \n", "1499 383080 Will LLC S2-00301 38 \n", "\n", " unit price ext price date \n", "1495 55.80 669.60 2014-12-30 13:38:13 \n", "1496 13.14 486.18 2014-12-30 21:42:17 \n", "1497 38.75 542.50 2014-12-30 22:45:19 \n", "1498 65.97 197.91 2014-12-31 10:36:24 \n", "1499 46.44 1764.72 2014-12-31 12:48:35 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
account numberquantityunit priceext price
count1500.0000001500.0000001500.0000001500.000000
mean485957.84133324.30866755.0075271345.856213
std223974.04457214.43926525.9032671084.914881
min141962.000000-1.00000010.030000-97.160000
25%257198.00000012.00000032.500000472.177500
50%527099.00000025.00000055.4650001050.390000
75%714466.00000037.00000077.0750002068.330000
max786968.00000049.00000099.8500004824.540000
\n", "
" ], "text/plain": [ " account number quantity unit price ext price\n", "count 1500.000000 1500.000000 1500.000000 1500.000000\n", "mean 485957.841333 24.308667 55.007527 1345.856213\n", "std 223974.044572 14.439265 25.903267 1084.914881\n", "min 141962.000000 -1.000000 10.030000 -97.160000\n", "25% 257198.000000 12.000000 32.500000 472.177500\n", "50% 527099.000000 25.000000 55.465000 1050.390000\n", "75% 714466.000000 37.000000 77.075000 2068.330000\n", "max 786968.000000 49.000000 99.850000 4824.540000" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 1500 entries, 0 to 1499\n", "Data columns (total 7 columns):\n", "account number 1500 non-null int64\n", "name 1500 non-null object\n", "sku 1500 non-null object\n", "quantity 1500 non-null int64\n", "unit price 1500 non-null float64\n", "ext price 1500 non-null float64\n", "date 1500 non-null object\n", "dtypes: float64(2), int64(2), object(3)\n", "memory usage: 82.1+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can easily perform math operations on the data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "36463" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.quantity.sum()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "82511.289999999994" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"unit price\"].sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we can show how to aggregate some data" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "name\n", "Barton LLC 4409.06\n", "Cronin, Oberbrunner and Spencer 3336.99\n", "Frami, Hills and Schmidt 3942.49\n", "Fritsch, Russel and Anderson 4350.41\n", "Halvorson, Crona and Champlin 3244.92\n", "Herman LLC 3259.15\n", "Jerde-Hilpert 4635.54\n", "Kassulke, Ondricka and Metz 3266.76\n", "Keeling LLC 4223.63\n", "Kiehn-Spinka 4389.32\n", "Koepp Ltd 4459.96\n", "Kuhn-Gusikowski 4075.87\n", "Kulas Inc 5608.19\n", "Pollich LLC 4126.92\n", "Purdy-Kunde 2668.07\n", "Sanford and Sons 4142.25\n", "Stokes LLC 3711.26\n", "Trantow-Barrows 5280.93\n", "White-Trantow 5040.73\n", "Will LLC 4338.84\n", "Name: unit price, dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby('name')[\"unit price\"].sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Do multiple groupbys" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quantityunit price
namesku
Barton LLCB1-0420235.00000069.5200
B1-0591424.00000013.5300
B1-2000022.00000078.9225
B1-3308722.00000096.4150
B1-3336427.66666740.0800
\n", "
" ], "text/plain": [ " quantity unit price\n", "name sku \n", "Barton LLC B1-04202 35.000000 69.5200\n", " B1-05914 24.000000 13.5300\n", " B1-20000 22.000000 78.9225\n", " B1-33087 22.000000 96.4150\n", " B1-33364 27.666667 40.0800" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby([\"name\", \"sku\"])[\"quantity\", \"unit price\"].mean().head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Time series analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Make sure to set the date as a date column" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 1500 entries, 0 to 1499\n", "Data columns (total 7 columns):\n", "account number 1500 non-null int64\n", "name 1500 non-null object\n", "sku 1500 non-null object\n", "quantity 1500 non-null int64\n", "unit price 1500 non-null float64\n", "ext price 1500 non-null float64\n", "date 1500 non-null datetime64[ns]\n", "dtypes: datetime64[ns](1), float64(2), int64(2), object(2)\n", "memory usage: 82.1+ KB\n" ] } ], "source": [ "df[\"date\"] = pd.to_datetime(df['date'])\n", "df.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we have a date time object" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
account numbernameskuquantityunit priceext pricedate
0740150Barton LLCB1-200003986.693380.912014-01-01 07:21:51
1714466Trantow-BarrowsS2-77896-163.16-63.162014-01-01 10:00:47
2218895Kulas IncB1-699242390.702086.102014-01-01 13:24:58
3307599Kassulke, Ondricka and MetzS1-654814121.05863.052014-01-01 15:05:22
4412290Jerde-HilpertS2-34077683.21499.262014-01-01 23:26:55
\n", "
" ], "text/plain": [ " account number name sku quantity \\\n", "0 740150 Barton LLC B1-20000 39 \n", "1 714466 Trantow-Barrows S2-77896 -1 \n", "2 218895 Kulas Inc B1-69924 23 \n", "3 307599 Kassulke, Ondricka and Metz S1-65481 41 \n", "4 412290 Jerde-Hilpert S2-34077 6 \n", "\n", " unit price ext price date \n", "0 86.69 3380.91 2014-01-01 07:21:51 \n", "1 63.16 -63.16 2014-01-01 10:00:47 \n", "2 90.70 2086.10 2014-01-01 13:24:58 \n", "3 21.05 863.05 2014-01-01 15:05:22 \n", "4 83.21 499.26 2014-01-01 23:26:55 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's see how much we sold by month by customer" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "name date \n", "Barton LLC 2014-01-31 6177.57\n", " 2014-02-28 12218.03\n", " 2014-03-31 3513.53\n", " 2014-04-30 11474.20\n", " 2014-05-31 10220.17\n", "Name: ext price, dtype: float64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.set_index('date').groupby('name')[\"ext price\"].resample(\"M\").sum().head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What about every 3 Months?" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "name date \n", "Barton LLC 2014-01-31 6177.57\n", " 2014-04-30 27205.76\n", " 2014-07-31 27434.38\n", " 2014-10-31 40946.75\n", " 2015-01-31 7674.04\n", "Name: ext price, dtype: float64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.set_index('date').groupby('name')[\"ext price\"].resample(\"3M\").sum().head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What about quarterly ending in Dec?" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "name date \n", "Barton LLC 2014-03-31 21909.13\n", " 2014-06-30 32158.10\n", " 2014-09-30 38345.55\n", " 2014-12-31 17025.72\n", "Cronin, Oberbrunner and Spencer 2014-03-31 26809.63\n", "Name: ext price, dtype: float64" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.set_index('date').groupby('name')[\"ext price\"].resample('Q-DEC').sum().head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Annual Number" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "name date \n", "Barton LLC 2014-12-31 109438.50\n", "Cronin, Oberbrunner and Spencer 2014-12-31 89734.55\n", "Frami, Hills and Schmidt 2014-12-31 103569.59\n", "Fritsch, Russel and Anderson 2014-12-31 112214.71\n", "Halvorson, Crona and Champlin 2014-12-31 70004.36\n", "Herman LLC 2014-12-31 82865.00\n", "Jerde-Hilpert 2014-12-31 112591.43\n", "Kassulke, Ondricka and Metz 2014-12-31 86451.07\n", "Keeling LLC 2014-12-31 100934.30\n", "Kiehn-Spinka 2014-12-31 99608.77\n", "Koepp Ltd 2014-12-31 103660.54\n", "Kuhn-Gusikowski 2014-12-31 91094.28\n", "Kulas Inc 2014-12-31 137351.96\n", "Pollich LLC 2014-12-31 87347.18\n", "Purdy-Kunde 2014-12-31 77898.21\n", "Sanford and Sons 2014-12-31 98822.98\n", "Stokes LLC 2014-12-31 91535.92\n", "Trantow-Barrows 2014-12-31 123381.38\n", "White-Trantow 2014-12-31 135841.99\n", "Will LLC 2014-12-31 104437.60\n", "Name: ext price, dtype: float64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.set_index('date').groupby('name')[\"ext price\"].resample('A-DEC').sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "DataFrame vs. Series" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ext price
namedate
Barton LLC2014-12-31109438.50
Cronin, Oberbrunner and Spencer2014-12-3189734.55
Frami, Hills and Schmidt2014-12-31103569.59
Fritsch, Russel and Anderson2014-12-31112214.71
Halvorson, Crona and Champlin2014-12-3170004.36
Herman LLC2014-12-3182865.00
Jerde-Hilpert2014-12-31112591.43
Kassulke, Ondricka and Metz2014-12-3186451.07
Keeling LLC2014-12-31100934.30
Kiehn-Spinka2014-12-3199608.77
Koepp Ltd2014-12-31103660.54
Kuhn-Gusikowski2014-12-3191094.28
Kulas Inc2014-12-31137351.96
Pollich LLC2014-12-3187347.18
Purdy-Kunde2014-12-3177898.21
Sanford and Sons2014-12-3198822.98
Stokes LLC2014-12-3191535.92
Trantow-Barrows2014-12-31123381.38
White-Trantow2014-12-31135841.99
Will LLC2014-12-31104437.60
\n", "
" ], "text/plain": [ " ext price\n", "name date \n", "Barton LLC 2014-12-31 109438.50\n", "Cronin, Oberbrunner and Spencer 2014-12-31 89734.55\n", "Frami, Hills and Schmidt 2014-12-31 103569.59\n", "Fritsch, Russel and Anderson 2014-12-31 112214.71\n", "Halvorson, Crona and Champlin 2014-12-31 70004.36\n", "Herman LLC 2014-12-31 82865.00\n", "Jerde-Hilpert 2014-12-31 112591.43\n", "Kassulke, Ondricka and Metz 2014-12-31 86451.07\n", "Keeling LLC 2014-12-31 100934.30\n", "Kiehn-Spinka 2014-12-31 99608.77\n", "Koepp Ltd 2014-12-31 103660.54\n", "Kuhn-Gusikowski 2014-12-31 91094.28\n", "Kulas Inc 2014-12-31 137351.96\n", "Pollich LLC 2014-12-31 87347.18\n", "Purdy-Kunde 2014-12-31 77898.21\n", "Sanford and Sons 2014-12-31 98822.98\n", "Stokes LLC 2014-12-31 91535.92\n", "Trantow-Barrows 2014-12-31 123381.38\n", "White-Trantow 2014-12-31 135841.99\n", "Will LLC 2014-12-31 104437.60" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.set_index('date').groupby('name')[\"ext price\",].resample('A-DEC').sum()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.set_index('date').groupby('name')[\"ext price\",].resample('A-DEC').sum().to_excel(\"test.xlsx\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }