{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Grouping, Splitting and Pivoting Data\n", "\n", "In this notebook we'll go over ways to group, split and pivot data using the DataFrame#group_by and and DataFrame#pivot_table functions." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "data": { "application/javascript": [ "if(window['d3'] === undefined ||\n", " window['Nyaplot'] === undefined){\n", " var path = {\"d3\":\"https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min\",\"downloadable\":\"https://cdn.rawgit.com/domitry/d3-downloadable/master/d3-downloadable\"};\n", "\n", "\n", "\n", " var shim = {\"d3\":{\"exports\":\"d3\"},\"downloadable\":{\"exports\":\"downloadable\"}};\n", "\n", " require.config({paths: path, shim:shim});\n", "\n", "\n", "require(['d3'], function(d3){window['d3']=d3;console.log('finished loading d3');require(['downloadable'], function(downloadable){window['downloadable']=downloadable;console.log('finished loading downloadable');\n", "\n", "\tvar script = d3.select(\"head\")\n", "\t .append(\"script\")\n", "\t .attr(\"src\", \"https://cdn.rawgit.com/domitry/Nyaplotjs/master/release/nyaplot.js\")\n", "\t .attr(\"async\", true);\n", "\n", "\tscript[0][0].onload = script[0][0].onreadystatechange = function(){\n", "\n", "\n", "\t var event = document.createEvent(\"HTMLEvents\");\n", "\t event.initEvent(\"load_nyaplot\",false,false);\n", "\t window.dispatchEvent(event);\n", "\t console.log('Finished loading Nyaplotjs');\n", "\n", "\t};\n", "\n", "\n", "});});\n", "}\n" ], "text/plain": [ "\"if(window['d3'] === undefined ||\\n window['Nyaplot'] === undefined){\\n var path = {\\\"d3\\\":\\\"https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min\\\",\\\"downloadable\\\":\\\"https://cdn.rawgit.com/domitry/d3-downloadable/master/d3-downloadable\\\"};\\n\\n\\n\\n var shim = {\\\"d3\\\":{\\\"exports\\\":\\\"d3\\\"},\\\"downloadable\\\":{\\\"exports\\\":\\\"downloadable\\\"}};\\n\\n require.config({paths: path, shim:shim});\\n\\n\\nrequire(['d3'], function(d3){window['d3']=d3;console.log('finished loading d3');require(['downloadable'], function(downloadable){window['downloadable']=downloadable;console.log('finished loading downloadable');\\n\\n\\tvar script = d3.select(\\\"head\\\")\\n\\t .append(\\\"script\\\")\\n\\t .attr(\\\"src\\\", \\\"https://cdn.rawgit.com/domitry/Nyaplotjs/master/release/nyaplot.js\\\")\\n\\t .attr(\\\"async\\\", true);\\n\\n\\tscript[0][0].onload = script[0][0].onreadystatechange = function(){\\n\\n\\n\\t var event = document.createEvent(\\\"HTMLEvents\\\");\\n\\t event.initEvent(\\\"load_nyaplot\\\",false,false);\\n\\t window.dispatchEvent(event);\\n\\t console.log('Finished loading Nyaplotjs');\\n\\n\\t};\\n\\n\\n});});\\n}\\n\"" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "true" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "require 'daru'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lets first create a hierarchically indexed DataFrame." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Daru::DataFrame:24162060 rows: 12 cols: 4
[:a, :one, :bar][:a, :two, :baz][:b, :two, :foo][:b, :one, :foo]
[:a, :one, :bar]111111
[:a, :one, :baz]122122
[:a, :two, :bar]133133
[:a, :two, :baz]144144
[:b, :one, :bar]111111
[:b, :two, :bar]122122
[:b, :two, :baz]133133
[:b, :one, :foo]144144
[:c, :one, :bar]111111
[:c, :one, :baz]122122
[:c, :two, :foo]133133
[:c, :two, :bar]144144
" ], "text/plain": [ "\n", "#\n", " [:a, :one, [:a, :two, [:b, :two, [:b, :one, \n", "[:a, :one, 11 1 11 1 \n", "[:a, :one, 12 2 12 2 \n", "[:a, :two, 13 3 13 3 \n", "[:a, :two, 14 4 14 4 \n", "[:b, :one, 11 1 11 1 \n", "[:b, :two, 12 2 12 2 \n", "[:b, :two, 13 3 13 3 \n", "[:b, :one, 14 4 14 4 \n", "[:c, :one, 11 1 11 1 \n", "[:c, :one, 12 2 12 2 \n", "[:c, :two, 13 3 13 3 \n", "[:c, :two, 14 4 14 4 \n" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create a multi-indexed DataFrame\n", "\n", "tuples = [\n", " [:a,:one,:bar],\n", " [:a,:one,:baz],\n", " [:a,:two,:bar],\n", " [:a,:two,:baz],\n", " [:b,:one,:bar],\n", " [:b,:two,:bar],\n", " [:b,:two,:baz],\n", " [:b,:one,:foo],\n", " [:c,:one,:bar],\n", " [:c,:one,:baz],\n", " [:c,:two,:foo],\n", " [:c,:two,:bar]\n", "]\n", "multi_index = Daru::MultiIndex.from_tuples(tuples)\n", "\n", "vector_arry1 = [11,12,13,14,11,12,13,14,11,12,13,14]\n", "vector_arry2 = [1,2,3,4,1,2,3,4,1,2,3,4]\n", "\n", "order_mi = Daru::MultiIndex.from_tuples([\n", " [:a,:one,:bar],\n", " [:a,:two,:baz],\n", " [:b,:two,:foo],\n", " [:b,:one,:foo]])\n", "\n", "df_mi = Daru::DataFrame.new([\n", " vector_arry1, \n", " vector_arry2, \n", " vector_arry1, \n", " vector_arry2], order: order_mi, index: multi_index)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To select a row from a multi-indexed dataframe, you should pass the full tuple to the `#row[]` method. Partial tuples will return whatever row that match the tuple partially." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Daru::Vector:22866540 size: 4
0
[:a, :one, :bar]11
[:a, :two, :baz]1
[:b, :two, :foo]11
[:b, :one, :foo]1
" ], "text/plain": [ "\n", "#\n", " 0\n", "[:a, :one, :bar] 11\n", "[:a, :two, :baz] 1\n", "[:b, :two, :foo] 11\n", "[:b, :one, :foo] 1\n" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Specify complete tuple to choose a single row\n", "df_mi.row[:a, :one,:bar]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Daru::DataFrame:22500640 rows: 4 cols: 4
[:a, :one, :bar][:a, :two, :baz][:b, :two, :foo][:b, :one, :foo]
[:one, :bar]111111
[:one, :baz]122122
[:two, :bar]133133
[:two, :baz]144144
" ], "text/plain": [ "\n", "#\n", " [:a, :one, [:a, :two, [:b, :two, [:b, :one, \n", "[:one, :ba 11 1 11 1 \n", "[:one, :ba 12 2 12 2 \n", "[:two, :ba 13 3 13 3 \n", "[:two, :ba 14 4 14 4 \n" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Specify partial tuple to select index hierarchially\n", "df_mi.row[:a]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Grouping\n", "\n", "The DataFrame#group_by method allows you to group elements in the dataframe by name. This is similar to SQL GROUP BY." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{[\"bar\", \"one\"]=>[1], [\"bar\", \"three\"]=>[3], [\"bar\", \"two\"]=>[5], [\"foo\", \"one\"]=>[0, 6], [\"foo\", \"three\"]=>[7], [\"foo\", \"two\"]=>[2, 4]}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# See grouped rows with the 'groups' method\n", "\n", "df = Daru::DataFrame.new({\n", " a: %w{foo bar foo bar foo bar foo foo},\n", " b: %w{one one two three two two one three},\n", " c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],\n", " d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]\n", "})\n", "\n", "# Pass the vectors that are to be grouped in an Array to the group_by method. This \n", "# will return a Daru::Core::GroupBy object.\n", "grouped = df.group_by([:a, :b])\n", "\n", "# See the groups created using the 'groups' method.\n", "grouped.groups" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Daru::DataFrame:21484420 rows: 6 cols: 2
cd
[\"bar\", \"one\"]222
[\"bar\", \"three\"]144
[\"bar\", \"two\"]666
[\"foo\", \"one\"]2.044.0
[\"foo\", \"three\"]888
[\"foo\", \"two\"]3.044.0
" ], "text/plain": [ "\n", "#\n", " c d \n", "[\"bar\", \"o 2 22 \n", "[\"bar\", \"t 1 44 \n", "[\"bar\", \"t 6 66 \n", "[\"foo\", \"o 2.0 44.0 \n", "[\"foo\", \"t 8 88 \n", "[\"foo\", \"t 3.0 44.0 \n" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# First group by the columns :a and :b and then calculate mean of the grouped rows.\n", "grouped.mean" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The #get_group method can be used for accessing a particualar group(s)." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Daru::DataFrame:21046320 rows: 2 cols: 4
abcd
0fooone111
6fooone377
" ], "text/plain": [ "\n", "#\n", " a b c d \n", " 0 foo one 1 11 \n", " 6 foo one 3 77 \n" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grouped.get_group([\"foo\", \"one\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pivoting Data\n", "\n", "Similar to Excel's Pivot Table, DataFrame provides the #pivot_table functions for quickly pivoting data around a particular value(s) and getting quick insights.\n", "\n", "Lets demonstrate using some sales data." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Daru::DataFrame:18812540 rows: 17 cols: 8
AccountManagerNamePriceProductQuantityRepStatus
0714466Debra HenleyTrantow-Barrows30000CPU1Craig Bookerpresented
1714466Debra HenleyTrantow-Barrows10000Software1Craig Bookerpresented
2714466Debra HenleyTrantow-Barrows5000Maintenance2Craig Bookerpending
3737550Debra HenleyFritsch, Russel and Anderson35000CPU1Craig Bookerdeclined
4146832Debra HenleyKiehn-Spinka65000CPU2Daniel Hiltonwon
5218895Debra HenleyKulas Inc40000CPU2Daniel Hiltonpending
6218895Debra HenleyKulas Inc10000Software1Daniel Hiltonpresented
7412290Debra HenleyJerde-Hilpert5000Maintenance2John Smithpending
8740150Debra HenleyBarton LLC35000CPU1John Smithdeclined
9141962Fred AndersonHerman LLC65000CPU2Cedric Mosswon
10163416Fred AndersonPurdy-Kunde30000CPU1Cedric Mosspresented
11239344Fred AndersonStokes LLC5000Maintenance1Cedric Mosspending
12239344Fred AndersonStokes LLC10000Software1Cedric Mosspresented
13307599Fred AndersonKassulke, Ondricka and Metz7000Maintenance3Wendy Yulewon
14688981Fred AndersonKeeling LLC100000CPU5Wendy Yulewon
15729833Fred AndersonKoepp Ltd65000CPU2Wendy Yuledeclined
16729833Fred AndersonKoepp Ltd5000Monitor2Wendy Yulepresented
" ], "text/plain": [ "\n", "#\n", " Account Manager Name Price Product Quantity Rep Status \n", " 0 714466 Debra Henl Trantow-Ba 30000 CPU 1 Craig Book presented \n", " 1 714466 Debra Henl Trantow-Ba 10000 Software 1 Craig Book presented \n", " 2 714466 Debra Henl Trantow-Ba 5000 Maintenanc 2 Craig Book pending \n", " 3 737550 Debra Henl Fritsch, R 35000 CPU 1 Craig Book declined \n", " 4 146832 Debra Henl Kiehn-Spin 65000 CPU 2 Daniel Hil won \n", " 5 218895 Debra Henl Kulas Inc 40000 CPU 2 Daniel Hil pending \n", " 6 218895 Debra Henl Kulas Inc 10000 Software 1 Daniel Hil presented \n", " 7 412290 Debra Henl Jerde-Hilp 5000 Maintenanc 2 John Smith pending \n", " 8 740150 Debra Henl Barton LLC 35000 CPU 1 John Smith declined \n", " 9 141962 Fred Ander Herman LLC 65000 CPU 2 Cedric Mos won \n", " 10 163416 Fred Ander Purdy-Kund 30000 CPU 1 Cedric Mos presented \n", " 11 239344 Fred Ander Stokes LLC 5000 Maintenanc 1 Cedric Mos pending \n", " 12 239344 Fred Ander Stokes LLC 10000 Software 1 Cedric Mos presented \n", " 13 307599 Fred Ander Kassulke, 7000 Maintenanc 3 Wendy Yule won \n", " 14 688981 Fred Ander Keeling LL 100000 CPU 5 Wendy Yule won \n", " ... ... ... ... ... ... ... ... ... \n" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sales = Daru::DataFrame.from_csv 'data/sales-funnel.csv'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The #pivot_table method accepts an option :index, in which you can specify what vectors you want to index your DataFrame against." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Daru::DataFrame:17258300 rows: 5 cols: 3
AccountPriceQuantity
[\"Debra Henley\", \"Craig Booker\"]720237.020000.01.25
[\"Debra Henley\", \"Daniel Hilton\"]194874.038333.3333333333361.6666666666666667
[\"Debra Henley\", \"John Smith\"]576220.020000.01.5
[\"Fred Anderson\", \"Cedric Moss\"]196016.527500.01.25
[\"Fred Anderson\", \"Wendy Yule\"]614061.544250.03.0
" ], "text/plain": [ "\n", "#\n", " Account Price Quantity \n", "[\"Debra He 720237.0 20000.0 1.25 \n", "[\"Debra He 194874.0 38333.3333 1.66666666 \n", "[\"Debra He 576220.0 20000.0 1.5 \n", "[\"Fred And 196016.5 27500.0 1.25 \n", "[\"Fred And 614061.5 44250.0 3.0 \n" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sales.pivot_table index: ['Manager', 'Rep']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can also specify the `:values` option for specifying which Vector is to be used for the values.\n", "\n", "The `:vectors` options lets you specify the columns to pivot against.\n", "\n", "The `:agg` option specifies the aggregation function. This can be any stats method like :mean, :median, :product, etc." ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
Daru::DataFrame:15752920 rows: 5 cols: 12
[\"Account\", \"CPU\"][\"Account\", \"Software\"][\"Account\", \"Maintenance\"][\"Price\", \"CPU\"][\"Price\", \"Software\"][\"Price\", \"Maintenance\"][\"Quantity\", \"CPU\"][\"Quantity\", \"Software\"][\"Quantity\", \"Maintenance\"][\"Account\", \"Monitor\"][\"Price\", \"Monitor\"][\"Quantity\", \"Monitor\"]
[\"Debra Henley\", \"Craig Booker\"]145201671446671446665000100005000212
[\"Debra Henley\", \"Daniel Hilton\"]3657272188951050001000041
[\"Debra Henley\", \"John Smith\"]74015041229035000500012
[\"Fred Anderson\", \"Cedric Moss\"]30537823934423934495000100005000311
[\"Fred Anderson\", \"Wendy Yule\"]141881430759916500070007372983350002
" ], "text/plain": [ "\n", "#\n", " [\"Account\" [\"Account\" [\"Account\" [\"Price\", [\"Price\", [\"Price\", [\"Quantity [\"Quantity [\"Quantity [\"Account\" [\"Price\", [\"Quantity \n", "[\"Debra He 1452016 714466 714466 65000 10000 5000 2 1 2 nil nil nil \n", "[\"Debra He 365727 218895 nil 105000 10000 nil 4 1 nil nil nil nil \n", "[\"Debra He 740150 nil 412290 35000 nil 5000 1 nil 2 nil nil nil \n", "[\"Fred And 305378 239344 239344 95000 10000 5000 3 1 1 nil nil nil \n", "[\"Fred And 1418814 nil 307599 165000 nil 7000 7 nil 3 729833 5000 2 \n" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sales.pivot_table(index: ['Manager','Rep'], values: 'Price', vectors: ['Product'], agg: :sum)" ] } ], "metadata": { "kernelspec": { "display_name": "Ruby 2.2.1", "language": "ruby", "name": "ruby" }, "language_info": { "file_extension": ".rb", "mimetype": "application/x-ruby", "name": "ruby", "version": "2.2.1" } }, "nbformat": 4, "nbformat_minor": 0 }