{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Reshaping and Pivoting" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Reshaping with Hierarchical Indexing\n", "Stack and Unstack" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
numberonetwothree
state
Ohio012
Colorado345
\n", "
" ], "text/plain": [ "number one two three\n", "state \n", "Ohio 0 1 2\n", "Colorado 3 4 5" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame(np.arange(6).reshape((2, 3)),\n", " index=pd.Index(['Ohio', 'Colorado'], name='state'),\n", " columns=pd.Index(['one', 'two', 'three'], name='number'))\n", "data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "state number\n", "Ohio one 0\n", " two 1\n", " three 2\n", "Colorado one 3\n", " two 4\n", " three 5\n", "dtype: int32" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# stacking\n", "res = data.stack()\n", "res" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
numberonetwothree
state
Ohio012
Colorado345
\n", "
" ], "text/plain": [ "number one two three\n", "state \n", "Ohio 0 1 2\n", "Colorado 3 4 5" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.unstack()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateOhioColorado
number
one03
two14
three25
\n", "
" ], "text/plain": [ "state Ohio Colorado\n", "number \n", "one 0 3\n", "two 1 4\n", "three 2 5" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# By default the innermost level is unstacked (same with stack). You can unstack a different\n", "# level by passing a level number or name\n", "\n", "res.unstack(0)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
numberonetwothree
state
Ohio012
Colorado345
\n", "
" ], "text/plain": [ "number one two three\n", "state \n", "Ohio 0 1 2\n", "Colorado 3 4 5" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.unstack(1)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateOhioColorado
number
one03
two14
three25
\n", "
" ], "text/plain": [ "state Ohio Colorado\n", "number \n", "one 0 3\n", "two 1 4\n", "three 2 5" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.unstack('state')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "one a 0\n", " b 1\n", " c 2\n", " d 3\n", "two c 4\n", " d 5\n", " e 6\n", "dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])\n", "s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])\n", "\n", "data2 = pd.concat([s1, s2], keys=['one', 'two'])\n", "data2" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcde
one0.01.02.03.0NaN
twoNaNNaN4.05.06.0
\n", "
" ], "text/plain": [ " a b c d e\n", "one 0.0 1.0 2.0 3.0 NaN\n", "two NaN NaN 4.0 5.0 6.0" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data2.unstack()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
a0.0NaN
b1.0NaN
c2.04.0
d3.05.0
eNaN6.0
\n", "
" ], "text/plain": [ " one two\n", "a 0.0 NaN\n", "b 1.0 NaN\n", "c 2.0 4.0\n", "d 3.0 5.0\n", "e NaN 6.0" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data2.unstack(0)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "one a 0.0\n", " b 1.0\n", " c 2.0\n", " d 3.0\n", "two c 4.0\n", " d 5.0\n", " e 6.0\n", "dtype: float64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Unstacking might introduce missing data if all of the values in the level aren’t found in\n", "# each of the subgroups\n", "# Stacking filters out missing data by default, so the operation is easily invertible\n", "data2.unstack().stack()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "one a 0.0\n", " b 1.0\n", " c 2.0\n", " d 3.0\n", " e NaN\n", "two a NaN\n", " b NaN\n", " c 4.0\n", " d 5.0\n", " e 6.0\n", "dtype: float64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data2.unstack().stack(dropna=False)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sideleftright
statenumber
Ohioone05
two16
three27
Coloradoone38
two49
three510
\n", "
" ], "text/plain": [ "side left right\n", "state number \n", "Ohio one 0 5\n", " two 1 6\n", " three 2 7\n", "Colorado one 3 8\n", " two 4 9\n", " three 5 10" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# When unstacking in a DataFrame, the level unstacked becomes the lowest level in the result\n", "df = pd.DataFrame({'left': res, 'right': res + 5},\n", " columns=pd.Index(['left', 'right'], name='side'))\n", "df" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sideleftright
stateOhioColoradoOhioColorado
number
one0358
two1469
three25710
\n", "
" ], "text/plain": [ "side left right \n", "state Ohio Colorado Ohio Colorado\n", "number \n", "one 0 3 5 8\n", "two 1 4 6 9\n", "three 2 5 7 10" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.unstack('state')" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateOhioColorado
numberside
oneleft03
right58
twoleft14
right69
threeleft25
right710
\n", "
" ], "text/plain": [ "state Ohio Colorado\n", "number side \n", "one left 0 3\n", " right 5 8\n", "two left 1 4\n", " right 6 9\n", "three left 2 5\n", " right 7 10" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.unstack('state').stack('side')" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "state number side \n", "Ohio one left 0\n", " right 5\n", " two left 1\n", " right 6\n", " three left 2\n", " right 7\n", "Colorado one left 3\n", " right 8\n", " two left 4\n", " right 9\n", " three left 5\n", " right 10\n", "dtype: int32" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.stack('side')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateOhioColorado
numberside
oneleft03
right58
twoleft14
right69
threeleft25
right710
\n", "
" ], "text/plain": [ "state Ohio Colorado\n", "number side \n", "one left 0 3\n", " right 5 8\n", "two left 1 4\n", " right 6 9\n", "three left 2 5\n", " right 7 10" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.stack('side').unstack('state') " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### If moving Header as a column use 'stack' else 'unstack'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Pivoting “long” to “wide” Format" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateitemvalue
01959-03-31realgdp2710.349
11959-03-31infl0.000
21959-03-31unemp5.800
31959-06-30realgdp2778.801
41959-06-30infl2.340
51959-06-30unemp5.100
61959-09-30realgdp2775.488
71959-09-30infl2.740
81959-09-30unemp5.300
91959-12-31realgdp2785.204
\n", "
" ], "text/plain": [ " date item value\n", "0 1959-03-31 realgdp 2710.349\n", "1 1959-03-31 infl 0.000\n", "2 1959-03-31 unemp 5.800\n", "3 1959-06-30 realgdp 2778.801\n", "4 1959-06-30 infl 2.340\n", "5 1959-06-30 unemp 5.100\n", "6 1959-09-30 realgdp 2775.488\n", "7 1959-09-30 infl 2.740\n", "8 1959-09-30 unemp 5.300\n", "9 1959-12-31 realgdp 2785.204" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_table(\"dataset/stack.txt\", sep='|')\n", "df" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iteminflrealgdpunemp
date
1959-03-310.002710.3495.8
1959-06-302.342778.8015.1
1959-09-302.742775.4885.3
1959-12-31NaN2785.204NaN
\n", "
" ], "text/plain": [ "item infl realgdp unemp\n", "date \n", "1959-03-31 0.00 2710.349 5.8\n", "1959-06-30 2.34 2778.801 5.1\n", "1959-09-30 2.74 2775.488 5.3\n", "1959-12-31 NaN 2785.204 NaN" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pivoted = df.pivot('date', 'item', 'value')\n", "pivoted" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dateitemvaluevalue2
01959-03-31realgdp2710.349-1.083851
11959-03-31infl0.0000.644205
21959-03-31unemp5.8001.590181
31959-06-30realgdp2778.8010.550000
41959-06-30infl2.3400.149167
51959-06-30unemp5.1001.316712
61959-09-30realgdp2775.488-0.026140
71959-09-30infl2.740-0.054482
81959-09-30unemp5.300-0.167150
91959-12-31realgdp2785.204-0.728173
\n", "
" ], "text/plain": [ " date item value value2\n", "0 1959-03-31 realgdp 2710.349 -1.083851\n", "1 1959-03-31 infl 0.000 0.644205\n", "2 1959-03-31 unemp 5.800 1.590181\n", "3 1959-06-30 realgdp 2778.801 0.550000\n", "4 1959-06-30 infl 2.340 0.149167\n", "5 1959-06-30 unemp 5.100 1.316712\n", "6 1959-09-30 realgdp 2775.488 -0.026140\n", "7 1959-09-30 infl 2.740 -0.054482\n", "8 1959-09-30 unemp 5.300 -0.167150\n", "9 1959-12-31 realgdp 2785.204 -0.728173" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['value2'] = np.random.randn(len(df))\n", "df" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
valuevalue2
iteminflrealgdpunempinflrealgdpunemp
date
1959-03-310.002710.3495.80.644205-1.0838511.590181
1959-06-302.342778.8015.10.1491670.5500001.316712
1959-09-302.742775.4885.3-0.054482-0.026140-0.167150
1959-12-31NaN2785.204NaNNaN-0.728173NaN
\n", "
" ], "text/plain": [ " value value2 \n", "item infl realgdp unemp infl realgdp unemp\n", "date \n", "1959-03-31 0.00 2710.349 5.8 0.644205 -1.083851 1.590181\n", "1959-06-30 2.34 2778.801 5.1 0.149167 0.550000 1.316712\n", "1959-09-30 2.74 2775.488 5.3 -0.054482 -0.026140 -0.167150\n", "1959-12-31 NaN 2785.204 NaN NaN -0.728173 NaN" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pivoted = df.pivot('date', 'item')\n", "pivoted" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
valuevalue2
iteminflrealgdpunempinflrealgdpunemp
date
1959-03-310.002710.3495.80.644205-1.0838511.590181
1959-06-302.342778.8015.10.1491670.5500001.316712
1959-09-302.742775.4885.3-0.054482-0.026140-0.167150
1959-12-31NaN2785.204NaNNaN-0.728173NaN
\n", "
" ], "text/plain": [ " value value2 \n", "item infl realgdp unemp infl realgdp unemp\n", "date \n", "1959-03-31 0.00 2710.349 5.8 0.644205 -1.083851 1.590181\n", "1959-06-30 2.34 2778.801 5.1 0.149167 0.550000 1.316712\n", "1959-09-30 2.74 2775.488 5.3 -0.054482 -0.026140 -0.167150\n", "1959-12-31 NaN 2785.204 NaN NaN -0.728173 NaN" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Note that pivot is just a shortcut for creating a hierarchical index using set_index and\n", "# reshaping with unstack:\n", "unstacked = df.set_index(['date', 'item']).unstack('item')\n", "unstacked" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }