{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Wrangling: Join, Combine, and, Reshape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Hierachical Indexing" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "a 1 0.528770\n", " 2 -1.294225\n", " 3 0.656741\n", "b 1 -1.678594\n", " 3 1.599596\n", "c 1 0.461214\n", " 2 0.321586\n", "d 2 0.850888\n", " 3 1.942281\n", "dtype: float64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.Series(np.random.randn(9), \n", " index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'], \n", " [1, 2, 3, 1, 3, 1, 2, 2, 3]])\n", "data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],\n", " labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.index" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1 -1.678594\n", "3 1.599596\n", "dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[\"b\"]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
123
a0.528770-1.2942250.656741
b-1.678594NaN1.599596
c0.4612140.321586NaN
dNaN0.8508881.942281
\n", "
" ], "text/plain": [ " 1 2 3\n", "a 0.528770 -1.294225 0.656741\n", "b -1.678594 NaN 1.599596\n", "c 0.461214 0.321586 NaN\n", "d NaN 0.850888 1.942281" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.unstack()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['a', 'b', 'c', 'd'], dtype='object')" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.unstack().index" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Int64Index([1, 2, 3], dtype='int64')" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.unstack().columns" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
a10.528770
2-1.294225
30.656741
b1-1.678594
31.599596
c10.461214
20.321586
d20.850888
31.942281
\n", "
" ], "text/plain": [ " 0\n", "a 1 0.528770\n", " 2 -1.294225\n", " 3 0.656741\n", "b 1 -1.678594\n", " 3 1.599596\n", "c 1 0.461214\n", " 2 0.321586\n", "d 2 0.850888\n", " 3 1.942281" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(data)\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Merging" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1key
00a
11b
22c
33d
44b
55b
66a
\n", "
" ], "text/plain": [ " data1 key\n", "0 0 a\n", "1 1 b\n", "2 2 c\n", "3 3 d\n", "4 4 b\n", "5 5 b\n", "6 6 a" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 = pd.DataFrame({\"key\": [\"a\", \"b\", \"c\", \"d\", \"b\", \"b\", \"a\"], \n", " \"data1\": range(7)})\n", "df1" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data2key
00a
11b
22b
33c
\n", "
" ], "text/plain": [ " data2 key\n", "0 0 a\n", "1 1 b\n", "2 2 b\n", "3 3 c" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = pd.DataFrame({\"key\": [\"a\", \"b\", \"b\", \"c\"], \n", " 'data2': range(4)})\n", "df2" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1keydata2
00a0
16a0
21b1
31b2
44b1
54b2
65b1
75b2
82c3
\n", "
" ], "text/plain": [ " data1 key data2\n", "0 0 a 0\n", "1 6 a 0\n", "2 1 b 1\n", "3 1 b 2\n", "4 4 b 1\n", "5 4 b 2\n", "6 5 b 1\n", "7 5 b 2\n", "8 2 c 3" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3 = pd.merge(df1, df2)\n", "df3" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1keydata2
00a0
16a0
21b1
31b2
44b1
54b2
65b1
75b2
82c3
\n", "
" ], "text/plain": [ " data1 key data2\n", "0 0 a 0\n", "1 6 a 0\n", "2 1 b 1\n", "3 1 b 2\n", "4 4 b 1\n", "5 4 b 2\n", "6 5 b 1\n", "7 5 b 2\n", "8 2 c 3" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.merge(df2)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1keydata2
00a0.0
11b1.0
21b2.0
32c3.0
43dNaN
54b1.0
64b2.0
75b1.0
85b2.0
96a0.0
\n", "
" ], "text/plain": [ " data1 key data2\n", "0 0 a 0.0\n", "1 1 b 1.0\n", "2 1 b 2.0\n", "3 2 c 3.0\n", "4 3 d NaN\n", "5 4 b 1.0\n", "6 4 b 2.0\n", "7 5 b 1.0\n", "8 5 b 2.0\n", "9 6 a 0.0" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.merge(df2, on = \"key\", how = \"left\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "a 0\n", "b 1\n", "dtype: int64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s1 = pd.Series([0, 1], index=['a', 'b'])\n", "s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])\n", "s3 = pd.Series([5, 6], index=['f', 'g'])\n", "s1" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "c 2\n", "d 3\n", "e 4\n", "dtype: int64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s2" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "f 5\n", "g 6\n", "dtype: int64" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s3" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "a 0\n", "b 1\n", "c 2\n", "d 3\n", "e 4\n", "f 5\n", "g 6\n", "dtype: int64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([s1, s2, s3])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCkey
0147foo
1258bar
2369baz
\n", "
" ], "text/plain": [ " A B C key\n", "0 1 4 7 foo\n", "1 2 5 8 bar\n", "2 3 6 9 baz" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({'key': ['foo', 'bar', 'baz'], \n", " 'A': [1, 2, 3], \n", " 'B': [4, 5, 6], \n", " 'C': [7, 8, 9]})\n", "df" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keyvariablevalue
0fooA1
1barA2
2bazA3
3fooB4
4barB5
5bazB6
6fooC7
7barC8
8bazC9
\n", "
" ], "text/plain": [ " key variable value\n", "0 foo A 1\n", "1 bar A 2\n", "2 baz A 3\n", "3 foo B 4\n", "4 bar B 5\n", "5 baz B 6\n", "6 foo C 7\n", "7 bar C 8\n", "8 baz C 9" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "melted = pd.melt(df, [\"key\"])\n", "melted" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
variableABC
key
bar258
baz369
foo147
\n", "
" ], "text/plain": [ "variable A B C\n", "key \n", "bar 2 5 8\n", "baz 3 6 9\n", "foo 1 4 7" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pivoted = melted.pivot(index = \"key\", values = \"value\", columns = \"variable\")\n", "pivoted" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }