{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Wrangling: Join, Combine, and, Reshape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hierachical Indexing"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"a 1 0.528770\n",
" 2 -1.294225\n",
" 3 0.656741\n",
"b 1 -1.678594\n",
" 3 1.599596\n",
"c 1 0.461214\n",
" 2 0.321586\n",
"d 2 0.850888\n",
" 3 1.942281\n",
"dtype: float64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.Series(np.random.randn(9), \n",
" index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'], \n",
" [1, 2, 3, 1, 3, 1, 2, 2, 3]])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],\n",
" labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.index"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 -1.678594\n",
"3 1.599596\n",
"dtype: float64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[\"b\"]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" a | \n",
" 0.528770 | \n",
" -1.294225 | \n",
" 0.656741 | \n",
"
\n",
" \n",
" b | \n",
" -1.678594 | \n",
" NaN | \n",
" 1.599596 | \n",
"
\n",
" \n",
" c | \n",
" 0.461214 | \n",
" 0.321586 | \n",
" NaN | \n",
"
\n",
" \n",
" d | \n",
" NaN | \n",
" 0.850888 | \n",
" 1.942281 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 1 2 3\n",
"a 0.528770 -1.294225 0.656741\n",
"b -1.678594 NaN 1.599596\n",
"c 0.461214 0.321586 NaN\n",
"d NaN 0.850888 1.942281"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.unstack()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['a', 'b', 'c', 'd'], dtype='object')"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.unstack().index"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Int64Index([1, 2, 3], dtype='int64')"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.unstack().columns"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" | \n",
" 0 | \n",
"
\n",
" \n",
" \n",
" \n",
" a | \n",
" 1 | \n",
" 0.528770 | \n",
"
\n",
" \n",
" 2 | \n",
" -1.294225 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.656741 | \n",
"
\n",
" \n",
" b | \n",
" 1 | \n",
" -1.678594 | \n",
"
\n",
" \n",
" 3 | \n",
" 1.599596 | \n",
"
\n",
" \n",
" c | \n",
" 1 | \n",
" 0.461214 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.321586 | \n",
"
\n",
" \n",
" d | \n",
" 2 | \n",
" 0.850888 | \n",
"
\n",
" \n",
" 3 | \n",
" 1.942281 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0\n",
"a 1 0.528770\n",
" 2 -1.294225\n",
" 3 0.656741\n",
"b 1 -1.678594\n",
" 3 1.599596\n",
"c 1 0.461214\n",
" 2 0.321586\n",
"d 2 0.850888\n",
" 3 1.942281"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(data)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Merging"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" data1 | \n",
" key | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" a | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" b | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" c | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" d | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" b | \n",
"
\n",
" \n",
" 5 | \n",
" 5 | \n",
" b | \n",
"
\n",
" \n",
" 6 | \n",
" 6 | \n",
" a | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" data1 key\n",
"0 0 a\n",
"1 1 b\n",
"2 2 c\n",
"3 3 d\n",
"4 4 b\n",
"5 5 b\n",
"6 6 a"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1 = pd.DataFrame({\"key\": [\"a\", \"b\", \"c\", \"d\", \"b\", \"b\", \"a\"], \n",
" \"data1\": range(7)})\n",
"df1"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" data2 | \n",
" key | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" a | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" b | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" b | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" c | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" data2 key\n",
"0 0 a\n",
"1 1 b\n",
"2 2 b\n",
"3 3 c"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = pd.DataFrame({\"key\": [\"a\", \"b\", \"b\", \"c\"], \n",
" 'data2': range(4)})\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" data1 | \n",
" key | \n",
" data2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" a | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 6 | \n",
" a | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" b | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" b | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" b | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" 4 | \n",
" b | \n",
" 2 | \n",
"
\n",
" \n",
" 6 | \n",
" 5 | \n",
" b | \n",
" 1 | \n",
"
\n",
" \n",
" 7 | \n",
" 5 | \n",
" b | \n",
" 2 | \n",
"
\n",
" \n",
" 8 | \n",
" 2 | \n",
" c | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" data1 key data2\n",
"0 0 a 0\n",
"1 6 a 0\n",
"2 1 b 1\n",
"3 1 b 2\n",
"4 4 b 1\n",
"5 4 b 2\n",
"6 5 b 1\n",
"7 5 b 2\n",
"8 2 c 3"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3 = pd.merge(df1, df2)\n",
"df3"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" data1 | \n",
" key | \n",
" data2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" a | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 6 | \n",
" a | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" b | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" b | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" b | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" 4 | \n",
" b | \n",
" 2 | \n",
"
\n",
" \n",
" 6 | \n",
" 5 | \n",
" b | \n",
" 1 | \n",
"
\n",
" \n",
" 7 | \n",
" 5 | \n",
" b | \n",
" 2 | \n",
"
\n",
" \n",
" 8 | \n",
" 2 | \n",
" c | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" data1 key data2\n",
"0 0 a 0\n",
"1 6 a 0\n",
"2 1 b 1\n",
"3 1 b 2\n",
"4 4 b 1\n",
"5 4 b 2\n",
"6 5 b 1\n",
"7 5 b 2\n",
"8 2 c 3"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1.merge(df2)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" data1 | \n",
" key | \n",
" data2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" a | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" b | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" b | \n",
" 2.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 2 | \n",
" c | \n",
" 3.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 3 | \n",
" d | \n",
" NaN | \n",
"
\n",
" \n",
" 5 | \n",
" 4 | \n",
" b | \n",
" 1.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 4 | \n",
" b | \n",
" 2.0 | \n",
"
\n",
" \n",
" 7 | \n",
" 5 | \n",
" b | \n",
" 1.0 | \n",
"
\n",
" \n",
" 8 | \n",
" 5 | \n",
" b | \n",
" 2.0 | \n",
"
\n",
" \n",
" 9 | \n",
" 6 | \n",
" a | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" data1 key data2\n",
"0 0 a 0.0\n",
"1 1 b 1.0\n",
"2 1 b 2.0\n",
"3 2 c 3.0\n",
"4 3 d NaN\n",
"5 4 b 1.0\n",
"6 4 b 2.0\n",
"7 5 b 1.0\n",
"8 5 b 2.0\n",
"9 6 a 0.0"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1.merge(df2, on = \"key\", how = \"left\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"a 0\n",
"b 1\n",
"dtype: int64"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s1 = pd.Series([0, 1], index=['a', 'b'])\n",
"s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])\n",
"s3 = pd.Series([5, 6], index=['f', 'g'])\n",
"s1"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"c 2\n",
"d 3\n",
"e 4\n",
"dtype: int64"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s2"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"f 5\n",
"g 6\n",
"dtype: int64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s3"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"a 0\n",
"b 1\n",
"c 2\n",
"d 3\n",
"e 4\n",
"f 5\n",
"g 6\n",
"dtype: int64"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat([s1, s2, s3])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" A | \n",
" B | \n",
" C | \n",
" key | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 4 | \n",
" 7 | \n",
" foo | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 5 | \n",
" 8 | \n",
" bar | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 6 | \n",
" 9 | \n",
" baz | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" A B C key\n",
"0 1 4 7 foo\n",
"1 2 5 8 bar\n",
"2 3 6 9 baz"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({'key': ['foo', 'bar', 'baz'], \n",
" 'A': [1, 2, 3], \n",
" 'B': [4, 5, 6], \n",
" 'C': [7, 8, 9]})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" key | \n",
" variable | \n",
" value | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" foo | \n",
" A | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" bar | \n",
" A | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" baz | \n",
" A | \n",
" 3 | \n",
"
\n",
" \n",
" 3 | \n",
" foo | \n",
" B | \n",
" 4 | \n",
"
\n",
" \n",
" 4 | \n",
" bar | \n",
" B | \n",
" 5 | \n",
"
\n",
" \n",
" 5 | \n",
" baz | \n",
" B | \n",
" 6 | \n",
"
\n",
" \n",
" 6 | \n",
" foo | \n",
" C | \n",
" 7 | \n",
"
\n",
" \n",
" 7 | \n",
" bar | \n",
" C | \n",
" 8 | \n",
"
\n",
" \n",
" 8 | \n",
" baz | \n",
" C | \n",
" 9 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" key variable value\n",
"0 foo A 1\n",
"1 bar A 2\n",
"2 baz A 3\n",
"3 foo B 4\n",
"4 bar B 5\n",
"5 baz B 6\n",
"6 foo C 7\n",
"7 bar C 8\n",
"8 baz C 9"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"melted = pd.melt(df, [\"key\"])\n",
"melted"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" variable | \n",
" A | \n",
" B | \n",
" C | \n",
"
\n",
" \n",
" key | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" bar | \n",
" 2 | \n",
" 5 | \n",
" 8 | \n",
"
\n",
" \n",
" baz | \n",
" 3 | \n",
" 6 | \n",
" 9 | \n",
"
\n",
" \n",
" foo | \n",
" 1 | \n",
" 4 | \n",
" 7 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"variable A B C\n",
"key \n",
"bar 2 5 8\n",
"baz 3 6 9\n",
"foo 1 4 7"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pivoted = melted.pivot(index = \"key\", values = \"value\", columns = \"variable\")\n",
"pivoted"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}