{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Combining and Merging Data Sets\n", "\n", "```\n", "pandas.merge connects rows in DataFrames based on one or more keys. \n", "pandas.concat glues or stacks together objects along an axis.\n", "combine_first instance method enables splicing together overlapping data to fill\n", " in missing values in one object with values from another.\n", "```" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " data1 key\n", "0 0 b\n", "1 1 b\n", "2 2 a\n", "3 3 c\n", "4 4 a\n", "5 5 a\n", "6 6 b \n", "\n", " data2 key\n", "0 0 a\n", "1 1 b\n", "2 2 d\n" ] } ], "source": [ "df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],\n", " 'data1': range(7)})\n", "df2 = pd.DataFrame({'key': ['a', 'b', 'd'],\n", " 'data2': range(3)})\n", "\n", "print(df1, \"\\n\\n\", df2)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1keydata2
00b1
11b1
26b1
32a0
44a0
55a0
\n", "
" ], "text/plain": [ " data1 key data2\n", "0 0 b 1\n", "1 1 b 1\n", "2 6 b 1\n", "3 2 a 0\n", "4 4 a 0\n", "5 5 a 0" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(df1, df2)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1keydata2
00b1
11b1
26b1
32a0
44a0
55a0
\n", "
" ], "text/plain": [ " data1 key data2\n", "0 0 b 1\n", "1 1 b 1\n", "2 6 b 1\n", "3 2 a 0\n", "4 4 a 0\n", "5 5 a 0" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(df1, df2, on='key')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1key_xdata2key_y
00b0a
11b1b
22a2d
\n", "
" ], "text/plain": [ " data1 key_x data2 key_y\n", "0 0 b 0 a\n", "1 1 b 1 b\n", "2 2 a 2 d" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# if column name are diff, then use as below - \n", "\n", "pd.merge(df1, df2, left_on='data1', right_on='data2')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1keydata2
00.0b1.0
11.0b1.0
26.0b1.0
32.0a0.0
44.0a0.0
55.0a0.0
63.0cNaN
7NaNd2.0
\n", "
" ], "text/plain": [ " data1 key data2\n", "0 0.0 b 1.0\n", "1 1.0 b 1.0\n", "2 6.0 b 1.0\n", "3 2.0 a 0.0\n", "4 4.0 a 0.0\n", "5 5.0 a 0.0\n", "6 3.0 c NaN\n", "7 NaN d 2.0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# by default, merge use the inner join\n", "\n", "pd.merge(df1, df2, how='outer')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1keydata2
00b1.0
11b1.0
22a0.0
33cNaN
44a0.0
55a0.0
66b1.0
\n", "
" ], "text/plain": [ " data1 key data2\n", "0 0 b 1.0\n", "1 1 b 1.0\n", "2 2 a 0.0\n", "3 3 c NaN\n", "4 4 a 0.0\n", "5 5 a 0.0\n", "6 6 b 1.0" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(df1, df2, how='left')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1keydata2
00.0bNaN
11.0b1.0
22.0aNaN
33.0cNaN
44.0aNaN
55.0aNaN
66.0bNaN
7NaNa0.0
8NaNd2.0
\n", "
" ], "text/plain": [ " data1 key data2\n", "0 0.0 b NaN\n", "1 1.0 b 1.0\n", "2 2.0 a NaN\n", "3 3.0 c NaN\n", "4 4.0 a NaN\n", "5 5.0 a NaN\n", "6 6.0 b NaN\n", "7 NaN a 0.0\n", "8 NaN d 2.0" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# To merge with multiple keys, pass a list of column names\n", "pd.merge(df1, df2, left_on=['key', 'data1'], right_on=['key', 'data2'], how='outer')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1key_xdata2key_y
00b0a
11b1b
22a2d
\n", "
" ], "text/plain": [ " data1 key_x data2 key_y\n", "0 0 b 0 a\n", "1 1 b 1 b\n", "2 2 a 2 d" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check the column name for KEY\n", "pd.merge(df1, df2, left_on='data1', right_on='data2')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1key_leftdata2key_right
00b0a
11b1b
22a2d
\n", "
" ], "text/plain": [ " data1 key_left data2 key_right\n", "0 0 b 0 a\n", "1 1 b 1 b\n", "2 2 a 2 d" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# for proper renaming of overlapping columns, use suffixes option\n", "pd.merge(df1, df2, left_on='data1', right_on='data2', suffixes=[\"_left\", \"_right\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### merge function arguments\n", "```\n", "left DataFrame to be merged on the left side\n", "right DataFrame to be merged on the right side\n", "how One of 'inner', 'outer', 'left' or 'right'. 'inner' by default\n", "on Column names to join on. Must be found in both DataFrame objects. If not specified and no other join keys given, will use the intersection of the column names in left and right as the join keys\n", "left_on Columns in left DataFrame to use as join keys\n", "right_on Analogous to left_on for left DataFrame\n", "left_index Use row index in left as its join key (or keys, if a MultiIndex)\n", "right_index Analogous to left_index\n", "sort Sort merged data lexicographically by join keys; True by default. Disable to get better performance in some cases on large datasets\n", "suffixes Tuple of string values to append to column names in case of overlap; defaults to ('_x', '_y'). For example, if 'data' in both DataFrame objects, would appear as 'data_x' and 'data_y' in result\n", "copy If False, avoid copying data into resulting data structure in some exceptional cases. By default always copies\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Merging on Index" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [], "source": [ "left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],\n", " 'value': range(6)})\n", "right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
\n", "
" ], "text/plain": [ " key value group_val\n", "0 a 0 3.5\n", "2 a 2 3.5\n", "3 a 3 3.5\n", "1 b 1 7.0\n", "4 b 4 7.0" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(left1, right1, left_on='key', right_index=True)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " data key1 key2\n", "0 0.0 Ohio 2000\n", "1 1.0 Ohio 2001\n", "2 2.0 Ohio 2002\n", "3 3.0 Nevada 2001\n", "4 4.0 Nevada 2002 \n", "\n", " event1 event2\n", "Nevada 2001 0 1\n", " 2000 2 3\n", "Ohio 2000 4 5\n", " 2000 6 7\n", " 2001 8 9\n", " 2002 10 11\n" ] } ], "source": [ "# With hierarchically-indexed data, things are a bit more complicated\n", "# we need to pass the multiple keys to join \n", "\n", "lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],\n", " 'key2': [2000, 2001, 2002, 2001, 2002],\n", " 'data': np.arange(5.)})\n", "\n", "righth = pd.DataFrame(np.arange(12).reshape((6, 2)),\n", " index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],\n", " [2001, 2000, 2000, 2000, 2001, 2002]],\n", " columns=['event1', 'event2'])\n", "\n", "print(lefth, \"\\n\\n\", righth)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
datakey1key2event1event2
00.0Ohio200045
00.0Ohio200067
11.0Ohio200189
22.0Ohio20021011
33.0Nevada200101
\n", "
" ], "text/plain": [ " data key1 key2 event1 event2\n", "0 0.0 Ohio 2000 4 5\n", "0 0.0 Ohio 2000 6 7\n", "1 1.0 Ohio 2001 8 9\n", "2 2.0 Ohio 2002 10 11\n", "3 3.0 Nevada 2001 0 1" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Ohio Nevada\n", "a 1.0 2.0\n", "c 3.0 4.0\n", "e 5.0 6.0 \n", "\n", " Missouri Alabama\n", "b 7.0 8.0\n", "c 9.0 10.0\n", "d 11.0 12.0\n", "e 13.0 14.0\n" ] } ], "source": [ "# Using the indexes of both sides of the merge is also not an issue\n", "\n", "left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],\n", " columns=['Ohio', 'Nevada'])\n", "right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],\n", " index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])\n", "\n", "print(left2, \"\\n\\n\", right2)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OhioNevadaMissouriAlabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0
\n", "
" ], "text/plain": [ " Ohio Nevada Missouri Alabama\n", "a 1.0 2.0 NaN NaN\n", "b NaN NaN 7.0 8.0\n", "c 3.0 4.0 9.0 10.0\n", "d NaN NaN 11.0 12.0\n", "e 5.0 6.0 13.0 14.0" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.merge(left2, right2, how='outer', left_index=True, right_index=True)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OhioNevadaMissouriAlabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0
\n", "
" ], "text/plain": [ " Ohio Nevada Missouri Alabama\n", "a 1.0 2.0 NaN NaN\n", "b NaN NaN 7.0 8.0\n", "c 3.0 4.0 9.0 10.0\n", "d NaN NaN 11.0 12.0\n", "e 5.0 6.0 13.0 14.0" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# DataFrame has a more convenient join instance for merging by index. It can also be\n", "# used to combine together many DataFrame objects having the same or similar indexes\n", "# but non-overlapping columns.\n", "\n", "left2.join(right2, how='outer')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
\n", "
" ], "text/plain": [ " key value group_val\n", "0 a 0 3.5\n", "2 a 2 3.5\n", "3 a 3 3.5\n", "1 b 1 7.0\n", "4 b 4 7.0" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left1.join(right1, on='key', how='inner')" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " New York Oregon\n", "a 7.0 8.0\n", "c 9.0 10.0\n", "e 11.0 12.0\n", "f 16.0 17.0\n" ] } ], "source": [ "another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],\n", " index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])\n", "print(another)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OhioNevadaMissouriAlabamaNew YorkOregon
a1.02.0NaNNaN7.08.0
c3.04.09.010.09.010.0
e5.06.013.014.011.012.0
\n", "
" ], "text/plain": [ " Ohio Nevada Missouri Alabama New York Oregon\n", "a 1.0 2.0 NaN NaN 7.0 8.0\n", "c 3.0 4.0 9.0 10.0 9.0 10.0\n", "e 5.0 6.0 13.0 14.0 11.0 12.0" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left2.join([right2, another])" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OhioNevadaMissouriAlabamaNew YorkOregon
a1.02.0NaNNaN7.08.0
bNaNNaN7.08.0NaNNaN
c3.04.09.010.09.010.0
dNaNNaN11.012.0NaNNaN
e5.06.013.014.011.012.0
fNaNNaNNaNNaN16.017.0
\n", "
" ], "text/plain": [ " Ohio Nevada Missouri Alabama New York Oregon\n", "a 1.0 2.0 NaN NaN 7.0 8.0\n", "b NaN NaN 7.0 8.0 NaN NaN\n", "c 3.0 4.0 9.0 10.0 9.0 10.0\n", "d NaN NaN 11.0 12.0 NaN NaN\n", "e 5.0 6.0 13.0 14.0 11.0 12.0\n", "f NaN NaN NaN NaN 16.0 17.0" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "left2.join([right2, another], how='outer')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concatenating Along an Axis" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "a 0\n", "b 1\n", "dtype: int64 \n", "\n", " c 2\n", "d 3\n", "e 4\n", "dtype: int64 \n", "\n", " f 5\n", "g 6\n", "dtype: int64\n" ] } ], "source": [ "s1 = pd.Series([0, 1], index=['a', 'b'])\n", "s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])\n", "s3 = pd.Series([5, 6], index=['f', 'g'])\n", "\n", "print(s1, \"\\n\\n\",s2, \"\\n\\n\",s3)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 0\n", "b 1\n", "c 2\n", "d 3\n", "e 4\n", "f 5\n", "g 6\n", "dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([s1, s2, s3])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0
\n", "
" ], "text/plain": [ " 0 1 2\n", "a 0.0 NaN NaN\n", "b 1.0 NaN NaN\n", "c NaN 2.0 NaN\n", "d NaN 3.0 NaN\n", "e NaN 4.0 NaN\n", "f NaN NaN 5.0\n", "g NaN NaN 6.0" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# By default concat works along axis=0, producing another Series. If you pass axis=1, the\n", "# result will instead be a DataFrame (axis=1 is the columns):\n", "pd.concat([s1, s2, s3], axis=1)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 0\n", "b 5\n", "f 5\n", "g 6\n", "dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# In this case there is no overlap on the other axis, which as you can see is the sorted\n", "# union (the 'outer' join) of the indexes. You can instead intersect them by passing join='inner':\n", "\n", "s4 = pd.concat([s1 * 5, s3])\n", "s4" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
a0.00
b1.05
fNaN5
gNaN6
\n", "
" ], "text/plain": [ " 0 1\n", "a 0.0 0\n", "b 1.0 5\n", "f NaN 5\n", "g NaN 6" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([s1, s4], axis=1)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
a00
b15
\n", "
" ], "text/plain": [ " 0 1\n", "a 0 0\n", "b 1 5" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([s1, s4], axis=1, join='inner')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
a0.00.0
cNaNNaN
b1.05.0
eNaNNaN
\n", "
" ], "text/plain": [ " 0 1\n", "a 0.0 0.0\n", "c NaN NaN\n", "b 1.0 5.0\n", "e NaN NaN" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "one a 0\n", " b 1\n", "two a 0\n", " b 1\n", "three f 5\n", " g 6\n", "dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# to create a hierarchical index on the concatenation axis. To do this, use the keys argument:\n", "result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])\n", "result" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abfg
one0.01.0NaNNaN
two0.01.0NaNNaN
threeNaNNaN5.06.0
\n", "
" ], "text/plain": [ " a b f g\n", "one 0.0 1.0 NaN NaN\n", "two 0.0 1.0 NaN NaN\n", "three NaN NaN 5.0 6.0" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result.unstack()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothree
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0
\n", "
" ], "text/plain": [ " one two three\n", "a 0.0 NaN NaN\n", "b 1.0 NaN NaN\n", "c NaN 2.0 NaN\n", "d NaN 3.0 NaN\n", "e NaN 4.0 NaN\n", "f NaN NaN 5.0\n", "g NaN NaN 6.0" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# In the case of combining Series along axis=1, the keys become the DataFrame column headers:\n", "pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " one two\n", "a 0 1\n", "b 2 3\n", "c 4 5 \n", "\n", " three four\n", "a 5 6\n", "c 7 8\n" ] } ], "source": [ "df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],\n", " columns=['one', 'two'])\n", "df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],\n", " columns=['three', 'four'])\n", "\n", "\n", "print(df1, \"\\n\\n\", df2)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
level1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
\n", "
" ], "text/plain": [ " level1 level2 \n", " one two three four\n", "a 0 1 5.0 6.0\n", "b 2 3 NaN NaN\n", "c 4 5 7.0 8.0" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
level1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
\n", "
" ], "text/plain": [ " level1 level2 \n", " one two three four\n", "a 0 1 5.0 6.0\n", "b 2 3 NaN NaN\n", "c 4 5 7.0 8.0" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# If you pass a dict of objects instead of a list, the dict’s keys will be used for the keys option:\n", "pd.concat({'level1': df1, 'level2': df2}, axis=1)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
upperlevel1level2
loweronetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
\n", "
" ], "text/plain": [ "upper level1 level2 \n", "lower one two three four\n", "a 0 1 5.0 6.0\n", "b 2 3 NaN NaN\n", "c 4 5 7.0 8.0" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], names=['upper', 'lower'])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " a b c d\n", "0 -0.743842 0.366076 0.043001 0.315239\n", "1 -0.126674 1.312018 0.689079 -0.647427\n", "2 -0.929330 1.053159 0.470513 -0.418311 \n", "\n", " b d a\n", "0 -1.432140 -1.802894 1.099453\n", "1 -1.137721 -0.052989 0.728343\n" ] } ], "source": [ "# DataFrames in which the row index is not meaningful in the context of the analysis\n", "\n", "df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])\n", "df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])\n", "\n", "print(df1, \"\\n\\n\", df2)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
0-0.7438420.3660760.0430010.315239
1-0.1266741.3120180.689079-0.647427
2-0.9293301.0531590.470513-0.418311
01.099453-1.432140NaN-1.802894
10.728343-1.137721NaN-0.052989
\n", "
" ], "text/plain": [ " a b c d\n", "0 -0.743842 0.366076 0.043001 0.315239\n", "1 -0.126674 1.312018 0.689079 -0.647427\n", "2 -0.929330 1.053159 0.470513 -0.418311\n", "0 1.099453 -1.432140 NaN -1.802894\n", "1 0.728343 -1.137721 NaN -0.052989" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([df1, df2])" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
0-0.7438420.3660760.0430010.315239
1-0.1266741.3120180.689079-0.647427
2-0.9293301.0531590.470513-0.418311
31.099453-1.432140NaN-1.802894
40.728343-1.137721NaN-0.052989
\n", "
" ], "text/plain": [ " a b c d\n", "0 -0.743842 0.366076 0.043001 0.315239\n", "1 -0.126674 1.312018 0.689079 -0.647427\n", "2 -0.929330 1.053159 0.470513 -0.418311\n", "3 1.099453 -1.432140 NaN -1.802894\n", "4 0.728343 -1.137721 NaN -0.052989" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.concat([df1, df2], ignore_index=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### concat function arguments\n", "\n", "```\n", "objs List or dict of pandas objects to be concatenated. The only required argument\n", "axis Axis to concatenate along; defaults to 0\n", "join One of 'inner', 'outer', defaulting to 'outer'; whether to intersection (inner) or union (outer) together indexes along the other axes\n", "join_axes Specific indexes to use for the other n-1 axes instead of performing union/intersection logic\n", "keys Values to associate with objects being concatenated, forming a hierarchical index along the concatenation axis. Can either be a list or array of arbitrary values, an array of tuples, or a list of arrays (if multiple level arrays passed in levels)\n", "levels Specific indexes to use as hierarchical index level or levels if keys passed\n", "names Names for created hierarchical levels if keys and / or levels passed\n", "verify_integrity Check new axis in concatenated object for duplicates and raise exception if so. By default (False) allows duplicates\n", "ignore_index Do not preserve indexes along concatenation axis, instead producing a new range(total_length) index\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Combining Data with Overlap\n", "\n", "Another data combination situation can’t be expressed as either a merge or concatenation\n", "operation. You may have two datasets whose indexes overlap in full or part. As\n", "a motivating example, consider NumPy’s where function, which expressed a vectorized\n", "if-else:\n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "f NaN\n", "e 2.5\n", "d NaN\n", "c 3.5\n", "b 4.5\n", "a NaN\n", "dtype: float64 \n", "\n", " f 0.0\n", "e 1.0\n", "d 2.0\n", "c 3.0\n", "b 4.0\n", "a 5.0\n", "dtype: float64\n" ] } ], "source": [ "a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],\n", "index=['f', 'e', 'd', 'c', 'b', 'a'])\n", "\n", "b = pd.Series(np.arange(len(a), dtype=np.float64),\n", "index=['f', 'e', 'd', 'c', 'b', 'a'])\n", "\n", "print(a, \"\\n\\n\", b)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "b[-1] = np.nan" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ 0. , 2.5, 2. , 3.5, 4.5, nan])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.where(pd.isnull(a), b, a)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a NaN\n", "b 4.5\n", "c 3.0\n", "d 2.0\n", "e 1.0\n", "f 0.0\n", "dtype: float64" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Series has a combine_first method, which performs the equivalent of this operation plus data alignment\n", "b[:-2].combine_first(a[2:])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abc
01.0NaN2.0
14.02.06.0
25.04.010.0
33.06.014.0
47.08.0NaN
\n", "
" ], "text/plain": [ " a b c\n", "0 1.0 NaN 2.0\n", "1 4.0 2.0 6.0\n", "2 5.0 4.0 10.0\n", "3 3.0 6.0 14.0\n", "4 7.0 8.0 NaN" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# With DataFrames, combine_first naturally does the same thing column by column, so\n", "# you can think of it as “patching” missing data in the calling object with data from the\n", "# object you pass:\n", "\n", "df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],\n", "'b': [np.nan, 2., np.nan, 6.],\n", " 'c': range(2, 18, 4)})\n", "df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],\n", " 'b': [np.nan, 3., 4., 6., 8.]})\n", "\n", "df1.combine_first(df2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }