{ "metadata": { "name": "", "signature": "sha256:d6e496e9c0002feda4fc405931b0ef9a5d1c831cef059878999df91f6cf22b7e" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "# Now we'll learn how to merge data sets by linking rows by keys.\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from pandas import Series, DataFrame" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "# Let's make a dframe\n", "\n", "dframe1 = DataFrame({'key':['X','Z','Y','Z','X','X'],'data_set_1': np.arange(6)})\n", "\n", "#Show\n", "dframe1" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data_set_1key
0 0 X
1 1 Z
2 2 Y
3 3 Z
4 4 X
5 5 X
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ " data_set_1 key\n", "0 0 X\n", "1 1 Z\n", "2 2 Y\n", "3 3 Z\n", "4 4 X\n", "5 5 X" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "#Now lets make another dframe\n", "\n", "dframe2 = DataFrame({'key':['Q','Y','Z'],'data_set_2':[1,2,3]})\n", "\n", "#Show\n", "dframe2" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data_set_2key
0 1 Q
1 2 Y
2 3 Z
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 13, "text": [ " data_set_2 key\n", "0 1 Q\n", "1 2 Y\n", "2 3 Z" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "# Now we can use merge the dataframes, this is a \"many-to-one\" situation\n", "\n", "# Merge will automatically choose overlapping columns to merge on\n", "pd.merge(dframe1,dframe2)\n", "\n", "#Note no overlapping 'X's" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data_set_1keydata_set_2
0 1 Z 3
1 3 Z 3
2 2 Y 2
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 14, "text": [ " data_set_1 key data_set_2\n", "0 1 Z 3\n", "1 3 Z 3\n", "2 2 Y 2" ] } ], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "# We could have also specified which column to merge on\n", "pd.merge(dframe1,dframe2,on='key')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data_set_1keydata_set_2
0 1 Z 3
1 3 Z 3
2 2 Y 2
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 16, "text": [ " data_set_1 key data_set_2\n", "0 1 Z 3\n", "1 3 Z 3\n", "2 2 Y 2" ] } ], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "# We can choose which DataFrame's keys to use, this will choose left (dframe1)\n", "pd.merge(dframe1,dframe2,on='key',how='left')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data_set_1keydata_set_2
0 0 XNaN
1 1 Z 3
2 2 Y 2
3 3 Z 3
4 4 XNaN
5 5 XNaN
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 17, "text": [ " data_set_1 key data_set_2\n", "0 0 X NaN\n", "1 1 Z 3\n", "2 2 Y 2\n", "3 3 Z 3\n", "4 4 X NaN\n", "5 5 X NaN" ] } ], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "# Choosing the one on the right (dframe2)\n", "pd.merge(dframe1,dframe2,on='key',how='right')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data_set_1keydata_set_2
0 1 Z 3
1 3 Z 3
2 2 Y 2
3NaN Q 1
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 18, "text": [ " data_set_1 key data_set_2\n", "0 1 Z 3\n", "1 3 Z 3\n", "2 2 Y 2\n", "3 NaN Q 1" ] } ], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "#Choosing the \"outer\" method selects the union of both keys\n", "pd.merge(dframe1,dframe2,on='key',how='outer')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data_set_1keydata_set_2
0 0 XNaN
1 4 XNaN
2 5 XNaN
3 1 Z 3
4 3 Z 3
5 2 Y 2
6NaN Q 1
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 19, "text": [ " data_set_1 key data_set_2\n", "0 0 X NaN\n", "1 4 X NaN\n", "2 5 X NaN\n", "3 1 Z 3\n", "4 3 Z 3\n", "5 2 Y 2\n", "6 NaN Q 1" ] } ], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "#Now we'll learn about a many to many merge\n", "\n", "# Nnote that these DataFrames contain more than one instance of the key in BOTH datasets\n", "\n", "dframe3 = DataFrame({'key': ['X', 'X', 'X', 'Y', 'Z', 'Z'],\n", " 'data_set_3': range(6)})\n", "dframe4 = DataFrame({'key': ['Y', 'Y', 'X', 'X', 'Z'],\n", " 'data_set_4': range(5)})\n", "\n", "#Show the merge\n", "pd.merge(dframe3, dframe4)\n" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data_set_3keydata_set_4
0 0 X 2
1 0 X 3
2 1 X 2
3 1 X 3
4 2 X 2
5 2 X 3
6 3 Y 0
7 3 Y 1
8 4 Z 4
9 5 Z 4
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 30, "text": [ " data_set_3 key data_set_4\n", "0 0 X 2\n", "1 0 X 3\n", "2 1 X 2\n", "3 1 X 3\n", "4 2 X 2\n", "5 2 X 3\n", "6 3 Y 0\n", "7 3 Y 1\n", "8 4 Z 4\n", "9 5 Z 4" ] } ], "prompt_number": 30 }, { "cell_type": "markdown", "metadata": {}, "source": [ "So what happened? A many to many merge results in the product of the rows. Because there were 3 'X's in dframe3 and 2 'X's in dframe4 there ended up being a total of 6 'X' rows in the result (2*3=6)! Note how dframe3 repeats its 0,1,2 values for 'X' and dframe4 repeats its '2,3' pairs throughout the key set. " ] }, { "cell_type": "code", "collapsed": false, "input": [ "# We can also merge with multiple keys!\n", "\n", "# Dframe on left\n", "df_left = DataFrame({'key1': ['SF', 'SF', 'LA'],\n", " 'key2': ['one', 'two', 'one'],\n", " 'left_data': [10,20,30]})\n", "\n", "#Dframe on right\n", "df_right = DataFrame({'key1': ['SF', 'SF', 'LA', 'LA'],\n", " 'key2': ['one', 'one', 'one', 'two'],\n", " 'right_data': [40,50,60,70]})\n", "\n", "#Merge\n", "pd.merge(df_left, df_right, on=['key1', 'key2'], how='outer')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
key1key2left_dataright_data
0 SF one 10 40
1 SF one 10 50
2 SF two 20NaN
3 LA one 30 60
4 LA twoNaN 70
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 33, "text": [ " key1 key2 left_data right_data\n", "0 SF one 10 40\n", "1 SF one 10 50\n", "2 SF two 20 NaN\n", "3 LA one 30 60\n", "4 LA two NaN 70" ] } ], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "# Now using the above you can check mulitple data sets for multiple key combos, for instance what did the left data set have for LA,one?\n", "# Answer = 60" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 32 }, { "cell_type": "code", "collapsed": false, "input": [ "#Note that the left and right DataFrames have overlapping key names (key1 and key2).\n", "# pandas automatically adds suffixes to them\n", "\n", "pd.merge(df_left,df_right,on='key1')" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
key1key2_xleft_datakey2_yright_data
0 SF one 10 one 40
1 SF one 10 one 50
2 SF two 20 one 40
3 SF two 20 one 50
4 LA one 30 one 60
5 LA one 30 two 70
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 35, "text": [ " key1 key2_x left_data key2_y right_data\n", "0 SF one 10 one 40\n", "1 SF one 10 one 50\n", "2 SF two 20 one 40\n", "3 SF two 20 one 50\n", "4 LA one 30 one 60\n", "5 LA one 30 two 70" ] } ], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "# We can also specify what the suffix becomes\n", "pd.merge(df_left,df_right, on='key1',suffixes=('_lefty','_righty'))" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
key1key2_leftyleft_datakey2_rightyright_data
0 SF one 10 one 40
1 SF one 10 one 50
2 SF two 20 one 40
3 SF two 20 one 50
4 LA one 30 one 60
5 LA one 30 two 70
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 36, "text": [ " key1 key2_lefty left_data key2_righty right_data\n", "0 SF one 10 one 40\n", "1 SF one 10 one 50\n", "2 SF two 20 one 40\n", "3 SF two 20 one 50\n", "4 LA one 30 one 60\n", "5 LA one 30 two 70" ] } ], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "# For more info on merge parameters check out:\n", "url = 'http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.merge.html'\n", "\n", "# Next we'll learn how to merge on Index!" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 37 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }