{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pandas" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from pandas import Series, DataFrame\n", "import pandas as pd\n", "import numpy as np\n", "\n", "pd.options.display.max_rows = 10" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Series\n", "one dimensional array-like object containing an array\n", "of data and an associated array of data labels, called its index.\n", "\n", "(a fixed-length, ordered dict)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 4\n", "1 7\n", "2 -5\n", "3 3\n", "dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = Series([4, 7, -5, 3])\n", "obj" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(array([ 4, 7, -5, 3]), RangeIndex(start=0, stop=4, step=1))" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.values, obj.index" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "d 4\n", "b 7\n", "a -5\n", "c 3\n", "dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# to attach an index\n", "\n", "obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])\n", "obj2" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a -5\n", "dtype: int64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj2[['a']] #Object" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "-5" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj2['a'] #value correspondinf to index 'a'" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "d 4\n", "b 7\n", "c 3\n", "dtype: int64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj2[obj2 > 0]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "d 54.598150\n", "b 1096.633158\n", "a 0.006738\n", "c 20.085537\n", "dtype: float64" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.exp(obj2)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'b' in obj2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create Series from a Dict" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sdata = {'Ohio': 3500, 'Texas': 7100, 'Utah':5000}" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Ohio 3500\n", "Texas 7100\n", "Utah 5000\n", "dtype: int64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj3 = Series(sdata)\n", "obj3" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "NY NaN\n", "Utah 5000.0\n", "Texas 7100.0\n", "dtype: float64" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# with different index\n", "\n", "states = ['NY', 'Utah', 'Texas']\n", "obj4 = Series(sdata, index=states)\n", "obj4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### .isnull(), .notnull()" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "NY True\n", "Utah False\n", "Texas False\n", "dtype: bool" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj4.isnull()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Arithmetic operations" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "NY NaN\n", "Ohio NaN\n", "Texas 14200.0\n", "Utah 10000.0\n", "dtype: float64" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj3 + obj4" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "state\n", "NY NaN\n", "Utah 5000.0\n", "Texas 7100.0\n", "Name: population, dtype: float64" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj4.name = 'population'\n", "obj4.index.name = 'state'\n", "obj4" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a NaN\n", "b 5000.0\n", "c 7100.0\n", "Name: population, dtype: float64" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj4.index = ['a','b','c']\n", "obj4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DataFrame\n", "a tabular spreadsheet-like data structure containing an ordered\n", "collection of columns." ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],\n", " 'year': [2000, 2001, 2002, 2001, 2002],\n", " 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}\n", "frame = DataFrame(data)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
popstateyear
01.5Ohio2000
11.7Ohio2001
23.6Ohio2002
32.4Nevada2001
42.9Nevada2002
\n", "
" ], "text/plain": [ " pop state year\n", "0 1.5 Ohio 2000\n", "1 1.7 Ohio 2001\n", "2 3.6 Ohio 2002\n", "3 2.4 Nevada 2001\n", "4 2.9 Nevada 2002" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepop
02000Ohio1.5
12001Ohio1.7
22002Ohio3.6
32001Nevada2.4
42002Nevada2.9
\n", "
" ], "text/plain": [ " year state pop\n", "0 2000 Ohio 1.5\n", "1 2001 Ohio 1.7\n", "2 2002 Ohio 3.6\n", "3 2001 Nevada 2.4\n", "4 2002 Nevada 2.9" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# specify the columns order \n", "\n", "DataFrame(data, columns= ['year', 'state', 'pop'])" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7NaN
three2002Ohio3.6NaN
four2001Nevada2.4NaN
five2002Nevada2.9NaN
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 NaN\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 NaN\n", "five 2002 Nevada 2.9 NaN" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame2 = DataFrame(data, columns= ['year', 'state', 'pop', 'debt'], \n", " index =['one', 'two', 'three', 'four', 'five'])\n", "frame2" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Index(['year', 'state', 'pop', 'debt'], dtype='object')" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame2.columns" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "one Ohio\n", "two Ohio\n", "three Ohio\n", "Name: state, dtype: object" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Select colums\n", "\n", "frame2['state'][:3]" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "one 2000\n", "two 2001\n", "three 2002\n", "four 2001\n", "five 2002\n", "Name: year, dtype: int64" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame2.year" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### .ix\n", "indexing field" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "year 2002\n", "state Nevada\n", "pop 2.9\n", "debt NaN\n", "Name: five, dtype: object" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Select rows\n", "\n", "frame2.ix['five']" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one2000Ohio1.50
two2001Ohio1.71
three2002Ohio3.62
four2001Nevada2.43
five2002Nevada2.94
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 2000 Ohio 1.5 0\n", "two 2001 Ohio 1.7 1\n", "three 2002 Ohio 3.6 2\n", "four 2001 Nevada 2.4 3\n", "five 2002 Nevada 2.9 4" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame2.debt = np.arange(5)\n", "frame2" ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7-1.2
three2002Ohio3.6NaN
four2001Nevada2.4-1.5
five2002Nevada2.9-1.7
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 -1.2\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 -1.5\n", "five 2002 Nevada 2.9 -1.7" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Assign values using Series\n", "\n", "val = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])\n", "frame2.debt = val\n", "frame2" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebteastern
one2000Ohio1.5NaNTrue
two2001Ohio1.7-1.2True
three2002Ohio3.6NaNTrue
four2001Nevada2.4-1.5False
five2002Nevada2.9-1.7False
\n", "
" ], "text/plain": [ " year state pop debt eastern\n", "one 2000 Ohio 1.5 NaN True\n", "two 2001 Ohio 1.7 -1.2 True\n", "three 2002 Ohio 3.6 NaN True\n", "four 2001 Nevada 2.4 -1.5 False\n", "five 2002 Nevada 2.9 -1.7 False" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# if the columns doesn't exist, it'll be created\n", "\n", "frame2['eastern'] = frame2.state == 'Ohio'\n", "frame2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### del column" ] }, { "cell_type": "code", "execution_count": 104, "metadata": { "collapsed": false }, "outputs": [], "source": [ "del frame2['eastern']" ] }, { "cell_type": "code", "execution_count": 105, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7-1.2
three2002Ohio3.6NaN
four2001Nevada2.4-1.5
five2002Nevada2.9-1.7
\n", "
" ], "text/plain": [ " year state pop debt\n", "one 2000 Ohio 1.5 NaN\n", "two 2001 Ohio 1.7 -1.2\n", "three 2002 Ohio 3.6 NaN\n", "four 2001 Nevada 2.4 -1.5\n", "five 2002 Nevada 2.9 -1.7" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Nested dict of dicts " ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pop = {'Nevada': {2001: 2.4, 2002: 2.9}, \n", " 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NevadaOhio
2000NaN1.5
20012.41.7
20022.93.6
\n", "
" ], "text/plain": [ " Nevada Ohio\n", "2000 NaN 1.5\n", "2001 2.4 1.7\n", "2002 2.9 3.6" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame3 = DataFrame(pop)\n", "frame3" ] }, { "cell_type": "code", "execution_count": 118, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
200020012002
NevadaNaN2.42.9
Ohio1.51.73.6
\n", "
" ], "text/plain": [ " 2000 2001 2002\n", "Nevada NaN 2.4 2.9\n", "Ohio 1.5 1.7 3.6" ] }, "execution_count": 118, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame3.T" ] }, { "cell_type": "code", "execution_count": 121, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NevadaOhio
20012.41.7
20022.93.6
2003NaNNaN
\n", "
" ], "text/plain": [ " Nevada Ohio\n", "2001 2.4 1.7\n", "2002 2.9 3.6\n", "2003 NaN NaN" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "DataFrame(pop, index =[2001, 2002, 2003])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dicts of Series" ] }, { "cell_type": "code", "execution_count": 125, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'Nevada': 2000 NaN\n", " Name: Nevada, dtype: float64, 'Ohio': 2000 1.5\n", " 2001 1.7\n", " Name: Ohio, dtype: float64}" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdata = {'Ohio': frame3['Ohio'][:-1],\n", " 'Nevada': frame3['Nevada'][:1]}\n", "pdata" ] }, { "cell_type": "code", "execution_count": 127, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NevadaOhio
2000NaN1.5
2001NaN1.7
\n", "
" ], "text/plain": [ " Nevada Ohio\n", "2000 NaN 1.5\n", "2001 NaN 1.7" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "DataFrame(pdata)" ] }, { "cell_type": "code", "execution_count": 128, "metadata": { "collapsed": true }, "outputs": [], "source": [ "frame3.index.name = 'year'; frame3.columns.name = 'state'" ] }, { "cell_type": "code", "execution_count": 129, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateNevadaOhio
year
2000NaN1.5
20012.41.7
20022.93.6
\n", "
" ], "text/plain": [ "state Nevada Ohio\n", "year \n", "2000 NaN 1.5\n", "2001 2.4 1.7\n", "2002 2.9 3.6" ] }, "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame3" ] }, { "cell_type": "code", "execution_count": 131, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ nan, 1.5],\n", " [ 2.4, 1.7],\n", " [ 2.9, 3.6]])" ] }, "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame3.values" ] }, { "cell_type": "code", "execution_count": 133, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[2000, 'Ohio', 1.5, nan],\n", " [2001, 'Ohio', 1.7, -1.2],\n", " [2002, 'Ohio', 3.6, nan],\n", " [2001, 'Nevada', 2.4, -1.5],\n", " [2002, 'Nevada', 2.9, -1.7]], dtype=object)" ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#columns of different types\n", "\n", "frame2.values" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Index Objects" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "obj = Series(range(3), index =['a', 'b', 'c'])\n", "index = obj.index" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Index(['a', 'b', 'c'], dtype='object')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Index objects are immutable\n", "\n", "index" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Int64Index([0, 1, 2], dtype='int64')" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index = pd.Index(np.arange(3))\n", "index" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "obj2 = Series([1.4, 2.3, 0], index=index)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj2.index is index" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NevadaOhio
2000NaN1.5
20012.41.7
20022.93.6
\n", "
" ], "text/plain": [ " Nevada Ohio\n", "2000 NaN 1.5\n", "2001 2.4 1.7\n", "2002 2.9 3.6" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# an Index als functions as a fixed-size set\n", "\n", "frame3" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "'Ohio' in frame3.columns" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "2003 in frame3.index" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# Essential Functionality" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### reindex\n", "arranges the data according to the new index, introducing missing values" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "d 4.5\n", "b 7.2\n", "a -5.3\n", "c 3.6\n", "dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])\n", "obj" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a -5.3\n", "b 7.2\n", "c 3.6\n", "d 4.5\n", "e 0.0\n", "dtype: float64" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)\n", "obj2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### method (ffill)\n", "interpolation" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# It may be desirable to do some interpolation of filling of values when reindexing\n", "\n", "obj3 = Series(['blue', 'purple', 'yellow'], index= [0, 2, 4])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 blue\n", "1 blue\n", "2 purple\n", "3 purple\n", "4 yellow\n", "5 yellow\n", "dtype: object" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj3.reindex(range(6), method='ffill')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OhioTexasCalifornia
a012
b345
d678
\n", "
" ], "text/plain": [ " Ohio Texas California\n", "a 0 1 2\n", "b 3 4 5\n", "d 6 7 8" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# reindex can alter either the (row) index, columsn or both\n", "\n", "frame = DataFrame(np.arange(9).reshape(3, 3), index= ['a', 'b', 'd'],\n", " columns= ['Ohio', 'Texas', 'California'])\n", "frame" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OhioTexasCalifornia
a0.01.02.0
b3.04.05.0
cNaNNaNNaN
d6.07.08.0
\n", "
" ], "text/plain": [ " Ohio Texas California\n", "a 0.0 1.0 2.0\n", "b 3.0 4.0 5.0\n", "c NaN NaN NaN\n", "d 6.0 7.0 8.0" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame2 = frame.reindex(['a', 'b', 'c', 'd'])\n", "frame2" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TexasUtahCalifornia
a1NaN2
b4NaN5
d7NaN8
\n", "
" ], "text/plain": [ " Texas Utah California\n", "a 1 NaN 2\n", "b 4 NaN 5\n", "d 7 NaN 8" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "states = ['Texas', 'Utah', 'California']\n", "frame.reindex(columns=states)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TexasUtahCalifornia
a1NaN2
b4NaN5
c4NaN5
d7NaN8
\n", "
" ], "text/plain": [ " Texas Utah California\n", "a 1 NaN 2\n", "b 4 NaN 5\n", "c 4 NaN 5\n", "d 7 NaN 8" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.reindex(index =['a', 'b', 'c', 'd'], method='ffill', columns= states)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### ix\n", "indexing field" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TexasUtahCalifornia
a1.0NaN2.0
b4.0NaN5.0
cNaNNaNNaN
d7.0NaN8.0
\n", "
" ], "text/plain": [ " Texas Utah California\n", "a 1.0 NaN 2.0\n", "b 4.0 NaN 5.0\n", "c NaN NaN NaN\n", "d 7.0 NaN 8.0" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.ix[['a', 'b', 'c', 'd'], states]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Droping entries from an axis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### drop" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": true }, "outputs": [], "source": [ "obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 0.0\n", "b 1.0\n", "d 3.0\n", "e 4.0\n", "dtype: float64" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_obj = obj.drop('c')\n", "new_obj" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data = DataFrame(np.arange(16).reshape(4, 4), \n", " index =['Ohio', 'Colorado', 'Utah', 'NY'],\n", " columns =['one', 'two', 'three', 'four'])" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
Ohio0123
NY12131415
\n", "
" ], "text/plain": [ " one two three four\n", "Ohio 0 1 2 3\n", "NY 12 13 14 15" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.drop(['Colorado', 'Utah'])" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onethreefour
Ohio023
Colorado467
Utah81011
NY121415
\n", "
" ], "text/plain": [ " one three four\n", "Ohio 0 2 3\n", "Colorado 4 6 7\n", "Utah 8 10 11\n", "NY 12 14 15" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.drop('two', axis=1)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onethree
Ohio02
Colorado46
Utah810
NY1214
\n", "
" ], "text/plain": [ " one three\n", "Ohio 0 2\n", "Colorado 4 6\n", "Utah 8 10\n", "NY 12 14" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.drop(['two', 'four'], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Indexing, selection and filtering\n", "works analogously to NumPy" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": true }, "outputs": [], "source": [ "obj = Series(np.arange(4), index=['a', 'b', 'c', 'd'])" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj['b']" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "c 2\n", "d 3\n", "dtype: int64" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj[2:4]" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "b 1\n", "d 3\n", "dtype: int64" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj[[1, 3]]" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "d 3\n", "dtype: int64" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj[obj > 2]" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "b 1\n", "c 2\n", "dtype: int64" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj['b':'c']" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 0\n", "b 5\n", "c 5\n", "d 3\n", "dtype: int64" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj['b':'c'] = 5\n", "obj" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
Ohio0123
Colorado4567
Utah891011
NY12131415
\n", "
" ], "text/plain": [ " one two three four\n", "Ohio 0 1 2 3\n", "Colorado 4 5 6 7\n", "Utah 8 9 10 11\n", "NY 12 13 14 15" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = DataFrame(np.arange(16).reshape(4, 4), \n", " index =['Ohio', 'Colorado', 'Utah', 'NY'],\n", " columns =['one', 'two', 'three', 'four'])\n", "data" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Ohio 1\n", "Colorado 5\n", "Utah 9\n", "NY 13\n", "Name: two, dtype: int64" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['two']" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
threeone
Ohio20
Colorado64
Utah108
NY1412
\n", "
" ], "text/plain": [ " three one\n", "Ohio 2 0\n", "Colorado 6 4\n", "Utah 10 8\n", "NY 14 12" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[['three', 'one']]" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
Ohio0123
Colorado4567
\n", "
" ], "text/plain": [ " one two three four\n", "Ohio 0 1 2 3\n", "Colorado 4 5 6 7" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[:2]" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
Colorado4567
Utah891011
NY12131415
\n", "
" ], "text/plain": [ " one two three four\n", "Colorado 4 5 6 7\n", "Utah 8 9 10 11\n", "NY 12 13 14 15" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[data['three'] > 5]" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
OhioTrueTrueTrueTrue
ColoradoTrueFalseFalseFalse
UtahFalseFalseFalseFalse
NYFalseFalseFalseFalse
\n", "
" ], "text/plain": [ " one two three four\n", "Ohio True True True True\n", "Colorado True False False False\n", "Utah False False False False\n", "NY False False False False" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data < 5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### .ix\n", "indexing field" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "two 5\n", "three 6\n", "Name: Colorado, dtype: int64" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.ix['Colorado', ['two', 'three']]" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "one 8\n", "two 9\n", "three 10\n", "four 11\n", "Name: Utah, dtype: int64" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.ix[2]" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothree
Colorado456
Utah8910
NY121314
\n", "
" ], "text/plain": [ " one two three\n", "Colorado 4 5 6\n", "Utah 8 9 10\n", "NY 12 13 14" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.ix[data.three > 5, :3]" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Arithmetic" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "s1 = Series([4., 2., 4., 9.], index = ['a', 'c', 'd', 'e'])\n", "s2 = Series([-2.3, 3.2, 3.2, 0.4, 1], index = ['a', 'c', 'e', 'f', 'g'])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 1.7\n", "c 5.2\n", "d NaN\n", "e 12.2\n", "f NaN\n", "g NaN\n", "dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s1 + s2" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# In the case of DataFrame\n", "\n", "df1 = DataFrame(np.arange(9).reshape(3, 3), columns= list('abc'),\n", " index =['Ohio', 'Texas', 'Colorado'])\n", "df2 = DataFrame(np.arange(12).reshape(4, 3), columns=list('bde'), \n", " index= ['Utah', 'Ohio', 'Texas', 'Orego'])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abc
Ohio012
Texas345
Colorado678
\n", "
" ], "text/plain": [ " a b c\n", "Ohio 0 1 2\n", "Texas 3 4 5\n", "Colorado 6 7 8" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah012
Ohio345
Texas678
Orego91011
\n", "
" ], "text/plain": [ " b d e\n", "Utah 0 1 2\n", "Ohio 3 4 5\n", "Texas 6 7 8\n", "Orego 9 10 11" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcde
ColoradoNaNNaNNaNNaNNaN
OhioNaN4.0NaNNaNNaN
OregoNaNNaNNaNNaNNaN
TexasNaN10.0NaNNaNNaN
UtahNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " a b c d e\n", "Colorado NaN NaN NaN NaN NaN\n", "Ohio NaN 4.0 NaN NaN NaN\n", "Orego NaN NaN NaN NaN NaN\n", "Texas NaN 10.0 NaN NaN NaN\n", "Utah NaN NaN NaN NaN NaN" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 + df2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### fill_value" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df1 = DataFrame(np.arange(12).reshape(3, 4), columns= list('abcd'))\n", "df2 = DataFrame(np.arange(20).reshape(4, 5), columns= list('abcde')) " ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcde
00.02.04.06.0NaN
19.011.013.015.0NaN
218.020.022.024.0NaN
3NaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " a b c d e\n", "0 0.0 2.0 4.0 6.0 NaN\n", "1 9.0 11.0 13.0 15.0 NaN\n", "2 18.0 20.0 22.0 24.0 NaN\n", "3 NaN NaN NaN NaN NaN" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1 + df2" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcde
00.02.04.06.04.0
19.011.013.015.09.0
218.020.022.024.014.0
315.016.017.018.019.0
\n", "
" ], "text/plain": [ " a b c d e\n", "0 0.0 2.0 4.0 6.0 4.0\n", "1 9.0 11.0 13.0 15.0 9.0\n", "2 18.0 20.0 22.0 24.0 14.0\n", "3 15.0 16.0 17.0 18.0 19.0" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.add(df2, fill_value=0)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcde
00123NaN
14567NaN
2891011NaN
\n", "
" ], "text/plain": [ " a b c d e\n", "0 0 1 2 3 NaN\n", "1 4 5 6 7 NaN\n", "2 8 9 10 11 NaN" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# when reindexing\n", "\n", "df1.reindex(columns = df2.columns)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcde
001230
145670
28910110
\n", "
" ], "text/plain": [ " a b c d e\n", "0 0 1 2 3 0\n", "1 4 5 6 7 0\n", "2 8 9 10 11 0" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.reindex(columns = df2.columns, fill_value=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Operations between DataFrame and Series" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### broadcasting" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2, 3],\n", " [ 4, 5, 6, 7],\n", " [ 8, 9, 10, 11]])" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr = np.arange(12).reshape(3, 4)\n", "arr" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, 0],\n", " [4, 4, 4, 4],\n", " [8, 8, 8, 8]])" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr - arr[0]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah012
Ohio345
Texas678
Orego91011
\n", "
" ], "text/plain": [ " b d e\n", "Utah 0 1 2\n", "Ohio 3 4 5\n", "Texas 6 7 8\n", "Orego 9 10 11" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = DataFrame(np.arange(12).reshape(4, 3), columns =list('bde'),\n", " index =['Utah', 'Ohio', 'Texas', 'Orego'])\n", "frame" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "b 0\n", "d 1\n", "e 2\n", "Name: Utah, dtype: int64" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "series = frame.ix[0]\n", "series" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah000
Ohio333
Texas666
Orego999
\n", "
" ], "text/plain": [ " b d e\n", "Utah 0 0 0\n", "Ohio 3 3 3\n", "Texas 6 6 6\n", "Orego 9 9 9" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame - series" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Utah 1\n", "Ohio 4\n", "Texas 7\n", "Orego 10\n", "Name: d, dtype: int64" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# If you want to broadcast over the colums\n", "\n", "series3 = frame['d']\n", "series3" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah-101
Ohio-101
Texas-101
Orego-101
\n", "
" ], "text/plain": [ " b d e\n", "Utah -1 0 1\n", "Ohio -1 0 1\n", "Texas -1 0 1\n", "Orego -1 0 1" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sub(series3, axis= 0)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah012
Ohio345
Texas678
Orego91011
\n", "
" ], "text/plain": [ " b d e\n", "Utah 0 1 2\n", "Ohio 3 4 5\n", "Texas 6 7 8\n", "Orego 9 10 11" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Function application and mapping\n", "Numpy functions work fine with pandas object\n", "**apply** works on a row / column basis of a DataFrame, **applymap** works element-wise on a DataFrame, and **map** works element-wise on a Series." ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah-1.702298-0.7294680.696094
Ohio0.782241-1.547337-0.573393
Texas1.2380660.317191-1.200809
Orego-0.7565882.5128170.663570
\n", "
" ], "text/plain": [ " b d e\n", "Utah -1.702298 -0.729468 0.696094\n", "Ohio 0.782241 -1.547337 -0.573393\n", "Texas 1.238066 0.317191 -1.200809\n", "Orego -0.756588 2.512817 0.663570" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = DataFrame(np.random.randn(4, 3), columns =list('bde'),\n", " index =['Utah', 'Ohio', 'Texas', 'Orego'])\n", "frame" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah1.7022980.7294680.696094
Ohio0.7822411.5473370.573393
Texas1.2380660.3171911.200809
Orego0.7565882.5128170.663570
\n", "
" ], "text/plain": [ " b d e\n", "Utah 1.702298 0.729468 0.696094\n", "Ohio 0.782241 1.547337 0.573393\n", "Texas 1.238066 0.317191 1.200809\n", "Orego 0.756588 2.512817 0.663570" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.abs(frame)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [], "source": [ "f= lambda x: x.max() - x.min()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### apply method" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "b 2.940364\n", "d 4.060153\n", "e 1.896903\n", "dtype: float64" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.apply(f)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Utah 2.398391\n", "Ohio 2.329578\n", "Texas 2.438875\n", "Orego 3.269405\n", "dtype: float64" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.apply(f, axis=1)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def f(x):\n", " return Series([x.min(), x.max()], index = ['min', 'max'])" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
min-1.702298-1.547337-1.200809
max1.2380662.5128170.696094
\n", "
" ], "text/plain": [ " b d e\n", "min -1.702298 -1.547337 -1.200809\n", "max 1.238066 2.512817 0.696094" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.apply(f)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### applymap\n", "element-wise" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": true }, "outputs": [], "source": [ "format = lambda x: '%.2f' % x" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
bde
Utah-1.70-0.730.70
Ohio0.78-1.55-0.57
Texas1.240.32-1.20
Orego-0.762.510.66
\n", "
" ], "text/plain": [ " b d e\n", "Utah -1.70 -0.73 0.70\n", "Ohio 0.78 -1.55 -0.57\n", "Texas 1.24 0.32 -1.20\n", "Orego -0.76 2.51 0.66" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.applymap(format)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Utah 0.70\n", "Ohio -0.57\n", "Texas -1.20\n", "Orego 0.66\n", "Name: e, dtype: object" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame['e'].map(format)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sorting and ranking" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### sort_index method" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": true }, "outputs": [], "source": [ "obj = Series(range(4), index = ['d', 'a', 'b', 'c'])" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 1\n", "b 2\n", "c 3\n", "d 0\n", "dtype: int64" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.sort_index()" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": true }, "outputs": [], "source": [ "frame = DataFrame(np.arange(8).reshape(2, 4), index=['three', 'one'],\n", " columns= ['d', 'a', 'b', 'c'])" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dabc
one4567
three0123
\n", "
" ], "text/plain": [ " d a b c\n", "one 4 5 6 7\n", "three 0 1 2 3" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_index()" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
three1230
one5674
\n", "
" ], "text/plain": [ " a b c d\n", "three 1 2 3 0\n", "one 5 6 7 4" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_index(axis=1)" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dcba
three0321
one4765
\n", "
" ], "text/plain": [ " d c b a\n", "three 0 3 2 1\n", "one 4 7 6 5" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Sortind in descending order\n", "\n", "frame.sort_index(axis=1, ascending = False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### order\n", "sort_values" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "collapsed": true }, "outputs": [], "source": [ "obj = Series([4, 7, -3, 2])" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2 -3\n", "3 2\n", "0 4\n", "1 7\n", "dtype: int64" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.sort_values()" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# When missing values are present\n", "\n", "obj = Series([4, np.nan, 7, np.nan, -3, 2])" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "4 -3.0\n", "5 2.0\n", "0 4.0\n", "2 7.0\n", "1 NaN\n", "3 NaN\n", "dtype: float64" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.sort_values()" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ab
004
117
20-3
312
\n", "
" ], "text/plain": [ " a b\n", "0 0 4\n", "1 1 7\n", "2 0 -3\n", "3 1 2" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})\n", "frame" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ab
20-3
312
004
117
\n", "
" ], "text/plain": [ " a b\n", "2 0 -3\n", "3 1 2\n", "0 0 4\n", "1 1 7" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# to sort columns\n", "\n", "frame.sort_values(by= 'b')" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ab
20-3
004
312
117
\n", "
" ], "text/plain": [ " a b\n", "2 0 -3\n", "0 0 4\n", "3 1 2\n", "1 1 7" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sort_values(by = ['a', 'b'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### rank method\n", "ranking is closely realted to sorting: but it breaks ties by assigning each group the mean rank" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": true }, "outputs": [], "source": [ "obj = Series([7, -5, 7, 4, 2, 0, 4])" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 6.5\n", "1 1.0\n", "2 6.5\n", "3 4.5\n", "4 3.0\n", "5 2.0\n", "6 4.5\n", "dtype: float64" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.rank()" ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 6.0\n", "1 1.0\n", "2 7.0\n", "3 4.0\n", "4 3.0\n", "5 2.0\n", "6 5.0\n", "dtype: float64" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# according to the order they're observed in the data\n", "\n", "obj.rank(method= 'first')" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 2.0\n", "1 7.0\n", "2 2.0\n", "3 4.0\n", "4 5.0\n", "5 6.0\n", "6 4.0\n", "dtype: float64" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# in descending order\n", "\n", "obj.rank( ascending=False, method='max')" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abc
004.3-2.0
114.05.0
20-3.08.0
312.0-2.5
\n", "
" ], "text/plain": [ " a b c\n", "0 0 4.3 -2.0\n", "1 1 4.0 5.0\n", "2 0 -3.0 8.0\n", "3 1 2.0 -2.5" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = DataFrame({'b': [4.3, 4, -3, 2], 'a': [0, 1, 0, 1], \n", " 'c': [-2, 5, 8, -2.5]})\n", "frame" ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abc
02.03.01.0
11.02.03.0
22.01.03.0
32.03.01.0
\n", "
" ], "text/plain": [ " a b c\n", "0 2.0 3.0 1.0\n", "1 1.0 2.0 3.0\n", "2 2.0 1.0 3.0\n", "3 2.0 3.0 1.0" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.rank(axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Axis indexes with duplicate values" ] }, { "cell_type": "code", "execution_count": 89, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 0\n", "a 1\n", "b 2\n", "b 3\n", "c 4\n", "dtype: int64" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = Series(range(5), index = ['a', 'a', 'b', 'b', 'c'])\n", "obj" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.index.is_unique" ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 0\n", "a 1\n", "dtype: int64" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj['a']" ] }, { "cell_type": "code", "execution_count": 92, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = DataFrame(np.random.randn(4, 3) , index = ['a', 'a', 'b', 'b'])" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
b0.890856-0.2415520.458379
b-0.3257160.4338330.195558
\n", "
" ], "text/plain": [ " 0 1 2\n", "b 0.890856 -0.241552 0.458379\n", "b -0.325716 0.433833 0.195558" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.ix['b']" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Computing Descriptive Statistics" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
a1.40NaN
b7.10-4.5
cNaNNaN
d0.75-1.3
\n", "
" ], "text/plain": [ " one two\n", "a 1.40 NaN\n", "b 7.10 -4.5\n", "c NaN NaN\n", "d 0.75 -1.3" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = DataFrame([[1.4, np.nan], [7.1, -4.5], \n", " [np.nan, np.nan], [0.75, -1.3]], \n", " index=['a', 'b', 'c', 'd'], columns = ['one', 'two'])\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "one 9.25\n", "two -5.80\n", "dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sum()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 1.40\n", "b 2.60\n", "c 0.00\n", "d -0.55\n", "dtype: float64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sum(axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### skipna" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a NaN\n", "b 1.300\n", "c NaN\n", "d -0.275\n", "dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.mean(axis = 1, skipna = False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### idxmin, idxmax" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "one d\n", "two b\n", "dtype: object" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.idxmin()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### accumulations" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
a1.40NaN
b8.50-4.5
cNaNNaN
d9.25-5.8
\n", "
" ], "text/plain": [ " one two\n", "a 1.40 NaN\n", "b 8.50 -4.5\n", "c NaN NaN\n", "d 9.25 -5.8" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.cumsum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### describe" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
count3.0000002.000000
mean3.083333-2.900000
std3.4936852.262742
min0.750000-4.500000
25%1.075000-3.700000
50%1.400000-2.900000
75%4.250000-2.100000
max7.100000-1.300000
\n", "
" ], "text/plain": [ " one two\n", "count 3.000000 2.000000\n", "mean 3.083333 -2.900000\n", "std 3.493685 2.262742\n", "min 0.750000 -4.500000\n", "25% 1.075000 -3.700000\n", "50% 1.400000 -2.900000\n", "75% 4.250000 -2.100000\n", "max 7.100000 -1.300000" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# On non-numeric-dat\n", "\n", "obj = Series(['a', 'a', 'b', 'c']*4)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "count 16\n", "unique 3\n", "top a\n", "freq 8\n", "dtype: object" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Correlation and Covariance" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas.io.data as web" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [], "source": [ "all_data = {}\n", "for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:\n", " all_data[ticker] = web.get_data_yahoo(ticker)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [], "source": [ "price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()})\n", "volume = DataFrame({tic: data['Volume'] for tic, data in all_data.items()})" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOGIBMMSFT
Date
2016-04-22-0.002737-0.053179-0.005358-0.071710
2016-04-25-0.0056770.0060940.0020880.006373
2016-04-26-0.006947-0.0207560.001814-0.012857
2016-04-27-0.062578-0.0032480.009324-0.009720
2016-04-28-0.030566-0.020996-0.022596-0.020416
\n", "
" ], "text/plain": [ " AAPL GOOG IBM MSFT\n", "Date \n", "2016-04-22 -0.002737 -0.053179 -0.005358 -0.071710\n", "2016-04-25 -0.005677 0.006094 0.002088 0.006373\n", "2016-04-26 -0.006947 -0.020756 0.001814 -0.012857\n", "2016-04-27 -0.062578 -0.003248 0.009324 -0.009720\n", "2016-04-28 -0.030566 -0.020996 -0.022596 -0.020416" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns = price.pct_change()\n", "returns.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### corr, cov" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.49435383397487709" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.MSFT.corr(returns.IBM)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "8.9771958168328864e-05" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.MSFT.cov(returns.IBM)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOGIBMMSFT
AAPL1.0000000.4075260.3877510.395076
GOOG0.4075261.0000000.4000160.462501
IBM0.3877510.4000161.0000000.494354
MSFT0.3950760.4625010.4943541.000000
\n", "
" ], "text/plain": [ " AAPL GOOG IBM MSFT\n", "AAPL 1.000000 0.407526 0.387751 0.395076\n", "GOOG 0.407526 1.000000 0.400016 0.462501\n", "IBM 0.387751 0.400016 1.000000 0.494354\n", "MSFT 0.395076 0.462501 0.494354 1.000000" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.corr()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLGOOGIBMMSFT
AAPL0.0002860.0001120.0000800.000099
GOOG0.0001120.0002630.0000790.000111
IBM0.0000800.0000790.0001490.000090
MSFT0.0000990.0001110.0000900.000221
\n", "
" ], "text/plain": [ " AAPL GOOG IBM MSFT\n", "AAPL 0.000286 0.000112 0.000080 0.000099\n", "GOOG 0.000112 0.000263 0.000079 0.000111\n", "IBM 0.000080 0.000079 0.000149 0.000090\n", "MSFT 0.000099 0.000111 0.000090 0.000221" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.cov()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### corrwith\n", "pair-wise correlations between a DataFrame's colums or rows with another\n", "Series or DataFrame" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "AAPL 0.387751\n", "GOOG 0.400016\n", "IBM 1.000000\n", "MSFT 0.494354\n", "dtype: float64" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.corrwith(returns.IBM)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "AAPL -0.079616\n", "GOOG -0.004106\n", "IBM -0.206886\n", "MSFT -0.093043\n", "dtype: float64" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "returns.corrwith(volume)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Unique values, value counts, membership" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": true }, "outputs": [], "source": [ "obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['c', 'a', 'd', 'b'], dtype=object)" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "uniques = obj.unique()\n", "uniques" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "c 3\n", "a 3\n", "b 2\n", "d 1\n", "dtype: int64" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.value_counts()" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 3\n", "c 3\n", "b 2\n", "d 1\n", "dtype: int64" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.value_counts(obj.values, sort=False)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 True\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", "5 True\n", "6 True\n", "7 True\n", "8 True\n", "dtype: bool" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mask = obj.isin(['b', 'c'])\n", "mask" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 c\n", "5 b\n", "6 b\n", "7 c\n", "8 c\n", "dtype: object" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj[mask]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### value_counts" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Qu1Qu2Qu3
0121
1335
2412
3324
4434
\n", "
" ], "text/plain": [ " Qu1 Qu2 Qu3\n", "0 1 2 1\n", "1 3 3 5\n", "2 4 1 2\n", "3 3 2 4\n", "4 4 3 4" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# to compute a histogram on multiple related columns in a DataFrame\n", "\n", "data = DataFrame({'Qu1': [1,3,4,3,4],\n", " 'Qu2': [2,3,1,2,3],\n", " 'Qu3': [1,5,2,4,4]})\n", "data" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Qu1Qu2Qu3
11.01.01.0
20.02.01.0
32.02.00.0
42.00.02.0
50.00.01.0
\n", "
" ], "text/plain": [ " Qu1 Qu2 Qu3\n", "1 1.0 1.0 1.0\n", "2 0.0 2.0 1.0\n", "3 2.0 2.0 0.0\n", "4 2.0 0.0 2.0\n", "5 0.0 0.0 1.0" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = data.apply(pd.value_counts).fillna(0)\n", "result" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXEAAAD8CAYAAACB3pQWAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEfBJREFUeJzt3X+Q1PV9x/HXCyjoESA1TO74IXfGxBqdMjSpRNNOsxl/\n8aOjIbViC2qI0xoDSWYy0tRqyGYiWv/ADIlUw5SE6Bg1zdiENFiUSc5EGa2D3iTTyAQZDsRyNzKB\nEH4EPe7dP26F49i7/S5873Y/3PMxs+Pu9/ve776/H+5efudzu/txRAgAkKYRtW4AAHDqCHEASBgh\nDgAJI8QBIGGEOAAkjBAHgIRVDHHbU23/1Pb/2v6V7c/3U/cN21ttt9mekX+rAIC+RmWo6ZL0xYho\ns/0uSZttPx0RW94psD1b0vkR8QHbH5H0kKRLB6dlAMA7Kl6JR0RHRLSV7h+Q9KqkKX3KrpX0cKnm\nRUkTbDfm3CsAoI+q5sRtt0iaIenFPrumSHq91+M3dHLQAwBylmU6RZJUmkr5gaQvlK7Iq2abz/gD\nwCmICJfbninEbY9ST4A/EhE/KlPyhqRzez2eWtpWrpEsLzmoisWiisVirdsYUralYpkdP5P08T7b\nivXx71QN25Ky9sxY9Koue37lfkdsVzPCyY1bf+ohL3r+TcvLOp3ybUm/joiV/exfJ+mm0otdKmlf\nRHRW0yQAoHoVr8Rt/4WkBZJ+ZfsV9fwv/l8kNUuKiFgdEettz7H9mqSDkhYNZtMAgB4VQzwinpc0\nMkPdklw6GgKFQqHWLdSPllo3UEdaat1A/eB35Lh6HwsP5byV7ThT5slS0++ceDnF9OYzc5kTL6d4\n5o9F1vMbrnPi9cD26f1hEwAGQ0tLi3bs2FHrNupGc3Oz2tvbq3oOIQ6gZnbs2MEVey8DvQulP3wB\nFgAkjBAHgIQR4gCQMEIcABJGiANAwghxAHWlqalFtgft1tTUUlU/a9eu1fTp0zV27FhNnjxZixcv\n1v79+zM9d9WqVbrkkkt01lln6dOf/vQpjEZlhDiAutLZuUM9H1YanFvP8bNZsWKF7rjjDq1YsUL7\n9+/XCy+8oPb2dl111VU6evRoxedPmTJFX/7yl3XLLbdkfs1qEeIAUMbvf/97FYtFPfDAA7ryyis1\ncuRITZs2Td///ve1fft2Pfroo1q0aJGWLVt27DnPPvuszj33+Be6fuITn9A111yjc845Z9D6JMQB\noIxNmzbpyJEjmjdv3gnbx44dq9mzZ+vpp58u++GcU/nAzukgxAGgjD179mjixIkaMeLkmJw0aZL2\n7NlTg65ORogDQBkTJ07Unj171N3dfdK+3bt3a+LEiTXo6mSEOACUcdlll2nMmDF68sknT9h+4MAB\nPfXUUyoUCmpoaNChQ4eO7du9e/dQt0mIA0A548eP17Jly/S5z31OGzZsUFdXl9rb2zV//ny9973v\n1YIFCzRjxgytX79ee/fuVUdHh1auPHHxs6NHj+oPf/iDjh49qq6uLh05ciTTu1qqQYgDqCuNjc3q\n+Ubywbn1HD+bpUuX6p577tHtt9+ucePG6X3ve58OHz6sZ555RmeffbZuvPFGTZ8+XS0tLZo1a5Zu\nuOGGE55/9913q6GhQffdd58effRRNTQ0aPny5ac8NuWwKMQwwaIQJ1QzFsera7ooRGmxg4xHrb3v\nfve7WrZsmZ5//nlNnTo19+P3Nx4sCgEAObj55ps1atQobdq0Sddff32t25FEiANAVRYsWFDrFk7A\nnDgAJIwQB4CEEeIAkDBCHAASRogDQMIIcQBIGCEOAAkjxAHUlaapTYO7PNvUplqfYq74sA+AutL5\nRmf2r0U4leMXO6uqX7t2re6//35t27ZNEyZM0Lx583Tvvfdq/PjxAz7vrbfe0mc/+1lt3LhRe/fu\n1fnnn6977rlHs2bNOp32T8KVOAD043TW2Ozq6tK0adP0i1/8Qr/73e/0ta99Tddff7127tyZa4+E\nOACUcbprbDY0NGjZsmXHHs+dO1fnnXeeNm/enGufhDgAlJH3GpudnZ3aunWrLr744lz7JMQBoIw8\n19js6urSwoUL9alPfUoXXHBBnm0S4gBQTl5rbEaEFi5cqDFjxuib3/xm3m0S4gBQTl5rbN5yyy3a\ns2ePnnzySY0cOTL3PnmLIYC60jilseq3AVZ7/Cx6r7E5btw4XX755dq1a5cWL158bI1NSbr//vt1\n55136siRIyetsfmZz3xGW7Zs0caNGzV69Ojcz0XiShxAnenY1aGIGLRbx66OzL2czhqbO3fu1OrV\nq9XW1qbGxkaNGzdO48eP12OPPZbreLHG5jDBGpsnVDMWx6tZY7MKrLEJAAljjU0ASBxrbAIAckOI\nA0DCCHEASBghDgAJI8QBIGGEOAAkjBAHUFdamgZ3ebaWpuqWZ1u7dq2mT5+usWPHavLkyVq8eLH2\n79+f6bk33nijJk2apHe/+9268MILtWbNmlMZkgFVDHHba2x32v5lP/s/Znuf7ZdLt7ty7xLAsLGj\ns1MhDdptR2f272U5nZV9JOmOO+7Q9u3btW/fPq1bt0533XWXXnnllcyvn0WWK/HvSLq6Qs3PI+JD\npdvdOfQFADV1uiv7SNJFF12ks846S1LP1xDY1rZt23Lts2KIR8RzkvZWKCu/lAUAJCqvlX0WL16s\nsWPH6oMf/KAmT56sOXPm5NpnXnPil9lus/0T2xfldEwAqJm8VvZZtWqVDhw4oOeee06f/OQnNWbM\nmFz7zOO7UzZLmhYRh2zPlvRDSf2uP1QsFo/dLxQKKhQKObQAAPnqvbJP3yCvZmUfqefq/KMf/age\neeQRPfjgg1qyZMmA9a2trWptbc107NMO8Yg40Ov+U7b/zfY5EfHbcvW9QxwA6lXvlX2uu+66Y9vf\nWdln+fLlamtrq7iyT29dXV2Z5sT7XuB+9atf7bc263SK1c+8t+3GXvdnquc7yssGOACkovfKPhs2\nbFBXV5fa29s1f/78Yyv7zJgxQ+vXr9fevXvV0dFxwso+b775pp544gkdPHhQ3d3d2rBhgx5//HFd\nccUVufZZ8Urc9vckFSS9x/ZOSV+RNFpSRMRqSdfZvk3S25IOS5qfa4cAhpXmxka5ircBnsrxs1q6\ndKkmTpyo22+/Xa+99pqOHDmiQqFwwso+GzduVEtLi8477zwtWrRIK1askNQzhfLggw/qtttuU3d3\nt5qbm7Vy5UrNnTs31/NhZZ9hgpV9TqhmLI5Xs7JPFVjZBwASxso+AJA4VvYBAOSGEAeAhBHiAJAw\nQhwAEsYfNgHUTHNzc9kvkRqumpubq34OIQ6gZtrb22vdQvKYTgGAhBHiAJAwQhwAEkaIA0DCCHEA\nSBghDgAJI8QBIGGEOAAkjBAHgIQR4gCQMEIcABJGiANAwghxAEgYIQ4ACSPEASBhhDgAJIwQB4CE\nEeIAkDBCHAASRogDQMIIcQBIGCEOAAkjxAEgYYQ4ACSMEAeAhBHiAJAwQhwAEkaIA0DCCHEASBgh\nDgAJI8QBIGGEOAAkjBAHgIQR4gCQMEIcABJGiANAwghxAEgYIQ4ACSPEASBhFUPc9hrbnbZ/OUDN\nN2xvtd1me0a+LQIA+pPlSvw7kq7ub6ft2ZLOj4gPSLpV0kM59QYAqKBiiEfEc5L2DlByraSHS7Uv\nSppguzGf9gAAA8ljTnyKpNd7PX6jtA0AMMhGDfULFovFY/cLhYIKhUJVz29qalFn545MtSNGj1D3\nW92ZahtGjNCh7my1zY2Nau/oyFSbojGSbGeqPdPHAiinpalJOzo7M9Weyu9Ia2urWltbM9U6IioX\n2c2SfhwR08vse0jSzyLiidLjLZI+FhEnnaHtyPJ6FXqRlPUYlooZS4tVHVWnex5DzT6zx2JQfy7q\n4PyqUe1YZD0/28n9XAyWoR4L24qIsldWWadTXLqVs07STaUXulTSvnIBDgDIX8XpFNvfk1SQ9B7b\nOyV9RdJoSRERqyNive05tl+TdFDSosFsGABwXMUQj4i/z1CzJJ92AADV4BObAJAwQhwAEkaIA0DC\nCHEASBghDgAJI8QBIGGEOAAkjBAHgIQR4gCQMEIcABJGiANAwghxAEgYIQ4ACSPEASBhhDgAJIwQ\nB4CEEeIAkDBCHAASRogDQMIIcQBIGCEOAAkjxAEgYYQ4ACSMEAeAhBHiAJAwQhwAEkaIA0DCCHEA\nSBghDgAJI8QBIGGEOAAkjBAHgIQR4gCQMEIcABJGiANAwghxAEgYIQ4ACSPEASBhhDgAJIwQB4CE\nEeIAkDBCHAASRogDQMIIcQBIGCEOAAkjxAEgYYQ4ACQsU4jbnmV7i+3f2P5Smf0fs73P9sul2135\ntwoA6GtUpQLbIyQ9IOlySf8n6SXbP4qILX1Kfx4R1wxCjwCAfmS5Ep8paWtE7IiItyU9LunaMnXO\ntTMAQEVZQnyKpNd7Pd5V2tbXZbbbbP/E9kW5dAcAGFDF6ZSMNkuaFhGHbM+W9ENJF5QrLBaLx+4X\nCgUVCoWcWgCAM0Nra6taW1sz1ToiBi6wL5VUjIhZpcf/LCki4r4BnrNd0ocj4rd9tkel16vYsC0p\n6zEsFTOWFqs6qk73PIaafWaPxaD+XNTB+VWj2rHIen62k/u5GCxDPRa2FRFlp6yzTKe8JOn9tptt\nj5Z0g6R1fV6gsdf9mer5n8NvBQAYVBWnUyLiqO0lkp5WT+iviYhXbd/asztWS7rO9m2S3pZ0WNL8\nwWwaANAj05x4RPy3pD/ps+1bve6vkrQq39YAAJXwiU0ASBghDgAJI8QBIGGEOAAkjBAHgIQR4gCQ\nMEIcABJGiANAwghxAEgYIQ4ACSPEASBhhDgAJIwQB4CEEeIAkDBCHAASRogDQMIIcQBIGCEOAAkj\nxAEgYYQ4ACSMEAeAhBHiAJAwQhwAEkaIA0DCCHEASBghDgAJI8QBIGGEOAAkjBAHgIQR4gCQMEIc\nABJGiANAwghxAEgYIQ4ACSPEASBhhDgAJIwQB4CEEeIAkDBCHAASRogDQMIIcQBIGCEOAAkjxAEg\nYYQ4ACSMEAeAhBHiAJCwTCFue5btLbZ/Y/tL/dR8w/ZW2222Z+TbZs6217qBOsJYHMdYHNPa2lrr\nFupGvY9FxRC3PULSA5KulnSxpL+zfWGfmtmSzo+ID0i6VdJDg9Brftpr3UAdaa91A3WkvdYN1I96\nD66hVO9jkeVKfKakrRGxIyLelvS4pGv71Fwr6WFJiogXJU2w3ZhrpwCAk2QJ8SmSXu/1eFdp20A1\nb5SpAQDkzBExcIH9N5Kujoh/LD1eKGlmRHy+V82PJd0bEZtKjzdK+qeIeLnPsQZ+MQBAWRHhcttH\nZXjuG5Km9Xo8tbStb825FWr6bQIAcGqyTKe8JOn9tpttj5Z0g6R1fWrWSbpJkmxfKmlfRHTm2ikA\n4CQVr8Qj4qjtJZKeVk/or4mIV23f2rM7VkfEettzbL8m6aCkRYPbNgBAyjAnDgCoX3xiEwASRogD\nQMII8WHE9oW2L7f9rj7bZ9Wqp1qxPdP2JaX7F9n+ou05te6rHth+uNY91APbf1n6ubiq1r0MZFjP\nidteFBHfqXUfQ8H25yUtlvSqpBmSvhARPyrtezkiPlTL/oaS7a9Imq2eP+w/I+kjkn4m6UpJGyJi\neQ3bG1K2+77TzJI+LumnkhQR1wx5UzVi+38iYmbp/j+o5/flPyVdJenHEfGvteyvP8M9xHdGxLTK\nlemz/StJl0XEAdstkn4g6ZGIWGn7lYj4s5o2OIRKYzFD0hhJHZKmRsR+22dLejEipte0wSFk+2VJ\nv5b075JCPSH+mHreSqyIeLZ23Q2t3r8Htl+SNCci3rQ9VtILEfGnte2wvCwf9kma7V/2t0vScPp+\nlxERcUCSIqLddkHSD2w3q2cshpOuiDgq6ZDtbRGxX5Ii4rDt7hr3NtT+XNIXJN0paWlEtNk+PJzC\nu5cRtv9YPdPMIyPiTUmKiIO2u2rbWv/O+BBXT1BfLWlvn+2WtGno26mZTtszIqJNkkpX5H8t6duS\n6vIKYxC9ZbshIg5J+vA7G21PkDSsQjwiuiV93fZ/lP7bqeGRC+VMkLRZPdkQtidFxO7S35Dq9kJn\nOPxj/Zekd70TXr3Zbh36dmrmJkknXE1ERJekm2x/qzYt1cxfRcQR6ViIveOPJN1cm5ZqKyJ2Sfpb\n23Ml7a91P7UQES397OqWNG8IW6nKsJ4TB4DU8RZDAEgYIQ4ACSPEASBhhDgAJOz/AYrGVGpCxduO\nAAAAAElFTkSuQmCC\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "result.plot(kind='bar')" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Handling missing data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 aardvark\n", "1 artichoke\n", "2 NaN\n", "3 avocado\n", "dtype: object" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])\n", "string_data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 True\n", "3 False\n", "dtype: bool" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string_data.isnull()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "string_data[0] = None" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 True\n", "1 False\n", "2 True\n", "3 False\n", "dtype: bool" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string_data.isnull()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Filtering out Missing data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### dropna" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from numpy import nan as NA" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data = Series([1, NA, 3.5, NA, 7])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "2 3.5\n", "4 7.0\n", "dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# = data[data.notnull()]\n", "\n", "data.dropna()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
01.05.63.0
11.0NaNNaN
2NaNNaNNaN
3NaN6.53.0
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 1.0 5.6 3.0\n", "1 1.0 NaN NaN\n", "2 NaN NaN NaN\n", "3 NaN 6.5 3.0" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data =DataFrame([[1., 5.6, 3.], [1, NA, NA], \n", " [NA, NA, NA], [NA, 6.5, 3.]])\n", "data" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
01.05.63.0
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 1.0 5.6 3.0" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dropna()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### how='all'\n", "only drop rows that are all NA" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
01.05.63.0
11.0NaNNaN
3NaN6.53.0
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 1.0 5.6 3.0\n", "1 1.0 NaN NaN\n", "3 NaN 6.5 3.0" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dropna(how = 'all')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0124
01.05.63.0NaN
11.0NaNNaNNaN
2NaNNaNNaNNaN
3NaN6.53.0NaN
\n", "
" ], "text/plain": [ " 0 1 2 4\n", "0 1.0 5.6 3.0 NaN\n", "1 1.0 NaN NaN NaN\n", "2 NaN NaN NaN NaN\n", "3 NaN 6.5 3.0 NaN" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Droping columns\n", "\n", "data[4] = NA\n", "data" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
01.05.63.0
11.0NaNNaN
2NaNNaNNaN
3NaN6.53.0
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 1.0 5.6 3.0\n", "1 1.0 NaN NaN\n", "2 NaN NaN NaN\n", "3 NaN 6.5 3.0" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.dropna(axis =1, how='all')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### thresh\n", "keep rows containing a certain number of observations" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
00.5718670.1457320.424709
10.6878450.3365390.215066
20.9982690.1183640.459211
30.0323190.8870720.534793
40.6742300.6043410.921974
50.8194320.4170090.265412
60.3918730.6678320.675651
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 0.571867 0.145732 0.424709\n", "1 0.687845 0.336539 0.215066\n", "2 0.998269 0.118364 0.459211\n", "3 0.032319 0.887072 0.534793\n", "4 0.674230 0.604341 0.921974\n", "5 0.819432 0.417009 0.265412\n", "6 0.391873 0.667832 0.675651" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = DataFrame(np.random.rand(7, 3))\n", "df" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.ix[:4, 1] = NA; df.ix[:2, 2] = NA" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
00.571867NaNNaN
10.687845NaNNaN
20.998269NaNNaN
30.032319NaN0.534793
40.674230NaN0.921974
50.8194320.4170090.265412
60.3918730.6678320.675651
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 0.571867 NaN NaN\n", "1 0.687845 NaN NaN\n", "2 0.998269 NaN NaN\n", "3 0.032319 NaN 0.534793\n", "4 0.674230 NaN 0.921974\n", "5 0.819432 0.417009 0.265412\n", "6 0.391873 0.667832 0.675651" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
30.032319NaN0.534793
40.674230NaN0.921974
50.8194320.4170090.265412
60.3918730.6678320.675651
\n", "
" ], "text/plain": [ " 0 1 2\n", "3 0.032319 NaN 0.534793\n", "4 0.674230 NaN 0.921974\n", "5 0.819432 0.417009 0.265412\n", "6 0.391873 0.667832 0.675651" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dropna(thresh=2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Filling Missing Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### fillna" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
00.5718670.0000000.000000
10.6878450.0000000.000000
20.9982690.0000000.000000
30.0323190.0000000.534793
40.6742300.0000000.921974
50.8194320.4170090.265412
60.3918730.6678320.675651
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 0.571867 0.000000 0.000000\n", "1 0.687845 0.000000 0.000000\n", "2 0.998269 0.000000 0.000000\n", "3 0.032319 0.000000 0.534793\n", "4 0.674230 0.000000 0.921974\n", "5 0.819432 0.417009 0.265412\n", "6 0.391873 0.667832 0.675651" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.fillna(0)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
00.5718670.500000NaN
10.6878450.500000NaN
20.9982690.500000NaN
30.0323190.5000000.534793
40.6742300.5000000.921974
50.8194320.4170090.265412
60.3918730.6678320.675651
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 0.571867 0.500000 NaN\n", "1 0.687845 0.500000 NaN\n", "2 0.998269 0.500000 NaN\n", "3 0.032319 0.500000 0.534793\n", "4 0.674230 0.500000 0.921974\n", "5 0.819432 0.417009 0.265412\n", "6 0.391873 0.667832 0.675651" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use a different fill value for each column\n", "\n", "df.fillna({1: 0.5, 3: -1})" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# fillna returns a new object, but you can \n", "#modify the existing object\n", "\n", "_ = df.fillna(0, inplace=True)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
00.5718670.0000000.000000
10.6878450.0000000.000000
20.9982690.0000000.000000
30.0323190.0000000.534793
40.6742300.0000000.921974
50.8194320.4170090.265412
60.3918730.6678320.675651
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 0.571867 0.000000 0.000000\n", "1 0.687845 0.000000 0.000000\n", "2 0.998269 0.000000 0.000000\n", "3 0.032319 0.000000 0.534793\n", "4 0.674230 0.000000 0.921974\n", "5 0.819432 0.417009 0.265412\n", "6 0.391873 0.667832 0.675651" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### interpolation methods" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
00.4400300.9429090.390700
10.3382240.6577350.457466
20.596110NaN0.124546
30.585392NaN0.692310
40.153977NaNNaN
50.775935NaNNaN
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 0.440030 0.942909 0.390700\n", "1 0.338224 0.657735 0.457466\n", "2 0.596110 NaN 0.124546\n", "3 0.585392 NaN 0.692310\n", "4 0.153977 NaN NaN\n", "5 0.775935 NaN NaN" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = DataFrame(np.random.rand(6, 3))\n", "df.ix[2:,1] = NA; df.ix[4:, 2] = NA\n", "df" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
00.4400300.9429090.390700
10.3382240.6577350.457466
20.5961100.6577350.124546
30.5853920.6577350.692310
40.1539770.6577350.692310
50.7759350.6577350.692310
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 0.440030 0.942909 0.390700\n", "1 0.338224 0.657735 0.457466\n", "2 0.596110 0.657735 0.124546\n", "3 0.585392 0.657735 0.692310\n", "4 0.153977 0.657735 0.692310\n", "5 0.775935 0.657735 0.692310" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.fillna(method ='ffill')" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
00.4400300.9429090.390700
10.3382240.6577350.457466
20.5961100.6577350.124546
30.5853920.6577350.692310
40.153977NaN0.692310
50.775935NaN0.692310
\n", "
" ], "text/plain": [ " 0 1 2\n", "0 0.440030 0.942909 0.390700\n", "1 0.338224 0.657735 0.457466\n", "2 0.596110 0.657735 0.124546\n", "3 0.585392 0.657735 0.692310\n", "4 0.153977 NaN 0.692310\n", "5 0.775935 NaN 0.692310" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.fillna(method ='ffill', limit=2)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 1.000000\n", "1 3.833333\n", "2 3.500000\n", "3 3.833333\n", "4 7.000000\n", "dtype: float64" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = Series([1, NA, 3.5, NA, 7])\n", "data.fillna(data.mean())" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# Hierarchical Indexing" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 1 0.527096\n", " 2 0.919174\n", " 3 0.877325\n", "b 1 0.062808\n", " 2 0.144843\n", " 3 0.605347\n", "c 1 0.593008\n", " 2 0.629646\n", "d 2 0.019268\n", " 3 0.600728\n", "dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = Series(np.random.rand(10), \n", " index =[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],\n", " [1, 2,3, 1, 2,3, 1, 2,2, 3]])\n", "data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],\n", " labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.index" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "b 1 0.062808\n", " 2 0.144843\n", " 3 0.605347\n", "c 1 0.593008\n", " 2 0.629646\n", "dtype: float64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['b':'c']" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "b 1 0.062808\n", " 2 0.144843\n", " 3 0.605347\n", "d 2 0.019268\n", " 3 0.600728\n", "dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.ix[['b', 'd']]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 0.919174\n", "b 0.144843\n", "c 0.629646\n", "d 0.019268\n", "dtype: float64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[:, 2]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
123
a0.5270960.9191740.877325
b0.0628080.1448430.605347
c0.5930080.629646NaN
dNaN0.0192680.600728
\n", "
" ], "text/plain": [ " 1 2 3\n", "a 0.527096 0.919174 0.877325\n", "b 0.062808 0.144843 0.605347\n", "c 0.593008 0.629646 NaN\n", "d NaN 0.019268 0.600728" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# arrange it into a DataFrame\n", "\n", "data.unstack()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "a 1 0.527096\n", " 2 0.919174\n", " 3 0.877325\n", "b 1 0.062808\n", " 2 0.144843\n", " 3 0.605347\n", "c 1 0.593008\n", " 2 0.629646\n", "d 2 0.019268\n", " 3 0.600728\n", "dtype: float64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.unstack().stack()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Either axis can have a hierarchical index" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OhioColorado
GreenRedGreen
a1012
2345
b1678
291011
\n", "
" ], "text/plain": [ " Ohio Colorado\n", " Green Red Green\n", "a 1 0 1 2\n", " 2 3 4 5\n", "b 1 6 7 8\n", " 2 9 10 11" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = DataFrame(np.arange(12).reshape((4,3)),\n", " index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],\n", " columns = [['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']])\n", "frame" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateOhioColorado
colorGreenRedGreen
key1key2
a1012
2345
b1678
291011
\n", "
" ], "text/plain": [ "state Ohio Colorado\n", "color Green Red Green\n", "key1 key2 \n", "a 1 0 1 2\n", " 2 3 4 5\n", "b 1 6 7 8\n", " 2 9 10 11" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.index.names = ['key1', 'key2']\n", "frame.columns.names = ['state', 'color']\n", "frame" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colorGreenRed
key1key2
a101
234
b167
2910
\n", "
" ], "text/plain": [ "color Green Red\n", "key1 key2 \n", "a 1 0 1\n", " 2 3 4\n", "b 1 6 7\n", " 2 9 10" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame['Ohio']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### MultiIndex" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],\n", " labels=[[1, 1, 0], [0, 1, 0]],\n", " names=['state', 'color'])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],\n", " names =['state', 'color'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reordering and Sorting Levels" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### swaplevel" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateOhioColorado
colorGreenRedGreen
key2key1
1a012
2a345
1b678
2b91011
\n", "
" ], "text/plain": [ "state Ohio Colorado\n", "color Green Red Green\n", "key2 key1 \n", "1 a 0 1 2\n", "2 a 3 4 5\n", "1 b 6 7 8\n", "2 b 9 10 11" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.swaplevel('key1', 'key2')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### sortlevel" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateOhioColorado
colorGreenRedGreen
key1key2
a1012
b1678
a2345
b291011
\n", "
" ], "text/plain": [ "state Ohio Colorado\n", "color Green Red Green\n", "key1 key2 \n", "a 1 0 1 2\n", "b 1 6 7 8\n", "a 2 3 4 5\n", "b 2 9 10 11" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sortlevel(1)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateOhioColorado
colorGreenRedGreen
key2key1
1a012
2a345
1b678
2b91011
\n", "
" ], "text/plain": [ "state Ohio Colorado\n", "color Green Red Green\n", "key2 key1 \n", "1 a 0 1 2\n", "2 a 3 4 5\n", "1 b 6 7 8\n", "2 b 9 10 11" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.swaplevel(0,1).sortlevel(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Summary Statistics by Level" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### level\n", "you can specify the level you want to (sum) by on a particular axis" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateOhioColorado
colorGreenRedGreen
key2
16810
2121416
\n", "
" ], "text/plain": [ "state Ohio Colorado\n", "color Green Red Green\n", "key2 \n", "1 6 8 10\n", "2 12 14 16" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sum(level = 'key2')" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
colorGreenRed
key1key2
a121
284
b1147
22010
\n", "
" ], "text/plain": [ "color Green Red\n", "key1 key2 \n", "a 1 2 1\n", " 2 8 4\n", "b 1 14 7\n", " 2 20 10" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.sum(level = 'color', axis= 1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using a DataFrame's Columns" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
007one0
116one1
225one2
334two0
443two1
552two2
661two3
\n", "
" ], "text/plain": [ " a b c d\n", "0 0 7 one 0\n", "1 1 6 one 1\n", "2 2 5 one 2\n", "3 3 4 two 0\n", "4 4 3 two 1\n", "5 5 2 two 2\n", "6 6 1 two 3" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),\n", " 'c': ['one', 'one','one', 'two', 'two', 'two', 'two'],\n", " 'd': [0,1,2,0,1,2,3]})\n", "frame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### set_index" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ab
cd
one007
116
225
two034
143
252
361
\n", "
" ], "text/plain": [ " a b\n", "c d \n", "one 0 0 7\n", " 1 1 6\n", " 2 2 5\n", "two 0 3 4\n", " 1 4 3\n", " 2 5 2\n", " 3 6 1" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame2 = frame.set_index(['c', 'd'])\n", "frame2" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abcd
cd
one007one0
116one1
225one2
two034two0
143two1
252two2
361two3
\n", "
" ], "text/plain": [ " a b c d\n", "c d \n", "one 0 0 7 one 0\n", " 1 1 6 one 1\n", " 2 2 5 one 2\n", "two 0 3 4 two 0\n", " 1 4 3 two 1\n", " 2 5 2 two 2\n", " 3 6 1 two 3" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# but still leave the colums\n", "\n", "frame.set_index(['c', 'd'], drop=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### reset_index" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cdab
0one007
1one116
2one225
3two034
4two143
5two252
6two361
\n", "
" ], "text/plain": [ " c d a b\n", "0 one 0 0 7\n", "1 one 1 1 6\n", "2 one 2 2 5\n", "3 two 0 3 4\n", "4 two 1 4 3\n", "5 two 2 5 2\n", "6 two 3 6 1" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# and the opposite, move an index to the colums\n", "\n", "frame2.reset_index()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Other pandas Topics " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### iloc\n", "usually used when you have integers as index to avoid confusion " ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ser3 = Series(range(3), index=[-5, 1, 3])\n", "ser3.iloc[2]" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": true }, "outputs": [], "source": [ "frame = DataFrame(np.arange(6).reshape(3, 2), index =[2, 0, 1])" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 1\n", "Name: 2, dtype: int64" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.iloc[0]" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2 0\n", "0 2\n", "1 4\n", "Name: 0, dtype: int64" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.iloc[:, 0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Panel data\n", "three dimensional analogue of DataFrame" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas.io.data as web" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", "Dimensions: 4 (items) x 1611 (major_axis) x 6 (minor_axis)\n", "Items axis: AAPL to MSFT\n", "Major_axis axis: 2010-01-04 00:00:00 to 2016-04-28 00:00:00\n", "Minor_axis axis: Open to Adj Close" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk))\n", " for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL']))\n", "pdata" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pdata = pdata.swapaxes('items', 'minor')" ] }, { "cell_type": "code", "execution_count": 74, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLDELLGOOGMSFT
Date
2010-01-0428.31319514.06528313.06246826.227603
2010-01-0528.36214514.38450311.68384426.236076
2010-01-0627.91100814.10397303.82668526.075067
2010-01-0727.85941214.23940296.75374925.803894
2010-01-0828.04463014.36516300.70980825.981851
...............
2016-04-22105.680000NaN718.77002051.779999
2016-04-25105.080002NaN723.15002452.110001
2016-04-26104.349998NaN708.14001551.439999
2016-04-2797.820000NaN705.84002750.939999
2016-04-2894.830002NaN691.02002049.900002
\n", "

1611 rows × 4 columns

\n", "
" ], "text/plain": [ " AAPL DELL GOOG MSFT\n", "Date \n", "2010-01-04 28.313195 14.06528 313.062468 26.227603\n", "2010-01-05 28.362145 14.38450 311.683844 26.236076\n", "2010-01-06 27.911008 14.10397 303.826685 26.075067\n", "2010-01-07 27.859412 14.23940 296.753749 25.803894\n", "2010-01-08 28.044630 14.36516 300.709808 25.981851\n", "... ... ... ... ...\n", "2016-04-22 105.680000 NaN 718.770020 51.779999\n", "2016-04-25 105.080002 NaN 723.150024 52.110001\n", "2016-04-26 104.349998 NaN 708.140015 51.439999\n", "2016-04-27 97.820000 NaN 705.840027 50.939999\n", "2016-04-28 94.830002 NaN 691.020020 49.900002\n", "\n", "[1611 rows x 4 columns]" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdata['Adj Close']" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OpenHighLowCloseVolumeAdj Close
AAPL569.159996572.650009560.520012560.989983130246900.074.218116
DELL12.15000012.30000012.04500012.07000019397600.011.675920
GOOG571.790972572.650996568.350996570.9810006138700.0285.205295
MSFT28.76000028.95999928.44000128.45000156634300.025.598227
\n", "
" ], "text/plain": [ " Open High Low Close Volume Adj Close\n", "AAPL 569.159996 572.650009 560.520012 560.989983 130246900.0 74.218116\n", "DELL 12.150000 12.300000 12.045000 12.070000 19397600.0 11.675920\n", "GOOG 571.790972 572.650996 568.350996 570.981000 6138700.0 285.205295\n", "MSFT 28.760000 28.959999 28.440001 28.450001 56634300.0 25.598227" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdata.ix[:, '6/1/2012', :]" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AAPLDELLGOOGMSFT
Date
2012-05-2273.68628214.58765300.10041226.776915
2012-05-2375.48421112.08221304.42610626.192070
2012-05-2474.79097312.04351301.52897826.156079
2012-05-2574.39010512.05319295.47005026.147081
2012-05-28NaN12.05319NaNNaN
...............
2016-04-22105.680000NaN718.77002051.779999
2016-04-25105.080002NaN723.15002452.110001
2016-04-26104.349998NaN708.14001551.439999
2016-04-2797.820000NaN705.84002750.939999
2016-04-2894.830002NaN691.02002049.900002
\n", "

1004 rows × 4 columns

\n", "
" ], "text/plain": [ " AAPL DELL GOOG MSFT\n", "Date \n", "2012-05-22 73.686282 14.58765 300.100412 26.776915\n", "2012-05-23 75.484211 12.08221 304.426106 26.192070\n", "2012-05-24 74.790973 12.04351 301.528978 26.156079\n", "2012-05-25 74.390105 12.05319 295.470050 26.147081\n", "2012-05-28 NaN 12.05319 NaN NaN\n", "... ... ... ... ...\n", "2016-04-22 105.680000 NaN 718.770020 51.779999\n", "2016-04-25 105.080002 NaN 723.150024 52.110001\n", "2016-04-26 104.349998 NaN 708.140015 51.439999\n", "2016-04-27 97.820000 NaN 705.840027 50.939999\n", "2016-04-28 94.830002 NaN 691.020020 49.900002\n", "\n", "[1004 rows x 4 columns]" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pdata.ix['Adj Close', '5/22/2012':, :]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### to_frame, to_panel" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OpenHighLowCloseVolumeAdj Close
Dateminor
2012-05-30AAPL569.199997579.989990566.559990579.169998132357400.076.623304
DELL12.59000012.70000012.46000012.56000019787800.012.149920
GOOG588.161028591.901014583.530999588.2309923827600.0293.821674
MSFT29.35000029.48000029.12000129.34000041585500.026.399015
2012-05-31AAPL580.740021581.499985571.460022577.730019122918600.076.432797
........................
2016-04-27GOOG707.289978708.979980692.364990705.8400273086700.0705.840027
MSFT51.48000051.50000050.54999950.93999943145900.050.939999
2016-04-28AAPL97.61000197.87999794.25000094.83000281990700.094.830002
GOOG708.260010714.169983689.549988691.0200202851100.0691.020020
MSFT50.61999950.77000049.56000149.90000242758200.049.900002
\n", "

3325 rows × 6 columns

\n", "
" ], "text/plain": [ " Open High Low Close Volume \\\n", "Date minor \n", "2012-05-30 AAPL 569.199997 579.989990 566.559990 579.169998 132357400.0 \n", " DELL 12.590000 12.700000 12.460000 12.560000 19787800.0 \n", " GOOG 588.161028 591.901014 583.530999 588.230992 3827600.0 \n", " MSFT 29.350000 29.480000 29.120001 29.340000 41585500.0 \n", "2012-05-31 AAPL 580.740021 581.499985 571.460022 577.730019 122918600.0 \n", "... ... ... ... ... ... \n", "2016-04-27 GOOG 707.289978 708.979980 692.364990 705.840027 3086700.0 \n", " MSFT 51.480000 51.500000 50.549999 50.939999 43145900.0 \n", "2016-04-28 AAPL 97.610001 97.879997 94.250000 94.830002 81990700.0 \n", " GOOG 708.260010 714.169983 689.549988 691.020020 2851100.0 \n", " MSFT 50.619999 50.770000 49.560001 49.900002 42758200.0 \n", "\n", " Adj Close \n", "Date minor \n", "2012-05-30 AAPL 76.623304 \n", " DELL 12.149920 \n", " GOOG 293.821674 \n", " MSFT 26.399015 \n", "2012-05-31 AAPL 76.432797 \n", "... ... \n", "2016-04-27 GOOG 705.840027 \n", " MSFT 50.939999 \n", "2016-04-28 AAPL 94.830002 \n", " GOOG 691.020020 \n", " MSFT 49.900002 \n", "\n", "[3325 rows x 6 columns]" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stacked = pdata.ix[:, '5/30/2012':, :].to_frame()\n", "stacked" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 0 }