{ "metadata": { "name": "", "signature": "sha256:533ad596008b3b6dc22f84b97cf2b92c150f08391d00dbc30deabd21a8bc80d9" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Purpose of This Notebook" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* how to use `apply` on a pandas `Series` and `DataFrame`\n", "* show a bit about how `lambda` functions work" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# numpy and pandas related imports \n", "\n", "import numpy as np\n", "from pandas import Series, DataFrame\n", "import pandas as pd\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Setup: create Series and DataFrames" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's make two Series and a DataFrame to use for our example" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# for example, using lower and uppercase English letters\n", "\n", "import string\n", "string.lowercase, string.uppercase" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 2, "text": [ "('abcdefghijklmnopqrstuvwxyz', 'ABCDEFGHIJKLMNOPQRSTUVWXYZ')" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "# we can make a list composed of the individual lowercase letters \n", "\n", "list(string.lowercase)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ "['a',\n", " 'b',\n", " 'c',\n", " 'd',\n", " 'e',\n", " 'f',\n", " 'g',\n", " 'h',\n", " 'i',\n", " 'j',\n", " 'k',\n", " 'l',\n", " 'm',\n", " 'n',\n", " 'o',\n", " 'p',\n", " 'q',\n", " 'r',\n", " 's',\n", " 't',\n", " 'u',\n", " 'v',\n", " 'w',\n", " 'x',\n", " 'y',\n", " 'z']" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "# create a pandas Series out of the list of lowercase letters\n", "\n", "lower = Series(list(string.lowercase), name='lower')\n", "print type(lower)\n", "lower.head()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] }, { "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ "0 a\n", "1 b\n", "2 c\n", "3 d\n", "4 e\n", "Name: lower, dtype: object" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "# create a pandas Series out of the list of lowercase letters\n", "\n", "upper = Series(list(string.uppercase), name='upper')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "# concatenate the two Series as columns, using axis=1 \n", "# axis = 0 would result in two rows in the DataFrame\n", "\n", "df = pd.concat((lower, upper), axis=1)\n", "df.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lowerupper
0 a A
1 b B
2 c C
3 d D
4 e E
\n", "

5 rows \u00d7 2 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ " lower upper\n", "0 a A\n", "1 b B\n", "2 c C\n", "3 d D\n", "4 e E\n", "\n", "[5 rows x 2 columns]" ] } ], "prompt_number": 6 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Using apply" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Series.apply" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.apply.html:\n", " \n", " Series.apply(func, convert_dtype=True, args=(), **kwds)\n", "\n", " Invoke function on values of Series." ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Let's start by using Series.apply\n", "# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.apply.html\n", "\n", "# first of all, it's useful to find a way to use apply to return the exact same Series\n", "\n", "def identity(s):\n", " return s\n", "\n", "lower.apply(identity)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ "0 a\n", "1 b\n", "2 c\n", "3 d\n", "4 e\n", "5 f\n", "6 g\n", "7 h\n", "8 i\n", "9 j\n", "10 k\n", "11 l\n", "12 m\n", "13 n\n", "14 o\n", "15 p\n", "16 q\n", "17 r\n", "18 s\n", "19 t\n", "20 u\n", "21 v\n", "22 w\n", "23 x\n", "24 y\n", "25 z\n", "Name: lower, dtype: object" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "# show that identity yields the same Series -- first on element by element basis\n", "\n", "lower.apply(identity) == lower" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ "0 True\n", "1 True\n", "2 True\n", "3 True\n", "4 True\n", "5 True\n", "6 True\n", "7 True\n", "8 True\n", "9 True\n", "10 True\n", "11 True\n", "12 True\n", "13 True\n", "14 True\n", "15 True\n", "16 True\n", "17 True\n", "18 True\n", "19 True\n", "20 True\n", "21 True\n", "22 True\n", "23 True\n", "24 True\n", "25 True\n", "Name: lower, dtype: bool" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "# Check that match happens for every element in the Series using numpy.all\n", "# http://docs.scipy.org/doc/numpy/reference/generated/numpy.all.html\n", "\n", "np.all(lower.apply(identity) == lower)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ "True" ] } ], "prompt_number": 9 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Let's use `lambda`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sometimes it's convenient to write functions using `lambda`, especially short functions for doing a simple transformation of the parameters. Only some functions can be rewritten with `lambda`. " ] }, { "cell_type": "code", "collapsed": false, "input": [ "def add_preface(s):\n", " return 'letter ' + s\n", "\n", "lower.apply(add_preface)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 10, "text": [ "0 letter a\n", "1 letter b\n", "2 letter c\n", "3 letter d\n", "4 letter e\n", "5 letter f\n", "6 letter g\n", "7 letter h\n", "8 letter i\n", "9 letter j\n", "10 letter k\n", "11 letter l\n", "12 letter m\n", "13 letter n\n", "14 letter o\n", "15 letter p\n", "16 letter q\n", "17 letter r\n", "18 letter s\n", "19 letter t\n", "20 letter u\n", "21 letter v\n", "22 letter w\n", "23 letter x\n", "24 letter y\n", "25 letter z\n", "Name: lower, dtype: object" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "# rewrite with lambda\n", "\n", "lower.apply(lambda s: 'letter ' + s)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ "0 letter a\n", "1 letter b\n", "2 letter c\n", "3 letter d\n", "4 letter e\n", "5 letter f\n", "6 letter g\n", "7 letter h\n", "8 letter i\n", "9 letter j\n", "10 letter k\n", "11 letter l\n", "12 letter m\n", "13 letter n\n", "14 letter o\n", "15 letter p\n", "16 letter q\n", "17 letter r\n", "18 letter s\n", "19 letter t\n", "20 letter u\n", "21 letter v\n", "22 letter w\n", "23 letter x\n", "24 letter y\n", "25 letter z\n", "Name: lower, dtype: object" ] } ], "prompt_number": 11 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Another illustration of apply" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Another illustration of using `apply` -- using `ord` and `chr`" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# ord: Given a string of length one, return an integer representing the Unicode code \n", "# point of the character when the argument is a unicode object, or the value of the \n", "# byte when the argument is an 8-bit string. \n", "# http://docs.python.org/2.7/library/functions.html#ord\n", "\n", "ord('a')" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ "97" ] } ], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "# chr: Return a string of one character whose ASCII code is the integer i.\n", "# http://docs.python.org/2.7/library/functions.html#chr\n", "\n", "chr(97)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 13, "text": [ "'a'" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "# show that for the case of 'a', chr(ord()) returns what we start with:'a'\n", "\n", "chr(ord('a')) == 'a'" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 14, "text": [ "True" ] } ], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "# we can test whether chr reverses ord for all the lower case letters\n", "# note how we chain two apply together\n", "\n", "np.all(lower.apply(ord).apply(chr) == lower)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 15, "text": [ "True" ] } ], "prompt_number": 15 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that we read off a specific series from the DataFrame" ] }, { "cell_type": "code", "collapsed": false, "input": [ "type(df.upper)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 16, "text": [ "pandas.core.series.Series" ] } ], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "# transform\n", "df.upper.apply(lambda s: s.lower())" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 17, "text": [ "0 a\n", "1 b\n", "2 c\n", "3 d\n", "4 e\n", "5 f\n", "6 g\n", "7 h\n", "8 i\n", "9 j\n", "10 k\n", "11 l\n", "12 m\n", "13 n\n", "14 o\n", "15 p\n", "16 q\n", "17 r\n", "18 s\n", "19 t\n", "20 u\n", "21 v\n", "22 w\n", "23 x\n", "24 y\n", "25 z\n", "Name: upper, dtype: object" ] } ], "prompt_number": 17 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "DataFrame.apply" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`apply` can also be applied to a `DataFrame`\n", "\n", "http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.apply.html\n", "\n", " DataFrame.apply(func, axis=0, broadcast=False, raw=False, reduce=None, args=(), **kwds)\n", " Applies function along input axis of DataFrame.\n", "\n", " Objects passed to functions are Series objects having index either the DataFrame\u2019s index (axis=0) or the columns (axis=1). Return type depends on whether passed function aggregates, or the reduce argument if the DataFrame is empty." ] }, { "cell_type": "code", "collapsed": false, "input": [ "# let's show that whether we use apply on columns (axis=0) or rows (axis=1), we get the same \n", "# result\n", "\n", "def identity(s):\n", " return s\n", "\n", "np.all(df.apply(identity, axis=0) == df.apply(identity, axis=1))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 18, "text": [ "True" ] } ], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "# for each column, first lower and then upper, return the index\n", "\n", "def index(s):\n", " return s.index\n", "\n", "df.apply(index, axis=0)\n" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lowerupper
0 0 0
1 1 1
2 2 2
3 3 3
4 4 4
5 5 5
6 6 6
7 7 7
8 8 8
9 9 9
10 10 10
11 11 11
12 12 12
13 13 13
14 14 14
15 15 15
16 16 16
17 17 17
18 18 18
19 19 19
20 20 20
21 21 21
22 22 22
23 23 23
24 24 24
25 25 25
\n", "

26 rows \u00d7 2 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 19, "text": [ " lower upper\n", "0 0 0\n", "1 1 1\n", "2 2 2\n", "3 3 3\n", "4 4 4\n", "5 5 5\n", "6 6 6\n", "7 7 7\n", "8 8 8\n", "9 9 9\n", "10 10 10\n", "11 11 11\n", "12 12 12\n", "13 13 13\n", "14 14 14\n", "15 15 15\n", "16 16 16\n", "17 17 17\n", "18 18 18\n", "19 19 19\n", "20 20 20\n", "21 21 21\n", "22 22 22\n", "23 23 23\n", "24 24 24\n", "25 25 25\n", "\n", "[26 rows x 2 columns]" ] } ], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "# for each row (axis=1), first lower and then upper, return the index \n", "# (which are the column names)\n", "\n", "def index(s):\n", " return s.index\n", "\n", "df.apply(index, axis=1)\n" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lowerupper
0 lower upper
1 lower upper
2 lower upper
3 lower upper
4 lower upper
5 lower upper
6 lower upper
7 lower upper
8 lower upper
9 lower upper
10 lower upper
11 lower upper
12 lower upper
13 lower upper
14 lower upper
15 lower upper
16 lower upper
17 lower upper
18 lower upper
19 lower upper
20 lower upper
21 lower upper
22 lower upper
23 lower upper
24 lower upper
25 lower upper
\n", "

26 rows \u00d7 2 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 20, "text": [ " lower upper\n", "0 lower upper\n", "1 lower upper\n", "2 lower upper\n", "3 lower upper\n", "4 lower upper\n", "5 lower upper\n", "6 lower upper\n", "7 lower upper\n", "8 lower upper\n", "9 lower upper\n", "10 lower upper\n", "11 lower upper\n", "12 lower upper\n", "13 lower upper\n", "14 lower upper\n", "15 lower upper\n", "16 lower upper\n", "17 lower upper\n", "18 lower upper\n", "19 lower upper\n", "20 lower upper\n", "21 lower upper\n", "22 lower upper\n", "23 lower upper\n", "24 lower upper\n", "25 lower upper\n", "\n", "[26 rows x 2 columns]" ] } ], "prompt_number": 20 }, { "cell_type": "code", "collapsed": false, "input": [ "# it might be easier to see the difference between axis=0 vs axis=1\n", "# by using join\n", "\n", "# Consider what you get with\n", "\n", "\"\".join(df.lower)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 21, "text": [ "'abcdefghijklmnopqrstuvwxyz'" ] } ], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "# Now compare (axis=0)\n", "\n", "df.apply(lambda s: \"\".join(s), axis=0)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 22, "text": [ "lower abcdefghijklmnopqrstuvwxyz\n", "upper ABCDEFGHIJKLMNOPQRSTUVWXYZ\n", "dtype: object" ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "# join with axis=1\n", "\n", "df.apply(lambda s: \"\".join(s), axis=1)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 23, "text": [ "0 aA\n", "1 bB\n", "2 cC\n", "3 dD\n", "4 eE\n", "5 fF\n", "6 gG\n", "7 hH\n", "8 iI\n", "9 jJ\n", "10 kK\n", "11 lL\n", "12 mM\n", "13 nN\n", "14 oO\n", "15 pP\n", "16 qQ\n", "17 rR\n", "18 sS\n", "19 tT\n", "20 uU\n", "21 vV\n", "22 wW\n", "23 xX\n", "24 yY\n", "25 zZ\n", "dtype: object" ] } ], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "# note that you can access use the index in your function passed to apply\n", "\n", "df.apply(lambda s: s['upper'] + s['lower'], axis=1)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 24, "text": [ "0 Aa\n", "1 Bb\n", "2 Cc\n", "3 Dd\n", "4 Ee\n", "5 Ff\n", "6 Gg\n", "7 Hh\n", "8 Ii\n", "9 Jj\n", "10 Kk\n", "11 Ll\n", "12 Mm\n", "13 Nn\n", "14 Oo\n", "15 Pp\n", "16 Qq\n", "17 Rr\n", "18 Ss\n", "19 Tt\n", "20 Uu\n", "21 Vv\n", "22 Ww\n", "23 Xx\n", "24 Yy\n", "25 Zz\n", "dtype: object" ] } ], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }