{
 "metadata": {
  "name": "",
  "signature": "sha256:a8ab151540de39baa3706b4bfc39a6130a4caec520d0fc188221a1017c8e45e3"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "What this notebook is about"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Goal:\n",
      "    \n",
      "  * to learn about the basics of Pandas groupby operations using the census information about states\n",
      " \n",
      "References:\n",
      "\n",
      "* [Group By: split-apply-combine \u2014 pandas 0.13.1 documentation](http://pandas.pydata.org/pandas-docs/stable/groupby.html)\n",
      "\n",
      "* [Python for Data Analysis > 9. Data Aggregation and Group Operations > GroupBy Mechanics : Safari Books Online](http://my.safaribooksonline.com/book/programming/python/9781449323592/9dot-data-aggregation-and-group-operations/id2805988)\n",
      "\n",
      "Note the **split-apply-combine** framework of thinking\n",
      "\n",
      "\n",
      " "
     ]
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Version of pandas"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# note version of pandas\n",
      "import pandas\n",
      "pandas.__version__"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 1,
       "text": [
        "'0.13.0'"
       ]
      }
     ],
     "prompt_number": 1
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Setup for grabbing state-related census data"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import us\n",
      "import census\n",
      "import settings\n",
      "\n",
      "import pandas as pd\n",
      "import numpy as np\n",
      "from pandas import DataFrame, Series\n",
      "from itertools import islice\n",
      "\n",
      "c = census.Census(settings.CENSUS_KEY)\n",
      "\n",
      "def states(variables='NAME'):\n",
      "    geo={'for':'state:*'}\n",
      "    \n",
      "    states_fips = set([state.fips for state in us.states.STATES])\n",
      "    # need to filter out non-states\n",
      "    for r in c.sf1.get(variables, geo=geo, year=2010):\n",
      "        if r['state'] in states_fips:\n",
      "            yield r"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# make a dataframe from the total populations of states in the 2010 Census\n",
      "\n",
      "df = DataFrame(states('NAME,P0010001'))\n",
      "df.P0010001 = df.P0010001.astype('int')\n",
      "df.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>NAME</th>\n",
        "      <th>P0010001</th>\n",
        "      <th>state</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>    Alabama</td>\n",
        "      <td>  4779736</td>\n",
        "      <td> 01</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>     Alaska</td>\n",
        "      <td>   710231</td>\n",
        "      <td> 02</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>    Arizona</td>\n",
        "      <td>  6392017</td>\n",
        "      <td> 04</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>   Arkansas</td>\n",
        "      <td>  2915918</td>\n",
        "      <td> 05</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> California</td>\n",
        "      <td> 37253956</td>\n",
        "      <td> 06</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 3 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 3,
       "text": [
        "         NAME  P0010001 state\n",
        "0     Alabama   4779736    01\n",
        "1      Alaska    710231    02\n",
        "2     Arizona   6392017    04\n",
        "3    Arkansas   2915918    05\n",
        "4  California  37253956    06\n",
        "\n",
        "[5 rows x 3 columns]"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# check that that we have the right total population\n",
      "\n",
      "df.P0010001.sum() == 308745538 "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 4,
       "text": [
        "True"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# add a column with the first letter \n",
      "# we'll be grouping states based on the first letter of the state NAME\n",
      "\n",
      "df['first_letter'] = df.NAME.apply(lambda s:s[0])\n",
      "df.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>NAME</th>\n",
        "      <th>P0010001</th>\n",
        "      <th>state</th>\n",
        "      <th>first_letter</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>    Alabama</td>\n",
        "      <td>  4779736</td>\n",
        "      <td> 01</td>\n",
        "      <td> A</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>     Alaska</td>\n",
        "      <td>   710231</td>\n",
        "      <td> 02</td>\n",
        "      <td> A</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>    Arizona</td>\n",
        "      <td>  6392017</td>\n",
        "      <td> 04</td>\n",
        "      <td> A</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>   Arkansas</td>\n",
        "      <td>  2915918</td>\n",
        "      <td> 05</td>\n",
        "      <td> A</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> California</td>\n",
        "      <td> 37253956</td>\n",
        "      <td> 06</td>\n",
        "      <td> C</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 4 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 5,
       "text": [
        "         NAME  P0010001 state first_letter\n",
        "0     Alabama   4779736    01            A\n",
        "1      Alaska    710231    02            A\n",
        "2     Arizona   6392017    04            A\n",
        "3    Arkansas   2915918    05            A\n",
        "4  California  37253956    06            C\n",
        "\n",
        "[5 rows x 4 columns]"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "[Group By: split-apply-combine \u2014 pandas 0.13.1 documentation](http://pandas.pydata.org/pandas-docs/stable/groupby.html)"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Possible to do grouping in [many ways](http://pandas.pydata.org/pandas-docs/stable/groupby.html#splitting-an-object-into-groups).  \"The mapping can be specified many different ways\":\n",
      "\n",
      "* A Python function, to be called on each of the axis labels\n",
      "* A list or NumPy array of the same length as the selected axis\n",
      "* A dict or Series, providing a label -> group name mapping\n",
      "* For DataFrame objects, a string indicating a column to be used to group. Of course df.groupby('A') is just syntactic sugar for df.groupby(df['A']), but it makes life simpler\n",
      "* A list of any of the above things\n",
      "\n",
      "\n",
      "We can also group by columns, axis=1, but such functionality is not demonstrated in this notebook."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# we can explicitly name df.first_letter\n",
      "\n",
      "grouped = df.groupby(df.first_letter)\n",
      "grouped"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 6,
       "text": [
        "<pandas.core.groupby.DataFrameGroupBy object at 0x106212350>"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# shorthand for df.first_letter\n",
      "# same thing as df.groupby(df.first_letter)\n",
      "\n",
      "grouped = df.groupby('first_letter')\n",
      "grouped"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 7,
       "text": [
        "<pandas.core.groupby.DataFrameGroupBy object at 0x106203f10>"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# count the number of states with each first letter -- look at top of the resulting DataFrame\n",
      "\n",
      "grouped.count().head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>NAME</th>\n",
        "      <th>P0010001</th>\n",
        "      <th>state</th>\n",
        "      <th>first_letter</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>first_letter</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>A</th>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>C</th>\n",
        "      <td> 3</td>\n",
        "      <td> 3</td>\n",
        "      <td> 3</td>\n",
        "      <td> 3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>D</th>\n",
        "      <td> 2</td>\n",
        "      <td> 2</td>\n",
        "      <td> 2</td>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>F</th>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>G</th>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 4 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "              NAME  P0010001  state  first_letter\n",
        "first_letter                                     \n",
        "A                4         4      4             4\n",
        "C                3         3      3             3\n",
        "D                2         2      2             2\n",
        "F                1         1      1             1\n",
        "G                1         1      1             1\n",
        "\n",
        "[5 rows x 4 columns]"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# count the number of states with each first letter -- look at bottom of the resulting DataFrame\n",
      "\n",
      "grouped.count().tail()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>NAME</th>\n",
        "      <th>P0010001</th>\n",
        "      <th>state</th>\n",
        "      <th>first_letter</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>first_letter</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>S</th>\n",
        "      <td> 2</td>\n",
        "      <td> 2</td>\n",
        "      <td> 2</td>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>T</th>\n",
        "      <td> 2</td>\n",
        "      <td> 2</td>\n",
        "      <td> 2</td>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>U</th>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>V</th>\n",
        "      <td> 2</td>\n",
        "      <td> 2</td>\n",
        "      <td> 2</td>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>W</th>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "      <td> 4</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 4 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 9,
       "text": [
        "              NAME  P0010001  state  first_letter\n",
        "first_letter                                     \n",
        "S                2         2      2             2\n",
        "T                2         2      2             2\n",
        "U                1         1      1             1\n",
        "V                2         2      2             2\n",
        "W                4         4      4             4\n",
        "\n",
        "[5 rows x 4 columns]"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# we didn't have to explicitly create a new column -- we could groupby on a dynamically generated Series\n",
      "# note the use of str operations on df.NAME: \n",
      "# http://pandas.pydata.org/pandas-docs/stable/basics.html#vectorized-string-methods\n",
      "\n",
      "df.groupby(df.NAME.str.slice(0,1)).sum()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>P0010001</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>NAME</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>A</th>\n",
        "      <td> 14797902</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>C</th>\n",
        "      <td> 45857249</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>D</th>\n",
        "      <td>  1499657</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>F</th>\n",
        "      <td> 18801310</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>G</th>\n",
        "      <td>  9687653</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>H</th>\n",
        "      <td>  1360301</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>I</th>\n",
        "      <td> 23928371</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>K</th>\n",
        "      <td>  7192485</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>L</th>\n",
        "      <td>  4533372</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>M</th>\n",
        "      <td> 38782746</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>N</th>\n",
        "      <td> 46280611</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>O</th>\n",
        "      <td> 19118929</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>P</th>\n",
        "      <td> 12702379</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>R</th>\n",
        "      <td>  1052567</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>S</th>\n",
        "      <td>  5439544</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>T</th>\n",
        "      <td> 31491666</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>U</th>\n",
        "      <td>  2763885</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>V</th>\n",
        "      <td>  8626765</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>W</th>\n",
        "      <td> 14828146</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>19 rows \u00d7 1 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 10,
       "text": [
        "      P0010001\n",
        "NAME          \n",
        "A     14797902\n",
        "C     45857249\n",
        "D      1499657\n",
        "F     18801310\n",
        "G      9687653\n",
        "H      1360301\n",
        "I     23928371\n",
        "K      7192485\n",
        "L      4533372\n",
        "M     38782746\n",
        "N     46280611\n",
        "O     19118929\n",
        "P     12702379\n",
        "R      1052567\n",
        "S      5439544\n",
        "T     31491666\n",
        "U      2763885\n",
        "V      8626765\n",
        "W     14828146\n",
        "\n",
        "[19 rows x 1 columns]"
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# we can get groups of indexes\n",
      "\n",
      "df.groupby('first_letter').groups"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 11,
       "text": [
        "{u'A': [0, 1, 2, 3],\n",
        " u'C': [4, 5, 6],\n",
        " u'D': [7, 8],\n",
        " u'F': [9],\n",
        " u'G': [10],\n",
        " u'H': [11],\n",
        " u'I': [12, 13, 14, 15],\n",
        " u'K': [16, 17],\n",
        " u'L': [18],\n",
        " u'M': [19, 20, 21, 22, 23, 24, 25, 26],\n",
        " u'N': [27, 28, 29, 30, 31, 32, 33, 34],\n",
        " u'O': [35, 36, 37],\n",
        " u'P': [38],\n",
        " u'R': [39],\n",
        " u'S': [40, 41],\n",
        " u'T': [42, 43],\n",
        " u'U': [44],\n",
        " u'V': [45, 46],\n",
        " u'W': [47, 48, 49, 50]}"
       ]
      }
     ],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# loop through all groups --> here just the first one\n",
      "\n",
      "for name, group in islice(df.groupby('first_letter'),1):\n",
      "    print(name)\n",
      "    print type(group) # yes -- a DataFrame\n",
      "    print group.index\n",
      "    print(group),"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "A\n",
        "<class 'pandas.core.frame.DataFrame'>\n",
        "Int64Index([0, 1, 2, 3], dtype='int64')\n",
        "       NAME  P0010001 state first_letter\n",
        "0   Alabama   4779736    01            A\n",
        "1    Alaska    710231    02            A\n",
        "2   Arizona   6392017    04            A\n",
        "3  Arkansas   2915918    05            A\n",
        "\n",
        "[4 rows x 4 columns]\n"
       ]
      }
     ],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# how about accessing group 'C'?\n",
      "# http://stackoverflow.com/a/14734627/7782\n",
      "# http://stackoverflow.com/questions/19804282/in-pandas-is-there-something-like-a-groupby-get-group-but-with-an-optional-defa\n",
      "# http://pandas.pydata.org/pandas-docs/dev/generated/pandas.core.groupby.GroupBy.get_group.html\n",
      "\n",
      "grouped = df.groupby('first_letter')\n",
      "grouped.get_group('C')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>NAME</th>\n",
        "      <th>P0010001</th>\n",
        "      <th>state</th>\n",
        "      <th>first_letter</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td>  California</td>\n",
        "      <td> 37253956</td>\n",
        "      <td> 06</td>\n",
        "      <td> C</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
        "      <td>    Colorado</td>\n",
        "      <td>  5029196</td>\n",
        "      <td> 08</td>\n",
        "      <td> C</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
        "      <td> Connecticut</td>\n",
        "      <td>  3574097</td>\n",
        "      <td> 09</td>\n",
        "      <td> C</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>3 rows \u00d7 4 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 13,
       "text": [
        "          NAME  P0010001 state first_letter\n",
        "4   California  37253956    06            C\n",
        "5     Colorado   5029196    08            C\n",
        "6  Connecticut   3574097    09            C\n",
        "\n",
        "[3 rows x 4 columns]"
       ]
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# total population of states starting with 'C'\n",
      "\n",
      "grouped.get_group('C').P0010001.sum()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 14,
       "text": [
        "45857249"
       ]
      }
     ],
     "prompt_number": 14
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Aggregation"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# generate a Series of total populations by first letter\n",
      "\n",
      "grouped = df.groupby('first_letter')\n",
      "s = grouped['P0010001'].sum()\n",
      "s"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 15,
       "text": [
        "first_letter\n",
        "A               14797902\n",
        "C               45857249\n",
        "D                1499657\n",
        "F               18801310\n",
        "G                9687653\n",
        "H                1360301\n",
        "I               23928371\n",
        "K                7192485\n",
        "L                4533372\n",
        "M               38782746\n",
        "N               46280611\n",
        "O               19118929\n",
        "P               12702379\n",
        "R                1052567\n",
        "S                5439544\n",
        "T               31491666\n",
        "U                2763885\n",
        "V                8626765\n",
        "W               14828146\n",
        "Name: P0010001, dtype: int64"
       ]
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# sort the list to get the most populous groups\n",
      "# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.sort.html\n",
      "# note sorting in place\n",
      "\n",
      "s.sort(ascending=False)\n",
      "s"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 16,
       "text": [
        "first_letter\n",
        "N               46280611\n",
        "C               45857249\n",
        "M               38782746\n",
        "T               31491666\n",
        "I               23928371\n",
        "O               19118929\n",
        "F               18801310\n",
        "W               14828146\n",
        "A               14797902\n",
        "P               12702379\n",
        "G                9687653\n",
        "V                8626765\n",
        "K                7192485\n",
        "S                5439544\n",
        "L                4533372\n",
        "U                2763885\n",
        "D                1499657\n",
        "H                1360301\n",
        "R                1052567\n",
        "Name: P0010001, dtype: int64"
       ]
      }
     ],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "s.index"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 17,
       "text": [
        "Index([u'N', u'C', u'M', u'T', u'I', u'O', u'F', u'W', u'A', u'P', u'G', u'V', u'K', u'S', u'L', u'U', u'D', u'H', u'R'], dtype='object')"
       ]
      }
     ],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# first pass at pulling together the letter, the total population, and the list of corresponding states\n",
      "\n",
      "for k in s.index:\n",
      "    print k, s[k], list(grouped.get_group(k).NAME)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "N 46280611 [u'Nebraska', u'Nevada', u'New Hampshire', u'New Jersey', u'New Mexico', u'New York', u'North Carolina', u'North Dakota']\n",
        "C 45857249 [u'California', u'Colorado', u'Connecticut']\n",
        "M 38782746 [u'Maine', u'Maryland', u'Massachusetts', u'Michigan', u'Minnesota', u'Mississippi', u'Missouri', u'Montana']\n",
        "T 31491666 [u'Tennessee', u'Texas']\n",
        "I 23928371 [u'Idaho', u'Illinois', u'Indiana', u'Iowa']\n",
        "O 19118929 [u'Ohio', u'Oklahoma', u'Oregon']\n",
        "F 18801310 [u'Florida']\n",
        "W 14828146 [u'Washington', u'West Virginia', u'Wisconsin', u'Wyoming']\n",
        "A 14797902 [u'Alabama', u'Alaska', u'Arizona', u'Arkansas']\n",
        "P 12702379 [u'Pennsylvania']\n",
        "G 9687653 [u'Georgia']\n",
        "V 8626765 [u'Vermont', u'Virginia']\n",
        "K 7192485 [u'Kansas', u'Kentucky']\n",
        "S 5439544 [u'South Carolina', u'South Dakota']\n",
        "L 4533372 [u'Louisiana']\n",
        "U 2763885 [u'Utah']\n",
        "D 1499657 [u'Delaware', u'District of Columbia']\n",
        "H 1360301 [u'Hawaii']\n",
        "R 1052567 [u'Rhode Island']\n"
       ]
      }
     ],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# let's try this again and try to do this in a more idiomatic Pandas fashion\n",
      "# ideally, generate a DataFrame with a NAME column that is a list of all states with the same first letter\n",
      "\n",
      "# It turns out that apply can come to the rescue\n",
      "# here' calculate the list of NAMEs\n",
      "\n",
      "# http://pandas.pydata.org/pandas-docs/dev/groupby.html#flexible-apply\n",
      "# http://stackoverflow.com/questions/19530568/can-pandas-groupby-aggregate-into-a-list-rather-than-sum-mean-etc\n",
      "\n",
      "df.groupby(\"first_letter\").apply(lambda x: list(x['NAME']))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 19,
       "text": [
        "first_letter\n",
        "A                            [Alabama, Alaska, Arizona, Arkansas]\n",
        "C                             [California, Colorado, Connecticut]\n",
        "D                                [Delaware, District of Columbia]\n",
        "F                                                       [Florida]\n",
        "G                                                       [Georgia]\n",
        "H                                                        [Hawaii]\n",
        "I                                [Idaho, Illinois, Indiana, Iowa]\n",
        "K                                              [Kansas, Kentucky]\n",
        "L                                                     [Louisiana]\n",
        "M               [Maine, Maryland, Massachusetts, Michigan, Min...\n",
        "N               [Nebraska, Nevada, New Hampshire, New Jersey, ...\n",
        "O                                        [Ohio, Oklahoma, Oregon]\n",
        "P                                                  [Pennsylvania]\n",
        "R                                                  [Rhode Island]\n",
        "S                                  [South Carolina, South Dakota]\n",
        "T                                              [Tennessee, Texas]\n",
        "U                                                          [Utah]\n",
        "V                                             [Vermont, Virginia]\n",
        "W                 [Washington, West Virginia, Wisconsin, Wyoming]\n",
        "dtype: object"
       ]
      }
     ],
     "prompt_number": 19
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# apply can be used to add up the populations by group\n",
      "\n",
      "df.groupby(\"first_letter\").apply(lambda x: np.sum(x['P0010001']))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 20,
       "text": [
        "first_letter\n",
        "A               14797902\n",
        "C               45857249\n",
        "D                1499657\n",
        "F               18801310\n",
        "G                9687653\n",
        "H                1360301\n",
        "I               23928371\n",
        "K                7192485\n",
        "L                4533372\n",
        "M               38782746\n",
        "N               46280611\n",
        "O               19118929\n",
        "P               12702379\n",
        "R                1052567\n",
        "S                5439544\n",
        "T               31491666\n",
        "U                2763885\n",
        "V                8626765\n",
        "W               14828146\n",
        "dtype: int64"
       ]
      }
     ],
     "prompt_number": 20
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# make a tuple out of the list of names and the population\n",
      "\n",
      "df.groupby(\"first_letter\").apply(lambda x:( list(x['NAME']), np.sum(x['P0010001'])))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 21,
       "text": [
        "first_letter\n",
        "A                ([Alabama, Alaska, Arizona, Arkansas], 14797902)\n",
        "C                 ([California, Colorado, Connecticut], 45857249)\n",
        "D                     ([Delaware, District of Columbia], 1499657)\n",
        "F                                           ([Florida], 18801310)\n",
        "G                                            ([Georgia], 9687653)\n",
        "H                                             ([Hawaii], 1360301)\n",
        "I                    ([Idaho, Illinois, Indiana, Iowa], 23928371)\n",
        "K                                   ([Kansas, Kentucky], 7192485)\n",
        "L                                          ([Louisiana], 4533372)\n",
        "M               ([Maine, Maryland, Massachusetts, Michigan, Mi...\n",
        "N               ([Nebraska, Nevada, New Hampshire, New Jersey,...\n",
        "O                            ([Ohio, Oklahoma, Oregon], 19118929)\n",
        "P                                      ([Pennsylvania], 12702379)\n",
        "R                                       ([Rhode Island], 1052567)\n",
        "S                       ([South Carolina, South Dakota], 5439544)\n",
        "T                                  ([Tennessee, Texas], 31491666)\n",
        "U                                               ([Utah], 2763885)\n",
        "V                                  ([Vermont, Virginia], 8626765)\n",
        "W               ([Washington, West Virginia, Wisconsin, Wyomin...\n",
        "dtype: object"
       ]
      }
     ],
     "prompt_number": 21
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# remind ourselves on how to turn a tuple into a Series with a small example\n",
      "\n",
      "Series(([1,2],2), index=['one','two'])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 22,
       "text": [
        "one    [1, 2]\n",
        "two         2\n",
        "dtype: object"
       ]
      }
     ],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# we're ready to make a new DataFrame\n",
      "\n",
      "df.groupby(\"first_letter\").apply(lambda x:Series((list(x['NAME']), np.sum(x['P0010001'])), \n",
      "                                                 index=['states','total_pop']))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>states</th>\n",
        "      <th>total_pop</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>first_letter</th>\n",
        "      <th></th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>A</th>\n",
        "      <td>              [Alabama, Alaska, Arizona, Arkansas]</td>\n",
        "      <td> 14797902</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>C</th>\n",
        "      <td>               [California, Colorado, Connecticut]</td>\n",
        "      <td> 45857249</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>D</th>\n",
        "      <td>                  [Delaware, District of Columbia]</td>\n",
        "      <td>  1499657</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>F</th>\n",
        "      <td>                                         [Florida]</td>\n",
        "      <td> 18801310</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>G</th>\n",
        "      <td>                                         [Georgia]</td>\n",
        "      <td>  9687653</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>H</th>\n",
        "      <td>                                          [Hawaii]</td>\n",
        "      <td>  1360301</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>I</th>\n",
        "      <td>                  [Idaho, Illinois, Indiana, Iowa]</td>\n",
        "      <td> 23928371</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>K</th>\n",
        "      <td>                                [Kansas, Kentucky]</td>\n",
        "      <td>  7192485</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>L</th>\n",
        "      <td>                                       [Louisiana]</td>\n",
        "      <td>  4533372</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>M</th>\n",
        "      <td> [Maine, Maryland, Massachusetts, Michigan, Min...</td>\n",
        "      <td> 38782746</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>N</th>\n",
        "      <td> [Nebraska, Nevada, New Hampshire, New Jersey, ...</td>\n",
        "      <td> 46280611</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>O</th>\n",
        "      <td>                          [Ohio, Oklahoma, Oregon]</td>\n",
        "      <td> 19118929</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>P</th>\n",
        "      <td>                                    [Pennsylvania]</td>\n",
        "      <td> 12702379</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>R</th>\n",
        "      <td>                                    [Rhode Island]</td>\n",
        "      <td>  1052567</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>S</th>\n",
        "      <td>                    [South Carolina, South Dakota]</td>\n",
        "      <td>  5439544</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>T</th>\n",
        "      <td>                                [Tennessee, Texas]</td>\n",
        "      <td> 31491666</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>U</th>\n",
        "      <td>                                            [Utah]</td>\n",
        "      <td>  2763885</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>V</th>\n",
        "      <td>                               [Vermont, Virginia]</td>\n",
        "      <td>  8626765</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>W</th>\n",
        "      <td>   [Washington, West Virginia, Wisconsin, Wyoming]</td>\n",
        "      <td> 14828146</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>19 rows \u00d7 2 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 23,
       "text": [
        "                                                         states  total_pop\n",
        "first_letter                                                              \n",
        "A                          [Alabama, Alaska, Arizona, Arkansas]   14797902\n",
        "C                           [California, Colorado, Connecticut]   45857249\n",
        "D                              [Delaware, District of Columbia]    1499657\n",
        "F                                                     [Florida]   18801310\n",
        "G                                                     [Georgia]    9687653\n",
        "H                                                      [Hawaii]    1360301\n",
        "I                              [Idaho, Illinois, Indiana, Iowa]   23928371\n",
        "K                                            [Kansas, Kentucky]    7192485\n",
        "L                                                   [Louisiana]    4533372\n",
        "M             [Maine, Maryland, Massachusetts, Michigan, Min...   38782746\n",
        "N             [Nebraska, Nevada, New Hampshire, New Jersey, ...   46280611\n",
        "O                                      [Ohio, Oklahoma, Oregon]   19118929\n",
        "P                                                [Pennsylvania]   12702379\n",
        "R                                                [Rhode Island]    1052567\n",
        "S                                [South Carolina, South Dakota]    5439544\n",
        "T                                            [Tennessee, Texas]   31491666\n",
        "U                                                        [Utah]    2763885\n",
        "V                                           [Vermont, Virginia]    8626765\n",
        "W               [Washington, West Virginia, Wisconsin, Wyoming]   14828146\n",
        "\n",
        "[19 rows x 2 columns]"
       ]
      }
     ],
     "prompt_number": 23
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df2 = df.groupby(\"first_letter\").apply(lambda x:Series((list(x['NAME']), np.sum(x['P0010001'])), \n",
      "                                                 index=['states','total_pop'])).sort_index(by='total_pop',ascending=False)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 24
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# make sure you understand the syntax here:\n",
      "# .ix: http://pandas.pydata.org/pandas-docs/dev/indexing.html#advanced-indexing-with-ix\n",
      "# ability to grab columns by name to return a new DataFrame\n",
      "\n",
      "df2.ix['C'][['states','total_pop']]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 25,
       "text": [
        "states       [California, Colorado, Connecticut]\n",
        "total_pop                               45857249\n",
        "Name: C, dtype: object"
       ]
      }
     ],
     "prompt_number": 25
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print \"states that start with 'C'\", df2.ix['C']['states']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "states that start with 'C' [u'California', u'Colorado', u'Connecticut']\n"
       ]
      }
     ],
     "prompt_number": 26
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print \"total population of states that start with 'C'\", df2.ix['C']['total_pop']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "total population of states that start with 'C' 45857249\n"
       ]
      }
     ],
     "prompt_number": 27
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 27
    }
   ],
   "metadata": {}
  }
 ]
}