{ "metadata": { "name": "", "signature": "sha256:a8ab151540de39baa3706b4bfc39a6130a4caec520d0fc188221a1017c8e45e3" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "What this notebook is about" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Goal:\n", " \n", " * to learn about the basics of Pandas groupby operations using the census information about states\n", " \n", "References:\n", "\n", "* [Group By: split-apply-combine \u2014 pandas 0.13.1 documentation](http://pandas.pydata.org/pandas-docs/stable/groupby.html)\n", "\n", "* [Python for Data Analysis > 9. Data Aggregation and Group Operations > GroupBy Mechanics : Safari Books Online](http://my.safaribooksonline.com/book/programming/python/9781449323592/9dot-data-aggregation-and-group-operations/id2805988)\n", "\n", "Note the **split-apply-combine** framework of thinking\n", "\n", "\n", " " ] }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Version of pandas" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# note version of pandas\n", "import pandas\n", "pandas.__version__" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 1, "text": [ "'0.13.0'" ] } ], "prompt_number": 1 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Setup for grabbing state-related census data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import us\n", "import census\n", "import settings\n", "\n", "import pandas as pd\n", "import numpy as np\n", "from pandas import DataFrame, Series\n", "from itertools import islice\n", "\n", "c = census.Census(settings.CENSUS_KEY)\n", "\n", "def states(variables='NAME'):\n", " geo={'for':'state:*'}\n", " \n", " states_fips = set([state.fips for state in us.states.STATES])\n", " # need to filter out non-states\n", " for r in c.sf1.get(variables, geo=geo, year=2010):\n", " if r['state'] in states_fips:\n", " yield r" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "# make a dataframe from the total populations of states in the 2010 Census\n", "\n", "df = DataFrame(states('NAME,P0010001'))\n", "df.P0010001 = df.P0010001.astype('int')\n", "df.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | NAME | \n", "P0010001 | \n", "state | \n", "
---|---|---|---|
0 | \n", "Alabama | \n", "4779736 | \n", "01 | \n", "
1 | \n", "Alaska | \n", "710231 | \n", "02 | \n", "
2 | \n", "Arizona | \n", "6392017 | \n", "04 | \n", "
3 | \n", "Arkansas | \n", "2915918 | \n", "05 | \n", "
4 | \n", "California | \n", "37253956 | \n", "06 | \n", "
5 rows \u00d7 3 columns
\n", "\n", " | NAME | \n", "P0010001 | \n", "state | \n", "first_letter | \n", "
---|---|---|---|---|
0 | \n", "Alabama | \n", "4779736 | \n", "01 | \n", "A | \n", "
1 | \n", "Alaska | \n", "710231 | \n", "02 | \n", "A | \n", "
2 | \n", "Arizona | \n", "6392017 | \n", "04 | \n", "A | \n", "
3 | \n", "Arkansas | \n", "2915918 | \n", "05 | \n", "A | \n", "
4 | \n", "California | \n", "37253956 | \n", "06 | \n", "C | \n", "
5 rows \u00d7 4 columns
\n", "\n", " | NAME | \n", "P0010001 | \n", "state | \n", "first_letter | \n", "
---|---|---|---|---|
first_letter | \n", "\n", " | \n", " | \n", " | \n", " |
A | \n", "4 | \n", "4 | \n", "4 | \n", "4 | \n", "
C | \n", "3 | \n", "3 | \n", "3 | \n", "3 | \n", "
D | \n", "2 | \n", "2 | \n", "2 | \n", "2 | \n", "
F | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
G | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
5 rows \u00d7 4 columns
\n", "\n", " | NAME | \n", "P0010001 | \n", "state | \n", "first_letter | \n", "
---|---|---|---|---|
first_letter | \n", "\n", " | \n", " | \n", " | \n", " |
S | \n", "2 | \n", "2 | \n", "2 | \n", "2 | \n", "
T | \n", "2 | \n", "2 | \n", "2 | \n", "2 | \n", "
U | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "
V | \n", "2 | \n", "2 | \n", "2 | \n", "2 | \n", "
W | \n", "4 | \n", "4 | \n", "4 | \n", "4 | \n", "
5 rows \u00d7 4 columns
\n", "\n", " | P0010001 | \n", "
---|---|
NAME | \n", "\n", " |
A | \n", "14797902 | \n", "
C | \n", "45857249 | \n", "
D | \n", "1499657 | \n", "
F | \n", "18801310 | \n", "
G | \n", "9687653 | \n", "
H | \n", "1360301 | \n", "
I | \n", "23928371 | \n", "
K | \n", "7192485 | \n", "
L | \n", "4533372 | \n", "
M | \n", "38782746 | \n", "
N | \n", "46280611 | \n", "
O | \n", "19118929 | \n", "
P | \n", "12702379 | \n", "
R | \n", "1052567 | \n", "
S | \n", "5439544 | \n", "
T | \n", "31491666 | \n", "
U | \n", "2763885 | \n", "
V | \n", "8626765 | \n", "
W | \n", "14828146 | \n", "
19 rows \u00d7 1 columns
\n", "\n", " | NAME | \n", "P0010001 | \n", "state | \n", "first_letter | \n", "
---|---|---|---|---|
4 | \n", "California | \n", "37253956 | \n", "06 | \n", "C | \n", "
5 | \n", "Colorado | \n", "5029196 | \n", "08 | \n", "C | \n", "
6 | \n", "Connecticut | \n", "3574097 | \n", "09 | \n", "C | \n", "
3 rows \u00d7 4 columns
\n", "\n", " | states | \n", "total_pop | \n", "
---|---|---|
first_letter | \n", "\n", " | \n", " |
A | \n", "[Alabama, Alaska, Arizona, Arkansas] | \n", "14797902 | \n", "
C | \n", "[California, Colorado, Connecticut] | \n", "45857249 | \n", "
D | \n", "[Delaware, District of Columbia] | \n", "1499657 | \n", "
F | \n", "[Florida] | \n", "18801310 | \n", "
G | \n", "[Georgia] | \n", "9687653 | \n", "
H | \n", "[Hawaii] | \n", "1360301 | \n", "
I | \n", "[Idaho, Illinois, Indiana, Iowa] | \n", "23928371 | \n", "
K | \n", "[Kansas, Kentucky] | \n", "7192485 | \n", "
L | \n", "[Louisiana] | \n", "4533372 | \n", "
M | \n", "[Maine, Maryland, Massachusetts, Michigan, Min... | \n", "38782746 | \n", "
N | \n", "[Nebraska, Nevada, New Hampshire, New Jersey, ... | \n", "46280611 | \n", "
O | \n", "[Ohio, Oklahoma, Oregon] | \n", "19118929 | \n", "
P | \n", "[Pennsylvania] | \n", "12702379 | \n", "
R | \n", "[Rhode Island] | \n", "1052567 | \n", "
S | \n", "[South Carolina, South Dakota] | \n", "5439544 | \n", "
T | \n", "[Tennessee, Texas] | \n", "31491666 | \n", "
U | \n", "[Utah] | \n", "2763885 | \n", "
V | \n", "[Vermont, Virginia] | \n", "8626765 | \n", "
W | \n", "[Washington, West Virginia, Wisconsin, Wyoming] | \n", "14828146 | \n", "
19 rows \u00d7 2 columns
\n", "