{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"%pylab --no-import-all inline"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from pandas import DataFrame, Series, Index\n",
"import pandas as pd"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# check that CENSUS_KEY is defined\n",
"\n",
"import settings\n",
"assert settings.CENSUS_KEY is not None"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The census documentation has example URLs but needs your API key to work. In this notebook, we'll use the IPython notebook HTML display mechanism to help out.\n"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# http://api.census.gov/data/2010/sf1/geo.html\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from IPython.core.display import HTML"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"HTML(\"\")"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"%%HTML\n",
"hi there"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import urlparse\n",
"import urllib\n",
"from IPython.core.display import HTML\n",
"\n",
"def add_census_key(url, api_key=settings.CENSUS_KEY):\n",
" \"\"\"Take an input example Census API call and a key parameter\"\"\"\n",
"\n",
" pr = urlparse.urlparse(url)\n",
" \n",
" # we're going to modify the query, which is the 5th element in the tuple (index 4)\n",
" pr1 = list(pr)\n",
" \n",
" # convert pr.query from string to dict\n",
" # see http://stackoverflow.com/a/10233141/7782 for meaning of doseq\n",
" pr_query = urlparse.parse_qs(pr.query)\n",
" pr_query[\"key\"]= api_key\n",
"\n",
" pr1[4] = urllib.urlencode(pr_query, doseq=True)\n",
" \n",
" return urlparse.urlunparse(pr1)\n",
"\n",
"\n",
"def c_url (url, title=None, api_key=settings.CENSUS_KEY):\n",
" url_with_key = add_census_key(url, api_key)\n",
" if title is None:\n",
" title = url\n",
" return HTML(\"\"\"{title}\"\"\".format(url=url_with_key, title=title))\n",
"\n",
"#add_census_key(\"http://api.census.gov/data/2010/sf1?get=P0010001&for=county:*\")\n",
"c_url(\"http://api.census.gov/data/2010/sf1?get=NAME,P0010001&for=state:*\")"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Scraping the examples"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import requests\n",
"from lxml.html import parse, fromstring\n",
"\n",
"url = \"http://api.census.gov/data/2010/sf1/geo.html\"\n",
"r = requests.get(url).content\n",
"doc = fromstring(r)\n",
"\n",
"rows = doc.xpath(\"//table/tr\")\n",
"\n",
"# first row is the header\n",
"\n",
"headers = [col.text for col in rows[0].findall('th')]\n",
"headers\n",
"\n",
"# next rows are the census URL examples\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"row = rows[1]\n",
"cols = row.findall('td')\n",
"\n",
"# col[s0]: Summmary Level\n",
"\n",
"print cols[0].text\n",
"\n",
"# cols[1]: Description\n",
"\n",
"print cols[1].text"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from itertools import islice\n",
"from lxml.html import parse\n",
"\n",
"# let's actually now decorate the urls\n",
"\n",
"def decorated_parse_examples(examples, api_key=settings.CENSUS_KEY):\n",
" for row in examples:\n",
" new_row = row.copy()\n",
" # need to change URLs\n",
" \n",
" example_urls_col = new_row[headers[2]]\n",
" #urls_with_key = [add_census_key(url) for url in example_urls_col]\n",
" \n",
" new_row[headers[2]] = \"
\".join(\n",
" [\"\"\"{url}\"\"\".format(\n",
" url=url, \n",
" url_with_key=add_census_key(url)\n",
" ) for url in example_urls_col\n",
" ])\n",
" \n",
" yield new_row\n",
" \n",
"def parse_urls_col(col):\n",
" # http://stackoverflow.com/a/15074386/7782\n",
" return [child for child in col.itertext()]\n",
"\n",
"def parse_census_examples():\n",
"\n",
" url = \"http://api.census.gov/data/2010/sf1/geo.html\"\n",
" doc = parse(url)\n",
"\n",
" rows = doc.xpath(\"//table/tr\")\n",
"\n",
" # first row is the header\n",
"\n",
" headers = [col.text for col in rows[0].findall('th')]\n",
"\n",
" for row in rows[1:]:\n",
" cols = row.findall('td')\n",
" yield ({headers[0]:cols[0].text, \n",
" headers[1]:cols[1].text, \n",
" headers[2]:parse_urls_col(cols[2])})\n",
"\n",
"#parsed_examples = list(islice(parse_census_examples(),None))\n",
"parsed_examples = parse_census_examples()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# let's redisplay the table with \n",
"\n",
"from IPython.display import HTML\n",
"from jinja2 import Template\n",
"\n",
"URLS_TEMPLATE= \"\"\"\n",
"
{{header}} | \n", " {% endfor %}\n", "
---|
{{row[header]}} | \n", " {% endfor %}\n", "