{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%pylab --no-import-all inline"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "import matplotlib.pyplot as plt\n",
      "from pandas import DataFrame, Series, Index\n",
      "import pandas as pd"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# check that CENSUS_KEY is defined\n",
      "\n",
      "import settings\n",
      "assert settings.CENSUS_KEY is not None"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "The census documentation has example URLs but needs your API key to work.  In this notebook, we'll use the IPython notebook HTML display mechanism to help out.\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# http://api.census.gov/data/2010/sf1/geo.html\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from IPython.core.display import HTML"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "HTML(\"<iframe src='http://api.census.gov/data/2010/sf1/geo.html' width='800px'/>\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "%%HTML\n",
      "<b>hi there</b>"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import urlparse\n",
      "import urllib\n",
      "from IPython.core.display import HTML\n",
      "\n",
      "def add_census_key(url, api_key=settings.CENSUS_KEY):\n",
      "    \"\"\"Take an input example Census API call and a key parameter\"\"\"\n",
      "\n",
      "    pr = urlparse.urlparse(url)\n",
      "    \n",
      "    # we're going to modify the query, which is the 5th element in the tuple (index 4)\n",
      "    pr1 = list(pr)\n",
      "    \n",
      "    # convert pr.query from string to dict\n",
      "    # see http://stackoverflow.com/a/10233141/7782 for meaning of doseq\n",
      "    pr_query = urlparse.parse_qs(pr.query)\n",
      "    pr_query[\"key\"]= api_key\n",
      "\n",
      "    pr1[4] = urllib.urlencode(pr_query, doseq=True)\n",
      "    \n",
      "    return urlparse.urlunparse(pr1)\n",
      "\n",
      "\n",
      "def c_url (url, title=None, api_key=settings.CENSUS_KEY):\n",
      "    url_with_key = add_census_key(url, api_key)\n",
      "    if title is None:\n",
      "        title = url\n",
      "    return HTML(\"\"\"<a href=\"{url}\">{title}</a>\"\"\".format(url=url_with_key, title=title))\n",
      "\n",
      "#add_census_key(\"http://api.census.gov/data/2010/sf1?get=P0010001&for=county:*\")\n",
      "c_url(\"http://api.census.gov/data/2010/sf1?get=NAME,P0010001&for=state:*\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "heading",
     "level": 1,
     "metadata": {},
     "source": [
      "Scraping the examples"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import requests\n",
      "from lxml.html import parse, fromstring\n",
      "\n",
      "url = \"http://api.census.gov/data/2010/sf1/geo.html\"\n",
      "r = requests.get(url).content\n",
      "doc = fromstring(r)\n",
      "\n",
      "rows = doc.xpath(\"//table/tr\")\n",
      "\n",
      "# first row is the header\n",
      "\n",
      "headers = [col.text for col in rows[0].findall('th')]\n",
      "headers\n",
      "\n",
      "# next rows are the census URL examples\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "row = rows[1]\n",
      "cols = row.findall('td')\n",
      "\n",
      "# col[s0]:  Summmary Level\n",
      "\n",
      "print cols[0].text\n",
      "\n",
      "# cols[1]:  Description\n",
      "\n",
      "print cols[1].text"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from itertools import islice\n",
      "from lxml.html import parse\n",
      "\n",
      "# let's actually now decorate the urls\n",
      "\n",
      "def decorated_parse_examples(examples, api_key=settings.CENSUS_KEY):\n",
      "    for row in examples:\n",
      "        new_row = row.copy()\n",
      "        # need to change URLs\n",
      "        \n",
      "        example_urls_col = new_row[headers[2]]\n",
      "        #urls_with_key  = [add_census_key(url) for url in example_urls_col]\n",
      "        \n",
      "        new_row[headers[2]] = \"<br/>\".join(\n",
      "            [\"\"\"<a href=\"{url_with_key}\">{url}</a>\"\"\".format(\n",
      "                url=url, \n",
      "                url_with_key=add_census_key(url)\n",
      "                ) for url in example_urls_col\n",
      "            ])\n",
      "        \n",
      "        yield new_row\n",
      "        \n",
      "def parse_urls_col(col):\n",
      "    # http://stackoverflow.com/a/15074386/7782\n",
      "    return [child for child in col.itertext()]\n",
      "\n",
      "def parse_census_examples():\n",
      "\n",
      "    url = \"http://api.census.gov/data/2010/sf1/geo.html\"\n",
      "    doc = parse(url)\n",
      "\n",
      "    rows = doc.xpath(\"//table/tr\")\n",
      "\n",
      "    # first row is the header\n",
      "\n",
      "    headers = [col.text for col in rows[0].findall('th')]\n",
      "\n",
      "    for row in rows[1:]:\n",
      "        cols = row.findall('td')\n",
      "        yield ({headers[0]:cols[0].text, \n",
      "                headers[1]:cols[1].text, \n",
      "                headers[2]:parse_urls_col(cols[2])})\n",
      "\n",
      "#parsed_examples = list(islice(parse_census_examples(),None))\n",
      "parsed_examples = parse_census_examples()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# let's redisplay the table with \n",
      "\n",
      "from IPython.display import HTML\n",
      "from jinja2 import Template\n",
      "\n",
      "URLS_TEMPLATE= \"\"\"\n",
      " <table>\n",
      "   <tr>\n",
      "   {% for header in headers %}\n",
      "     <th>{{header}}</th>\n",
      "   {% endfor %}\n",
      "   </tr>\n",
      "   {% for row in rows %}\n",
      "   <tr>\n",
      "    {% for header in headers %}\n",
      "      <td>{{row[header]}}</td>\n",
      "    {% endfor %}\n",
      "   </tr>\n",
      "   {% endfor %}\n",
      " </table>\"\"\"\n",
      "    \n",
      "template = Template(URLS_TEMPLATE)\n",
      "HTML(template.render(headers=headers, rows=decorated_parse_examples(parsed_examples))) "
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}