{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Scrape the 2013 Indian election results\n",
      "\n",
      "The main portal that holds the results is <http://eciresults.nic.in/>. Of these, the constituency-wise results <http://eciresults.nic.in/ConstituencywiseS2653.htm> appears to have the maximum detail, with the following fields:\n",
      "\n",
      "- State\n",
      "- Constituency\n",
      "- Candidate\n",
      "- Party\n",
      "- Votes\n",
      "\n",
      "So let's scrape that."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# I'll keep it to standard Python 2.7 libraries, but for lxml.\n",
      "import os\n",
      "import re\n",
      "import urllib\n",
      "from hashlib import sha256\n",
      "from lxml.html import parse"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def get(url):\n",
      "    \"\"\"Retrieves a URL as an lxml tree, cached where possible\"\"\"\n",
      "    filename = '.cache.' + sha256(url).hexdigest()\n",
      "    if not os.path.exists(filename):\n",
      "        html = urllib.urlretrieve(url, filename)\n",
      "    return parse(filename)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def constituencies(url):\n",
      "    \"\"\"Yields dicts with state, state_code, constituency, constituency_code.\"\"\"\n",
      "    tree = get(url)\n",
      "\n",
      "    # States and codes are stored in Javascript, like this:\n",
      "    #     if (st.value == 'S26') {\n",
      "    #         strValues = document.getElementById('HdnFldChhattisgarh').value;\n",
      "    # This is a crude parsing of that code\n",
      "    statecode = re.findall('st.value *=+ *\\'([^\\']+).*?HdnFld([^\\']+)',\n",
      "                           tree.findall('.//script')[0].text, re.S)\n",
      "    statecode = {state:code for code, state in statecode}\n",
      "    \n",
      "    # Constituency codes are in hidden input fields. Format is:\n",
      "    # code,constituency; code,constituency; ...\n",
      "    for el in tree.findall('.//input[@id]'):\n",
      "        id = el.get('id', '').strip()\n",
      "        if id.startswith('HdnFld'):\n",
      "            state = id.replace('HdnFld', '')\n",
      "            for row in el.get('value').split(';'):\n",
      "                row = row.strip()\n",
      "                if row:\n",
      "                    cells = row.split(',')\n",
      "                    yield {\n",
      "                        'state': state,\n",
      "                        'statecode': statecode.get(state),\n",
      "                        'constituency': cells[1],\n",
      "                        'constituencycode': cells[0]\n",
      "                    }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def results(url):\n",
      "    \"\"\"For a constituency URL, yields dicts with candidate, party, votes.\"\"\"\n",
      "    tree = get(url)\n",
      "\n",
      "    # Results are inside a table in a <div id=\"div1\">\n",
      "    for row in tree.findall('.//*[@id=\"div1\"]//tr'):\n",
      "        cells = row.findall('td')\n",
      "        if len(cells) >= 3:\n",
      "            yield {\n",
      "                'candidate': cells[0].text.strip(),\n",
      "                'party': cells[1].text.strip(),\n",
      "                'votes': cells[2].text.strip(),\n",
      "            }"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": true,
     "input": [
      "dataset = []\n",
      "for place in constituencies('http://eciresults.nic.in/ConstituencywiseS2653.htm'):\n",
      "    url = 'http://eciresults.nic.in/Constituencywise{:s}{:s}.htm?ac={:s}'.format(\n",
      "        place['statecode'], place['constituencycode'], place['constituencycode'])\n",
      "    # print 'Debug: scraping', place['state'], place['constituency']\n",
      "    for result in results(url):\n",
      "        result.update(place)\n",
      "        dataset.append(result)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Let's save this as tab-delimited UTF-8 file. (Sadly, csv doesn't do UTF-8)\n",
      "with open('2013-result.txt', 'wb') as out:\n",
      "    fields = ['state', 'constituency', 'votes', 'candidate', 'party']\n",
      "    out.write('\\t'.join(fields) + '\\n')\n",
      "    for row in dataset:\n",
      "        out.write('\\t'.join(row[f] for f in fields).encode('utf-8') + '\\n')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    }
   ],
   "metadata": {}
  }
 ]
}