{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Scrape the 2013 Indian election results\n", "\n", "The main portal that holds the results is . Of these, the constituency-wise results appears to have the maximum detail, with the following fields:\n", "\n", "- State\n", "- Constituency\n", "- Candidate\n", "- Party\n", "- Votes\n", "\n", "So let's scrape that." ] }, { "cell_type": "code", "collapsed": false, "input": [ "# I'll keep it to standard Python 2.7 libraries, but for lxml.\n", "import os\n", "import re\n", "import urllib\n", "from hashlib import sha256\n", "from lxml.html import parse" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "def get(url):\n", " \"\"\"Retrieves a URL as an lxml tree, cached where possible\"\"\"\n", " filename = '.cache.' + sha256(url).hexdigest()\n", " if not os.path.exists(filename):\n", " html = urllib.urlretrieve(url, filename)\n", " return parse(filename)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "def constituencies(url):\n", " \"\"\"Yields dicts with state, state_code, constituency, constituency_code.\"\"\"\n", " tree = get(url)\n", "\n", " # States and codes are stored in Javascript, like this:\n", " # if (st.value == 'S26') {\n", " # strValues = document.getElementById('HdnFldChhattisgarh').value;\n", " # This is a crude parsing of that code\n", " statecode = re.findall('st.value *=+ *\\'([^\\']+).*?HdnFld([^\\']+)',\n", " tree.findall('.//script')[0].text, re.S)\n", " statecode = {state:code for code, state in statecode}\n", " \n", " # Constituency codes are in hidden input fields. Format is:\n", " # code,constituency; code,constituency; ...\n", " for el in tree.findall('.//input[@id]'):\n", " id = el.get('id', '').strip()\n", " if id.startswith('HdnFld'):\n", " state = id.replace('HdnFld', '')\n", " for row in el.get('value').split(';'):\n", " row = row.strip()\n", " if row:\n", " cells = row.split(',')\n", " yield {\n", " 'state': state,\n", " 'statecode': statecode.get(state),\n", " 'constituency': cells[1],\n", " 'constituencycode': cells[0]\n", " }" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "def results(url):\n", " \"\"\"For a constituency URL, yields dicts with candidate, party, votes.\"\"\"\n", " tree = get(url)\n", "\n", " # Results are inside a table in a
\n", " for row in tree.findall('.//*[@id=\"div1\"]//tr'):\n", " cells = row.findall('td')\n", " if len(cells) >= 3:\n", " yield {\n", " 'candidate': cells[0].text.strip(),\n", " 'party': cells[1].text.strip(),\n", " 'votes': cells[2].text.strip(),\n", " }" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": true, "input": [ "dataset = []\n", "for place in constituencies('http://eciresults.nic.in/ConstituencywiseS2653.htm'):\n", " url = 'http://eciresults.nic.in/Constituencywise{:s}{:s}.htm?ac={:s}'.format(\n", " place['statecode'], place['constituencycode'], place['constituencycode'])\n", " # print 'Debug: scraping', place['state'], place['constituency']\n", " for result in results(url):\n", " result.update(place)\n", " dataset.append(result)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "# Let's save this as tab-delimited UTF-8 file. (Sadly, csv doesn't do UTF-8)\n", "with open('2013-result.txt', 'wb') as out:\n", " fields = ['state', 'constituency', 'votes', 'candidate', 'party']\n", " out.write('\\t'.join(fields) + '\\n')\n", " for row in dataset:\n", " out.write('\\t'.join(row[f] for f in fields).encode('utf-8') + '\\n')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 } ], "metadata": {} } ] }