{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Crawl the ECI election statistics\n", "\n", " has PDFs of past election results. Let's first download them all and convert to text." ] }, { "cell_type": "code", "collapsed": false, "input": [ "import os\n", "from urllib import urlopen, urlretrieve\n", "from urlparse import urljoin\n", "from lxml.html import parse\n", "from os.path import exists\n", "from subprocess import call" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "# Use [xpdf](http://www.foolabs.com/xpdf/) to convert PDF to text\n", "PDF_TO_TEXT = 'D:/Apps/xpdf/pdftotext.exe'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "base = 'http://eci.nic.in/eci_main1/ElectionStatistics.aspx'\n", "tree = parse(urlopen(base))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "files = set()\n", "def download(year, link):\n", " '''Download a year's election results from link and convert to text'''\n", " pdf_file = os.path.join('raw', year + '.pdf')\n", " if not exists(pdf_file):\n", " urlretrieve(urljoin(base, link), pdf_file)\n", " text_file = pdf_file.replace('.pdf', '.txt')\n", " if not exists(text_file):\n", " call([PDF_TO_TEXT, '-layout', pdf_file, text_file])\n", " files.add(year + '.txt')\n", "\n", "# Get all rows from the first table in
\n", "# We pick only the first link, that has the\n", "# constituency-wise detailed results\n", "for td in tree.findall('//*[@id=\"c\"]/table[1]//td'):\n", " if td.text is None:\n", " continue\n", " year = td.text.strip().split(' ')[0]\n", " download(year, td.find('.//a').get('href'))\n", "\n", "# 2009 results are elsewhere. Hard code the link\n", "download('2009', 'http://eci.nic.in/eci_main/archiveofge2009/Stats/VOLI/25_ConstituencyWiseDetailedResult.pdf')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, we'll convert these into a CSV file with the relevant data.\n", "\n", "Manual processing\n", "-----------------\n", "At this point, there's some manual munging of the text files. I'd ideally like to have avoided this, but it's just so much faster to manually process some of this content than write a program to do it.\n", "\n", "I'll document what I did at some point. But a few notes in the meantime:\n", "\n", "- See and similar URLs for corrections *NOT* available in the PDF\n", "\n", "Resume automated extraction\n", "---------------------------" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import re\n", "import logging" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "fieldlist = {\n", " '1951.txt': ['NAME', 'PARTY', 'VOTES', '%'],\n", " '1957.txt': ['NAME', 'PARTY', 'VOTES', '%'],\n", " '1962.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],\n", " '1967.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],\n", " '1971.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],\n", " '1977.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],\n", " '1980.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],\n", " '1984.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],\n", " '1985.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],\n", " '1989.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],\n", " '1991.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],\n", " '1992.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],\n", " '1996.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],\n", " '1998.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],\n", " '1999.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],\n", " '2004.txt': ['NAME', 'SEX', 'AGE', 'CATEGORY', 'PARTY', 'GENERAL VOTES', 'POSTAL VOTES', 'VOTES'],\n", " '2009.txt': ['#', 'NAME', 'SEX', 'AGE', 'CATEGORY', 'PARTY', 'GENERAL VOTES', 'POSTAL VOTES', 'VOTES', '% ELECTORS', '% VOTES'],\n", "}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "def old_text_parse(filename):\n", " if filename.startswith('1'):\n", " re_state = re.compile(r'^ {25,}[A-Za-z].*')\n", " re_electors = re.compile(r'ELECTORS *: *(\\d+)')\n", " else:\n", " re_state = re.compile(r'^[A-Z][A-Za-z& ]+$')\n", " re_electors = re.compile(r'Total Electors *(\\d+)(.*)')\n", " \n", " re_constituency = re.compile(r'Constituency *:? *(\\d+) *\\.? *(.*)', re.IGNORECASE)\n", " re_name = re.compile(r'^\\d+ *\\. *')\n", " re_scst = re.compile(r' *\\((SC|ST)\\)')\n", " \n", " fields = fieldlist[filename]\n", " results, electors = [], {}\n", " state, constituency = None, None\n", " for ln, line in enumerate(open(filename)):\n", " match = re_constituency.match(line)\n", " if match:\n", " constituency = match.group(2).split(' ')[0].upper()\n", " constituency = re_scst.sub('', constituency)\n", " continue\n", " \n", " match = re_state.match(line)\n", " if match:\n", " state = line.strip().upper()\n", " continue\n", "\n", " match = re_electors.match(line)\n", " if match:\n", " electors[state, constituency] = match.group(1)\n", " continue\n", " \n", " parts = re.split(r' +', line.strip())\n", " if len(parts) == len(fields):\n", " row = dict(zip(fields, parts))\n", " elif len(parts) == 1:\n", " row['NAME'] = row['NAME'] + ' ' + line.strip()\n", " continue\n", " else:\n", " logging.warn('%s:%d: %d parts, not %d: %s', \n", " filename, ln + 1, len(parts), len(fields), line)\n", " continue\n", " \n", " row['STATE'] = state\n", " row['PC'] = constituency\n", " row['NAME'] = re_name.sub('', row['NAME'])\n", " results.append(row)\n", " \n", " results = pd.DataFrame(results).set_index(['STATE', 'PC'])\n", " results['YEAR'] = filename.split('.')[0]\n", " results['ELECTORS'] = pd.Series(electors)\n", " if '%' in results:\n", " del results['%']\n", " return results.reset_index()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": true, "input": [ "# Parse the text files\n", "logging.basicConfig(level=logging.INFO)\n", "\n", "results = []\n", "for filename in sorted(fieldlist):\n", " results.append(old_text_parse(filename))\n", "\n", "results = pd.concat(results, ignore_index=True)['YEAR STATE PC NAME SEX PARTY AGE CATEGORY VOTES ELECTORS'.split(' ')]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "# Cleanse the results\n", "rename = pd.read_csv('rename.csv').set_index(['Field', 'Source'])['Target']\n", "for col in rename.index.get_level_values(0).unique():\n", " # print rename.ix[col]\n", " results[col].replace(rename.ix[col].to_dict(), inplace=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "# Calculations\n", "results['VOTES'] = results['VOTES'].astype(float)\n", "results['#'] = results.groupby(['YEAR', 'STATE', 'PC'])['VOTES'].rank(method='min', ascending=False)\n", "results.sort(['YEAR', 'STATE', 'PC', 'VOTES'], ascending=(True, True, True, False), inplace=True)\n", "results.to_csv('parliament.csv', index=False, float_format='%.0f')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 } ], "metadata": {} } ] }