appears to have the maximum detail, with the following fields:\n",
"\n",
"- State\n",
"- Constituency\n",
"- Candidate\n",
"- Party\n",
"- Votes\n",
"\n",
"So let's scrape that."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# I'll keep it to standard Python 2.7 libraries, but for lxml.\n",
"import os\n",
"import re\n",
"import urllib\n",
"from hashlib import sha256\n",
"from lxml.html import parse"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def get(url):\n",
" \"\"\"Retrieves a URL as an lxml tree, cached where possible\"\"\"\n",
" filename = '.cache.' + sha256(url).hexdigest()\n",
" if not os.path.exists(filename):\n",
" html = urllib.urlretrieve(url, filename)\n",
" return parse(filename)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def constituencies(url):\n",
" \"\"\"Yields dicts with state, state_code, constituency, constituency_code.\"\"\"\n",
" tree = get(url)\n",
"\n",
" # States and codes are stored in Javascript, like this:\n",
" # if (st.value == 'S26') {\n",
" # strValues = document.getElementById('HdnFldChhattisgarh').value;\n",
" # This is a crude parsing of that code\n",
" statecode = re.findall('st.value *=+ *\\'([^\\']+).*?HdnFld([^\\']+)',\n",
" tree.findall('.//script')[0].text, re.S)\n",
" statecode = {state:code for code, state in statecode}\n",
" \n",
" # Constituency codes are in hidden input fields. Format is:\n",
" # code,constituency; code,constituency; ...\n",
" for el in tree.findall('.//input[@id]'):\n",
" id = el.get('id', '').strip()\n",
" if id.startswith('HdnFld'):\n",
" state = id.replace('HdnFld', '')\n",
" for row in el.get('value').split(';'):\n",
" row = row.strip()\n",
" if row:\n",
" cells = row.split(',')\n",
" yield {\n",
" 'state': state,\n",
" 'statecode': statecode.get(state),\n",
" 'constituency': cells[1],\n",
" 'constituencycode': cells[0]\n",
" }"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def results(url):\n",
" \"\"\"For a constituency URL, yields dicts with candidate, party, votes.\"\"\"\n",
" tree = get(url)\n",
"\n",
" # Results are inside a table in a \n",
" for row in tree.findall('.//*[@id=\"div1\"]//tr'):\n",
" cells = row.findall('td')\n",
" if len(cells) >= 3:\n",
" yield {\n",
" 'candidate': cells[0].text.strip(),\n",
" 'party': cells[1].text.strip(),\n",
" 'votes': cells[2].text.strip(),\n",
" }"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": true,
"input": [
"dataset = []\n",
"for place in constituencies('http://eciresults.nic.in/ConstituencywiseS2653.htm'):\n",
" url = 'http://eciresults.nic.in/Constituencywise{:s}{:s}.htm?ac={:s}'.format(\n",
" place['statecode'], place['constituencycode'], place['constituencycode'])\n",
" # print 'Debug: scraping', place['state'], place['constituency']\n",
" for result in results(url):\n",
" result.update(place)\n",
" dataset.append(result)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Let's save this as tab-delimited UTF-8 file. (Sadly, csv doesn't do UTF-8)\n",
"with open('2013-result.txt', 'wb') as out:\n",
" fields = ['state', 'constituency', 'votes', 'candidate', 'party']\n",
" out.write('\\t'.join(fields) + '\\n')\n",
" for row in dataset:\n",
" out.write('\\t'.join(row[f] for f in fields).encode('utf-8') + '\\n')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
}
],
"metadata": {}
}
]
}