{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Scraping ADR's Myneta.info\n",
      "\n",
      "[Myneta.info](http://myneta.info) has analysed candidate affidavits for many candidates. This scraper converts that data into CSVs. (Of course, we could always ask ADR, and they'll probably happily provide it. But I find it faster to write a scraper than wait for people to arrive at office.)\n",
      "\n",
      "The pages are very structured. We'll begin with the [candidate summary page](http://myneta.info/ls2014/index.php?action=summary&subAction=candidates_analyzed&sort=candidate#summary)."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import os\n",
      "import time\n",
      "import urllib\n",
      "import hashlib\n",
      "import pandas as pd\n",
      "from lxml.html import parse\n",
      "\n",
      "if not os.path.exists('.cache'):\n",
      "    os.makedirs('.cache')\n",
      "    \n",
      "# If file is older than 5 days, download it again\n",
      "OLD = time.time() - 15 * 24 * 60 * 60\n",
      "\n",
      "yearkey = {\n",
      "    2014: 'ls2014',\n",
      "    2009: 'ls2009',\n",
      "    2004: 'loksabha2004',\n",
      "}\n",
      "\n",
      "def get(url):\n",
      "    path = os.path.join('.cache', hashlib.sha1(url).hexdigest()) + '.html'\n",
      "    if not os.path.exists(path) or os.stat(path).st_mtime < OLD:\n",
      "        print url\n",
      "        urllib.urlretrieve(url, path)\n",
      "    return parse(open(path))\n",
      "\n",
      "def candidates(year):\n",
      "    url = 'http://myneta.info/{:s}/index.php?action=summary&subAction=candidates_analyzed&sort=candidate'\n",
      "    tree = get(url.format(yearkey[year]))\n",
      "    results = []\n",
      "    for row in tree.findall('.//table')[-1].findall('tr'):\n",
      "        td = row.findall('td')\n",
      "        results.append({\n",
      "            'Year': year,\n",
      "            'Sno': td[0].text,\n",
      "            'ID': int(td[1].find('a').get('href').split('=')[-1]),\n",
      "            'Candidate': td[1].find('a').text,\n",
      "            'Constituency': td[2].text,\n",
      "            'Party': td[3].text,\n",
      "            'Criminal Cases': int(td[4].text_content()),\n",
      "            'Education': td[5].text,\n",
      "            'Total Assets': int(td[6].text.replace(u'Rs\\xa0', '').replace(',', '').replace('Nil', '0')),\n",
      "            'Total Liabilities': int(td[7].text.replace(u'Rs\\xa0', '').replace(',', '').replace('Nil', '0')),\n",
      "        })\n",
      "    return pd.DataFrame(results)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "ls2014 = candidates(2014)\n",
      "\n",
      "# The constituency page does not provide the state and PC code\n",
      "# So let's introduce that, at least for 2014.\n",
      "pc2014 = pd.read_csv('pc2014.csv').set_index('Constituency')\n",
      "ls2014['ST_CODE'] = ls2014['Constituency'].apply(lambda v: pc2014['ST_CODE'].get(v, ''))\n",
      "ls2014['PC_CODE'] = ls2014['Constituency'].apply(lambda v: pc2014['PC_CODE'].get(v, ''))\n",
      "\n",
      "# However, some corrections are required for duplicate constituencies\n",
      "index = ls2014[(ls2014['Constituency'] == 'AURANGABAD') & (ls2014['ID'] > 5000)].index\n",
      "ls2014['ST_CODE'][index] = 'S13'\n",
      "ls2014['PC_CODE'][index] = 19\n",
      "\n",
      "index = ls2014[(ls2014['Constituency'] == 'MAHARAJGANJ') & (ls2014['ID'] > 9000)].index\n",
      "ls2014['ST_CODE'][index] = 'S24'\n",
      "ls2014['PC_CODE'][index] = 63\n",
      "\n",
      "index = ls2014[(ls2014['Constituency'] == 'HAMIRPUR') & (ls2014['ID'] < 7000)].index\n",
      "ls2014['ST_CODE'][index] = 'S24'\n",
      "ls2014['PC_CODE'][index] = 47\n",
      "\n",
      "# Save to disk\n",
      "ls2014.to_csv('myneta.2014.csv', index=False)\n",
      "ls2014.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>Candidate</th>\n",
        "      <th>Constituency</th>\n",
        "      <th>Criminal Cases</th>\n",
        "      <th>Education</th>\n",
        "      <th>ID</th>\n",
        "      <th>Party</th>\n",
        "      <th>Sno</th>\n",
        "      <th>Total Assets</th>\n",
        "      <th>Total Liabilities</th>\n",
        "      <th>Year</th>\n",
        "      <th>ST_CODE</th>\n",
        "      <th>PC_CODE</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td>      Kaushal Yadav</td>\n",
        "      <td>        NAWADA</td>\n",
        "      <td> 8</td>\n",
        "      <td>         Post Graduate</td>\n",
        "      <td>  148</td>\n",
        "      <td>                        JD(U)</td>\n",
        "      <td> 1</td>\n",
        "      <td> 154566136</td>\n",
        "      <td> 2604969</td>\n",
        "      <td> 2014</td>\n",
        "      <td> S04</td>\n",
        "      <td> 39</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>       Kiran Sharma</td>\n",
        "      <td>      AZAMGARH</td>\n",
        "      <td> 0</td>\n",
        "      <td>              8th Pass</td>\n",
        "      <td> 9487</td>\n",
        "      <td> Bhartiya Shakti Chetna Party</td>\n",
        "      <td> 2</td>\n",
        "      <td>   3509407</td>\n",
        "      <td>  325000</td>\n",
        "      <td> 2014</td>\n",
        "      <td> S24</td>\n",
        "      <td> 69</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td>   M. Aamir Rashadi</td>\n",
        "      <td>      AZAMGARH</td>\n",
        "      <td> 1</td>\n",
        "      <td>                Others</td>\n",
        "      <td> 9496</td>\n",
        "      <td>      Rashtriya Ulama Council</td>\n",
        "      <td> 3</td>\n",
        "      <td>   2191523</td>\n",
        "      <td>       0</td>\n",
        "      <td> 2014</td>\n",
        "      <td> S24</td>\n",
        "      <td> 69</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td>  Rakesh Kumar Giri</td>\n",
        "      <td>   MAHARAJGANJ</td>\n",
        "      <td> 0</td>\n",
        "      <td> Graduate Professional</td>\n",
        "      <td> 9706</td>\n",
        "      <td>                          IND</td>\n",
        "      <td> 4</td>\n",
        "      <td>    306023</td>\n",
        "      <td>       0</td>\n",
        "      <td> 2014</td>\n",
        "      <td> S24</td>\n",
        "      <td> 63</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> (Kuppal)G.Devadoss</td>\n",
        "      <td> CHENNAI SOUTH</td>\n",
        "      <td> 0</td>\n",
        "      <td>              8th Pass</td>\n",
        "      <td> 6912</td>\n",
        "      <td>                          IND</td>\n",
        "      <td> 5</td>\n",
        "      <td>   3630000</td>\n",
        "      <td>  850000</td>\n",
        "      <td> 2014</td>\n",
        "      <td> S22</td>\n",
        "      <td>  3</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 12 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 2,
       "text": [
        "            Candidate   Constituency  Criminal Cases              Education  \\\n",
        "0       Kaushal Yadav         NAWADA               8          Post Graduate   \n",
        "1        Kiran Sharma       AZAMGARH               0               8th Pass   \n",
        "2    M. Aamir Rashadi       AZAMGARH               1                 Others   \n",
        "3   Rakesh Kumar Giri    MAHARAJGANJ               0  Graduate Professional   \n",
        "4  (Kuppal)G.Devadoss  CHENNAI SOUTH               0               8th Pass   \n",
        "\n",
        "     ID                         Party Sno  Total Assets  Total Liabilities  \\\n",
        "0   148                         JD(U)   1     154566136            2604969   \n",
        "1  9487  Bhartiya Shakti Chetna Party   2       3509407             325000   \n",
        "2  9496       Rashtriya Ulama Council   3       2191523                  0   \n",
        "3  9706                           IND   4        306023                  0   \n",
        "4  6912                           IND   5       3630000             850000   \n",
        "\n",
        "   Year ST_CODE  PC_CODE  \n",
        "0  2014     S04       39  \n",
        "1  2014     S24       69  \n",
        "2  2014     S24       69  \n",
        "3  2014     S24       63  \n",
        "4  2014     S22        3  \n",
        "\n",
        "[5 rows x 12 columns]"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "ls2009 = candidates(2009)\n",
      "ls2009.to_csv('myneta.2009.csv', index=False)\n",
      "\n",
      "ls2004 = candidates(2004)\n",
      "ls2004.to_csv('myneta.2004.csv', index=False)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Candidate details\n",
      "\n",
      "Let's scraping the IPC sections and asset breakup.\n",
      "\n",
      "(As I supsected, writing the scraper took less time (40 min) than I think it'd have taken to get the information even from an organisation as friendly as ADR.)"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import re\n",
      "\n",
      "re_ipcs = re.compile(r'(\\d+) charges related to .*?IPC Section\\-(\\d+)')\n",
      "\n",
      "def candidate(year, id):\n",
      "    url = 'http://myneta.info/{:s}/candidate.php?candidate_id={:d}'.format(yearkey[year], id)\n",
      "        \n",
      "    tree = get(url)\n",
      "    result = []\n",
      "    ipcs = tree.xpath(\".//h3[contains(text(), 'Brief Details of IPCs')]\")\n",
      "    if len(ipcs):\n",
      "        ipcs = ipcs[0].getparent().text_content()\n",
      "        for count, ipc_section in re_ipcs.findall(ipcs):\n",
      "            result.append({\n",
      "                'Type': 'IPC',\n",
      "                'Year': year,\n",
      "                'ID': id,\n",
      "                'Key': ipc_section,\n",
      "                'Value': int(count)\n",
      "            })\n",
      "\n",
      "    for heading in tree.xpath(\".//h3[contains(text(), 'ovable Assets')]\"):\n",
      "        # Ignore 1st header row, last total rows from table\n",
      "        for row in heading.getparent().getnext().findall('.//tr')[1:-1]:\n",
      "            cells = row.findall('.//td')\n",
      "            # Since rowspan is used for some cells, description is in 1st / 2nd col\n",
      "            key = cells[0].text_content()\n",
      "            if key[0].islower():\n",
      "                key = cells[1].text_content()\n",
      "            if 'Total' in key:\n",
      "                key = 'Total as per Affidavit'\n",
      "            result.append({\n",
      "                'Type': 'Assets',\n",
      "                'Year': year,\n",
      "                'ID': id,\n",
      "                'Key': key,\n",
      "                'Value': int(re.sub(r'\\D', '', cells[-1].find('.//b').text) or 0)\n",
      "            })\n",
      "            \n",
      "    return pd.DataFrame(result)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "ls2014_details = []\n",
      "for index, row in ls2014.iterrows():\n",
      "    ls2014_details.append(candidate(2014, row['ID']))\n",
      "\n",
      "ls2014_details = pd.concat(ls2014_details)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "ls2014_details.to_csv('myneta.details.2014.csv', index=False)\n",
      "ls2014_details.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>ID</th>\n",
        "      <th>Key</th>\n",
        "      <th>Type</th>\n",
        "      <th>Value</th>\n",
        "      <th>Year</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> 148</td>\n",
        "      <td> 420</td>\n",
        "      <td> IPC</td>\n",
        "      <td> 3</td>\n",
        "      <td> 2014</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> 148</td>\n",
        "      <td> 467</td>\n",
        "      <td> IPC</td>\n",
        "      <td> 2</td>\n",
        "      <td> 2014</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 148</td>\n",
        "      <td> 468</td>\n",
        "      <td> IPC</td>\n",
        "      <td> 2</td>\n",
        "      <td> 2014</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 148</td>\n",
        "      <td> 307</td>\n",
        "      <td> IPC</td>\n",
        "      <td> 1</td>\n",
        "      <td> 2014</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> 148</td>\n",
        "      <td> 379</td>\n",
        "      <td> IPC</td>\n",
        "      <td> 1</td>\n",
        "      <td> 2014</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 5 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 6,
       "text": [
        "    ID  Key Type  Value  Year\n",
        "0  148  420  IPC      3  2014\n",
        "1  148  467  IPC      2  2014\n",
        "2  148  468  IPC      2  2014\n",
        "3  148  307  IPC      1  2014\n",
        "4  148  379  IPC      1  2014\n",
        "\n",
        "[5 rows x 5 columns]"
       ]
      }
     ],
     "prompt_number": 6
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Candidates with shares\n",
      "\n",
      "Some URLS have share information. Next step is to scrape those as well.\n",
      "\n",
      "- http://myneta.info/ls2014/candidate.php?candidate_id=214\n",
      "- http://myneta.info/ls2014/candidate.php?candidate_id=74\n",
      "- http://myneta.info/ls2014/candidate.php?candidate_id=1142\n",
      "- http://myneta.info/ls2014/candidate.php?candidate_id=4988"
     ]
    }
   ],
   "metadata": {}
  }
 ]
}