{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import json\n",
      "import requests\n",
      "import re\n",
      "import pandas as pd\n",
      "from pandas import DataFrame"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "scholar_search = 'https://scholar.google.com/scholar?cites=http://www.ncbi.nlm.nih.gov/pubmed/'\n",
      "pubmed_search = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=10000&term=%22retracted%20publication%22[Publication%20Type]'\n",
      "pubmed_deets = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id='\n",
      "\n",
      "file_pattern = 'html/scholar?cites=http:%2F%2Fwww.ncbi.nlm.nih.gov%2Fpubmed%2F'\n",
      "\n",
      "#regex = r'/[1-9](?:\\d{0,2})(?:,\\d{3})*(?:\\.\\d*[1-9])?|0?\\.\\d*[1-9]|0/ ' \n",
      "\n",
      "#regex = r'About\\s[0-9],[0-9]*\\sresults'\n",
      "\n",
      "#regex = r'About\\s[0-9]{1,3}(,[0-9]{3})*(\\.[0-9]+)?\\sresults'\n",
      "\n",
      "regex = r'About\\s[0-9]{1,3}(,[0-9]{3})?\\sresults'"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 276
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def get_retracted_article_ids():\n",
      "    \"\"\"\n",
      "    Return a list of Pubmed ids that are retracted\n",
      "    \"\"\"\n",
      "    return json.loads(requests.get(pubmed_search).content)['esearchresult']['idlist']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 277
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def get_pubmed_detail(pubmed_id):\n",
      "    \"\"\"\n",
      "    Use the Pubmed API to return the details of a pubmed article\n",
      "    \"\"\"\n",
      "    return json.loads (requests.get(pubmed_deets+pubmed_id).content)['result'][pubmed_id]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#make a list of all of retracted paper pubmed ids\n",
      "retraction_ids = get_retracted_article_ids() "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 150
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#make a list of google scholar citation searches for our retracted pubmed ids\n",
      "filelist = open(\"list.txt\", \"w\")\n",
      "for retraction in retraction_ids:\n",
      "    filelist.write(scholar_search + retraction)\n",
      "    filelist.write(\"\\n\")\n",
      "filelist.close()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 235
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "Some wget kung fu to fetch the list of search results...."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#go through our our downloaded search results and extract number of citations text\n",
      "results = {}\n",
      "\n",
      "for retraction in retraction_ids:\n",
      "    f = file_pattern + retraction\n",
      "    result = open(f, 'r')\n",
      "    reg_snip = re.search(regex, result.read())\n",
      "    if reg_snip != None:\n",
      "        results[retraction] = reg_snip.group()\n",
      "    else:\n",
      "        results[retraction] = '0'\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 289
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df = DataFrame.from_dict(results.items())\n",
      "df.columns = ['pmid', 'citations']\n",
      "\n",
      "df['citations'] = df['citations'].str.replace('About', '')\n",
      "df['citations'] = df['citations'].str.replace('results', '')\n",
      "df['citations'] = df['citations'].str.replace(',', '')\n",
      "df['citations'] = df['citations'].astype(float)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 290
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df = df.sort('citations', ascending=False).head(11) #let's make a top 10 table"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 309
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df['title'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['title'])\n",
      "df['author'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['authors'][0]['name'])\n",
      "df['journal'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['fulljournalname'])\n",
      "df['pubdate'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['pubdate'])\n",
      "df['scholar_link'] = scholar_search + df['pmid']"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 328
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df.head().to_csv('top5.csv')\n",
      "df.to_csv('top10.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 330
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df = pd.read_csv('top10.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>Unnamed: 0</th>\n",
        "      <th>pmid</th>\n",
        "      <th>citations</th>\n",
        "      <th>title</th>\n",
        "      <th>author</th>\n",
        "      <th>journal</th>\n",
        "      <th>pubdate</th>\n",
        "      <th>scholar_link</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0 </th>\n",
        "      <td> 2892</td>\n",
        "      <td>  9500320</td>\n",
        "      <td> 2070</td>\n",
        "      <td> Ileal-lymphoid-nodular hyperplasia, non-specif...</td>\n",
        "      <td> Wakefield AJ</td>\n",
        "      <td>                   Lancet</td>\n",
        "      <td> 1998 Feb 28</td>\n",
        "      <td> https://scholar.google.com/scholar?cites=http:...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1 </th>\n",
        "      <td> 2391</td>\n",
        "      <td> 15604363</td>\n",
        "      <td> 2050</td>\n",
        "      <td> Visfatin: a protein secreted by visceral fat t...</td>\n",
        "      <td>   Fukuhara A</td>\n",
        "      <td> Science (New York, N.Y.)</td>\n",
        "      <td> 2005 Jan 21</td>\n",
        "      <td> https://scholar.google.com/scholar?cites=http:...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2 </th>\n",
        "      <td>  508</td>\n",
        "      <td> 11675329</td>\n",
        "      <td> 1550</td>\n",
        "      <td> Purification and ex vivo expansion of postnata...</td>\n",
        "      <td>      Reyes M</td>\n",
        "      <td>                    Blood</td>\n",
        "      <td>  2001 Nov 1</td>\n",
        "      <td> https://scholar.google.com/scholar?cites=http:...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3 </th>\n",
        "      <td> 3540</td>\n",
        "      <td> 12531578</td>\n",
        "      <td> 1250</td>\n",
        "      <td> Combination treatment of angiotensin-II recept...</td>\n",
        "      <td>      Nakao N</td>\n",
        "      <td>                   Lancet</td>\n",
        "      <td> 2003 Jan 11</td>\n",
        "      <td> https://scholar.google.com/scholar?cites=http:...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4 </th>\n",
        "      <td> 1010</td>\n",
        "      <td> 15833829</td>\n",
        "      <td> 1040</td>\n",
        "      <td> Spontaneous human adult stem cell transformation.</td>\n",
        "      <td>      Rubio D</td>\n",
        "      <td>          Cancer research</td>\n",
        "      <td> 2005 Apr 15</td>\n",
        "      <td> https://scholar.google.com/scholar?cites=http:...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5 </th>\n",
        "      <td> 2710</td>\n",
        "      <td> 10700237</td>\n",
        "      <td>  825</td>\n",
        "      <td> Regression of human metastatic renal cell carc...</td>\n",
        "      <td>     Kugler A</td>\n",
        "      <td>          Nature medicine</td>\n",
        "      <td>    2000 Mar</td>\n",
        "      <td> https://scholar.google.com/scholar?cites=http:...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6 </th>\n",
        "      <td>  477</td>\n",
        "      <td> 14963337</td>\n",
        "      <td>  805</td>\n",
        "      <td> Evidence of a pluripotent human embryonic stem...</td>\n",
        "      <td>     Hwang WS</td>\n",
        "      <td> Science (New York, N.Y.)</td>\n",
        "      <td> 2004 Mar 12</td>\n",
        "      <td> https://scholar.google.com/scholar?cites=http:...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7 </th>\n",
        "      <td> 2378</td>\n",
        "      <td> 12176951</td>\n",
        "      <td>  755</td>\n",
        "      <td> Multiple atherosclerotic plaque rupture in acu...</td>\n",
        "      <td>    Rioufol G</td>\n",
        "      <td>              Circulation</td>\n",
        "      <td> 2002 Aug 13</td>\n",
        "      <td> https://scholar.google.com/scholar?cites=http:...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8 </th>\n",
        "      <td>  350</td>\n",
        "      <td> 11546864</td>\n",
        "      <td>  732</td>\n",
        "      <td> Structure of MsbA from E. coli: a homolog of t...</td>\n",
        "      <td>      Chang G</td>\n",
        "      <td> Science (New York, N.Y.)</td>\n",
        "      <td>  2001 Sep 7</td>\n",
        "      <td> https://scholar.google.com/scholar?cites=http:...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9 </th>\n",
        "      <td> 3494</td>\n",
        "      <td>  8633243</td>\n",
        "      <td>  616</td>\n",
        "      <td> Synergistic activation of estrogen receptor wi...</td>\n",
        "      <td>    Arnold SF</td>\n",
        "      <td> Science (New York, N.Y.)</td>\n",
        "      <td>  1996 Jun 7</td>\n",
        "      <td> https://scholar.google.com/scholar?cites=http:...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
        "      <td>  665</td>\n",
        "      <td> 12351674</td>\n",
        "      <td>  607</td>\n",
        "      <td> Contribution of human alpha-defensin 1, 2, and...</td>\n",
        "      <td>      Zhang L</td>\n",
        "      <td> Science (New York, N.Y.)</td>\n",
        "      <td>  2002 Nov 1</td>\n",
        "      <td> https://scholar.google.com/scholar?cites=http:...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>11 rows \u00d7 8 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 3,
       "text": [
        "    Unnamed: 0      pmid  citations  \\\n",
        "0         2892   9500320       2070   \n",
        "1         2391  15604363       2050   \n",
        "2          508  11675329       1550   \n",
        "3         3540  12531578       1250   \n",
        "4         1010  15833829       1040   \n",
        "5         2710  10700237        825   \n",
        "6          477  14963337        805   \n",
        "7         2378  12176951        755   \n",
        "8          350  11546864        732   \n",
        "9         3494   8633243        616   \n",
        "10         665  12351674        607   \n",
        "\n",
        "                                                title        author  \\\n",
        "0   Ileal-lymphoid-nodular hyperplasia, non-specif...  Wakefield AJ   \n",
        "1   Visfatin: a protein secreted by visceral fat t...    Fukuhara A   \n",
        "2   Purification and ex vivo expansion of postnata...       Reyes M   \n",
        "3   Combination treatment of angiotensin-II recept...       Nakao N   \n",
        "4   Spontaneous human adult stem cell transformation.       Rubio D   \n",
        "5   Regression of human metastatic renal cell carc...      Kugler A   \n",
        "6   Evidence of a pluripotent human embryonic stem...      Hwang WS   \n",
        "7   Multiple atherosclerotic plaque rupture in acu...     Rioufol G   \n",
        "8   Structure of MsbA from E. coli: a homolog of t...       Chang G   \n",
        "9   Synergistic activation of estrogen receptor wi...     Arnold SF   \n",
        "10  Contribution of human alpha-defensin 1, 2, and...       Zhang L   \n",
        "\n",
        "                     journal      pubdate  \\\n",
        "0                     Lancet  1998 Feb 28   \n",
        "1   Science (New York, N.Y.)  2005 Jan 21   \n",
        "2                      Blood   2001 Nov 1   \n",
        "3                     Lancet  2003 Jan 11   \n",
        "4            Cancer research  2005 Apr 15   \n",
        "5            Nature medicine     2000 Mar   \n",
        "6   Science (New York, N.Y.)  2004 Mar 12   \n",
        "7                Circulation  2002 Aug 13   \n",
        "8   Science (New York, N.Y.)   2001 Sep 7   \n",
        "9   Science (New York, N.Y.)   1996 Jun 7   \n",
        "10  Science (New York, N.Y.)   2002 Nov 1   \n",
        "\n",
        "                                         scholar_link  \n",
        "0   https://scholar.google.com/scholar?cites=http:...  \n",
        "1   https://scholar.google.com/scholar?cites=http:...  \n",
        "2   https://scholar.google.com/scholar?cites=http:...  \n",
        "3   https://scholar.google.com/scholar?cites=http:...  \n",
        "4   https://scholar.google.com/scholar?cites=http:...  \n",
        "5   https://scholar.google.com/scholar?cites=http:...  \n",
        "6   https://scholar.google.com/scholar?cites=http:...  \n",
        "7   https://scholar.google.com/scholar?cites=http:...  \n",
        "8   https://scholar.google.com/scholar?cites=http:...  \n",
        "9   https://scholar.google.com/scholar?cites=http:...  \n",
        "10  https://scholar.google.com/scholar?cites=http:...  \n",
        "\n",
        "[11 rows x 8 columns]"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}