{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import json\n", "import requests\n", "import re\n", "import pandas as pd\n", "from pandas import DataFrame" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "scholar_search = 'https://scholar.google.com/scholar?cites=http://www.ncbi.nlm.nih.gov/pubmed/'\n", "pubmed_search = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=10000&term=%22retracted%20publication%22[Publication%20Type]'\n", "pubmed_deets = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id='\n", "\n", "file_pattern = 'html/scholar?cites=http:%2F%2Fwww.ncbi.nlm.nih.gov%2Fpubmed%2F'\n", "\n", "#regex = r'/[1-9](?:\\d{0,2})(?:,\\d{3})*(?:\\.\\d*[1-9])?|0?\\.\\d*[1-9]|0/ ' \n", "\n", "#regex = r'About\\s[0-9],[0-9]*\\sresults'\n", "\n", "#regex = r'About\\s[0-9]{1,3}(,[0-9]{3})*(\\.[0-9]+)?\\sresults'\n", "\n", "regex = r'About\\s[0-9]{1,3}(,[0-9]{3})?\\sresults'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 276 }, { "cell_type": "code", "collapsed": false, "input": [ "def get_retracted_article_ids():\n", " \"\"\"\n", " Return a list of Pubmed ids that are retracted\n", " \"\"\"\n", " return json.loads(requests.get(pubmed_search).content)['esearchresult']['idlist']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 277 }, { "cell_type": "code", "collapsed": false, "input": [ "def get_pubmed_detail(pubmed_id):\n", " \"\"\"\n", " Use the Pubmed API to return the details of a pubmed article\n", " \"\"\"\n", " return json.loads (requests.get(pubmed_deets+pubmed_id).content)['result'][pubmed_id]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "#make a list of all of retracted paper pubmed ids\n", "retraction_ids = get_retracted_article_ids() " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 150 }, { "cell_type": "code", "collapsed": false, "input": [ "#make a list of google scholar citation searches for our retracted pubmed ids\n", "filelist = open(\"list.txt\", \"w\")\n", "for retraction in retraction_ids:\n", " filelist.write(scholar_search + retraction)\n", " filelist.write(\"\\n\")\n", "filelist.close()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 235 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Some wget kung fu to fetch the list of search results...." ] }, { "cell_type": "code", "collapsed": false, "input": [ "#go through our our downloaded search results and extract number of citations text\n", "results = {}\n", "\n", "for retraction in retraction_ids:\n", " f = file_pattern + retraction\n", " result = open(f, 'r')\n", " reg_snip = re.search(regex, result.read())\n", " if reg_snip != None:\n", " results[retraction] = reg_snip.group()\n", " else:\n", " results[retraction] = '0'\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 289 }, { "cell_type": "code", "collapsed": false, "input": [ "df = DataFrame.from_dict(results.items())\n", "df.columns = ['pmid', 'citations']\n", "\n", "df['citations'] = df['citations'].str.replace('About', '')\n", "df['citations'] = df['citations'].str.replace('results', '')\n", "df['citations'] = df['citations'].str.replace(',', '')\n", "df['citations'] = df['citations'].astype(float)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 290 }, { "cell_type": "code", "collapsed": false, "input": [ "df = df.sort('citations', ascending=False).head(11) #let's make a top 10 table" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 309 }, { "cell_type": "code", "collapsed": false, "input": [ "df['title'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['title'])\n", "df['author'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['authors'][0]['name'])\n", "df['journal'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['fulljournalname'])\n", "df['pubdate'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['pubdate'])\n", "df['scholar_link'] = scholar_search + df['pmid']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 328 }, { "cell_type": "code", "collapsed": false, "input": [ "df.head().to_csv('top5.csv')\n", "df.to_csv('top10.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 330 }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('top10.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0pmidcitationstitleauthorjournalpubdatescholar_link
0 2892 9500320 2070 Ileal-lymphoid-nodular hyperplasia, non-specif... Wakefield AJ Lancet 1998 Feb 28 https://scholar.google.com/scholar?cites=http:...
1 2391 15604363 2050 Visfatin: a protein secreted by visceral fat t... Fukuhara A Science (New York, N.Y.) 2005 Jan 21 https://scholar.google.com/scholar?cites=http:...
2 508 11675329 1550 Purification and ex vivo expansion of postnata... Reyes M Blood 2001 Nov 1 https://scholar.google.com/scholar?cites=http:...
3 3540 12531578 1250 Combination treatment of angiotensin-II recept... Nakao N Lancet 2003 Jan 11 https://scholar.google.com/scholar?cites=http:...
4 1010 15833829 1040 Spontaneous human adult stem cell transformation. Rubio D Cancer research 2005 Apr 15 https://scholar.google.com/scholar?cites=http:...
5 2710 10700237 825 Regression of human metastatic renal cell carc... Kugler A Nature medicine 2000 Mar https://scholar.google.com/scholar?cites=http:...
6 477 14963337 805 Evidence of a pluripotent human embryonic stem... Hwang WS Science (New York, N.Y.) 2004 Mar 12 https://scholar.google.com/scholar?cites=http:...
7 2378 12176951 755 Multiple atherosclerotic plaque rupture in acu... Rioufol G Circulation 2002 Aug 13 https://scholar.google.com/scholar?cites=http:...
8 350 11546864 732 Structure of MsbA from E. coli: a homolog of t... Chang G Science (New York, N.Y.) 2001 Sep 7 https://scholar.google.com/scholar?cites=http:...
9 3494 8633243 616 Synergistic activation of estrogen receptor wi... Arnold SF Science (New York, N.Y.) 1996 Jun 7 https://scholar.google.com/scholar?cites=http:...
10 665 12351674 607 Contribution of human alpha-defensin 1, 2, and... Zhang L Science (New York, N.Y.) 2002 Nov 1 https://scholar.google.com/scholar?cites=http:...
\n", "

11 rows \u00d7 8 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ " Unnamed: 0 pmid citations \\\n", "0 2892 9500320 2070 \n", "1 2391 15604363 2050 \n", "2 508 11675329 1550 \n", "3 3540 12531578 1250 \n", "4 1010 15833829 1040 \n", "5 2710 10700237 825 \n", "6 477 14963337 805 \n", "7 2378 12176951 755 \n", "8 350 11546864 732 \n", "9 3494 8633243 616 \n", "10 665 12351674 607 \n", "\n", " title author \\\n", "0 Ileal-lymphoid-nodular hyperplasia, non-specif... Wakefield AJ \n", "1 Visfatin: a protein secreted by visceral fat t... Fukuhara A \n", "2 Purification and ex vivo expansion of postnata... Reyes M \n", "3 Combination treatment of angiotensin-II recept... Nakao N \n", "4 Spontaneous human adult stem cell transformation. Rubio D \n", "5 Regression of human metastatic renal cell carc... Kugler A \n", "6 Evidence of a pluripotent human embryonic stem... Hwang WS \n", "7 Multiple atherosclerotic plaque rupture in acu... Rioufol G \n", "8 Structure of MsbA from E. coli: a homolog of t... Chang G \n", "9 Synergistic activation of estrogen receptor wi... Arnold SF \n", "10 Contribution of human alpha-defensin 1, 2, and... Zhang L \n", "\n", " journal pubdate \\\n", "0 Lancet 1998 Feb 28 \n", "1 Science (New York, N.Y.) 2005 Jan 21 \n", "2 Blood 2001 Nov 1 \n", "3 Lancet 2003 Jan 11 \n", "4 Cancer research 2005 Apr 15 \n", "5 Nature medicine 2000 Mar \n", "6 Science (New York, N.Y.) 2004 Mar 12 \n", "7 Circulation 2002 Aug 13 \n", "8 Science (New York, N.Y.) 2001 Sep 7 \n", "9 Science (New York, N.Y.) 1996 Jun 7 \n", "10 Science (New York, N.Y.) 2002 Nov 1 \n", "\n", " scholar_link \n", "0 https://scholar.google.com/scholar?cites=http:... \n", "1 https://scholar.google.com/scholar?cites=http:... \n", "2 https://scholar.google.com/scholar?cites=http:... \n", "3 https://scholar.google.com/scholar?cites=http:... \n", "4 https://scholar.google.com/scholar?cites=http:... \n", "5 https://scholar.google.com/scholar?cites=http:... \n", "6 https://scholar.google.com/scholar?cites=http:... \n", "7 https://scholar.google.com/scholar?cites=http:... \n", "8 https://scholar.google.com/scholar?cites=http:... \n", "9 https://scholar.google.com/scholar?cites=http:... \n", "10 https://scholar.google.com/scholar?cites=http:... \n", "\n", "[11 rows x 8 columns]" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }