{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import json\n", "import requests\n", "import re\n", "import pandas as pd\n", "from pandas import DataFrame" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "scholar_search = 'https://scholar.google.com/scholar?cites=http://www.ncbi.nlm.nih.gov/pubmed/'\n", "pubmed_search = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=10000&term=%22retracted%20publication%22[Publication%20Type]'\n", "pubmed_deets = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id='\n", "\n", "file_pattern = 'html/scholar?cites=http:%2F%2Fwww.ncbi.nlm.nih.gov%2Fpubmed%2F'\n", "\n", "#regex = r'/[1-9](?:\\d{0,2})(?:,\\d{3})*(?:\\.\\d*[1-9])?|0?\\.\\d*[1-9]|0/ ' \n", "\n", "#regex = r'About\\s[0-9],[0-9]*\\sresults'\n", "\n", "#regex = r'About\\s[0-9]{1,3}(,[0-9]{3})*(\\.[0-9]+)?\\sresults'\n", "\n", "regex = r'About\\s[0-9]{1,3}(,[0-9]{3})?\\sresults'" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 276 }, { "cell_type": "code", "collapsed": false, "input": [ "def get_retracted_article_ids():\n", " \"\"\"\n", " Return a list of Pubmed ids that are retracted\n", " \"\"\"\n", " return json.loads(requests.get(pubmed_search).content)['esearchresult']['idlist']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 277 }, { "cell_type": "code", "collapsed": false, "input": [ "def get_pubmed_detail(pubmed_id):\n", " \"\"\"\n", " Use the Pubmed API to return the details of a pubmed article\n", " \"\"\"\n", " return json.loads (requests.get(pubmed_deets+pubmed_id).content)['result'][pubmed_id]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "#make a list of all of retracted paper pubmed ids\n", "retraction_ids = get_retracted_article_ids() " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 150 }, { "cell_type": "code", "collapsed": false, "input": [ "#make a list of google scholar citation searches for our retracted pubmed ids\n", "filelist = open(\"list.txt\", \"w\")\n", "for retraction in retraction_ids:\n", " filelist.write(scholar_search + retraction)\n", " filelist.write(\"\\n\")\n", "filelist.close()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 235 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Some wget kung fu to fetch the list of search results...." ] }, { "cell_type": "code", "collapsed": false, "input": [ "#go through our our downloaded search results and extract number of citations text\n", "results = {}\n", "\n", "for retraction in retraction_ids:\n", " f = file_pattern + retraction\n", " result = open(f, 'r')\n", " reg_snip = re.search(regex, result.read())\n", " if reg_snip != None:\n", " results[retraction] = reg_snip.group()\n", " else:\n", " results[retraction] = '0'\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 289 }, { "cell_type": "code", "collapsed": false, "input": [ "df = DataFrame.from_dict(results.items())\n", "df.columns = ['pmid', 'citations']\n", "\n", "df['citations'] = df['citations'].str.replace('About', '')\n", "df['citations'] = df['citations'].str.replace('results', '')\n", "df['citations'] = df['citations'].str.replace(',', '')\n", "df['citations'] = df['citations'].astype(float)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 290 }, { "cell_type": "code", "collapsed": false, "input": [ "df = df.sort('citations', ascending=False).head(11) #let's make a top 10 table" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 309 }, { "cell_type": "code", "collapsed": false, "input": [ "df['title'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['title'])\n", "df['author'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['authors'][0]['name'])\n", "df['journal'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['fulljournalname'])\n", "df['pubdate'] = df['pmid'].map(lambda x: get_pubmed_detail(x)['pubdate'])\n", "df['scholar_link'] = scholar_search + df['pmid']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 328 }, { "cell_type": "code", "collapsed": false, "input": [ "df.head().to_csv('top5.csv')\n", "df.to_csv('top10.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 330 }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('top10.csv')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | Unnamed: 0 | \n", "pmid | \n", "citations | \n", "title | \n", "author | \n", "journal | \n", "pubdate | \n", "scholar_link | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "2892 | \n", "9500320 | \n", "2070 | \n", "Ileal-lymphoid-nodular hyperplasia, non-specif... | \n", "Wakefield AJ | \n", "Lancet | \n", "1998 Feb 28 | \n", "https://scholar.google.com/scholar?cites=http:... | \n", "
1 | \n", "2391 | \n", "15604363 | \n", "2050 | \n", "Visfatin: a protein secreted by visceral fat t... | \n", "Fukuhara A | \n", "Science (New York, N.Y.) | \n", "2005 Jan 21 | \n", "https://scholar.google.com/scholar?cites=http:... | \n", "
2 | \n", "508 | \n", "11675329 | \n", "1550 | \n", "Purification and ex vivo expansion of postnata... | \n", "Reyes M | \n", "Blood | \n", "2001 Nov 1 | \n", "https://scholar.google.com/scholar?cites=http:... | \n", "
3 | \n", "3540 | \n", "12531578 | \n", "1250 | \n", "Combination treatment of angiotensin-II recept... | \n", "Nakao N | \n", "Lancet | \n", "2003 Jan 11 | \n", "https://scholar.google.com/scholar?cites=http:... | \n", "
4 | \n", "1010 | \n", "15833829 | \n", "1040 | \n", "Spontaneous human adult stem cell transformation. | \n", "Rubio D | \n", "Cancer research | \n", "2005 Apr 15 | \n", "https://scholar.google.com/scholar?cites=http:... | \n", "
5 | \n", "2710 | \n", "10700237 | \n", "825 | \n", "Regression of human metastatic renal cell carc... | \n", "Kugler A | \n", "Nature medicine | \n", "2000 Mar | \n", "https://scholar.google.com/scholar?cites=http:... | \n", "
6 | \n", "477 | \n", "14963337 | \n", "805 | \n", "Evidence of a pluripotent human embryonic stem... | \n", "Hwang WS | \n", "Science (New York, N.Y.) | \n", "2004 Mar 12 | \n", "https://scholar.google.com/scholar?cites=http:... | \n", "
7 | \n", "2378 | \n", "12176951 | \n", "755 | \n", "Multiple atherosclerotic plaque rupture in acu... | \n", "Rioufol G | \n", "Circulation | \n", "2002 Aug 13 | \n", "https://scholar.google.com/scholar?cites=http:... | \n", "
8 | \n", "350 | \n", "11546864 | \n", "732 | \n", "Structure of MsbA from E. coli: a homolog of t... | \n", "Chang G | \n", "Science (New York, N.Y.) | \n", "2001 Sep 7 | \n", "https://scholar.google.com/scholar?cites=http:... | \n", "
9 | \n", "3494 | \n", "8633243 | \n", "616 | \n", "Synergistic activation of estrogen receptor wi... | \n", "Arnold SF | \n", "Science (New York, N.Y.) | \n", "1996 Jun 7 | \n", "https://scholar.google.com/scholar?cites=http:... | \n", "
10 | \n", "665 | \n", "12351674 | \n", "607 | \n", "Contribution of human alpha-defensin 1, 2, and... | \n", "Zhang L | \n", "Science (New York, N.Y.) | \n", "2002 Nov 1 | \n", "https://scholar.google.com/scholar?cites=http:... | \n", "
11 rows \u00d7 8 columns
\n", "