{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Process PubMed journal catalog\n", "\n", "Download and process PubMed/NLM [journal catalog](http://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.journal_lists/)." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "import re\n", "\n", "import pandas" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2016-01-20 17:30:58 URL: ftp://ftp.ncbi.nih.gov/pubmed/J_Medline.txt [1758] -> \"download/.listing\" [1]\r\n" ] } ], "source": [ "# Download PubMed Journals\n", "url = 'ftp://ftp.ncbi.nih.gov/pubmed/J_Medline.txt'\n", "! wget --no-verbose --directory-prefix download --timestamping {url}" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Read PubMed journals\n", "path = os.path.join('download', 'J_Medline.txt')\n", "with open(path) as read_file:\n", " text = read_file.read()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Create a dataframe of PubMed journals\n", "rows = list()\n", "pattern = re.compile('^-+$', re.MULTILINE)\n", "for stanza in re.split(pattern, text):\n", " stanza = stanza.strip()\n", " if not stanza:\n", " continue\n", " row = dict()\n", " for line in stanza.split('\\n'):\n", " key, value = line.split(': ', 1)\n", " row[key] = value or None\n", " rows.append(row)\n", "\n", "journal_df = pandas.DataFrame(rows)\n", "journal_df = journal_df.sort_values(by='NlmId')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "JournalTitle 0.000000\n", "JrId 0.000000\n", "NlmId 0.000000\n", "IsoAbbr 0.000346\n", "MedAbbr 0.002869\n", "ISSN (Print) 0.194760\n", "ISSN (Online) 0.626205\n", "dtype: float64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Order columns by percent missing\n", "missing_pct = journal_df.isnull().mean().sort_values()\n", "journal_df = journal_df[missing_pct.index]\n", "missing_pct" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Save journal dataframe as a TSV\n", "path = 'data/pubmed-journals.tsv'\n", "journal_df.to_csv(path, sep='\\t', index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 0 }