# Process PubMed journal catalog

Download and process PubMed/NLM [journal catalog](http://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.journal_lists/).

In [1]:
import os
import re

import pandas

In [2]:
# Download PubMed Journals
url = 'ftp://ftp.ncbi.nih.gov/pubmed/J_Medline.txt'
! wget --no-verbose --directory-prefix download --timestamping {url}

2016-01-20 17:30:58 URL: ftp://ftp.ncbi.nih.gov/pubmed/J_Medline.txt [1758] -> "download/.listing" [1]


In [3]:
# Read PubMed journals
path = os.path.join('download', 'J_Medline.txt')
with open(path) as read_file:
 text = read_file.read()

In [4]:
# Create a dataframe of PubMed journals
rows = list()
pattern = re.compile('^-+$', re.MULTILINE)
for stanza in re.split(pattern, text):
 stanza = stanza.strip()
 if not stanza:
 continue
 row = dict()
 for line in stanza.split('\n'):
 key, value = line.split(': ', 1)
 row[key] = value or None
 rows.append(row)

journal_df = pandas.DataFrame(rows)
journal_df = journal_df.sort_values(by='NlmId')

In [5]:
# Order columns by percent missing
missing_pct = journal_df.isnull().mean().sort_values()
journal_df = journal_df[missing_pct.index]
missing_pct

JournalTitle 0.000000
JrId 0.000000
NlmId 0.000000
IsoAbbr 0.000346
MedAbbr 0.002869
ISSN (Print) 0.194760
ISSN (Online) 0.626205
dtype: float64

In [6]:
# Save journal dataframe as a TSV
path = 'data/pubmed-journals.tsv'
journal_df.to_csv(path, sep='\t', index=False)