# Process Unpaywall data for access status to articles

In [10]:
import collections
import csv
import gzip
import itertools
import json
import lzma

import pandas

## Read and process jsonl snapshot

In [11]:
def _process_unpaywall_record(row):
    row['doi'] = row['doi'].lower()
    row['journal_access'] = False
    row['journal_access_license'] = None
    row['journal_access_evidence'] = None
    for location in row['oa_locations']:
        if location['host_type'] == 'publisher':
            row['journal_access'] = True
            row['journal_access_license'] = location.get('license')
            row['journal_access_evidence'] = location.get('evidence')
            break
    row['journal_date'] = row.get('published_date') or row.get('year')
    journal_date = row.get('published_date')
    journal_issns = row.get('journal_issns')
    row['journal_issns'] = journal_issns.split(',') if journal_issns else []

def read_unpaywall_snapshot(path, doi_subset=None):
    """
    https://unpaywall.org/data-format
    https://unpaywall.org/products/snapshot
    """
    opener = gzip.open if str(path).endswith('.gz') else open
    with opener(path, 'rt') as read_file:
        for line in read_file:
            row = json.loads(line)
            if doi_subset is None or row['doi'].lower() in doi_subset:
                _process_unpaywall_record(row)
                yield row

record_renamer = {
    'doi': 'doi',
    'genre': 'crossref_type',
    'journal_date': 'journal_date',
    'is_oa': 'unpaywall_access',
    'journal_access': 'journal_access',
    'journal_access_evidence': 'journal_access_evidence',
    'journal_access_license': 'journal_access_license',
    'journal_is_oa': 'journal_fully_oa',
}

def _reduce_unpaywall_record(row):
    reduced_row = collections.OrderedDict()
    for key, renamed_key in record_renamer.items():
        value = row[key]
        if isinstance(value, bool):
            value = int(value)
        reduced_row[renamed_key] = value
    return reduced_row

In [12]:
# Input path
path_jsonl = 'downloads/unpaywall/unpaywall_snapshot_2018-09-24T232615.jsonl.gz'
# Output paths
path_access_tsv = 'data/02.unpaywall-access.tsv.xz'
path_issn_tsv = 'data/02.unpaywall-issns.tsv.xz'

articles = read_unpaywall_snapshot(path_jsonl)
# Uncomment following line for development
# articles = itertools.islice(articles, 100)
with lzma.open(path_access_tsv, 'wt') as access_file, lzma.open(path_issn_tsv, 'wt') as issn_file:
    access_writer = csv.DictWriter(access_file, delimiter='\t', fieldnames=list(record_renamer.values()))
    access_writer.writeheader()
    issn_writer = csv.writer(issn_file, delimiter='\t')
    issn_writer.writerow(('doi', 'issn'))
    for article in articles:
        doi = article['doi']
        access_writer.writerow(_reduce_unpaywall_record(article))
        for issn in article['journal_issns']:
            issn_writer.writerow((doi, issn))

In [13]:
article

{'doi': '10.1002/asi.20570',
 'year': 2007,
 'genre': 'journal-article',
 'is_oa': True,
 'title': 'Towards memory supporting personal information management tools',
 'doi_url': 'https://doi.org/10.1002/asi.20570',
 'updated': '2018-06-21T04:54:17.294334',
 'publisher': 'Wiley-Blackwell',
 'z_authors': [{'given': 'David', 'family': 'Elsweiler'},
  {'given': 'Ian', 'family': 'Ruthven'},
  {'given': 'Christopher', 'family': 'Jones'}],
 'journal_name': 'Journal of the American Society for Information Science and Technology',
 'oa_locations': [{'url': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf',
   'pmh_id': 'oai:epub.uni-regensburg.de:22679',
   'is_best': True,
   'license': None,
   'updated': '2018-06-20T18:33:20.945686',
   'version': 'submittedVersion',
   'evidence': 'oa repository (via OAI-PMH title and first author match)',
   'host_type': 'repository',
   'url_for_pdf': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf',
   'url_for_

## Test reading `unpaywall-access.tsv`

In [14]:
article_df = pandas.read_csv(path_access_tsv, sep='\t')
article_df.tail()

Unnamed: 0,doi,crossref_type,journal_date,unpaywall_access,journal_access,journal_access_evidence,journal_access_license,journal_fully_oa
99940224,10.1002/nadc.19970450625,journal-article,1997-06-01,0,0,,,0
99940225,10.1371/journal.pbio.1001712.g004,component,,1,1,oa journal (via publisher name),,1
99940226,10.1364/opex.12.002220.m005,component,,0,0,,,0
99940227,10.2105/ajph.10.6.536,journal-article,1920-06-01,1,1,open (via free pdf),,0
99940228,10.1002/asi.20570,journal-article,2007-01-01,1,0,,,0


In [15]:
# Green OA only articles
article_df.query("journal_access==0 and unpaywall_access==1").head(3)

Unnamed: 0,doi,crossref_type,journal_date,unpaywall_access,journal_access,journal_access_evidence,journal_access_license,journal_fully_oa
0,10.1080/21645515.2017.1330236,journal-article,2017-06-12,1,0,,,0
49,10.1016/j.drugalcdep.2016.08.636,journal-article,2016-11-01,1,0,,,0
54,10.1109/icecs.2001.957596,proceedings-article,,1,0,,,0


In [16]:
# Hybrid/Bronze OA articles
article_df.query("journal_access==1 and journal_fully_oa==0").head(3)

Unnamed: 0,doi,crossref_type,journal_date,unpaywall_access,journal_access,journal_access_evidence,journal_access_license,journal_fully_oa
3,10.1088/0004-6256/135/4/1201,journal-article,2008-03-04,1,1,open (via free pdf),,0
5,10.2478/v10172-012-0058-8,journal-article,2012-01-01,1,1,open (via free pdf),,0
13,10.1038/313176c0,journal-article,1985-01-01,1,1,open (via free pdf),,0


## Test reading `unpaywall-issns.tsv`

In [17]:
issn_df = pandas.read_csv(path_issn_tsv, sep='\t')
issn_df.tail()

Unnamed: 0,doi,issn
112751994,10.1006/bbrc.1997.6706,0006-291X
112751995,10.1002/nadc.19970450625,0341-5163
112751996,10.2105/ajph.10.6.536,0271-4353
112751997,10.1002/asi.20570,1532-2882
112751998,10.1002/asi.20570,1532-2890


In [18]:
# Make sure this dataframe is empty
# all ISSNs should be length nine, i.e. XXXX-XXXX formatted
invalid_issn_df = issn_df[issn_df.issn.str.len() != 9]
invalid_issn_df.head()

Unnamed: 0,doi,issn
