{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Process Unpaywall data for access status to articles"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import collections\n",
"import csv\n",
"import gzip\n",
"import itertools\n",
"import json\n",
"import lzma\n",
"\n",
"import pandas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read and process jsonl snapshot"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def _process_unpaywall_record(row):\n",
" row['doi'] = row['doi'].lower()\n",
" row['journal_access'] = False\n",
" row['journal_access_license'] = None\n",
" row['journal_access_evidence'] = None\n",
" for location in row['oa_locations']:\n",
" if location['host_type'] == 'publisher':\n",
" row['journal_access'] = True\n",
" row['journal_access_license'] = location.get('license')\n",
" row['journal_access_evidence'] = location.get('evidence')\n",
" break\n",
" row['journal_date'] = row.get('published_date') or row.get('year')\n",
" journal_date = row.get('published_date')\n",
" journal_issns = row.get('journal_issns')\n",
" row['journal_issns'] = journal_issns.split(',') if journal_issns else []\n",
"\n",
"def read_unpaywall_snapshot(path, doi_subset=None):\n",
" \"\"\"\n",
" https://unpaywall.org/data-format\n",
" https://unpaywall.org/products/snapshot\n",
" \"\"\"\n",
" opener = gzip.open if str(path).endswith('.gz') else open\n",
" with opener(path, 'rt') as read_file:\n",
" for line in read_file:\n",
" row = json.loads(line)\n",
" if doi_subset is None or row['doi'].lower() in doi_subset:\n",
" _process_unpaywall_record(row)\n",
" yield row\n",
"\n",
"record_renamer = {\n",
" 'doi': 'doi',\n",
" 'genre': 'crossref_type',\n",
" 'journal_date': 'journal_date',\n",
" 'is_oa': 'unpaywall_access',\n",
" 'journal_access': 'journal_access',\n",
" 'journal_access_evidence': 'journal_access_evidence',\n",
" 'journal_access_license': 'journal_access_license',\n",
" 'journal_is_oa': 'journal_fully_oa',\n",
"}\n",
"\n",
"def _reduce_unpaywall_record(row):\n",
" reduced_row = collections.OrderedDict()\n",
" for key, renamed_key in record_renamer.items():\n",
" value = row[key]\n",
" if isinstance(value, bool):\n",
" value = int(value)\n",
" reduced_row[renamed_key] = value\n",
" return reduced_row"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# Input path\n",
"path_jsonl = 'downloads/unpaywall/unpaywall_snapshot_2018-09-24T232615.jsonl.gz'\n",
"# Output paths\n",
"path_access_tsv = 'data/02.unpaywall-access.tsv.xz'\n",
"path_issn_tsv = 'data/02.unpaywall-issns.tsv.xz'\n",
"\n",
"articles = read_unpaywall_snapshot(path_jsonl)\n",
"# Uncomment following line for development\n",
"# articles = itertools.islice(articles, 100)\n",
"with lzma.open(path_access_tsv, 'wt') as access_file, lzma.open(path_issn_tsv, 'wt') as issn_file:\n",
" access_writer = csv.DictWriter(access_file, delimiter='\\t', fieldnames=list(record_renamer.values()))\n",
" access_writer.writeheader()\n",
" issn_writer = csv.writer(issn_file, delimiter='\\t')\n",
" issn_writer.writerow(('doi', 'issn'))\n",
" for article in articles:\n",
" doi = article['doi']\n",
" access_writer.writerow(_reduce_unpaywall_record(article))\n",
" for issn in article['journal_issns']:\n",
" issn_writer.writerow((doi, issn))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'doi': '10.1002/asi.20570',\n",
" 'year': 2007,\n",
" 'genre': 'journal-article',\n",
" 'is_oa': True,\n",
" 'title': 'Towards memory supporting personal information management tools',\n",
" 'doi_url': 'https://doi.org/10.1002/asi.20570',\n",
" 'updated': '2018-06-21T04:54:17.294334',\n",
" 'publisher': 'Wiley-Blackwell',\n",
" 'z_authors': [{'given': 'David', 'family': 'Elsweiler'},\n",
" {'given': 'Ian', 'family': 'Ruthven'},\n",
" {'given': 'Christopher', 'family': 'Jones'}],\n",
" 'journal_name': 'Journal of the American Society for Information Science and Technology',\n",
" 'oa_locations': [{'url': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf',\n",
" 'pmh_id': 'oai:epub.uni-regensburg.de:22679',\n",
" 'is_best': True,\n",
" 'license': None,\n",
" 'updated': '2018-06-20T18:33:20.945686',\n",
" 'version': 'submittedVersion',\n",
" 'evidence': 'oa repository (via OAI-PMH title and first author match)',\n",
" 'host_type': 'repository',\n",
" 'url_for_pdf': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf',\n",
" 'url_for_landing_page': None},\n",
" {'url': 'https://strathprints.strath.ac.uk/2395/6/strathprints002395.pdf',\n",
" 'pmh_id': 'oai:strathprints.strath.ac.uk:2395',\n",
" 'is_best': False,\n",
" 'license': None,\n",
" 'updated': '2018-09-22T09:24:03.567239',\n",
" 'version': 'submittedVersion',\n",
" 'evidence': 'oa repository (via OAI-PMH title and first author match)',\n",
" 'host_type': 'repository',\n",
" 'url_for_pdf': 'https://strathprints.strath.ac.uk/2395/6/strathprints002395.pdf',\n",
" 'url_for_landing_page': None}],\n",
" 'data_standard': 2,\n",
" 'journal_is_oa': False,\n",
" 'journal_issns': ['1532-2882', '1532-2890'],\n",
" 'published_date': '2007-01-01',\n",
" 'best_oa_location': {'url': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf',\n",
" 'pmh_id': 'oai:epub.uni-regensburg.de:22679',\n",
" 'is_best': True,\n",
" 'license': None,\n",
" 'updated': '2018-06-20T18:33:20.945686',\n",
" 'version': 'submittedVersion',\n",
" 'evidence': 'oa repository (via OAI-PMH title and first author match)',\n",
" 'host_type': 'repository',\n",
" 'url_for_pdf': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf',\n",
" 'url_for_landing_page': None},\n",
" 'journal_is_in_doaj': False,\n",
" 'journal_access': False,\n",
" 'journal_access_license': None,\n",
" 'journal_access_evidence': None,\n",
" 'journal_date': '2007-01-01'}"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"article"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test reading `unpaywall-access.tsv`"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" doi | \n",
" crossref_type | \n",
" journal_date | \n",
" unpaywall_access | \n",
" journal_access | \n",
" journal_access_evidence | \n",
" journal_access_license | \n",
" journal_fully_oa | \n",
"
\n",
" \n",
" \n",
" \n",
" 99940224 | \n",
" 10.1002/nadc.19970450625 | \n",
" journal-article | \n",
" 1997-06-01 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" 99940225 | \n",
" 10.1371/journal.pbio.1001712.g004 | \n",
" component | \n",
" NaN | \n",
" 1 | \n",
" 1 | \n",
" oa journal (via publisher name) | \n",
" NaN | \n",
" 1 | \n",
"
\n",
" \n",
" 99940226 | \n",
" 10.1364/opex.12.002220.m005 | \n",
" component | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" 99940227 | \n",
" 10.2105/ajph.10.6.536 | \n",
" journal-article | \n",
" 1920-06-01 | \n",
" 1 | \n",
" 1 | \n",
" open (via free pdf) | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" 99940228 | \n",
" 10.1002/asi.20570 | \n",
" journal-article | \n",
" 2007-01-01 | \n",
" 1 | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" doi crossref_type journal_date \\\n",
"99940224 10.1002/nadc.19970450625 journal-article 1997-06-01 \n",
"99940225 10.1371/journal.pbio.1001712.g004 component NaN \n",
"99940226 10.1364/opex.12.002220.m005 component NaN \n",
"99940227 10.2105/ajph.10.6.536 journal-article 1920-06-01 \n",
"99940228 10.1002/asi.20570 journal-article 2007-01-01 \n",
"\n",
" unpaywall_access journal_access journal_access_evidence \\\n",
"99940224 0 0 NaN \n",
"99940225 1 1 oa journal (via publisher name) \n",
"99940226 0 0 NaN \n",
"99940227 1 1 open (via free pdf) \n",
"99940228 1 0 NaN \n",
"\n",
" journal_access_license journal_fully_oa \n",
"99940224 NaN 0 \n",
"99940225 NaN 1 \n",
"99940226 NaN 0 \n",
"99940227 NaN 0 \n",
"99940228 NaN 0 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"article_df = pandas.read_csv(path_access_tsv, sep='\\t')\n",
"article_df.tail()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" doi | \n",
" crossref_type | \n",
" journal_date | \n",
" unpaywall_access | \n",
" journal_access | \n",
" journal_access_evidence | \n",
" journal_access_license | \n",
" journal_fully_oa | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 10.1080/21645515.2017.1330236 | \n",
" journal-article | \n",
" 2017-06-12 | \n",
" 1 | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" 49 | \n",
" 10.1016/j.drugalcdep.2016.08.636 | \n",
" journal-article | \n",
" 2016-11-01 | \n",
" 1 | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" 54 | \n",
" 10.1109/icecs.2001.957596 | \n",
" proceedings-article | \n",
" NaN | \n",
" 1 | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" doi crossref_type journal_date \\\n",
"0 10.1080/21645515.2017.1330236 journal-article 2017-06-12 \n",
"49 10.1016/j.drugalcdep.2016.08.636 journal-article 2016-11-01 \n",
"54 10.1109/icecs.2001.957596 proceedings-article NaN \n",
"\n",
" unpaywall_access journal_access journal_access_evidence \\\n",
"0 1 0 NaN \n",
"49 1 0 NaN \n",
"54 1 0 NaN \n",
"\n",
" journal_access_license journal_fully_oa \n",
"0 NaN 0 \n",
"49 NaN 0 \n",
"54 NaN 0 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Green OA only articles\n",
"article_df.query(\"journal_access==0 and unpaywall_access==1\").head(3)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" doi | \n",
" crossref_type | \n",
" journal_date | \n",
" unpaywall_access | \n",
" journal_access | \n",
" journal_access_evidence | \n",
" journal_access_license | \n",
" journal_fully_oa | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" 10.1088/0004-6256/135/4/1201 | \n",
" journal-article | \n",
" 2008-03-04 | \n",
" 1 | \n",
" 1 | \n",
" open (via free pdf) | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" 10.2478/v10172-012-0058-8 | \n",
" journal-article | \n",
" 2012-01-01 | \n",
" 1 | \n",
" 1 | \n",
" open (via free pdf) | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
" 13 | \n",
" 10.1038/313176c0 | \n",
" journal-article | \n",
" 1985-01-01 | \n",
" 1 | \n",
" 1 | \n",
" open (via free pdf) | \n",
" NaN | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" doi crossref_type journal_date \\\n",
"3 10.1088/0004-6256/135/4/1201 journal-article 2008-03-04 \n",
"5 10.2478/v10172-012-0058-8 journal-article 2012-01-01 \n",
"13 10.1038/313176c0 journal-article 1985-01-01 \n",
"\n",
" unpaywall_access journal_access journal_access_evidence \\\n",
"3 1 1 open (via free pdf) \n",
"5 1 1 open (via free pdf) \n",
"13 1 1 open (via free pdf) \n",
"\n",
" journal_access_license journal_fully_oa \n",
"3 NaN 0 \n",
"5 NaN 0 \n",
"13 NaN 0 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Hybrid/Bronze OA articles\n",
"article_df.query(\"journal_access==1 and journal_fully_oa==0\").head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test reading `unpaywall-issns.tsv`"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" doi | \n",
" issn | \n",
"
\n",
" \n",
" \n",
" \n",
" 112751994 | \n",
" 10.1006/bbrc.1997.6706 | \n",
" 0006-291X | \n",
"
\n",
" \n",
" 112751995 | \n",
" 10.1002/nadc.19970450625 | \n",
" 0341-5163 | \n",
"
\n",
" \n",
" 112751996 | \n",
" 10.2105/ajph.10.6.536 | \n",
" 0271-4353 | \n",
"
\n",
" \n",
" 112751997 | \n",
" 10.1002/asi.20570 | \n",
" 1532-2882 | \n",
"
\n",
" \n",
" 112751998 | \n",
" 10.1002/asi.20570 | \n",
" 1532-2890 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" doi issn\n",
"112751994 10.1006/bbrc.1997.6706 0006-291X\n",
"112751995 10.1002/nadc.19970450625 0341-5163\n",
"112751996 10.2105/ajph.10.6.536 0271-4353\n",
"112751997 10.1002/asi.20570 1532-2882\n",
"112751998 10.1002/asi.20570 1532-2890"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"issn_df = pandas.read_csv(path_issn_tsv, sep='\\t')\n",
"issn_df.tail()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" doi | \n",
" issn | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [doi, issn]\n",
"Index: []"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Make sure this dataframe is empty\n",
"# all ISSNs should be length nine, i.e. XXXX-XXXX formatted\n",
"invalid_issn_df = issn_df[issn_df.issn.str.len() != 9]\n",
"invalid_issn_df.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:greenblack]",
"language": "python",
"name": "conda-env-greenblack-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}