{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Process Unpaywall data for access status to articles" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import collections\n", "import csv\n", "import gzip\n", "import itertools\n", "import json\n", "import lzma\n", "\n", "import pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read and process jsonl snapshot" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def _process_unpaywall_record(row):\n", " row['doi'] = row['doi'].lower()\n", " row['journal_access'] = False\n", " row['journal_access_license'] = None\n", " row['journal_access_evidence'] = None\n", " for location in row['oa_locations']:\n", " if location['host_type'] == 'publisher':\n", " row['journal_access'] = True\n", " row['journal_access_license'] = location.get('license')\n", " row['journal_access_evidence'] = location.get('evidence')\n", " break\n", " row['journal_date'] = row.get('published_date') or row.get('year')\n", " journal_date = row.get('published_date')\n", " journal_issns = row.get('journal_issns')\n", " row['journal_issns'] = journal_issns.split(',') if journal_issns else []\n", "\n", "def read_unpaywall_snapshot(path, doi_subset=None):\n", " \"\"\"\n", " https://unpaywall.org/data-format\n", " https://unpaywall.org/products/snapshot\n", " \"\"\"\n", " opener = gzip.open if str(path).endswith('.gz') else open\n", " with opener(path, 'rt') as read_file:\n", " for line in read_file:\n", " row = json.loads(line)\n", " if doi_subset is None or row['doi'].lower() in doi_subset:\n", " _process_unpaywall_record(row)\n", " yield row\n", "\n", "record_renamer = {\n", " 'doi': 'doi',\n", " 'genre': 'crossref_type',\n", " 'journal_date': 'journal_date',\n", " 'is_oa': 'unpaywall_access',\n", " 'journal_access': 'journal_access',\n", " 'journal_access_evidence': 'journal_access_evidence',\n", " 'journal_access_license': 'journal_access_license',\n", " 'journal_is_oa': 'journal_fully_oa',\n", "}\n", "\n", "def _reduce_unpaywall_record(row):\n", " reduced_row = collections.OrderedDict()\n", " for key, renamed_key in record_renamer.items():\n", " value = row[key]\n", " if isinstance(value, bool):\n", " value = int(value)\n", " reduced_row[renamed_key] = value\n", " return reduced_row" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# Input path\n", "path_jsonl = 'downloads/unpaywall/unpaywall_snapshot_2018-09-24T232615.jsonl.gz'\n", "# Output paths\n", "path_access_tsv = 'data/02.unpaywall-access.tsv.xz'\n", "path_issn_tsv = 'data/02.unpaywall-issns.tsv.xz'\n", "\n", "articles = read_unpaywall_snapshot(path_jsonl)\n", "# Uncomment following line for development\n", "# articles = itertools.islice(articles, 100)\n", "with lzma.open(path_access_tsv, 'wt') as access_file, lzma.open(path_issn_tsv, 'wt') as issn_file:\n", " access_writer = csv.DictWriter(access_file, delimiter='\\t', fieldnames=list(record_renamer.values()))\n", " access_writer.writeheader()\n", " issn_writer = csv.writer(issn_file, delimiter='\\t')\n", " issn_writer.writerow(('doi', 'issn'))\n", " for article in articles:\n", " doi = article['doi']\n", " access_writer.writerow(_reduce_unpaywall_record(article))\n", " for issn in article['journal_issns']:\n", " issn_writer.writerow((doi, issn))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'doi': '10.1002/asi.20570',\n", " 'year': 2007,\n", " 'genre': 'journal-article',\n", " 'is_oa': True,\n", " 'title': 'Towards memory supporting personal information management tools',\n", " 'doi_url': 'https://doi.org/10.1002/asi.20570',\n", " 'updated': '2018-06-21T04:54:17.294334',\n", " 'publisher': 'Wiley-Blackwell',\n", " 'z_authors': [{'given': 'David', 'family': 'Elsweiler'},\n", " {'given': 'Ian', 'family': 'Ruthven'},\n", " {'given': 'Christopher', 'family': 'Jones'}],\n", " 'journal_name': 'Journal of the American Society for Information Science and Technology',\n", " 'oa_locations': [{'url': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf',\n", " 'pmh_id': 'oai:epub.uni-regensburg.de:22679',\n", " 'is_best': True,\n", " 'license': None,\n", " 'updated': '2018-06-20T18:33:20.945686',\n", " 'version': 'submittedVersion',\n", " 'evidence': 'oa repository (via OAI-PMH title and first author match)',\n", " 'host_type': 'repository',\n", " 'url_for_pdf': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf',\n", " 'url_for_landing_page': None},\n", " {'url': 'https://strathprints.strath.ac.uk/2395/6/strathprints002395.pdf',\n", " 'pmh_id': 'oai:strathprints.strath.ac.uk:2395',\n", " 'is_best': False,\n", " 'license': None,\n", " 'updated': '2018-09-22T09:24:03.567239',\n", " 'version': 'submittedVersion',\n", " 'evidence': 'oa repository (via OAI-PMH title and first author match)',\n", " 'host_type': 'repository',\n", " 'url_for_pdf': 'https://strathprints.strath.ac.uk/2395/6/strathprints002395.pdf',\n", " 'url_for_landing_page': None}],\n", " 'data_standard': 2,\n", " 'journal_is_oa': False,\n", " 'journal_issns': ['1532-2882', '1532-2890'],\n", " 'published_date': '2007-01-01',\n", " 'best_oa_location': {'url': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf',\n", " 'pmh_id': 'oai:epub.uni-regensburg.de:22679',\n", " 'is_best': True,\n", " 'license': None,\n", " 'updated': '2018-06-20T18:33:20.945686',\n", " 'version': 'submittedVersion',\n", " 'evidence': 'oa repository (via OAI-PMH title and first author match)',\n", " 'host_type': 'repository',\n", " 'url_for_pdf': 'https://epub.uni-regensburg.de/22679/1/jasist2007-towards_memory.pdf',\n", " 'url_for_landing_page': None},\n", " 'journal_is_in_doaj': False,\n", " 'journal_access': False,\n", " 'journal_access_license': None,\n", " 'journal_access_evidence': None,\n", " 'journal_date': '2007-01-01'}" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "article" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test reading `unpaywall-access.tsv`" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
doicrossref_typejournal_dateunpaywall_accessjournal_accessjournal_access_evidencejournal_access_licensejournal_fully_oa
9994022410.1002/nadc.19970450625journal-article1997-06-0100NaNNaN0
9994022510.1371/journal.pbio.1001712.g004componentNaN11oa journal (via publisher name)NaN1
9994022610.1364/opex.12.002220.m005componentNaN00NaNNaN0
9994022710.2105/ajph.10.6.536journal-article1920-06-0111open (via free pdf)NaN0
9994022810.1002/asi.20570journal-article2007-01-0110NaNNaN0
\n", "
" ], "text/plain": [ " doi crossref_type journal_date \\\n", "99940224 10.1002/nadc.19970450625 journal-article 1997-06-01 \n", "99940225 10.1371/journal.pbio.1001712.g004 component NaN \n", "99940226 10.1364/opex.12.002220.m005 component NaN \n", "99940227 10.2105/ajph.10.6.536 journal-article 1920-06-01 \n", "99940228 10.1002/asi.20570 journal-article 2007-01-01 \n", "\n", " unpaywall_access journal_access journal_access_evidence \\\n", "99940224 0 0 NaN \n", "99940225 1 1 oa journal (via publisher name) \n", "99940226 0 0 NaN \n", "99940227 1 1 open (via free pdf) \n", "99940228 1 0 NaN \n", "\n", " journal_access_license journal_fully_oa \n", "99940224 NaN 0 \n", "99940225 NaN 1 \n", "99940226 NaN 0 \n", "99940227 NaN 0 \n", "99940228 NaN 0 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "article_df = pandas.read_csv(path_access_tsv, sep='\\t')\n", "article_df.tail()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
doicrossref_typejournal_dateunpaywall_accessjournal_accessjournal_access_evidencejournal_access_licensejournal_fully_oa
010.1080/21645515.2017.1330236journal-article2017-06-1210NaNNaN0
4910.1016/j.drugalcdep.2016.08.636journal-article2016-11-0110NaNNaN0
5410.1109/icecs.2001.957596proceedings-articleNaN10NaNNaN0
\n", "
" ], "text/plain": [ " doi crossref_type journal_date \\\n", "0 10.1080/21645515.2017.1330236 journal-article 2017-06-12 \n", "49 10.1016/j.drugalcdep.2016.08.636 journal-article 2016-11-01 \n", "54 10.1109/icecs.2001.957596 proceedings-article NaN \n", "\n", " unpaywall_access journal_access journal_access_evidence \\\n", "0 1 0 NaN \n", "49 1 0 NaN \n", "54 1 0 NaN \n", "\n", " journal_access_license journal_fully_oa \n", "0 NaN 0 \n", "49 NaN 0 \n", "54 NaN 0 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Green OA only articles\n", "article_df.query(\"journal_access==0 and unpaywall_access==1\").head(3)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
doicrossref_typejournal_dateunpaywall_accessjournal_accessjournal_access_evidencejournal_access_licensejournal_fully_oa
310.1088/0004-6256/135/4/1201journal-article2008-03-0411open (via free pdf)NaN0
510.2478/v10172-012-0058-8journal-article2012-01-0111open (via free pdf)NaN0
1310.1038/313176c0journal-article1985-01-0111open (via free pdf)NaN0
\n", "
" ], "text/plain": [ " doi crossref_type journal_date \\\n", "3 10.1088/0004-6256/135/4/1201 journal-article 2008-03-04 \n", "5 10.2478/v10172-012-0058-8 journal-article 2012-01-01 \n", "13 10.1038/313176c0 journal-article 1985-01-01 \n", "\n", " unpaywall_access journal_access journal_access_evidence \\\n", "3 1 1 open (via free pdf) \n", "5 1 1 open (via free pdf) \n", "13 1 1 open (via free pdf) \n", "\n", " journal_access_license journal_fully_oa \n", "3 NaN 0 \n", "5 NaN 0 \n", "13 NaN 0 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Hybrid/Bronze OA articles\n", "article_df.query(\"journal_access==1 and journal_fully_oa==0\").head(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Test reading `unpaywall-issns.tsv`" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
doiissn
11275199410.1006/bbrc.1997.67060006-291X
11275199510.1002/nadc.199704506250341-5163
11275199610.2105/ajph.10.6.5360271-4353
11275199710.1002/asi.205701532-2882
11275199810.1002/asi.205701532-2890
\n", "
" ], "text/plain": [ " doi issn\n", "112751994 10.1006/bbrc.1997.6706 0006-291X\n", "112751995 10.1002/nadc.19970450625 0341-5163\n", "112751996 10.2105/ajph.10.6.536 0271-4353\n", "112751997 10.1002/asi.20570 1532-2882\n", "112751998 10.1002/asi.20570 1532-2890" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "issn_df = pandas.read_csv(path_issn_tsv, sep='\\t')\n", "issn_df.tail()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
doiissn
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [doi, issn]\n", "Index: []" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make sure this dataframe is empty\n", "# all ISSNs should be length nine, i.e. XXXX-XXXX formatted\n", "invalid_issn_df = issn_df[issn_df.issn.str.len() != 9]\n", "invalid_issn_df.head()" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:greenblack]", "language": "python", "name": "conda-env-greenblack-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }