{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Extract bioRxiv preprint and traffic data from the Rxivist database" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas\n", "import psycopg2" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def months_between_datetimes(date_from, date_to):\n", " \"\"\"\n", " Compute the number of calendar months (rather than 30 day units)\n", " between two dates.\n", "\n", " date_from and date_to are pandas datetime series .dt attributes\n", "\n", " See http://stackoverflow.com/a/4040338/4651668.\n", " \"\"\"\n", " date_from = pandas.to_datetime(date_from)\n", " date_to = pandas.to_datetime(date_to)\n", " if pandas.isna(date_from) or pandas.isna(date_to):\n", " return None\n", " delta_years = 12 * (date_to.year - date_from.year)\n", " delta_months = date_to.month - date_from.month\n", " return delta_years + delta_months" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "connection = psycopg2.connect(host='localhost', dbname=\"rxdb\", user='postgres', port=65500)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Table of bioRxiv preprints" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rxivist_preprint_idpreprint_datepreprint_doijournal_datejournal_doi
03862018-05-2110.1101/327015NaTNone
13872018-08-0310.1101/3841232018-11-1910.1128/aac.01646-18
23882018-08-0310.1101/383109NaTNone
33892018-07-3110.1101/3816402018-11-0110.1099/mgen.0.000234
\n", "
" ], "text/plain": [ " rxivist_preprint_id preprint_date preprint_doi journal_date \\\n", "0 386 2018-05-21 10.1101/327015 NaT \n", "1 387 2018-08-03 10.1101/384123 2018-11-19 \n", "2 388 2018-08-03 10.1101/383109 NaT \n", "3 389 2018-07-31 10.1101/381640 2018-11-01 \n", "\n", " journal_doi \n", "0 None \n", "1 10.1128/aac.01646-18 \n", "2 None \n", "3 10.1099/mgen.0.000234 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = '''\n", "SELECT\n", " id AS rxivist_preprint_id,\n", " paper.articles.posted AS preprint_date,\n", " paper.articles.doi AS preprint_doi,\n", " paper.publication_dates.date AS journal_date,\n", " LOWER(paper.article_publications.doi) AS journal_doi\n", "FROM paper.articles\n", "LEFT JOIN paper.article_publications\n", " ON paper.articles.id=paper.article_publications.article\n", "LEFT JOIN paper.publication_dates\n", " ON paper.articles.id=paper.publication_dates.article\n", "ORDER BY rxivist_preprint_id \n", ";\n", "'''\n", "preprint_df = pandas.read_sql(sql=query, con=connection, parse_dates=['preprint_date', 'journal_date'])\n", "assert not preprint_df.rxivist_preprint_id.duplicated().any()\n", "preprint_df.head(4)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "preprint_df.to_csv('data/01.preprints.tsv', sep='\\t', index=False)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "37,648 preprints\n", "15,797 preprints with a journal date\n", "15,797 preprints with a journal DOI\n", "\n" ] } ], "source": [ "print(f'''\\\n", "{len(preprint_df):,} preprints\n", "{sum(preprint_df.journal_date.notna()):,} preprints with a journal date\n", "{sum(preprint_df.journal_doi.notna()):,} preprints with a journal DOI\n", "''')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Table of bioRxiv traffic by year-month period" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rxivist_preprint_idpreprint_datejournal_dateperiodpreprint_viewspreprint_downloads
03862018-05-21None2018-0556964
13862018-05-21None2018-0626549
23862018-05-21None2018-0723129
\n", "
" ], "text/plain": [ " rxivist_preprint_id preprint_date journal_date period preprint_views \\\n", "0 386 2018-05-21 None 2018-05 569 \n", "1 386 2018-05-21 None 2018-06 265 \n", "2 386 2018-05-21 None 2018-07 231 \n", "\n", " preprint_downloads \n", "0 64 \n", "1 49 \n", "2 29 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = '''\n", "SELECT\n", " paper.articles.id AS rxivist_preprint_id,\n", " paper.articles.posted AS preprint_date,\n", " paper.publication_dates.date AS journal_date,\n", " CONCAT(paper.article_traffic.year, '-', TO_CHAR(paper.article_traffic.month, 'fm00')) AS period,\n", " paper.article_traffic.abstract AS preprint_views,\n", " paper.article_traffic.pdf AS preprint_downloads\n", "FROM paper.article_traffic\n", "JOIN paper.articles\n", " ON paper.articles.id=paper.article_traffic.article\n", "LEFT JOIN paper.publication_dates\n", " ON paper.articles.id=paper.publication_dates.article\n", "ORDER BY rxivist_preprint_id, period\n", ";\n", "'''\n", "traffic_df = pandas.read_sql(sql=query, con=connection, parse_dates='preprint_date')\n", "traffic_df.head(3)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Compute the number of calendar months from preprint / journal publication for each traffic period\n", "traffic_df['months_since_preprint'] = [\n", " months_between_datetimes(*pair) for pair in\n", " zip(traffic_df.pop('preprint_date'), pandas.to_datetime(traffic_df.period))\n", "]\n", "traffic_df['months_since_journal'] = pandas.Series([\n", " months_between_datetimes(*pair) for pair in\n", " zip(traffic_df.pop('journal_date'), pandas.to_datetime(traffic_df.period))\n", "], dtype='Int64')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rxivist_preprint_idperiodpreprint_viewspreprint_downloadsmonths_since_preprintmonths_since_journal
03862018-05569640NaN
13862018-06265491NaN
\n", "
" ], "text/plain": [ " rxivist_preprint_id period preprint_views preprint_downloads \\\n", "0 386 2018-05 569 64 \n", "1 386 2018-06 265 49 \n", "\n", " months_since_preprint months_since_journal \n", "0 0 NaN \n", "1 1 NaN " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "traffic_df.head(2)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "traffic_df.to_csv('data/01.preprint-traffic.tsv.xz', sep='\\t', index=False)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rxivist_preprint_idpreprint_datepreprint_doijournal_datejournal_doi
359039762016-11-1410.1101/0876192017-09-2210.7554/elife.26726
\n", "
" ], "text/plain": [ " rxivist_preprint_id preprint_date preprint_doi journal_date \\\n", "3590 3976 2016-11-14 10.1101/087619 2017-09-22 \n", "\n", " journal_doi \n", "3590 10.7554/elife.26726 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Locate the Project Rephetio Preprint by DOI\n", "rephetio_df = preprint_df.query('preprint_doi == \"10.1101/087619\"')\n", "rephetio_df" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rxivist_preprint_idperiodpreprint_viewspreprint_downloadsmonths_since_preprintmonths_since_journal
039762016-1111581520-10
139762016-121791121-9
239762017-01114302-8
339762017-02129433-7
439762017-033551424-6
539762017-04111495-5
639762017-05204816-4
739762017-06162737-3
839762017-07137608-2
939762017-08200629-1
1039762017-09492168100
1139762017-1013061111
1239762017-1111634122
1339762017-127137133
1439762018-014719144
1539762018-026019155
1639762018-037218166
1739762018-04388177
1839762018-056810188
1939762018-066216199
2039762018-0763182010
2139762018-0891162111
2239762018-0963272212
2339762018-1024512313
2439762018-1143192414
\n", "
" ], "text/plain": [ " rxivist_preprint_id period preprint_views preprint_downloads \\\n", "0 3976 2016-11 1158 152 \n", "1 3976 2016-12 179 112 \n", "2 3976 2017-01 114 30 \n", "3 3976 2017-02 129 43 \n", "4 3976 2017-03 355 142 \n", "5 3976 2017-04 111 49 \n", "6 3976 2017-05 204 81 \n", "7 3976 2017-06 162 73 \n", "8 3976 2017-07 137 60 \n", "9 3976 2017-08 200 62 \n", "10 3976 2017-09 492 168 \n", "11 3976 2017-10 130 61 \n", "12 3976 2017-11 116 34 \n", "13 3976 2017-12 71 37 \n", "14 3976 2018-01 47 19 \n", "15 3976 2018-02 60 19 \n", "16 3976 2018-03 72 18 \n", "17 3976 2018-04 38 8 \n", "18 3976 2018-05 68 10 \n", "19 3976 2018-06 62 16 \n", "20 3976 2018-07 63 18 \n", "21 3976 2018-08 91 16 \n", "22 3976 2018-09 63 27 \n", "23 3976 2018-10 24 51 \n", "24 3976 2018-11 43 19 \n", "\n", " months_since_preprint months_since_journal \n", "0 0 -10 \n", "1 1 -9 \n", "2 2 -8 \n", "3 3 -7 \n", "4 4 -6 \n", "5 5 -5 \n", "6 6 -4 \n", "7 7 -3 \n", "8 8 -2 \n", "9 9 -1 \n", "10 10 0 \n", "11 11 1 \n", "12 12 2 \n", "13 13 3 \n", "14 14 4 \n", "15 15 5 \n", "16 16 6 \n", "17 17 7 \n", "18 18 8 \n", "19 19 9 \n", "20 20 10 \n", "21 21 11 \n", "22 22 12 \n", "23 23 13 \n", "24 24 14 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Traffic for Project Rephetio Preprint\n", "rephetio_df[['rxivist_preprint_id']].merge(traffic_df)" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:greenblack]", "language": "python", "name": "conda-env-greenblack-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }