{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extract bioRxiv preprint and traffic data from the Rxivist database"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas\n",
"import psycopg2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def months_between_datetimes(date_from, date_to):\n",
" \"\"\"\n",
" Compute the number of calendar months (rather than 30 day units)\n",
" between two dates.\n",
"\n",
" date_from and date_to are pandas datetime series .dt attributes\n",
"\n",
" See http://stackoverflow.com/a/4040338/4651668.\n",
" \"\"\"\n",
" date_from = pandas.to_datetime(date_from)\n",
" date_to = pandas.to_datetime(date_to)\n",
" if pandas.isna(date_from) or pandas.isna(date_to):\n",
" return None\n",
" delta_years = 12 * (date_to.year - date_from.year)\n",
" delta_months = date_to.month - date_from.month\n",
" return delta_years + delta_months"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"connection = psycopg2.connect(host='localhost', dbname=\"rxdb\", user='postgres', port=65500)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table of bioRxiv preprints"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" rxivist_preprint_id | \n",
" preprint_date | \n",
" preprint_doi | \n",
" journal_date | \n",
" journal_doi | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 386 | \n",
" 2018-05-21 | \n",
" 10.1101/327015 | \n",
" NaT | \n",
" None | \n",
"
\n",
" \n",
" 1 | \n",
" 387 | \n",
" 2018-08-03 | \n",
" 10.1101/384123 | \n",
" 2018-11-19 | \n",
" 10.1128/aac.01646-18 | \n",
"
\n",
" \n",
" 2 | \n",
" 388 | \n",
" 2018-08-03 | \n",
" 10.1101/383109 | \n",
" NaT | \n",
" None | \n",
"
\n",
" \n",
" 3 | \n",
" 389 | \n",
" 2018-07-31 | \n",
" 10.1101/381640 | \n",
" 2018-11-01 | \n",
" 10.1099/mgen.0.000234 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" rxivist_preprint_id preprint_date preprint_doi journal_date \\\n",
"0 386 2018-05-21 10.1101/327015 NaT \n",
"1 387 2018-08-03 10.1101/384123 2018-11-19 \n",
"2 388 2018-08-03 10.1101/383109 NaT \n",
"3 389 2018-07-31 10.1101/381640 2018-11-01 \n",
"\n",
" journal_doi \n",
"0 None \n",
"1 10.1128/aac.01646-18 \n",
"2 None \n",
"3 10.1099/mgen.0.000234 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = '''\n",
"SELECT\n",
" id AS rxivist_preprint_id,\n",
" paper.articles.posted AS preprint_date,\n",
" paper.articles.doi AS preprint_doi,\n",
" paper.publication_dates.date AS journal_date,\n",
" LOWER(paper.article_publications.doi) AS journal_doi\n",
"FROM paper.articles\n",
"LEFT JOIN paper.article_publications\n",
" ON paper.articles.id=paper.article_publications.article\n",
"LEFT JOIN paper.publication_dates\n",
" ON paper.articles.id=paper.publication_dates.article\n",
"ORDER BY rxivist_preprint_id \n",
";\n",
"'''\n",
"preprint_df = pandas.read_sql(sql=query, con=connection, parse_dates=['preprint_date', 'journal_date'])\n",
"assert not preprint_df.rxivist_preprint_id.duplicated().any()\n",
"preprint_df.head(4)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"preprint_df.to_csv('data/01.preprints.tsv', sep='\\t', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"37,648 preprints\n",
"15,797 preprints with a journal date\n",
"15,797 preprints with a journal DOI\n",
"\n"
]
}
],
"source": [
"print(f'''\\\n",
"{len(preprint_df):,} preprints\n",
"{sum(preprint_df.journal_date.notna()):,} preprints with a journal date\n",
"{sum(preprint_df.journal_doi.notna()):,} preprints with a journal DOI\n",
"''')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table of bioRxiv traffic by year-month period"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" rxivist_preprint_id | \n",
" preprint_date | \n",
" journal_date | \n",
" period | \n",
" preprint_views | \n",
" preprint_downloads | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 386 | \n",
" 2018-05-21 | \n",
" None | \n",
" 2018-05 | \n",
" 569 | \n",
" 64 | \n",
"
\n",
" \n",
" 1 | \n",
" 386 | \n",
" 2018-05-21 | \n",
" None | \n",
" 2018-06 | \n",
" 265 | \n",
" 49 | \n",
"
\n",
" \n",
" 2 | \n",
" 386 | \n",
" 2018-05-21 | \n",
" None | \n",
" 2018-07 | \n",
" 231 | \n",
" 29 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" rxivist_preprint_id preprint_date journal_date period preprint_views \\\n",
"0 386 2018-05-21 None 2018-05 569 \n",
"1 386 2018-05-21 None 2018-06 265 \n",
"2 386 2018-05-21 None 2018-07 231 \n",
"\n",
" preprint_downloads \n",
"0 64 \n",
"1 49 \n",
"2 29 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = '''\n",
"SELECT\n",
" paper.articles.id AS rxivist_preprint_id,\n",
" paper.articles.posted AS preprint_date,\n",
" paper.publication_dates.date AS journal_date,\n",
" CONCAT(paper.article_traffic.year, '-', TO_CHAR(paper.article_traffic.month, 'fm00')) AS period,\n",
" paper.article_traffic.abstract AS preprint_views,\n",
" paper.article_traffic.pdf AS preprint_downloads\n",
"FROM paper.article_traffic\n",
"JOIN paper.articles\n",
" ON paper.articles.id=paper.article_traffic.article\n",
"LEFT JOIN paper.publication_dates\n",
" ON paper.articles.id=paper.publication_dates.article\n",
"ORDER BY rxivist_preprint_id, period\n",
";\n",
"'''\n",
"traffic_df = pandas.read_sql(sql=query, con=connection, parse_dates='preprint_date')\n",
"traffic_df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Compute the number of calendar months from preprint / journal publication for each traffic period\n",
"traffic_df['months_since_preprint'] = [\n",
" months_between_datetimes(*pair) for pair in\n",
" zip(traffic_df.pop('preprint_date'), pandas.to_datetime(traffic_df.period))\n",
"]\n",
"traffic_df['months_since_journal'] = pandas.Series([\n",
" months_between_datetimes(*pair) for pair in\n",
" zip(traffic_df.pop('journal_date'), pandas.to_datetime(traffic_df.period))\n",
"], dtype='Int64')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" rxivist_preprint_id | \n",
" period | \n",
" preprint_views | \n",
" preprint_downloads | \n",
" months_since_preprint | \n",
" months_since_journal | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 386 | \n",
" 2018-05 | \n",
" 569 | \n",
" 64 | \n",
" 0 | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 386 | \n",
" 2018-06 | \n",
" 265 | \n",
" 49 | \n",
" 1 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" rxivist_preprint_id period preprint_views preprint_downloads \\\n",
"0 386 2018-05 569 64 \n",
"1 386 2018-06 265 49 \n",
"\n",
" months_since_preprint months_since_journal \n",
"0 0 NaN \n",
"1 1 NaN "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"traffic_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"traffic_df.to_csv('data/01.preprint-traffic.tsv.xz', sep='\\t', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" rxivist_preprint_id | \n",
" preprint_date | \n",
" preprint_doi | \n",
" journal_date | \n",
" journal_doi | \n",
"
\n",
" \n",
" \n",
" \n",
" 3590 | \n",
" 3976 | \n",
" 2016-11-14 | \n",
" 10.1101/087619 | \n",
" 2017-09-22 | \n",
" 10.7554/elife.26726 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" rxivist_preprint_id preprint_date preprint_doi journal_date \\\n",
"3590 3976 2016-11-14 10.1101/087619 2017-09-22 \n",
"\n",
" journal_doi \n",
"3590 10.7554/elife.26726 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Locate the Project Rephetio Preprint by DOI\n",
"rephetio_df = preprint_df.query('preprint_doi == \"10.1101/087619\"')\n",
"rephetio_df"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" rxivist_preprint_id | \n",
" period | \n",
" preprint_views | \n",
" preprint_downloads | \n",
" months_since_preprint | \n",
" months_since_journal | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 3976 | \n",
" 2016-11 | \n",
" 1158 | \n",
" 152 | \n",
" 0 | \n",
" -10 | \n",
"
\n",
" \n",
" 1 | \n",
" 3976 | \n",
" 2016-12 | \n",
" 179 | \n",
" 112 | \n",
" 1 | \n",
" -9 | \n",
"
\n",
" \n",
" 2 | \n",
" 3976 | \n",
" 2017-01 | \n",
" 114 | \n",
" 30 | \n",
" 2 | \n",
" -8 | \n",
"
\n",
" \n",
" 3 | \n",
" 3976 | \n",
" 2017-02 | \n",
" 129 | \n",
" 43 | \n",
" 3 | \n",
" -7 | \n",
"
\n",
" \n",
" 4 | \n",
" 3976 | \n",
" 2017-03 | \n",
" 355 | \n",
" 142 | \n",
" 4 | \n",
" -6 | \n",
"
\n",
" \n",
" 5 | \n",
" 3976 | \n",
" 2017-04 | \n",
" 111 | \n",
" 49 | \n",
" 5 | \n",
" -5 | \n",
"
\n",
" \n",
" 6 | \n",
" 3976 | \n",
" 2017-05 | \n",
" 204 | \n",
" 81 | \n",
" 6 | \n",
" -4 | \n",
"
\n",
" \n",
" 7 | \n",
" 3976 | \n",
" 2017-06 | \n",
" 162 | \n",
" 73 | \n",
" 7 | \n",
" -3 | \n",
"
\n",
" \n",
" 8 | \n",
" 3976 | \n",
" 2017-07 | \n",
" 137 | \n",
" 60 | \n",
" 8 | \n",
" -2 | \n",
"
\n",
" \n",
" 9 | \n",
" 3976 | \n",
" 2017-08 | \n",
" 200 | \n",
" 62 | \n",
" 9 | \n",
" -1 | \n",
"
\n",
" \n",
" 10 | \n",
" 3976 | \n",
" 2017-09 | \n",
" 492 | \n",
" 168 | \n",
" 10 | \n",
" 0 | \n",
"
\n",
" \n",
" 11 | \n",
" 3976 | \n",
" 2017-10 | \n",
" 130 | \n",
" 61 | \n",
" 11 | \n",
" 1 | \n",
"
\n",
" \n",
" 12 | \n",
" 3976 | \n",
" 2017-11 | \n",
" 116 | \n",
" 34 | \n",
" 12 | \n",
" 2 | \n",
"
\n",
" \n",
" 13 | \n",
" 3976 | \n",
" 2017-12 | \n",
" 71 | \n",
" 37 | \n",
" 13 | \n",
" 3 | \n",
"
\n",
" \n",
" 14 | \n",
" 3976 | \n",
" 2018-01 | \n",
" 47 | \n",
" 19 | \n",
" 14 | \n",
" 4 | \n",
"
\n",
" \n",
" 15 | \n",
" 3976 | \n",
" 2018-02 | \n",
" 60 | \n",
" 19 | \n",
" 15 | \n",
" 5 | \n",
"
\n",
" \n",
" 16 | \n",
" 3976 | \n",
" 2018-03 | \n",
" 72 | \n",
" 18 | \n",
" 16 | \n",
" 6 | \n",
"
\n",
" \n",
" 17 | \n",
" 3976 | \n",
" 2018-04 | \n",
" 38 | \n",
" 8 | \n",
" 17 | \n",
" 7 | \n",
"
\n",
" \n",
" 18 | \n",
" 3976 | \n",
" 2018-05 | \n",
" 68 | \n",
" 10 | \n",
" 18 | \n",
" 8 | \n",
"
\n",
" \n",
" 19 | \n",
" 3976 | \n",
" 2018-06 | \n",
" 62 | \n",
" 16 | \n",
" 19 | \n",
" 9 | \n",
"
\n",
" \n",
" 20 | \n",
" 3976 | \n",
" 2018-07 | \n",
" 63 | \n",
" 18 | \n",
" 20 | \n",
" 10 | \n",
"
\n",
" \n",
" 21 | \n",
" 3976 | \n",
" 2018-08 | \n",
" 91 | \n",
" 16 | \n",
" 21 | \n",
" 11 | \n",
"
\n",
" \n",
" 22 | \n",
" 3976 | \n",
" 2018-09 | \n",
" 63 | \n",
" 27 | \n",
" 22 | \n",
" 12 | \n",
"
\n",
" \n",
" 23 | \n",
" 3976 | \n",
" 2018-10 | \n",
" 24 | \n",
" 51 | \n",
" 23 | \n",
" 13 | \n",
"
\n",
" \n",
" 24 | \n",
" 3976 | \n",
" 2018-11 | \n",
" 43 | \n",
" 19 | \n",
" 24 | \n",
" 14 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" rxivist_preprint_id period preprint_views preprint_downloads \\\n",
"0 3976 2016-11 1158 152 \n",
"1 3976 2016-12 179 112 \n",
"2 3976 2017-01 114 30 \n",
"3 3976 2017-02 129 43 \n",
"4 3976 2017-03 355 142 \n",
"5 3976 2017-04 111 49 \n",
"6 3976 2017-05 204 81 \n",
"7 3976 2017-06 162 73 \n",
"8 3976 2017-07 137 60 \n",
"9 3976 2017-08 200 62 \n",
"10 3976 2017-09 492 168 \n",
"11 3976 2017-10 130 61 \n",
"12 3976 2017-11 116 34 \n",
"13 3976 2017-12 71 37 \n",
"14 3976 2018-01 47 19 \n",
"15 3976 2018-02 60 19 \n",
"16 3976 2018-03 72 18 \n",
"17 3976 2018-04 38 8 \n",
"18 3976 2018-05 68 10 \n",
"19 3976 2018-06 62 16 \n",
"20 3976 2018-07 63 18 \n",
"21 3976 2018-08 91 16 \n",
"22 3976 2018-09 63 27 \n",
"23 3976 2018-10 24 51 \n",
"24 3976 2018-11 43 19 \n",
"\n",
" months_since_preprint months_since_journal \n",
"0 0 -10 \n",
"1 1 -9 \n",
"2 2 -8 \n",
"3 3 -7 \n",
"4 4 -6 \n",
"5 5 -5 \n",
"6 6 -4 \n",
"7 7 -3 \n",
"8 8 -2 \n",
"9 9 -1 \n",
"10 10 0 \n",
"11 11 1 \n",
"12 12 2 \n",
"13 13 3 \n",
"14 14 4 \n",
"15 15 5 \n",
"16 16 6 \n",
"17 17 7 \n",
"18 18 8 \n",
"19 19 9 \n",
"20 20 10 \n",
"21 21 11 \n",
"22 22 12 \n",
"23 23 13 \n",
"24 24 14 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Traffic for Project Rephetio Preprint\n",
"rephetio_df[['rxivist_preprint_id']].merge(traffic_df)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:greenblack]",
"language": "python",
"name": "conda-env-greenblack-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}