{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# The history of publishing delays by Daniel Himmelstein" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "library(dplyr, warn=F)\n", "library(ggplot2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Read history dates" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\n", "
journal_nlm_idpubmed_iddelay_typedatedelayyeardate_decimal
1000102722221113Acceptance2011-11-1511120112011.871
2000102722221113Publication2012-01-055120122012.011
\n" ], "text/latex": [ "\\begin{tabular}{r|lllllll}\n", " & journal_nlm_id & pubmed_id & delay_type & date & delay & year & date_decimal\\\\\n", "\\hline\n", "\t1 & 0001027 & 22221113 & Acceptance & 2011-11-15 & 111 & 2011 & 2011.871\\\\\n", "\t2 & 0001027 & 22221113 & Publication & 2012-01-05 & 51 & 2012 & 2012.011\\\\\n", "\\end{tabular}\n" ], "text/plain": [ "Source: local data frame [2 x 7]\n", "\n", " journal_nlm_id pubmed_id delay_type date delay year date_decimal\n", " (chr) (int) (chr) (date) (int) (dbl) (dbl)\n", "1 0001027 22221113 Acceptance 2011-11-15 111 2011 2011.871\n", "2 0001027 22221113 Publication 2012-01-05 51 2012 2012.011" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Read history dates for all articles\n", "delay_df = file.path('data', 'delays.tsv.gz') %>%\n", " readr::read_tsv(col_types = list(date = readr::col_date())) %>%\n", " dplyr::mutate(year = lubridate::year(date)) %>%\n", " dplyr::mutate(date_decimal = lubridate::decimal_date(date))\n", "\n", "head(delay_df, 2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare journal dataframe" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Joining by: \"journal_nlm_id\"\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\n", "
journal_nlm_idjournal_abbrevjournaldelay_typen_articlesmedian_delaymean_delayn_unique_datesn_unique_yearsn_unique_delays
10001027Aust N Z J Obstet GynaecolThe Australian & New Zealand journal of obstetrics & gynaecologyAcceptance397111122.4532585183
20001027Aust N Z J Obstet GynaecolThe Australian & New Zealand journal of obstetrics & gynaecologyPublication3255668.30151384107
\n" ], "text/latex": [ "\\begin{tabular}{r|llllllllll}\n", " & journal_nlm_id & journal_abbrev & journal & delay_type & n_articles & median_delay & mean_delay & n_unique_dates & n_unique_years & n_unique_delays\\\\\n", "\\hline\n", "\t1 & 0001027 & Aust N Z J Obstet Gynaecol & The Australian & New Zealand journal of obstetrics & gynaecology & Acceptance & 397 & 111 & 122.453 & 258 & 5 & 183\\\\\n", "\t2 & 0001027 & Aust N Z J Obstet Gynaecol & The Australian & New Zealand journal of obstetrics & gynaecology & Publication & 325 & 56 & 68.3015 & 138 & 4 & 107\\\\\n", "\\end{tabular}\n" ], "text/plain": [ "Source: local data frame [2 x 10]\n", "\n", " journal_nlm_id journal_abbrev\n", " (chr) (chr)\n", "1 0001027 Aust N Z J Obstet Gynaecol\n", "2 0001027 Aust N Z J Obstet Gynaecol\n", "Variables not shown: journal (chr), delay_type (chr), n_articles (int),\n", " median_delay (dbl), mean_delay (dbl), n_unique_dates (int), n_unique_years\n", " (int), n_unique_delays (int)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journal_df = file.path('data', 'pubmed-journals.tsv') %>%\n", " readr::read_tsv() %>%\n", " dplyr::transmute(journal_nlm_id = NlmId, journal_abbrev = MedAbbr, journal = JournalTitle) %>%\n", " dplyr::inner_join(\n", " delay_df %>%\n", " dplyr::group_by(journal_nlm_id, delay_type) %>%\n", " dplyr::summarize(\n", " n_articles = n(),\n", " median_delay = median(delay),\n", " mean_delay = signif(mean(delay), 6),\n", " n_unique_dates = length(unique(date)),\n", " n_unique_years = length(unique(year)),\n", " n_unique_delays = length(unique(delay))\n", " )\n", " )\n", "\n", "head(journal_df, 2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "journal_df %>%\n", " readr::write_tsv(file.path('data', 'journal-summaries.tsv'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Summarize each year" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\n", "
delay_typeyearn_journalsn_articlesmedian_delaymean_delay
1Acceptance196518591114.788
2Acceptance196611916087.0785
\n" ], "text/latex": [ "\\begin{tabular}{r|llllll}\n", " & delay_type & year & n_journals & n_articles & median_delay & mean_delay\\\\\n", "\\hline\n", "\t1 & Acceptance & 1965 & 1 & 85 & 91 & 114.788\\\\\n", "\t2 & Acceptance & 1966 & 1 & 191 & 60 & 87.0785\\\\\n", "\\end{tabular}\n" ], "text/plain": [ "Source: local data frame [2 x 6]\n", "Groups: delay_type [1]\n", "\n", " delay_type year n_journals n_articles median_delay mean_delay\n", " (chr) (dbl) (int) (int) (dbl) (dbl)\n", "1 Acceptance 1965 1 85 91 114.7880\n", "2 Acceptance 1966 1 191 60 87.0785" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "year_summary_df = delay_df %>%\n", " dplyr::group_by(delay_type, year) %>%\n", " dplyr::summarize(\n", " n_journals = n_distinct(journal_nlm_id),\n", " n_articles = n(),\n", " median_delay = median(delay),\n", " mean_delay = signif(mean(delay), 6)\n", " )\n", "\n", "year_summary_df %>%\n", " readr::write_tsv(file.path('data', 'yearly-summaries.tsv'))\n", "\n", "head(year_summary_df, 2)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\n", "
delay_typeyearpercentiledelay
1Acceptance1965016
2Acceptance19652.523.3
\n" ], "text/latex": [ "\\begin{tabular}{r|llll}\n", " & delay_type & year & percentile & delay\\\\\n", "\\hline\n", "\t1 & Acceptance & 1965 & 0 & 16\\\\\n", "\t2 & Acceptance & 1965 & 2.5 & 23.3\\\\\n", "\\end{tabular}\n" ], "text/plain": [ "Source: local data frame [2 x 4]\n", "Groups: delay_type, year [1]\n", "\n", " delay_type year percentile delay\n", " (chr) (dbl) (dbl) (dbl)\n", "1 Acceptance 1965 0.0 16.0\n", "2 Acceptance 1965 2.5 23.3" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summarize_year <- function(df) {\n", " # Return delay percentiles from df\n", " probs = seq(0, 1, 0.025)\n", " dplyr::data_frame(\n", " percentile = 100 * probs,\n", " delay = quantile(df$delay, probs=probs)\n", " )\n", "}\n", "\n", "year_percentile_df = delay_df %>%\n", " dplyr::group_by(delay_type, year) %>%\n", " dplyr::do(summarize_year(.))\n", "\n", "year_percentile_df %>%\n", " format(digits=3) %>%\n", " readr::write_tsv(file.path('data', 'yearly-percentiles.tsv')) \n", "\n", "head(year_percentile_df, 2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Global plotting settings" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "xlab_accept = 'Date accepted'\n", "xlab_publish = 'Date published online'\n", "ylab_accept = 'Acceptance delay (days)'\n", "ylab_publish = 'Publication delay (days)'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot delays by year for all articles" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "plot_years <- function(percentile_df, delay_df, min_year, max_delay, xlab, ylab) {\n", " pctl_col = '#00693E'\n", " gg = percentile_df %>%\n", " ggplot2::ggplot(aes(x = year, y = delay, group = percentile)) +\n", " ggplot2::geom_smooth(aes(group=NULL), data=delay_df, color='#B4B3B3', fill = '#B4B3B3', alpha=1) +\n", " ggplot2::geom_line(size=0.1, color = pctl_col) +\n", " ggplot2::geom_line(data = dplyr::filter(percentile_df, percentile == 50), size=0.8, color = pctl_col) +\n", " ggplot2::geom_line(data = dplyr::filter(percentile_df, percentile %in% c(25, 75)), size=0.5, color = pctl_col) +\n", " ggplot2::coord_cartesian(xlim = c(min_year, 2015), ylim = c(0, max_delay), expand = FALSE) +\n", " ggplot2::theme_bw() +\n", " ggplot2::xlab(xlab) +\n", " ggplot2::ylab(ylab) +\n", " ggplot2::theme(plot.margin=grid::unit(c(2, 10, 2, 2), 'points'))\n", " return(gg)\n", "}" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Acceptance delay plot\n", "gg = plot_years(\n", " percentile_df = year_percentile_df %>% dplyr::filter(delay_type == 'Acceptance'),\n", " delay_df = delay_df %>% dplyr::filter(delay_type == 'Acceptance'),\n", " min_year = 1981,\n", " max_delay = 205,\n", " xlab = xlab_accept,\n", " ylab = ylab_accept\n", ")\n", "\n", "file.path('viz', 'acceptance-by-article.png') %>%\n", " ggplot2::ggsave(plot = gg, width = 5.5, height = 0.75 * 5.5)\n", "file.path('viz', 'acceptance-by-article.pdf') %>%\n", " ggplot2::ggsave(plot = gg, width = 5.5, height = 0.75 * 5.5)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Publication delay plot\n", "gg = plot_years(\n", " percentile_df = year_percentile_df %>% dplyr::filter(delay_type == 'Publication'),\n", " delay_df = delay_df %>% dplyr::filter(delay_type == 'Publication'),\n", " min_year = 2000,\n", " max_delay = 105,\n", " xlab = xlab_publish,\n", " ylab = ylab_publish\n", ")\n", "\n", "file.path('viz', 'publication-by-article.png') %>%\n", " ggplot2::ggsave(plot = gg, width = 5.5, height = 0.75 * 5.5)\n", "file.path('viz', 'publication-by-article.pdf') %>%\n", " ggplot2::ggsave(plot = gg, width = 5.5, height = 0.75 * 5.5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Journal-specific publication delay histories" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "\n", " Acceptance Publication \n", " 3086 2756 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "journal_subset_df = journal_df %>%\n", " dplyr::filter(n_articles >= 100) %>%\n", " dplyr::filter(n_unique_dates >= 5) %>%\n", " dplyr::filter(n_unique_delays >= 5) %>%\n", " dplyr::select(journal_nlm_id:delay_type)\n", "\n", "table(journal_subset_df$delay_type)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plot_journal_history <- function(delay_df, journal_abbrev, xlab, ylab) {\n", " # Plot all delays for a single journal and delay_type\n", " x_box = boxplot.stats(delay_df$date_decimal, coef = 1.5)\n", " xlimits = x_box$stats[c(1, 5)]\n", " if(length(x_box$out) >= 35) {\n", " xlimits = boxplot.stats(delay_df$date_decimal, coef = 2.75)$stats[c(1, 5)]\n", " }\n", " ylimits = c(0, boxplot.stats(delay_df$delay, coef = 2.8)$stats[5])\n", " alpha = log(nrow(delay_df)) ^ -1.03 - 0.02\n", " gg = delay_df %>%\n", " ggplot2::ggplot(aes(x = date_decimal, y = delay)) +\n", " ggplot2::geom_point(alpha=alpha, color='#e60000', size=0.8) +\n", " ggplot2::geom_smooth(linetype=0, fill='#454343', alpha=0.26) +\n", " ggplot2::coord_cartesian(xlim = xlimits, ylim = ylimits) +\n", " ggplot2::theme_bw() +\n", " ggplot2::xlab(xlab) +\n", " ggplot2::ylab(ylab) +\n", " ggplot2::theme(plot.margin=grid::unit(c(5, 12, 2, 2), 'points')) +\n", " ggplot2::annotate('text', x = Inf, y = Inf, label = journal_abbrev, fontface='italic', hjust=1.12, vjust=2)\n", " return(gg)\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "property_map = list(\n", " Acceptance = list(directory = 'accept', xlab = xlab_accept, ylab = ylab_accept),\n", " Publication = list(directory = 'publish', xlab = xlab_publish, ylab = ylab_publish)\n", ")\n", "\n", "for (i in 1:nrow(journal_subset_df)) {\n", " # Iterate through journal-delay_type pairs\n", " delay_type_ = journal_subset_df$delay_type[i]\n", " props = property_map[[delay_type_]]\n", " nlm_id = journal_subset_df$journal_nlm_id[i]\n", " journal_abbrev = journal_subset_df$journal_abbrev[i]\n", " \n", " gg = delay_df %>%\n", " dplyr::filter(delay_type == delay_type_) %>%\n", " dplyr::filter(journal_nlm_id == nlm_id) %>%\n", " plot_journal_history(\n", " journal_abbrev = journal_abbrev,\n", " xlab = props$xlab,\n", " ylab = props$ylab\n", " )\n", " \n", " path = file.path('viz', 'journal', props$directory, sprintf('%s.png', nlm_id))\n", " ggplot2::ggsave(filename = path, plot = gg, width = 5.5, height = 3.8, dpi = 150)\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Journal-specific delay slopes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "get_beta = function(df) {\n", " # Regress delay against date and return model information\n", " model = lm(delay ~ date_decimal, data = df)\n", " model_df = broom::tidy(model)\n", " row_df = dplyr::data_frame(\n", " slope = model_df$estimate[2],\n", " slope_as_percent = 100 * model_df$estimate[2] / mean(df$delay),\n", " p_value = model_df$p.value[2]\n", " )\n", " return(row_df)\n", "}\n", "\n", "slope_df = journal_subset_df %>%\n", " dplyr::group_by(journal_nlm_id, delay_type) %>%\n", " dplyr::inner_join(delay_df) %>%\n", " dplyr::do(get_beta(.))\n", "\n", "slope_df %>%\n", " format(digits=3, scientific=3) %>%\n", " readr::write_tsv(file.path('data', 'slopes.tsv'))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\n", "
journal_nlm_iddelay_typeslopeslope_as_percentp_value
10001027Acceptance2.380851.944290.385
20001027Publication14.7563421.604694.01e-20
\n" ], "text/latex": [ "\\begin{tabular}{r|lllll}\n", " & journal_nlm_id & delay_type & slope & slope_as_percent & p_value\\\\\n", "\\hline\n", "\t1 & 0001027 & Acceptance & 2.38085 & 1.94429 & 0.385\\\\\n", "\t2 & 0001027 & Publication & 14.75634 & 21.60469 & 4.01e-20\\\\\n", "\\end{tabular}\n" ], "text/plain": [ "Source: local data frame [2 x 5]\n", "\n", " journal_nlm_id delay_type slope slope_as_percent p_value\n", " (chr) (chr) (dbl) (dbl) (dbl)\n", "1 0001027 Acceptance 2.38085 1.94429 3.85e-01\n", "2 0001027 Publication 14.75634 21.60469 4.01e-20" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head(slope_df, 2)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Violin plots of per-journal change in delays\n", "gg = slope_df %>%\n", " ggplot2::ggplot(aes(x = delay_type, y = slope)) +\n", " ggplot2::geom_violin(linetype=0, fill='#80a5f9') +\n", " ggplot2::stat_summary(fun.y=mean, geom='point', size=6, color=\"white\", shape='+') +\n", " ggplot2::stat_summary(fun.y=quantile, geom='point', size=6, color=\"white\", shape='|') +\n", " ggplot2::geom_hline(yintercept = 0, linetype = 'dashed') + \n", " ggplot2::theme_bw() +\n", " ggplot2::ylab('Δ days of delay per year') +\n", " ggplot2::scale_x_discrete(name = NULL) +\n", " ggplot2::coord_flip(ylim = c(-20, 20)) +\n", " ggplot2::theme(plot.margin=grid::unit(c(2, 2, 2, 2), 'points'))\n", "\n", "file.path('viz', 'slope-distributions.png') %>%\n", " ggplot2::ggsave(plot = gg, width = 5.5, height = 2.3)\n", "file.path('viz', 'slope-distributions.pdf') %>%\n", " ggplot2::ggsave(plot = gg, width = 5.5, height = 2.3, device = cairo_pdf)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\t\n", "\t\n", "\n", "
delay_typepercent_decreasinglowerupper
1Acceptance0.46565130.026695771.379158
2Publication0.6901306-5.369522-3.420653
\n" ], "text/latex": [ "\\begin{tabular}{r|llll}\n", " & delay_type & percent_decreasing & lower & upper\\\\\n", "\\hline\n", "\t1 & Acceptance & 0.4656513 & 0.02669577 & 1.379158\\\\\n", "\t2 & Publication & 0.6901306 & -5.369522 & -3.420653\\\\\n", "\\end{tabular}\n" ], "text/plain": [ "Source: local data frame [2 x 4]\n", "\n", " delay_type percent_decreasing lower upper\n", " (chr) (dbl) (dbl) (dbl)\n", "1 Acceptance 0.4656513 0.02669577 1.379158\n", "2 Publication 0.6901306 -5.36952170 -3.420653" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Medians of violins\n", "slope_df %>%\n", " dplyr::group_by(delay_type) %>%\n", " dplyr::summarize(\n", " percent_decreasing = mean(slope < 0),\n", " lower = t.test(slope, conf.level = 0.99)$conf.int[1],\n", " upper = t.test(slope, conf.level = 0.99)$conf.int[2]\n", " )" ] } ], "metadata": { "kernelspec": { "display_name": "R", "language": "R", "name": "ir" }, "language_info": { "codemirror_mode": "r", "file_extension": ".r", "mimetype": "text/x-r-source", "name": "R", "pygments_lexer": "r", "version": "3.2.2" } }, "nbformat": 4, "nbformat_minor": 0 }