{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# The history of publishing delays by Daniel Himmelstein"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"library(dplyr, warn=F)\n",
"library(ggplot2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Read history dates"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
" | journal_nlm_id | pubmed_id | delay_type | date | delay | year | date_decimal |
\n",
"\n",
"\t1 | 0001027 | 22221113 | Acceptance | 2011-11-15 | 111 | 2011 | 2011.871 |
\n",
"\t2 | 0001027 | 22221113 | Publication | 2012-01-05 | 51 | 2012 | 2012.011 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllllll}\n",
" & journal_nlm_id & pubmed_id & delay_type & date & delay & year & date_decimal\\\\\n",
"\\hline\n",
"\t1 & 0001027 & 22221113 & Acceptance & 2011-11-15 & 111 & 2011 & 2011.871\\\\\n",
"\t2 & 0001027 & 22221113 & Publication & 2012-01-05 & 51 & 2012 & 2012.011\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
"Source: local data frame [2 x 7]\n",
"\n",
" journal_nlm_id pubmed_id delay_type date delay year date_decimal\n",
" (chr) (int) (chr) (date) (int) (dbl) (dbl)\n",
"1 0001027 22221113 Acceptance 2011-11-15 111 2011 2011.871\n",
"2 0001027 22221113 Publication 2012-01-05 51 2012 2012.011"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Read history dates for all articles\n",
"delay_df = file.path('data', 'delays.tsv.gz') %>%\n",
" readr::read_tsv(col_types = list(date = readr::col_date())) %>%\n",
" dplyr::mutate(year = lubridate::year(date)) %>%\n",
" dplyr::mutate(date_decimal = lubridate::decimal_date(date))\n",
"\n",
"head(delay_df, 2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare journal dataframe"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Joining by: \"journal_nlm_id\"\n"
]
},
{
"data": {
"text/html": [
"\n",
" | journal_nlm_id | journal_abbrev | journal | delay_type | n_articles | median_delay | mean_delay | n_unique_dates | n_unique_years | n_unique_delays |
\n",
"\n",
"\t1 | 0001027 | Aust N Z J Obstet Gynaecol | The Australian & New Zealand journal of obstetrics & gynaecology | Acceptance | 397 | 111 | 122.453 | 258 | 5 | 183 |
\n",
"\t2 | 0001027 | Aust N Z J Obstet Gynaecol | The Australian & New Zealand journal of obstetrics & gynaecology | Publication | 325 | 56 | 68.3015 | 138 | 4 | 107 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|llllllllll}\n",
" & journal_nlm_id & journal_abbrev & journal & delay_type & n_articles & median_delay & mean_delay & n_unique_dates & n_unique_years & n_unique_delays\\\\\n",
"\\hline\n",
"\t1 & 0001027 & Aust N Z J Obstet Gynaecol & The Australian & New Zealand journal of obstetrics & gynaecology & Acceptance & 397 & 111 & 122.453 & 258 & 5 & 183\\\\\n",
"\t2 & 0001027 & Aust N Z J Obstet Gynaecol & The Australian & New Zealand journal of obstetrics & gynaecology & Publication & 325 & 56 & 68.3015 & 138 & 4 & 107\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
"Source: local data frame [2 x 10]\n",
"\n",
" journal_nlm_id journal_abbrev\n",
" (chr) (chr)\n",
"1 0001027 Aust N Z J Obstet Gynaecol\n",
"2 0001027 Aust N Z J Obstet Gynaecol\n",
"Variables not shown: journal (chr), delay_type (chr), n_articles (int),\n",
" median_delay (dbl), mean_delay (dbl), n_unique_dates (int), n_unique_years\n",
" (int), n_unique_delays (int)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"journal_df = file.path('data', 'pubmed-journals.tsv') %>%\n",
" readr::read_tsv() %>%\n",
" dplyr::transmute(journal_nlm_id = NlmId, journal_abbrev = MedAbbr, journal = JournalTitle) %>%\n",
" dplyr::inner_join(\n",
" delay_df %>%\n",
" dplyr::group_by(journal_nlm_id, delay_type) %>%\n",
" dplyr::summarize(\n",
" n_articles = n(),\n",
" median_delay = median(delay),\n",
" mean_delay = signif(mean(delay), 6),\n",
" n_unique_dates = length(unique(date)),\n",
" n_unique_years = length(unique(year)),\n",
" n_unique_delays = length(unique(delay))\n",
" )\n",
" )\n",
"\n",
"head(journal_df, 2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"journal_df %>%\n",
" readr::write_tsv(file.path('data', 'journal-summaries.tsv'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summarize each year"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | delay_type | year | n_journals | n_articles | median_delay | mean_delay |
\n",
"\n",
"\t1 | Acceptance | 1965 | 1 | 85 | 91 | 114.788 |
\n",
"\t2 | Acceptance | 1966 | 1 | 191 | 60 | 87.0785 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|llllll}\n",
" & delay_type & year & n_journals & n_articles & median_delay & mean_delay\\\\\n",
"\\hline\n",
"\t1 & Acceptance & 1965 & 1 & 85 & 91 & 114.788\\\\\n",
"\t2 & Acceptance & 1966 & 1 & 191 & 60 & 87.0785\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
"Source: local data frame [2 x 6]\n",
"Groups: delay_type [1]\n",
"\n",
" delay_type year n_journals n_articles median_delay mean_delay\n",
" (chr) (dbl) (int) (int) (dbl) (dbl)\n",
"1 Acceptance 1965 1 85 91 114.7880\n",
"2 Acceptance 1966 1 191 60 87.0785"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"year_summary_df = delay_df %>%\n",
" dplyr::group_by(delay_type, year) %>%\n",
" dplyr::summarize(\n",
" n_journals = n_distinct(journal_nlm_id),\n",
" n_articles = n(),\n",
" median_delay = median(delay),\n",
" mean_delay = signif(mean(delay), 6)\n",
" )\n",
"\n",
"year_summary_df %>%\n",
" readr::write_tsv(file.path('data', 'yearly-summaries.tsv'))\n",
"\n",
"head(year_summary_df, 2)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | delay_type | year | percentile | delay |
\n",
"\n",
"\t1 | Acceptance | 1965 | 0 | 16 |
\n",
"\t2 | Acceptance | 1965 | 2.5 | 23.3 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|llll}\n",
" & delay_type & year & percentile & delay\\\\\n",
"\\hline\n",
"\t1 & Acceptance & 1965 & 0 & 16\\\\\n",
"\t2 & Acceptance & 1965 & 2.5 & 23.3\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
"Source: local data frame [2 x 4]\n",
"Groups: delay_type, year [1]\n",
"\n",
" delay_type year percentile delay\n",
" (chr) (dbl) (dbl) (dbl)\n",
"1 Acceptance 1965 0.0 16.0\n",
"2 Acceptance 1965 2.5 23.3"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"summarize_year <- function(df) {\n",
" # Return delay percentiles from df\n",
" probs = seq(0, 1, 0.025)\n",
" dplyr::data_frame(\n",
" percentile = 100 * probs,\n",
" delay = quantile(df$delay, probs=probs)\n",
" )\n",
"}\n",
"\n",
"year_percentile_df = delay_df %>%\n",
" dplyr::group_by(delay_type, year) %>%\n",
" dplyr::do(summarize_year(.))\n",
"\n",
"year_percentile_df %>%\n",
" format(digits=3) %>%\n",
" readr::write_tsv(file.path('data', 'yearly-percentiles.tsv')) \n",
"\n",
"head(year_percentile_df, 2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Global plotting settings"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"xlab_accept = 'Date accepted'\n",
"xlab_publish = 'Date published online'\n",
"ylab_accept = 'Acceptance delay (days)'\n",
"ylab_publish = 'Publication delay (days)'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plot delays by year for all articles"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"plot_years <- function(percentile_df, delay_df, min_year, max_delay, xlab, ylab) {\n",
" pctl_col = '#00693E'\n",
" gg = percentile_df %>%\n",
" ggplot2::ggplot(aes(x = year, y = delay, group = percentile)) +\n",
" ggplot2::geom_smooth(aes(group=NULL), data=delay_df, color='#B4B3B3', fill = '#B4B3B3', alpha=1) +\n",
" ggplot2::geom_line(size=0.1, color = pctl_col) +\n",
" ggplot2::geom_line(data = dplyr::filter(percentile_df, percentile == 50), size=0.8, color = pctl_col) +\n",
" ggplot2::geom_line(data = dplyr::filter(percentile_df, percentile %in% c(25, 75)), size=0.5, color = pctl_col) +\n",
" ggplot2::coord_cartesian(xlim = c(min_year, 2015), ylim = c(0, max_delay), expand = FALSE) +\n",
" ggplot2::theme_bw() +\n",
" ggplot2::xlab(xlab) +\n",
" ggplot2::ylab(ylab) +\n",
" ggplot2::theme(plot.margin=grid::unit(c(2, 10, 2, 2), 'points'))\n",
" return(gg)\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Acceptance delay plot\n",
"gg = plot_years(\n",
" percentile_df = year_percentile_df %>% dplyr::filter(delay_type == 'Acceptance'),\n",
" delay_df = delay_df %>% dplyr::filter(delay_type == 'Acceptance'),\n",
" min_year = 1981,\n",
" max_delay = 205,\n",
" xlab = xlab_accept,\n",
" ylab = ylab_accept\n",
")\n",
"\n",
"file.path('viz', 'acceptance-by-article.png') %>%\n",
" ggplot2::ggsave(plot = gg, width = 5.5, height = 0.75 * 5.5)\n",
"file.path('viz', 'acceptance-by-article.pdf') %>%\n",
" ggplot2::ggsave(plot = gg, width = 5.5, height = 0.75 * 5.5)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Publication delay plot\n",
"gg = plot_years(\n",
" percentile_df = year_percentile_df %>% dplyr::filter(delay_type == 'Publication'),\n",
" delay_df = delay_df %>% dplyr::filter(delay_type == 'Publication'),\n",
" min_year = 2000,\n",
" max_delay = 105,\n",
" xlab = xlab_publish,\n",
" ylab = ylab_publish\n",
")\n",
"\n",
"file.path('viz', 'publication-by-article.png') %>%\n",
" ggplot2::ggsave(plot = gg, width = 5.5, height = 0.75 * 5.5)\n",
"file.path('viz', 'publication-by-article.pdf') %>%\n",
" ggplot2::ggsave(plot = gg, width = 5.5, height = 0.75 * 5.5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Journal-specific publication delay histories"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"\n",
" Acceptance Publication \n",
" 3086 2756 "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"journal_subset_df = journal_df %>%\n",
" dplyr::filter(n_articles >= 100) %>%\n",
" dplyr::filter(n_unique_dates >= 5) %>%\n",
" dplyr::filter(n_unique_delays >= 5) %>%\n",
" dplyr::select(journal_nlm_id:delay_type)\n",
"\n",
"table(journal_subset_df$delay_type)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plot_journal_history <- function(delay_df, journal_abbrev, xlab, ylab) {\n",
" # Plot all delays for a single journal and delay_type\n",
" x_box = boxplot.stats(delay_df$date_decimal, coef = 1.5)\n",
" xlimits = x_box$stats[c(1, 5)]\n",
" if(length(x_box$out) >= 35) {\n",
" xlimits = boxplot.stats(delay_df$date_decimal, coef = 2.75)$stats[c(1, 5)]\n",
" }\n",
" ylimits = c(0, boxplot.stats(delay_df$delay, coef = 2.8)$stats[5])\n",
" alpha = log(nrow(delay_df)) ^ -1.03 - 0.02\n",
" gg = delay_df %>%\n",
" ggplot2::ggplot(aes(x = date_decimal, y = delay)) +\n",
" ggplot2::geom_point(alpha=alpha, color='#e60000', size=0.8) +\n",
" ggplot2::geom_smooth(linetype=0, fill='#454343', alpha=0.26) +\n",
" ggplot2::coord_cartesian(xlim = xlimits, ylim = ylimits) +\n",
" ggplot2::theme_bw() +\n",
" ggplot2::xlab(xlab) +\n",
" ggplot2::ylab(ylab) +\n",
" ggplot2::theme(plot.margin=grid::unit(c(5, 12, 2, 2), 'points')) +\n",
" ggplot2::annotate('text', x = Inf, y = Inf, label = journal_abbrev, fontface='italic', hjust=1.12, vjust=2)\n",
" return(gg)\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"property_map = list(\n",
" Acceptance = list(directory = 'accept', xlab = xlab_accept, ylab = ylab_accept),\n",
" Publication = list(directory = 'publish', xlab = xlab_publish, ylab = ylab_publish)\n",
")\n",
"\n",
"for (i in 1:nrow(journal_subset_df)) {\n",
" # Iterate through journal-delay_type pairs\n",
" delay_type_ = journal_subset_df$delay_type[i]\n",
" props = property_map[[delay_type_]]\n",
" nlm_id = journal_subset_df$journal_nlm_id[i]\n",
" journal_abbrev = journal_subset_df$journal_abbrev[i]\n",
" \n",
" gg = delay_df %>%\n",
" dplyr::filter(delay_type == delay_type_) %>%\n",
" dplyr::filter(journal_nlm_id == nlm_id) %>%\n",
" plot_journal_history(\n",
" journal_abbrev = journal_abbrev,\n",
" xlab = props$xlab,\n",
" ylab = props$ylab\n",
" )\n",
" \n",
" path = file.path('viz', 'journal', props$directory, sprintf('%s.png', nlm_id))\n",
" ggplot2::ggsave(filename = path, plot = gg, width = 5.5, height = 3.8, dpi = 150)\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Journal-specific delay slopes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"get_beta = function(df) {\n",
" # Regress delay against date and return model information\n",
" model = lm(delay ~ date_decimal, data = df)\n",
" model_df = broom::tidy(model)\n",
" row_df = dplyr::data_frame(\n",
" slope = model_df$estimate[2],\n",
" slope_as_percent = 100 * model_df$estimate[2] / mean(df$delay),\n",
" p_value = model_df$p.value[2]\n",
" )\n",
" return(row_df)\n",
"}\n",
"\n",
"slope_df = journal_subset_df %>%\n",
" dplyr::group_by(journal_nlm_id, delay_type) %>%\n",
" dplyr::inner_join(delay_df) %>%\n",
" dplyr::do(get_beta(.))\n",
"\n",
"slope_df %>%\n",
" format(digits=3, scientific=3) %>%\n",
" readr::write_tsv(file.path('data', 'slopes.tsv'))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | journal_nlm_id | delay_type | slope | slope_as_percent | p_value |
\n",
"\n",
"\t1 | 0001027 | Acceptance | 2.38085 | 1.94429 | 0.385 |
\n",
"\t2 | 0001027 | Publication | 14.75634 | 21.60469 | 4.01e-20 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|lllll}\n",
" & journal_nlm_id & delay_type & slope & slope_as_percent & p_value\\\\\n",
"\\hline\n",
"\t1 & 0001027 & Acceptance & 2.38085 & 1.94429 & 0.385\\\\\n",
"\t2 & 0001027 & Publication & 14.75634 & 21.60469 & 4.01e-20\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
"Source: local data frame [2 x 5]\n",
"\n",
" journal_nlm_id delay_type slope slope_as_percent p_value\n",
" (chr) (chr) (dbl) (dbl) (dbl)\n",
"1 0001027 Acceptance 2.38085 1.94429 3.85e-01\n",
"2 0001027 Publication 14.75634 21.60469 4.01e-20"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head(slope_df, 2)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Violin plots of per-journal change in delays\n",
"gg = slope_df %>%\n",
" ggplot2::ggplot(aes(x = delay_type, y = slope)) +\n",
" ggplot2::geom_violin(linetype=0, fill='#80a5f9') +\n",
" ggplot2::stat_summary(fun.y=mean, geom='point', size=6, color=\"white\", shape='+') +\n",
" ggplot2::stat_summary(fun.y=quantile, geom='point', size=6, color=\"white\", shape='|') +\n",
" ggplot2::geom_hline(yintercept = 0, linetype = 'dashed') + \n",
" ggplot2::theme_bw() +\n",
" ggplot2::ylab('Δ days of delay per year') +\n",
" ggplot2::scale_x_discrete(name = NULL) +\n",
" ggplot2::coord_flip(ylim = c(-20, 20)) +\n",
" ggplot2::theme(plot.margin=grid::unit(c(2, 2, 2, 2), 'points'))\n",
"\n",
"file.path('viz', 'slope-distributions.png') %>%\n",
" ggplot2::ggsave(plot = gg, width = 5.5, height = 2.3)\n",
"file.path('viz', 'slope-distributions.pdf') %>%\n",
" ggplot2::ggsave(plot = gg, width = 5.5, height = 2.3, device = cairo_pdf)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" | delay_type | percent_decreasing | lower | upper |
\n",
"\n",
"\t1 | Acceptance | 0.4656513 | 0.02669577 | 1.379158 |
\n",
"\t2 | Publication | 0.6901306 | -5.369522 | -3.420653 |
\n",
"\n",
"
\n"
],
"text/latex": [
"\\begin{tabular}{r|llll}\n",
" & delay_type & percent_decreasing & lower & upper\\\\\n",
"\\hline\n",
"\t1 & Acceptance & 0.4656513 & 0.02669577 & 1.379158\\\\\n",
"\t2 & Publication & 0.6901306 & -5.369522 & -3.420653\\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
"Source: local data frame [2 x 4]\n",
"\n",
" delay_type percent_decreasing lower upper\n",
" (chr) (dbl) (dbl) (dbl)\n",
"1 Acceptance 0.4656513 0.02669577 1.379158\n",
"2 Publication 0.6901306 -5.36952170 -3.420653"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Medians of violins\n",
"slope_df %>%\n",
" dplyr::group_by(delay_type) %>%\n",
" dplyr::summarize(\n",
" percent_decreasing = mean(slope < 0),\n",
" lower = t.test(slope, conf.level = 0.99)$conf.int[1],\n",
" upper = t.test(slope, conf.level = 0.99)$conf.int[2]\n",
" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "3.2.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}