{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Extract authors from PMC-OAI frontmatter `
` records" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pathlib\n", "\n", "import pandas\n", "\n", "from pubmedpy.xml import yield_etrees_from_zip\n", "from pubmedpy.pmc_oai import extract_authors_from_article" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[PosixPath('data/pmc/oai/pmc_fm/bioinfo.zip'),\n", " PosixPath('data/pmc/oai/pmc_fm/bmcbioi.zip'),\n", " PosixPath('data/pmc/oai/pmc_fm/ploscomp.zip')]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "zip_paths = sorted(pathlib.Path('data/pmc/oai/pmc_fm').glob('*.zip'))\n", "zip_paths" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pmcidpositionfore_namelast_namecorrespondingreverse_position
24041PMC773942Ferdinando DiCunto02
24042PMC773943PaoloProvero11
24043PMC901871Jonas SAlmeida12
24044PMC901872SusanaVinga01
24045PMC990491Harry JMangalam11
\n", "
" ], "text/plain": [ " pmcid position fore_name last_name corresponding \\\n", "24041 PMC77394 2 Ferdinando Di Cunto 0 \n", "24042 PMC77394 3 Paolo Provero 1 \n", "24043 PMC90187 1 Jonas S Almeida 1 \n", "24044 PMC90187 2 Susana Vinga 0 \n", "24045 PMC99049 1 Harry J Mangalam 1 \n", "\n", " reverse_position \n", "24041 2 \n", "24042 1 \n", "24043 2 \n", "24044 1 \n", "24045 1 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "authors = list()\n", "for zip_path in zip_paths:\n", " for name, article in yield_etrees_from_zip(zip_path):\n", " authors.extend(extract_authors_from_article(article))\n", "author_df = pandas.DataFrame(authors)\n", "author_df = author_df.sort_values(['pmcid', 'position'])\n", "affiliation_df = author_df[[\"pmcid\", \"position\", \"affiliations\"]]\n", "author_df = author_df.drop(columns=['affiliations'])\n", "author_df.tail()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pmcidpositionaffiliation
24046PMC10032111 University of Cologne, Institute of Genetics...
24047PMC10032121 University of Cologne, Institute of Genetics...
\n", "
" ], "text/plain": [ " pmcid position affiliation\n", "24046 PMC100321 1 1 University of Cologne, Institute of Genetics...\n", "24047 PMC100321 2 1 University of Cologne, Institute of Genetics..." ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create affiliations table\n", "affiliation_df = (\n", " affiliation_df\n", " .explode('affiliations')\n", " .rename(columns={\"affiliations\": \"affiliation\"})\n", " [[\"pmcid\", \"position\", \"affiliation\"]]\n", " .dropna(subset=[\"affiliation\"])\n", ")\n", "affiliation_df.head(2)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "6 Commissariat à l'énergie atomique, iBiTecS, Gif-sur-Yvette, France\n", "1 Department of Computer Science, Princeton University, Princeton, NJ 08544, USA and 2 Lewis-Sigler Institute for Integrative Genomics, Princeton University, Princeton, NJ 08540, USA\n", "1 Bioinformatics Institute (BII), Agency for Science Technology and Research (A*STAR), 30 Biopolis Street, #07-01, Matrix, 138671, 2 Institute of High Performance Computing (IHPC), Agency for Science Technology and Research (A*STAR), 1 Fusionopolis Way, #16-16 Connexis, 138632, 3 Department of Biological Sciences (DBS), National University of Singapore (NUS), 8 Medical Drive 4, 117597, 4 School of Computer Engineering (SCE), Nanyang Technological University (NTU), 50 Nanyang Drive, 637553 and 5 School of Biological Sciences (SBS), Nanyang Technological University (NTU), 60 Nanyang Drive, 637551, Singapore\n", "2 Fogarty International Center, National Institutes of Health, Bethesda, MD, United States of America\n", "2 Department of Mathematics, Rowland Hall, University of California, Irvine, California, United States of America\n", "2 Center for Medical Informatics, Yale University, New Haven, CT 06520, USA\n", "Department of Biology, Carleton University, Ottawa, ON, Canada\n", "1 0000 0001 2106 9910 grid.65499.37 Department of Biostatistics and Computational Biology, Dana-Farber Cancer Institute, Boston, MA 02215 USA\n", "5 Grossman Center for the Statistics of Mind and Center for Theoretical Neuroscience, Columbia University, New York, New York, United States of America\n", "1 Computer Science Division, University of California, Berkeley, Berkeley, California, United States of America\n" ] } ], "source": [ "# Show 10 random affiliations\n", "print(*affiliation_df.sample(10, random_state=0).affiliation, sep='\\n')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "52939" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# number of unique affiliations\n", "affiliation_df.affiliation.nunique()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "21587" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Total number of articles\n", "author_df.pmcid.nunique()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# number of corresponding authors per paper\n", "n_corresponding = author_df.groupby(\"pmcid\").corresponding.sum()\n", "pmcids_without_corresponding = set(n_corresponding[n_corresponding == 0].index)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "position\n", "1 42.9%\n", "2 7.6%\n", "3 4.6%\n", "4 4.6%\n", "5 5.3%\n", "Name: corresponding, dtype: object" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Probability of author position being corresponding,\n", "# given that there's at least one corresponding author\n", "# and the author is not the last author\n", "(\n", " author_df\n", " .query(\"pmcid not in @pmcids_without_corresponding\")\n", " .query(\"reverse_position > 1\")\n", " .groupby(\"position\")\n", " .corresponding\n", " .mean()\n", " .map(\"{:.1%}\".format)\n", " .head()\n", ")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "reverse_position\n", "1 61.9%\n", "2 12.4%\n", "3 4.1%\n", "4 3.0%\n", "5 3.5%\n", "Name: corresponding, dtype: object" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Probability of author reverse position being corresponding,\n", "# given that there's at least one corresponding author\n", "# and the author is not the first author\n", "(\n", " author_df\n", " .query(\"pmcid not in @pmcids_without_corresponding\")\n", " .query(\"position > 1\")\n", " .groupby(\"reverse_position\")\n", " .corresponding\n", " .mean()\n", " .map(\"{:.1%}\".format)\n", " .head()\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 371\n", "1 17529\n", "2 3267\n", "3 314\n", "4 62\n", "5 19\n", "6 7\n", "7 2\n", "8 2\n", "9 6\n", "10 2\n", "11 1\n", "14 2\n", "15 1\n", "17 1\n", "21 1\n", "Name: corresponding, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Corresponding author counts\n", "n_corresponding.value_counts().sort_index()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pmcidcorresponding
105PMC11835100
106PMC11835110
107PMC11835120
119PMC11856440
160PMC11939920
\n", "
" ], "text/plain": [ " pmcid corresponding\n", "105 PMC1183510 0\n", "106 PMC1183511 0\n", "107 PMC1183512 0\n", "119 PMC1185644 0\n", "160 PMC1193992 0" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Testing: show some articles without any corresponding authors\n", "n_corresponding.reset_index().query(\"corresponding == 0\").head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pmcidcorresponding
9078PMC346311515
9349PMC350949514
9393PMC351946117
9583PMC354679710
9719PMC357020711
10363PMC369465910
15564PMC500120821
17344PMC564755614
\n", "
" ], "text/plain": [ " pmcid corresponding\n", "9078 PMC3463115 15\n", "9349 PMC3509495 14\n", "9393 PMC3519461 17\n", "9583 PMC3546797 10\n", "9719 PMC3570207 11\n", "10363 PMC3694659 10\n", "15564 PMC5001208 21\n", "17344 PMC5647556 14" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Testing: show some articles without >10 corresponding authors\n", "n_corresponding.reset_index().query(\"corresponding >= 10\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# Write author dataframe to a TSV\n", "author_df.to_csv('data/pmc/authors.tsv.xz', index=False, sep='\\t')\n", "\n", "# Write affiliation dataframe to a TSV\n", "affiliation_df.to_csv('data/pmc/affiliations.tsv.xz', index=False, sep='\\t')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }