{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extract authors from PMC-OAI frontmatter `<article>` records"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pathlib\n",
    "\n",
    "import pandas\n",
    "\n",
    "from pubmedpy.xml import yield_etrees_from_zip\n",
    "from pubmedpy.pmc_oai import extract_authors_from_article"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[PosixPath('data/pmc/oai/pmc_fm/bioinfo.zip'),\n",
       " PosixPath('data/pmc/oai/pmc_fm/bmcbioi.zip'),\n",
       " PosixPath('data/pmc/oai/pmc_fm/ploscomp.zip')]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "zip_paths = sorted(pathlib.Path('data/pmc/oai/pmc_fm').glob('*.zip'))\n",
    "zip_paths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pmcid</th>\n",
       "      <th>position</th>\n",
       "      <th>fore_name</th>\n",
       "      <th>last_name</th>\n",
       "      <th>corresponding</th>\n",
       "      <th>reverse_position</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>24041</th>\n",
       "      <td>PMC77394</td>\n",
       "      <td>2</td>\n",
       "      <td>Ferdinando Di</td>\n",
       "      <td>Cunto</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24042</th>\n",
       "      <td>PMC77394</td>\n",
       "      <td>3</td>\n",
       "      <td>Paolo</td>\n",
       "      <td>Provero</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24043</th>\n",
       "      <td>PMC90187</td>\n",
       "      <td>1</td>\n",
       "      <td>Jonas S</td>\n",
       "      <td>Almeida</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24044</th>\n",
       "      <td>PMC90187</td>\n",
       "      <td>2</td>\n",
       "      <td>Susana</td>\n",
       "      <td>Vinga</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24045</th>\n",
       "      <td>PMC99049</td>\n",
       "      <td>1</td>\n",
       "      <td>Harry J</td>\n",
       "      <td>Mangalam</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          pmcid  position      fore_name last_name  corresponding  \\\n",
       "24041  PMC77394         2  Ferdinando Di     Cunto              0   \n",
       "24042  PMC77394         3          Paolo   Provero              1   \n",
       "24043  PMC90187         1        Jonas S   Almeida              1   \n",
       "24044  PMC90187         2         Susana     Vinga              0   \n",
       "24045  PMC99049         1        Harry J  Mangalam              1   \n",
       "\n",
       "       reverse_position  \n",
       "24041                 2  \n",
       "24042                 1  \n",
       "24043                 2  \n",
       "24044                 1  \n",
       "24045                 1  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "authors = list()\n",
    "for zip_path in zip_paths:\n",
    "    for name, article in yield_etrees_from_zip(zip_path):\n",
    "        authors.extend(extract_authors_from_article(article))\n",
    "author_df = pandas.DataFrame(authors)\n",
    "author_df = author_df.sort_values(['pmcid', 'position'])\n",
    "affiliation_df = author_df[[\"pmcid\", \"position\", \"affiliations\"]]\n",
    "author_df = author_df.drop(columns=['affiliations'])\n",
    "author_df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pmcid</th>\n",
       "      <th>position</th>\n",
       "      <th>affiliation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>24046</th>\n",
       "      <td>PMC100321</td>\n",
       "      <td>1</td>\n",
       "      <td>1 University of Cologne, Institute of Genetics...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24047</th>\n",
       "      <td>PMC100321</td>\n",
       "      <td>2</td>\n",
       "      <td>1 University of Cologne, Institute of Genetics...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           pmcid  position                                        affiliation\n",
       "24046  PMC100321         1  1 University of Cologne, Institute of Genetics...\n",
       "24047  PMC100321         2  1 University of Cologne, Institute of Genetics..."
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create affiliations table\n",
    "affiliation_df = (\n",
    "    affiliation_df\n",
    "    .explode('affiliations')\n",
    "    .rename(columns={\"affiliations\": \"affiliation\"})\n",
    "    [[\"pmcid\", \"position\", \"affiliation\"]]\n",
    "    .dropna(subset=[\"affiliation\"])\n",
    ")\n",
    "affiliation_df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6 Commissariat à l'énergie atomique, iBiTecS, Gif-sur-Yvette, France\n",
      "1 Department of Computer Science, Princeton University, Princeton, NJ 08544, USA and 2 Lewis-Sigler Institute for Integrative Genomics, Princeton University, Princeton, NJ 08540, USA\n",
      "1 Bioinformatics Institute (BII), Agency for Science Technology and Research (A*STAR), 30 Biopolis Street, #07-01, Matrix, 138671, 2 Institute of High Performance Computing (IHPC), Agency for Science Technology and Research (A*STAR), 1 Fusionopolis Way, #16-16 Connexis, 138632, 3 Department of Biological Sciences (DBS), National University of Singapore (NUS), 8 Medical Drive 4, 117597, 4 School of Computer Engineering (SCE), Nanyang Technological University (NTU), 50 Nanyang Drive, 637553 and 5 School of Biological Sciences (SBS), Nanyang Technological University (NTU), 60 Nanyang Drive, 637551, Singapore\n",
      "2 Fogarty International Center, National Institutes of Health, Bethesda, MD, United States of America\n",
      "2 Department of Mathematics, Rowland Hall, University of California, Irvine, California, United States of America\n",
      "2 Center for Medical Informatics, Yale University, New Haven, CT 06520, USA\n",
      "Department of Biology, Carleton University, Ottawa, ON, Canada\n",
      "1 0000 0001 2106 9910 grid.65499.37 Department of Biostatistics and Computational Biology, Dana-Farber Cancer Institute, Boston, MA 02215 USA\n",
      "5 Grossman Center for the Statistics of Mind and Center for Theoretical Neuroscience, Columbia University, New York, New York, United States of America\n",
      "1 Computer Science Division, University of California, Berkeley, Berkeley, California, United States of America\n"
     ]
    }
   ],
   "source": [
    "# Show 10 random affiliations\n",
    "print(*affiliation_df.sample(10, random_state=0).affiliation, sep='\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "52939"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# number of unique affiliations\n",
    "affiliation_df.affiliation.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "21587"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Total number of articles\n",
    "author_df.pmcid.nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# number of corresponding authors per paper\n",
    "n_corresponding = author_df.groupby(\"pmcid\").corresponding.sum()\n",
    "pmcids_without_corresponding = set(n_corresponding[n_corresponding == 0].index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "position\n",
       "1    42.9%\n",
       "2     7.6%\n",
       "3     4.6%\n",
       "4     4.6%\n",
       "5     5.3%\n",
       "Name: corresponding, dtype: object"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Probability of author position being corresponding,\n",
    "# given that there's at least one corresponding author\n",
    "# and the author is not the last author\n",
    "(\n",
    "    author_df\n",
    "    .query(\"pmcid not in @pmcids_without_corresponding\")\n",
    "    .query(\"reverse_position > 1\")\n",
    "    .groupby(\"position\")\n",
    "    .corresponding\n",
    "    .mean()\n",
    "    .map(\"{:.1%}\".format)\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "reverse_position\n",
       "1    61.9%\n",
       "2    12.4%\n",
       "3     4.1%\n",
       "4     3.0%\n",
       "5     3.5%\n",
       "Name: corresponding, dtype: object"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Probability of author reverse position being corresponding,\n",
    "# given that there's at least one corresponding author\n",
    "# and the author is not the first author\n",
    "(\n",
    "    author_df\n",
    "    .query(\"pmcid not in @pmcids_without_corresponding\")\n",
    "    .query(\"position > 1\")\n",
    "    .groupby(\"reverse_position\")\n",
    "    .corresponding\n",
    "    .mean()\n",
    "    .map(\"{:.1%}\".format)\n",
    "    .head()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       371\n",
       "1     17529\n",
       "2      3267\n",
       "3       314\n",
       "4        62\n",
       "5        19\n",
       "6         7\n",
       "7         2\n",
       "8         2\n",
       "9         6\n",
       "10        2\n",
       "11        1\n",
       "14        2\n",
       "15        1\n",
       "17        1\n",
       "21        1\n",
       "Name: corresponding, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Corresponding author counts\n",
    "n_corresponding.value_counts().sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pmcid</th>\n",
       "      <th>corresponding</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>PMC1183510</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>PMC1183511</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>107</th>\n",
       "      <td>PMC1183512</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>PMC1185644</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>160</th>\n",
       "      <td>PMC1193992</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          pmcid  corresponding\n",
       "105  PMC1183510              0\n",
       "106  PMC1183511              0\n",
       "107  PMC1183512              0\n",
       "119  PMC1185644              0\n",
       "160  PMC1193992              0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Testing: show some articles without any corresponding authors\n",
    "n_corresponding.reset_index().query(\"corresponding == 0\").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pmcid</th>\n",
       "      <th>corresponding</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>9078</th>\n",
       "      <td>PMC3463115</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9349</th>\n",
       "      <td>PMC3509495</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9393</th>\n",
       "      <td>PMC3519461</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9583</th>\n",
       "      <td>PMC3546797</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9719</th>\n",
       "      <td>PMC3570207</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10363</th>\n",
       "      <td>PMC3694659</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15564</th>\n",
       "      <td>PMC5001208</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17344</th>\n",
       "      <td>PMC5647556</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            pmcid  corresponding\n",
       "9078   PMC3463115             15\n",
       "9349   PMC3509495             14\n",
       "9393   PMC3519461             17\n",
       "9583   PMC3546797             10\n",
       "9719   PMC3570207             11\n",
       "10363  PMC3694659             10\n",
       "15564  PMC5001208             21\n",
       "17344  PMC5647556             14"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Testing: show some articles without >10 corresponding authors\n",
    "n_corresponding.reset_index().query(\"corresponding >= 10\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write author dataframe to a TSV\n",
    "author_df.to_csv('data/pmc/authors.tsv.xz', index=False, sep='\\t')\n",
    "\n",
    "# Write affiliation dataframe to a TSV\n",
    "affiliation_df.to_csv('data/pmc/affiliations.tsv.xz', index=False, sep='\\t')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}