# Extract authors from PMC-OAI frontmatter `<article>` records

In [1]:
import pathlib

import pandas

from pubmedpy.xml import yield_etrees_from_zip
from pubmedpy.pmc_oai import extract_authors_from_article

In [2]:
zip_paths = sorted(pathlib.Path('data/pmc/oai/pmc_fm').glob('*.zip'))
zip_paths

[PosixPath('data/pmc/oai/pmc_fm/bioinfo.zip'),
 PosixPath('data/pmc/oai/pmc_fm/bmcbioi.zip'),
 PosixPath('data/pmc/oai/pmc_fm/ploscomp.zip')]

In [3]:
authors = list()
for zip_path in zip_paths:
    for name, article in yield_etrees_from_zip(zip_path):
        authors.extend(extract_authors_from_article(article))
author_df = pandas.DataFrame(authors)
author_df = author_df.sort_values(['pmcid', 'position'])
affiliation_df = author_df[["pmcid", "position", "affiliations"]]
author_df = author_df.drop(columns=['affiliations'])
author_df.tail()

Unnamed: 0,pmcid,position,fore_name,last_name,corresponding,reverse_position
24041,PMC77394,2,Ferdinando Di,Cunto,0,2
24042,PMC77394,3,Paolo,Provero,1,1
24043,PMC90187,1,Jonas S,Almeida,1,2
24044,PMC90187,2,Susana,Vinga,0,1
24045,PMC99049,1,Harry J,Mangalam,1,1


In [4]:
# create affiliations table
affiliation_df = (
    affiliation_df
    .explode('affiliations')
    .rename(columns={"affiliations": "affiliation"})
    [["pmcid", "position", "affiliation"]]
    .dropna(subset=["affiliation"])
)
affiliation_df.head(2)

Unnamed: 0,pmcid,position,affiliation
24046,PMC100321,1,"1 University of Cologne, Institute of Genetics..."
24047,PMC100321,2,"1 University of Cologne, Institute of Genetics..."


In [5]:
# Show 10 random affiliations
print(*affiliation_df.sample(10, random_state=0).affiliation, sep='\n')

6 Commissariat à l'énergie atomique, iBiTecS, Gif-sur-Yvette, France
1 Department of Computer Science, Princeton University, Princeton, NJ 08544, USA and 2 Lewis-Sigler Institute for Integrative Genomics, Princeton University, Princeton, NJ 08540, USA
1 Bioinformatics Institute (BII), Agency for Science Technology and Research (A*STAR), 30 Biopolis Street, #07-01, Matrix, 138671, 2 Institute of High Performance Computing (IHPC), Agency for Science Technology and Research (A*STAR), 1 Fusionopolis Way, #16-16 Connexis, 138632, 3 Department of Biological Sciences (DBS), National University of Singapore (NUS), 8 Medical Drive 4, 117597, 4 School of Computer Engineering (SCE), Nanyang Technological University (NTU), 50 Nanyang Drive, 637553 and 5 School of Biological Sciences (SBS), Nanyang Technological University (NTU), 60 Nanyang Drive, 637551, Singapore
2 Fogarty International Center, National Institutes of Health, Bethesda, MD, United States of America
2 Department of Mathematics, Rowl

In [6]:
# number of unique affiliations
affiliation_df.affiliation.nunique()

52939

In [7]:
# Total number of articles
author_df.pmcid.nunique()

21587

In [8]:
# number of corresponding authors per paper
n_corresponding = author_df.groupby("pmcid").corresponding.sum()
pmcids_without_corresponding = set(n_corresponding[n_corresponding == 0].index)

In [9]:
# Probability of author position being corresponding,
# given that there's at least one corresponding author
# and the author is not the last author
(
    author_df
    .query("pmcid not in @pmcids_without_corresponding")
    .query("reverse_position > 1")
    .groupby("position")
    .corresponding
    .mean()
    .map("{:.1%}".format)
    .head()
)

position
1    42.9%
2     7.6%
3     4.6%
4     4.6%
5     5.3%
Name: corresponding, dtype: object

In [10]:
# Probability of author reverse position being corresponding,
# given that there's at least one corresponding author
# and the author is not the first author
(
    author_df
    .query("pmcid not in @pmcids_without_corresponding")
    .query("position > 1")
    .groupby("reverse_position")
    .corresponding
    .mean()
    .map("{:.1%}".format)
    .head()
)

reverse_position
1    61.9%
2    12.4%
3     4.1%
4     3.0%
5     3.5%
Name: corresponding, dtype: object

In [11]:
# Corresponding author counts
n_corresponding.value_counts().sort_index()

0       371
1     17529
2      3267
3       314
4        62
5        19
6         7
7         2
8         2
9         6
10        2
11        1
14        2
15        1
17        1
21        1
Name: corresponding, dtype: int64

In [12]:
# Testing: show some articles without any corresponding authors
n_corresponding.reset_index().query("corresponding == 0").head()

Unnamed: 0,pmcid,corresponding
105,PMC1183510,0
106,PMC1183511,0
107,PMC1183512,0
119,PMC1185644,0
160,PMC1193992,0


In [13]:
# Testing: show some articles without >10 corresponding authors
n_corresponding.reset_index().query("corresponding >= 10")

Unnamed: 0,pmcid,corresponding
9078,PMC3463115,15
9349,PMC3509495,14
9393,PMC3519461,17
9583,PMC3546797,10
9719,PMC3570207,11
10363,PMC3694659,10
15564,PMC5001208,21
17344,PMC5647556,14


In [14]:
# Write author dataframe to a TSV
author_df.to_csv('data/pmc/authors.tsv.xz', index=False, sep='\t')

# Write affiliation dataframe to a TSV
affiliation_df.to_csv('data/pmc/affiliations.tsv.xz', index=False, sep='\t')