# Data exploration of the articles (Automed data)

## Articles found (eliminate duplicates)

In [None]:
import pandas as pd

In [None]:
# Read csv
df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_found.csv')

In [None]:
# 831000 articles
df.shape

In [None]:
# Get only the pubmed id
df_p = df.loc[:,'AKE_pubmed_id']

In [None]:
# 44 duplciated articles
duplicated_rows = df_p[df_p.duplicated(keep=False)]
duplicated_rows.index

In [None]:
# Threre are duplicates max 2
df_p.value_counts()

In [None]:
# Get the duplicated articles information
# CHECK THE DUPLCIATES IN PUBMED WEB
df.iloc[duplicated_rows.index]

In [None]:
# Rows that will be eliminated because of duplicate
# PROBLEMS: same pmcid, erratum, retracted.
mask = [7016, 28652, 34115, 36974, 101609, 134736, 209387, 237868, 270683, 302438, 308649, 349159, 427168, 444565, 481005, 523773, 527998, 581868, 719666, 726571, 773107, 817015]


In [None]:
# Drop the duplicates and save it
df = df.drop(mask)

In [None]:
df.head()

In [None]:
# Save data set after eliminating the duplicates
df.to_csv('./Dataset/Found/Complete/data_pubmed_found_final.csv', index=False)

In [None]:
# Save data set after eliminating the duplicates
df.to_csv('./Dataset/Complete/data_pubmed_all.csv', index=False)

## Articles not found

In [None]:
df_not_found = pd.read_csv('./Dataset/NotFound/Complete/data_pubmed_not_found.csv')

In [None]:
df_not_found.shape

In [None]:
df_not_found.head()

In [None]:
# 11372 DUPLCIATED ARTICLES
duplicated_rows = df_not_found[df_not_found['pcmid_AKE'].duplicated(keep=False)]
duplicated_rows.index

## Preprocessing

In [None]:
import pandas as pd

In [None]:
# Read csv
df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_found_final.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# Get columns that we want: date, pmcid, title, abstract, keywords, journal
df_p = df.loc[:,['publication_date',
 'AKE_pubmed_id', 'AKE_pubmed_title', 'AKE_abstract',
 'AKE_keywords', 'journal']]

In [None]:
df_p.head()

In [None]:
df_p.shape

In [None]:
df_p['journal'].value_counts()

In [None]:
# Most recent paper 2022-12-13
df_p['publication_date'].max()

In [None]:
# Get papers that are 2018 forward, papers that are in the 5 years range 
df_p = df_p[df_p['publication_date'] > '2018']

In [None]:
df_p.shape

In [None]:
df_p['journal'].nunique()

In [None]:
# Count the number of occurrences of each unique element
counts = df_p['journal'].value_counts()

# Filter the dataframe based on the count
filtered_df = df_p[df_p['journal'].isin(counts[counts > 200].index)]

In [None]:
filtered_df.shape

In [None]:
filtered_df.head()

In [None]:
filtered_df['journal'].nunique()

In [None]:
import ast
ast.literal_eval(filtered_df.iloc[0]['AKE_keywords'])

In [None]:
# Save the filtered data set
filtered_df.to_csv('Data/Complete/data_pubmed.csv', index=False)

## Split data set to 60% train 20% validation and 20% test

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
df = pd.read_csv('./Dataset/Complete/data_pubmed.csv')

In [None]:
# Split the dataset by journal
journals = df['journal'].unique()
train_ratio = 0.6

train_data = pd.DataFrame()
val_data = pd.DataFrame()
test_data = pd.DataFrame()

for journal in journals:
 # Filter the dataset by journal
 journal_data = df[df['journal'] == journal]
 
 # Split the data for the current journal 60% train 20% validation and 20% test
 train, val_test = train_test_split(journal_data, train_size=train_ratio, random_state=42)
 val, test = train_test_split(val_test, train_size=0.5, random_state=42)
 
 # Concatenate the data for the current journal to the overall data
 train_data = pd.concat([train_data, train])
 val_data = pd.concat([val_data, val])
 test_data = pd.concat([test_data, test])

# Print the number of rows for each set
print("Training data:", len(train_data))
print("Validation data:", len(val_data))
print("Test data:", len(test_data))

In [None]:
print(train_data['journal'].nunique())
print(val_data['journal'].nunique())
print(test_data['journal'].nunique())

In [None]:
print('Train set \n-----------------------------------------------------')
print(train_data['journal'].value_counts())
print('Validation set \n-----------------------------------------------------')
print(val_data['journal'].value_counts())
print('Test set \n-----------------------------------------------------')
print(test_data['journal'].value_counts())

In [None]:
# Save the train, validation, and test sets to csv files
train_data.to_csv('./Dataset/data_pubmed_train.csv', index=False)
val_data.to_csv('./Dataset/data_pubmed_val.csv', index=False)
test_data.to_csv('./Dataset/data_pubmed_test.csv', index=False)
