{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Data exploration of the articles (Automed data)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Articles found (eliminate duplicates)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Read csv\n", "df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_found.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 831000 articles\n", "df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get only the pubmed id\n", "df_p = df.loc[:,'AKE_pubmed_id']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 44 duplciated articles\n", "duplicated_rows = df_p[df_p.duplicated(keep=False)]\n", "duplicated_rows.index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Threre are duplicates max 2\n", "df_p.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get the duplicated articles information\n", "# CHECK THE DUPLCIATES IN PUBMED WEB\n", "df.iloc[duplicated_rows.index]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Rows that will be eliminated because of duplicate\n", "# PROBLEMS: same pmcid, erratum, retracted.\n", "mask = [7016, 28652, 34115, 36974, 101609, 134736, 209387, 237868, 270683, 302438, 308649, 349159, 427168, 444565, 481005, 523773, 527998, 581868, 719666, 726571, 773107, 817015]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Drop the duplicates and save it\n", "df = df.drop(mask)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Save data set after eliminating the duplicates\n", "df.to_csv('./Dataset/Found/Complete/data_pubmed_found_final.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Save data set after eliminating the duplicates\n", "df.to_csv('./Dataset/Complete/data_pubmed_all.csv', index=False)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Articles not found" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_not_found = pd.read_csv('./Dataset/NotFound/Complete/data_pubmed_not_found.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_not_found.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_not_found.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 11372 DUPLCIATED ARTICLES\n", "duplicated_rows = df_not_found[df_not_found['pcmid_AKE'].duplicated(keep=False)]\n", "duplicated_rows.index" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Preprocessing" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Read csv\n", "df = pd.read_csv('./Dataset/Found/Complete/data_pubmed_found_final.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get columns that we want: date, pmcid, title, abstract, keywords, journal\n", "df_p = df.loc[:,['publication_date',\n", " 'AKE_pubmed_id', 'AKE_pubmed_title', 'AKE_abstract',\n", " 'AKE_keywords', 'journal']]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_p.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_p.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_p['journal'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Most recent paper 2022-12-13\n", "df_p['publication_date'].max()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Get papers that are 2018 forward, papers that are in the 5 years range \n", "df_p = df_p[df_p['publication_date'] > '2018']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_p.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_p['journal'].nunique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Count the number of occurrences of each unique element\n", "counts = df_p['journal'].value_counts()\n", "\n", "# Filter the dataframe based on the count\n", "filtered_df = df_p[df_p['journal'].isin(counts[counts > 200].index)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filtered_df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filtered_df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filtered_df['journal'].nunique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import ast\n", "ast.literal_eval(filtered_df.iloc[0]['AKE_keywords'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Save the filtered data set\n", "filtered_df.to_csv('Data/Complete/data_pubmed.csv', index=False)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Split data set to 60% train 20% validation and 20% test" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('./Dataset/Complete/data_pubmed.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Split the dataset by journal\n", "journals = df['journal'].unique()\n", "train_ratio = 0.6\n", "\n", "train_data = pd.DataFrame()\n", "val_data = pd.DataFrame()\n", "test_data = pd.DataFrame()\n", "\n", "for journal in journals:\n", " # Filter the dataset by journal\n", " journal_data = df[df['journal'] == journal]\n", " \n", " # Split the data for the current journal 60% train 20% validation and 20% test\n", " train, val_test = train_test_split(journal_data, train_size=train_ratio, random_state=42)\n", " val, test = train_test_split(val_test, train_size=0.5, random_state=42)\n", " \n", " # Concatenate the data for the current journal to the overall data\n", " train_data = pd.concat([train_data, train])\n", " val_data = pd.concat([val_data, val])\n", " test_data = pd.concat([test_data, test])\n", "\n", "# Print the number of rows for each set\n", "print(\"Training data:\", len(train_data))\n", "print(\"Validation data:\", len(val_data))\n", "print(\"Test data:\", len(test_data))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(train_data['journal'].nunique())\n", "print(val_data['journal'].nunique())\n", "print(test_data['journal'].nunique())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print('Train set \\n-----------------------------------------------------')\n", "print(train_data['journal'].value_counts())\n", "print('Validation set \\n-----------------------------------------------------')\n", "print(val_data['journal'].value_counts())\n", "print('Test set \\n-----------------------------------------------------')\n", "print(test_data['journal'].value_counts())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Save the train, validation, and test sets to csv files\n", "train_data.to_csv('./Dataset/data_pubmed_train.csv', index=False)\n", "val_data.to_csv('./Dataset/data_pubmed_val.csv', index=False)\n", "test_data.to_csv('./Dataset/data_pubmed_test.csv', index=False)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "TFM", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }